#/usr/bin/python #################################################### # # Author: Brian Chan (birdchan@ucdavis.edu) # Supervisor : Alex Kozik (akozik@atgc.org) # Date: February 27 2004 # #################################################### # # In short, this script reads in the contig ID info # and contig mismatch info, then output details # about possible polymorphism. # ############ Detailed description ################# # # It first asks for misMatch file, which is in this format: # ContigName \t seqName \t (index1:symbol)* # For example, a line could be like the following: # QH_CA_Contig23 QHA10B27.yg.ab1+ 72:D|29:I # The symbol D means deletion, S means substitution, I # means insertion. So, say 72:D means that at index # 72 of such seq of such contig there is a deletion. # # It then asks for a contig ID file, which is in this format: # ContigName \t numOfSeq \t (seqName w/ start and end indexes)* # For example, a line could look like this: # QG_CA_Contig72 2 Seq1+(2,200)|Seq2+(1,234) # # It then asks for whether we want to use prefix or # suffix to distinguish seqs in two groups. # # It then asks how long the pattern is. This pattern # corresponds to the prefix or suffix we chose. # # Then it asks for the pattern for group 1. Then for # group 2. # # Say, if we input QGA, QGB, QGC, QGD and QGI for # group 1, and QGE, QGF, QGG, QGH, and QGJ for # group 2. Then all the seq with the prefix or # suffix (depending what we chose) in the group # 1 patterns will be placed in group 1. Same # applies to group 2. For example, seq QGB1122 # will be placed in group 1. All unidentified # seq will be ignored. # # After all these input, the script then analysze # the content of the two input files. It forms # groups and checks if possibly polymorphism # exists. If so, output findings to corresponding # output file. # # The output format is as follow: # ContigName DIS ALL PartialFraction LowPriority # For example, the followings are valid outputs: # QH_CA_Contig921 S 1 2/5 1 # QH_CA_Contig3499 I 7 1/13 0 # # DIS is an element of {D, I, S} # # ALL means the number of DIS occuring in a group # whose sequences having a particular index all # have this DIS, while the other sequences in # the other group all do not have this DIS at # that particular index. # # PartialFraction is similar to ALL, except that # some sequences don't have that particular # DIS, while the sequences in the other group # all don't have that DIS. # # LowPriority is either 0 or 1. Basically, when # ALL is greater than 1, then LowPriority is 0, # meaning this contig has some priority. When # ALL is less than or equal to 1, then LowPriority # is 1, meaning this contig has low priority. # # As an illustration, # -----D--------- # -----D-------------S---------D--- # -----D-------------S--- # --D--------- # ========I==S=======D=== # ========I==S==== # ========I===== # # The output would be # ContigName D 4 0/1 0 # ContigName I 3 0/1 0 # ContigName S 2 2/3 0 # # Notice, the default output of the partial fraction # is 0/1 . # # The following outputs are wrong # ContigName D 1 0/1 1 (since we have 4 D's for ALL) # ContigName S 0 2/3 0 (since we do have something for ALL) # # As rules given for this script, we keep the # biggest "ALL" and "PartialFraction". # #################################################### # importing the libraries import sys from string import * from string import atoi from string import rstrip from math import log10 #################################################### # class definition def mycmp(left, right): lSeqNum = int( left[12:len(left)] ) # get seq ID rSeqNum = int( right[12:len(right)] ) # get seq ID if lSeqNum > rSeqNum: return 1 else: return -1 class seqInfo: def __init__(self, name): self.seqName = name self.info = "" def setSeqName(self, seqName): self.seqName = seqName def addInfo(self, info): self.info = info def printInfo(self): print self.info class seqMMGroupClass: def __init__(self, contigID): self.contigID = contigID self.seqList_G1 = {} self.seqList_G1_index = [] self.seqList_G2 = {} self.seqList_G2_index = [] def readinSeq(self, name, info): global contig_pre_or_suf global contig_pre_or_suf_size global contig_G1_def global contig_G2_def if contig_pre_or_suf == "p": # if prefix type_str = name[0:contig_pre_or_suf_size] else: # else, assume it is suffix type_str = name[contig_pre_or_suf_size*(-1):] if (type_str in contig_G1_def): if info == "no polymorphism found": self.seqList_G1[name] = "" else: self.seqList_G1[name] = info allInfos = split(info, "|") for info in allInfos: num, type = split(info, ":") if not num in self.seqList_G1_index: self.seqList_G1_index.append(num) elif (type_str in contig_G2_def): if info == "no polymorphism found": self.seqList_G2[name] = "" else: self.seqList_G2[name] = info allInfos = split(info, "|") for info in allInfos: num, type = split(info, ":") if not num in self.seqList_G2_index: self.seqList_G2_index.append(num) def get_G1_Dict(self): return self.seqList_G1 def get_G2_Dict(self): return self.seqList_G2 def get_G1_index(self): return self.seqList_G1_index def get_G2_index(self): return self.seqList_G2_index def clear(self): self.seqList_G1.clear() self.seqList_G1_index.clear() self.seqList_G2.clear() self.seqList_G2_index.clear() def printInfo(self): # print out G1 print ">>>> G1" allSeqs = self.seqList_G1.items() allSeqs.sort(lambda f, s: cmp(f[0], s[0])) for name, info in allSeqs: print name, info # print out G2 print ">>>> G2" allSeqs = self.seqList_G2.items() allSeqs.sort(lambda f, s: cmp(f[0], s[0])) for name, info in allSeqs: print name, info #def analyse(self): class seqMMClass: def __init__(self): self.seqMMGroups = {} self.contigID = "" self.currMMGroupList = {} def setContigName(self, contigID): self.contigID = contigID def readinSeq(self, seqName, info): self.currMMGroupList[seqName] = info def finalizeGroup(self): MMGroup = seqMMGroupClass(self.contigID) allSeqs = self.currMMGroupList.items() allSeqs.sort(lambda f, s: cmp(f[0], s[0])) for sn, info in allSeqs: MMGroup.readinSeq(sn, info) self.seqMMGroups[self.contigID] = MMGroup self.contigID = "" self.currMMGroupList = {} def getMMGroup(self, contigID): return self.seqMMGroups[contigID] def clear(self): self.seqMMGroups = {} self.contigID = "" self.currMMGroupList = {} def printInfo(self): allMMGroups = self.seqMMGroups.items() allMMGroups.sort(lambda f, s: mycmp(f[0], s[0])) for contigID, group in allMMGroups: print contigID, "-------------------------" group.printInfo() class seqContigClass: def __init__(self): self.seqs = {} def readinSeq(self, name, info): self.seqs[name] = info def getContig(self, name): return self.seqs[name] def getKeys(self): return self.seqs.keys() def clear(self): self.seqs.clear() def printInfo(self): allSeqs = self.seqs.items() allSeqs.sort(lambda f, s: mycmp(f[0], s[0])) for name, info in allSeqs: print name, info #def analyse(self): #################################################### #global fp_in_MM = 0 # input file handler for the mismatch file fp_in_contig = 0 # input file handler for the contig file fp_out_del = 0 # output file handler for the deletion file fp_out_ins = 0 # output file handler for the insertion file fp_out_sub = 0 # output file handler for the substitution file seqMM = seqMMClass() # obj that stores G1 and G2 seq's seqContig = seqContigClass() # obj that stores the seq's info contig_pre_or_suf = "" # do we distinguish the contigs by prefix or suffix ? contig_pre_or_suf_size = 0 # what is the size of it ? contig_G1_def = [] # The definition of G1 (what is defined as in group 1) contig_G2_def = [] # The definition of G2 (what is defined as in group 2) #################################################### # function definition def main_routine(): read_in_contig_data() # read data from the contig file read_in_MM_data() # read data from the mismatch file check_polymorphism() # check possible polymorphism #-------------------------------------- def read_in_contig_data(): global fp_in_contig lines = fp_in_contig.readlines() # read all lines from the contig file if not lines: print "No input in the input file" sys.exit(0) # if no input, exit i = 0 # the line index, starting from the top seqContig.clear() # clear up the internal storage while i < len(lines): # for each line print i line = getLine(lines, i) contigName, numSeq, info = split(line, "\t") # retrieve info seqContig.readinSeq(contigName, info) # store the info i = i + 1 # for debugging #seqContig.printInfo() #-------------------------------------- def read_in_MM_data(): global fp_in_MM lines = fp_in_MM.readlines() # read all lines from the mismatch file if not lines: print "No input in the input file" sys.exit(0) # if no input, exit i = 0 # the line index, starting from the top seqMM.clear() # clear up the internal memory while i < len(lines): print i line = getLine(lines, i) curr_contigName, t1, t2 = split(line, "\t") # retrieve contig name seqMM.setContigName(curr_contigName) # store contig name next_contig_line_num = next_diff_contig_line(lines, i, curr_contigName) while i < next_contig_line_num: line = getLine(lines, i) contigName, seqName, info = split(line, "\t") # retrieve info seqMM.readinSeq(seqName, info) # store info i = i + 1 seqMM.finalizeGroup() # um... this is necessary for memory arrangement # for debugging #seqMM.printInfo() #-------------------------------------- def check_polymorphism(): contigIDs = seqContig.getKeys() # unordered contigIDs for cid in contigIDs: # check for each contig """ This is for debugging (to focus on one contig) if cid != "QH_CA_Contig921": continue """ # get info for individual contig contigInfo = seqContig.getContig(cid) MMGroup = seqMM.getMMGroup(cid) # retrieve data out from contigInfo contigInfoList = [] contigInfoList = retrieveContigInfo(contigInfo) # Here we get the seqNames DictG1 = MMGroup.get_G1_Dict() # get the set of subseqs DictG2 = MMGroup.get_G2_Dict() # get the set of subseqs # Here we get indexes (where D, I, or S occurs) G1_index = MMGroup.get_G1_index() # get the indexes that need to check G2_index = MMGroup.get_G2_index() # get the indexes that need to check # do a union on D's, I's and S's to have all the indexes for checking all_index = G1_index # all_index is the union of the two for i in G2_index: if not i in all_index: all_index.append(i) all_index.sort(lambda x, y: cmp(atoi(x), atoi(y))) #-- search for D, I, S print "Checking polymorphism for", cid check_polymorphism_contig("D", DictG1, DictG2, G1_index, G2_index, all_index, cid, contigInfoList) check_polymorphism_contig("I", DictG1, DictG2, G1_index, G2_index, all_index, cid, contigInfoList) check_polymorphism_contig("S", DictG1, DictG2, G1_index, G2_index, all_index, cid, contigInfoList) ################################################# # helper funcions def getLine(lines, index): line = lines[index] line = rstrip(line) # rid of \n return line #-------------------------------------- def retrieveContigInfo(contigInfo): contigInfoList = [] # each element is a list of (seqName, start, end) sign = " " t_contigInfoList = split(contigInfo, "|") for t_ci in t_contigInfoList: if find(t_ci, "+") != -1: # get sign sign = "+" if find(t_ci, "-") != -1: sign = "-" t_seqName, t_indexes = split(t_ci, sign) t_indexes = t_indexes[1:len(t_indexes)-1] start, end = split(t_indexes, ",") # get start and end indexes start = atoi(start) # we store integer internally end = atoi(end) c = (t_seqName+sign, start, end) # package the three into one contigInfoList.append(c) # store it into the list """ This is for debugging (printing out the seqName in a group) print "===========" for i in range( len(contigInfoList) ): print contigInfoList[i] print "//////////////" """ return contigInfoList #--------------------------------- def check_polymorphism_contig(DIS, DictG1, DictG2, G1_index, G2_index, all_index, cid, contigInfoList): low_priority = 0 # the Low_Priority field max_occur_all = 0 # the ALL field max_occur_partial_num = 0 # numerator for partialFraction max_occur_partial_den = 1 # denominator for partialFraction position_all = [] # location of where DIS occurs (assuming refering ALL) # get num of seqs in group 1 and 2 (should be held constant) numOfSeqG1_const = len( DictG1.keys() ) # num of seqs in group 1 numOfSeqG2_const = len( DictG2.keys() ) # num of seqs in group 2 for i in all_index: # we go from left to right, looking for possible polymorphism i = atoi(i) # i needs to be converted to type int total_occur_G1 = 0 # total occurance of DIS in group 1 total_occur_G2 = 0 # total occurance of DIS in group 2 # these are for local use numOfSeqG1 = numOfSeqG1_const numOfSeqG2 = numOfSeqG2_const # analyze the group for contigInfo in contigInfoList: # should be (for each seq's info) c, startIndex, endIndex = contigInfo # decompose it # if seq belongs to group 1, update vars corresponding to group 1 if DictG1.has_key(c): numOfSeqG1, total_occur_G1 = updateTotalOccur(DIS, DictG1, c, i, total_occur_G1, numOfSeqG1, startIndex, endIndex) # if seq belongs to group 2, update vars corresponding to group 2 elif DictG2.has_key(c): numOfSeqG2, total_occur_G2 = updateTotalOccur(DIS, DictG2, c, i, total_occur_G2, numOfSeqG2, startIndex, endIndex) else: print "neither in DictG1 or DictG2..." sys.exit(0) # update max occurance of ALL if total_occur_G1 == numOfSeqG1 and total_occur_G1 > max_occur_all and total_occur_G2 == 0: max_occur_all = numOfSeqG1 position_all.append(i) if total_occur_G2 == numOfSeqG2 and total_occur_G2 > max_occur_all and total_occur_G1 == 0: max_occur_all = numOfSeqG2 position_all.append(i) # update max occurance of PARTIAL if numOfSeqG1 != 0 and total_occur_G1 < numOfSeqG1 and total_occur_G2 == 0: fractionC = ( total_occur_G1 * 1.0 ) / numOfSeqG1 fractionM = ( max_occur_partial_num * 1.0 ) / max_occur_partial_den if fractionC > fractionM or (fractionC==fractionM and numOfSeqG1>max_occur_partial_den): max_occur_partial_num = total_occur_G1 max_occur_partial_den = numOfSeqG1 if numOfSeqG2 != 0 and total_occur_G2 < numOfSeqG2 and total_occur_G1 == 0: fractionC = ( total_occur_G2 * 1.0 ) / numOfSeqG2 fractionM = ( max_occur_partial_num * 1.0 ) / max_occur_partial_den if fractionC > fractionM or (fractionC==fractionM and numOfSeqG2>max_occur_partial_den): max_occur_partial_num = total_occur_G2 max_occur_partial_den = numOfSeqG2 # for debugging (check indexes and other local variables) # print DIS, i, ">", total_occur_G1, numOfSeqG1, "|", total_occur_G2, numOfSeqG2, "<", max_occur_all, max_occur_partial_num, max_occur_partial_den # update low_priority (only one parent?) if max_occur_all == 1: low_priority = 1 # output findings OutputGood(DIS, cid, max_occur_all, max_occur_partial_num, max_occur_partial_den, low_priority, position_all) #--------------------------------- def updateTotalOccur(DIS, Dict, c, i, total_occur_arg, numOfSeq_arg, start, end): total_occur = total_occur_arg # save args to local vars numOfSeq = numOfSeq_arg MM_info = Dict[c] # this is something like 121:S|134:I|341:S if find(MM_info, str(i)) != -1: # see if MM_info contain index i t1 = index(MM_info, str(i)) # if so, get the DIS symbol t2 = int(log10(i)) t = MM_info[t1+t2+2] # 2 coz DIS is after : if t == DIS: # if it's the provided one, we got a hit total_occur = total_occur + 1 elif start > i or end < i: # if location i is not in this seq numOfSeq = numOfSeq - 1 # treat this seq as not existent return numOfSeq, total_occur #--------------------------------- def next_diff_contig_line(lines, index, contigName): i = index line = getLine(lines, i) curr_contigName, t1, t2 = split(line, "\t") contigName = curr_contigName while contigName == curr_contigName: i = i + 1 if i >= len(lines): return len(lines) line = getLine(lines, i) contigName, t1, t2 = split(line, "\t") return i #--------------------------------- def OutputGood(DIS, cid, All, Partial_num, Partial_den, low_priority, position_all): # format output position string position_str = "|" i = 0 # index var linking_flag = 0 # indicator of linking (eg. 1-4) previous_p = -1 # previous p in position_all for p in position_all: if i == 0: position_str = position_str + str(p) else: if p == previous_p + 1: linking_flag = 1 else: if linking_flag == 0: # normal output position_str = position_str + ":" + str(p) if linking_flag == 1: # output the end part position_str = position_str + "-" + str(previous_p) + ":" + str(p) linking_flag = 0 previous_p = p i = i + 1 if linking_flag == 1: position_str = position_str + "-" + str(previous_p) position_str = position_str + "|" # output all the strings if DIS == "D": OutputDeletionsGood(cid, All, Partial_num, Partial_den, low_priority, position_str) elif DIS == "I": OutputInsertionsGood(cid, All, Partial_num, Partial_den, low_priority, position_str) elif DIS == "S": OutputSubstitutionsGood(cid, All, Partial_num, Partial_den, low_priority, position_str) #--------------------------------- def OutputDeletionsGood(contigID, All, Partial_num, Partial_den, low_priority, position_str): global fp_out_del s = "Congratulation: " + contigID + " contains deletion with possible polymorphism between parents!!!" print s outputStr = contigID + "\tD\t" + str(All) + "\t" + str(Partial_num) + "/" + str(Partial_den) + "\t" + str(low_priority) + "\t" + position_str + "\n" fp_out_del.write(outputStr) #--------------------------------- def OutputInsertionsGood(contigID, All, Partial_num, Partial_den, low_priority, position_str): global fp_out_ins s = "Congratulation: " + contigID + " contains insertion with possible polymorphism between parents!!!" print s outputStr = contigID + "\tI\t" + str(All) + "\t" + str(Partial_num) + "/" + str(Partial_den) + "\t" + str(low_priority) + "\t" + position_str + "\n" fp_out_ins.write(outputStr) #--------------------------------- def OutputSubstitutionsGood(contigID, All, Partial_num, Partial_den, low_priority, position_str): global fp_out_sub s = "Congratulation: " + contigID + " contains substitution with possible polymorphism between parents!!!" print s outputStr = contigID + "\tS\t" + str(All) + "\t" + str(Partial_num) + "/" + str(Partial_den) + "\t" + str(low_priority) + "\t" + position_str + "\n" fp_out_sub.write(outputStr) #################################################### # main #ask for input file and open them s = raw_input("Enter the mismatch file name : ") #s = "Sunflower_Contigs_MM.mySQL" #s = "sun_contig.mismatch.out" input_filename_MM = s fp_in_MM = open(input_filename_MM, "rb") s = raw_input("Enter the contig info file name : ") #s = "Sunflower_Contigs_ID.mySQL" #s = "cap3_QH_CA.contig.info" input_filename_contig = s fp_in_contig = open(input_filename_contig, "rb") #assign output filename output_filename_deletion = "deletions.good" output_filename_insertion = "insertions.good" output_filename_substitution = "substitutions.good" #open the output file fp_out_del = open(output_filename_deletion, "wb") fp_out_ins = open(output_filename_insertion, "wb") fp_out_sub = open(output_filename_substitution, "wb") #ask if we use prefix or suffix to distinguish between group 1 and group 2 #contig_pre_or_suf = "p" while 1: contig_pre_or_suf = raw_input("Distinguish by (p)refix or (s)uffix? : ") if contig_pre_or_suf != "p" and contig_pre_or_suf != "s": print "Please enter either 'p' or 's', but not", contig_pre_or_suf else: break #ask how long the pre or suffix is #contig_pre_or_suf_size = 3 while 1: if contig_pre_or_suf == "p": s = raw_input("Please input the size of the prefix : ") contig_pre_or_suf_size = int(s) elif contig_pre_or_suf == "s": s = raw_input("Please input the size of the suffix : ") contig_pre_or_suf_size = int(s) if contig_pre_or_suf_size > 0: break # ask for the definition of G1 (what is defined as in group 1) # contig_G1_def.append("QHA") # contig_G1_def.append("QHB") # contig_G1_def.append("QHC") # contig_G1_def.append("QHD") # contig_G1_def.append("QHI") print "Specify Group 1" while 1: print "Group 1 has", contig_G1_def #getting new element s = raw_input("Please input element : ") if len(s) != contig_pre_or_suf_size: print "element must be of size", contig_pre_or_suf_size continue if s in contig_G1_def: print "element is already in Group 1" continue #add the new element to Group 1 contig_G1_def.append(s) #ask if the user wants to add more s = raw_input("Do you want to add more? (y/n) : ") if s != "y": break # ask for the definition of G2 (what is defined as in group 2) # contig_G2_def.append("QHE") # contig_G2_def.append("QHF") # contig_G2_def.append("QHG") # contig_G2_def.append("QHH") # contig_G2_def.append("QHJ") print "Specify Group 2" while 1: print "Group 2 has", contig_G2_def #getting new element s = raw_input("Please input element : ") if len(s) != contig_pre_or_suf_size: print "element must be of size", contig_pre_or_suf_size continue if s in contig_G1_def: print "element is already in Group 1" continue if s in contig_G2_def: print "element is already in Group 2" continue #add the new element to Group 2 contig_G2_def.append(s) #ask if the user wants to add more s = raw_input("Do you want to add more? (y/n) : ") if s != "y": break #open the input and output file print "reading data form the input file", "..." main_routine() #close the input and output files fp_in_MM.close() fp_in_contig.close() fp_out_del.close() fp_out_ins.close() fp_out_sub.close()