#!/usr/bin/python #################################################### # # # SEQUENCE EXTRACTION FROM SUBJECT AND QUERY FILES # # WITHOUT STOP CODONS WITHIN SEQUECE # # # # INPUT - TWO FILES WITH DNA SEQUENCES # # (QUERY AND SUBJECT) # # AND # # PARSED BLAST REPORT BY tcl_blast_parser_123 # # # # NOTE! BLAST REPORT CORRESPONDS TO BLASTX # # (TRANSLATED DNA VERSUS PROTEIN SEQUENCES) # # IT MEANS THAT SUBJECT DNA FILE SHOULD CORRESPOND # # EXACTLY TO PROTEIN QUERY FILE FOR PROPER RESULTS # # # #################################################### #################################################### # # # COPYRIGHT, ALEXANDER KOZIK, 2003 # # # #################################################### def Seqs_Extractor(in_name1, in_name2, in_name3, out_name, out_dir): global stop_codon_status print "INPUT FILE 1 (QR_SEQS) : " + in_name1 print "INPUT FILE 2 (DB_SEQS) : " + in_name2 print "INPUT FILE 3 (BLASTX) : " + in_name3 print "OUTPUT FILE (SEQ FRAGM): " + out_name print "OUTPUT DIR : " + out_dir cod_name = out_name + '.triplets' hit_name = out_name + '.subj_seq' info_name = out_name + '.overlap_info' cons_codn = out_name + '.x_triplets' kska_name = out_name + '.x_kska' in_file1 = open(in_name1, "rb") in_file2 = open(in_name2, "rb") in_file3 = open(in_name3, "rb") # out_file = open(out_name, "wb") out_file = open(out_name + '.query_seq', "wb") out_file2 = open(hit_name, "wb") cod_file = open(cod_name, "wb") kska_file = open(kska_name, "wb") info_file = open(info_name, "wb") xcdn_file = open(cons_codn, "wb") os.mkdir(out_dir) group_number = 1 seq_list = [] hit_list = [] seq_array = {} hit_array = {} ########################################## # READ SEQUENCE DATA INTO ARRAY : STEP 1 # # QUERY DNA FASTA FILE # ########################################## line_counter = 0 ################################# while 1: ### LAST SEQUENCE IN FASTA FILE ### t = in_file1.readline() if t == '': ### SUB_SEQ FUNCTION ### have_seqs = "".join(my_seqs) seqs_len = len(have_seqs) seq_array[proper_id] = have_seqs #### END OF SUB_SEQ #### break ### SEQ PROCESSING ### if '\n' in t: t = t[:-1] if '\r' in t: t = t[:-1] fasta_match = t[0:1] if fasta_match == ">": gi_test = t[0:4] if gi_test == ">gi|": # print gi_test descr_line = t descr_line = re.sub("^>gi\|", "", descr_line) descr_line = re.sub("\|", '\t', descr_line, 1) # print line_counter line_counter += 1 else: descr_line = t descr_line = re.sub("^>", "", descr_line) descr_line = re.sub(" ", '\t', descr_line, 1) # print line_counter line_counter += 1 good_head = string.split(descr_line, '\t')[0] try: long_tail = string.split(descr_line, '\t')[1] except: long_tail = "no description found" if good_head in seq_list: running_text = "\ \n\n Ooops... ID duplication \n\n check input for duplications \n\n\n ID: " + good_head + "\n\n\n" print running_text ### ### INSERT TKINTER TEXT MESSAGE BOX ### break seq_list.append(good_head) print `line_counter` + '\t' + good_head if line_counter != 1: ### SUB_SEQ FUNCTION ### have_seqs = "".join(my_seqs) seqs_len = len(have_seqs) seq_array[proper_id] = have_seqs #print good_head #print have_seqs #### END OF SUB_SEQ #### have_seqs = "" my_seqs = [] if fasta_match != ">" and fasta_match != "": proper_id = good_head good_name = long_tail my_seqs.append(t) ################################# ### END OF STEP 1 # ################################# seq_list2 = [] seq_array2 = {} ########################################## # READ SEQUENCE DATA INTO ARRAY : STEP 2 # # SUBJECT DNA FASTA FILE # ########################################## line_counter = 0 ################################# while 1: ### LAST SEQUENCE IN FASTA FILE ### t = in_file2.readline() if t == '': ### SUB_SEQ FUNCTION ### have_seqs = "".join(my_seqs) seqs_len = len(have_seqs) seq_array2[proper_id] = have_seqs #### END OF SUB_SEQ #### break ### SEQ PROCESSING ### if '\n' in t: t = t[:-1] if '\r' in t: t = t[:-1] fasta_match = t[0:1] if fasta_match == ">": gi_test = t[0:4] if gi_test == ">gi|": # print gi_test descr_line = t descr_line = re.sub("^>gi\|", "", descr_line) descr_line = re.sub("\|", '\t', descr_line, 1) # print line_counter line_counter += 1 else: descr_line = t descr_line = re.sub("^>", "", descr_line) descr_line = re.sub(" ", '\t', descr_line, 1) # print line_counter line_counter += 1 good_head = string.split(descr_line, '\t')[0] try: long_tail = string.split(descr_line, '\t')[1] except: long_tail = "no description found" if good_head in seq_list2: running_text = "\ \n\n Ooops... ID duplication \n\n check input for duplications \n\n\n ID: " + good_head + "\n\n\n" print running_text ### ### INSERT TKINTER TEXT MESSAGE BOX ### break seq_list2.append(good_head) print `line_counter` + '\t' + good_head if line_counter != 1: ### SUB_SEQ FUNCTION ### have_seqs = "".join(my_seqs) seqs_len = len(have_seqs) seq_array2[proper_id] = have_seqs #print good_head #print have_seqs #### END OF SUB_SEQ #### have_seqs = "" my_seqs = [] if fasta_match != ">" and fasta_match != "": proper_id = good_head good_name = long_tail my_seqs.append(t) ################################# # END OF STEP 2 # ################################# ################################# hit_count = 0 ################################# # READ BLASTX DATA # ################################# while 1: t = in_file3.readline() if t == '': break if '\n' in t: t = t[:-1] if '\r' in t: t = t[:-1] t = t.split('\t') ################# id = t[0] # print 'id' + '\t' + id hit = t[1] # print 'hit' + '\t' + hit ds = t[2] # print 'ds' + '\t' + ds if ds != 'no_hits_found': aln_idn = t[4] hnb = t[7] hnb = int(hnb) # print 'hnb' + '\t' + hnb fr = t[8] # print 'fr' + '\t' + fr qst = t[9] qst = int(qst) # print 'qst' + '\t' + qst qen = t[10] qen = int(qen) # print 'qen' + '\t' + qen hst = t[11] hst = int(hst) # print 'hst' + '\t' + hst hen = t[12] hen = int(hen) # print 'hen' + '\t' + hen gaps = t[14] ################################# # SUBSEQUENCE EXTRACTION # ################################# if hnb == 1: hit_count += 1 print `hit_count` + '\t' + id ######################## sequence = seq_array[id] # print sequence seq_len = len(sequence) if qen > qst: direction = 'forward' sub_seq = sequence[(qst-1):(qen-0)] sub_len = len(sub_seq) if qen < qst: direction = 'reverse' t_r = list(sequence) # string -> list of chars t_r.reverse() # inplace reverse the list t_r = string.join(t_r, '') # list of strings -> string t_r = string.upper(t_r) # all uppercase t_r = re.sub("A", "t", t_r) # A -> t t_r = re.sub("T", "a", t_r) # T -> a t_r = re.sub("G", "c", t_r) # G -> c t_r = re.sub("C", "g", t_r) # C -> g t_r = string.upper(t_r) # back to uppercase sequence = t_r sub_seq = sequence[(seq_len-qst):(seq_len-qen + 1)] sub_len = len(sub_seq) # print sub_seq mod_3 = math.fmod(sub_len, 3) if mod_3 != 0: print "MOD IS NOT 3 !!!" time.sleep(1) ################################ sequence2 = seq_array2[hit] # print sequence2 seq_len2 = len(sequence2) ## PROTEIN POSITION INTO DNA POSITION ### hst = hst*3 - 2 hen = hen*3 direction2 = 'forward' sub_seq2 = sequence2[(hst-1):(hen-0)] sub_len2 = len(sub_seq2) mod_3s = math.fmod(sub_len2, 3) if mod_3s != 0: print "MOD IS NOT 3 !!!" time.sleep(1) n_find = sub_seq.rfind('N') n_find2 = sub_seq2.rfind('N') if n_find == -1 and n_find2 == -1: ################################ Count_Triplets(id, hit, sub_len, sub_seq, sub_len2, sub_seq2, kska_file, gaps) ################################ if stop_codon_status == 'GOOD': out_file.write('>' + id + ' length: ' + `sub_len` + ' ' + 'mod: ' + `mod_3` + ' ' + \ direction + ' ' + `qst` + '-' + `qen` + \ ' [' + hit + ' ' + `hst` + '-' + `hen` + ']' + '\n' + sub_seq + '\n') ################################ out_file2.write('>' + hit + ' length: ' + `sub_len2` + ' ' + 'mod: ' + `mod_3s` + ' ' + \ direction2 + ' ' + `hst` + '-' + `hen` + ' subsequence ' + \ '\n' + sub_seq2 + '\n') ################################ info_file.write(id + '\t' + `qst` + '\t' + `qen` + '\t' \ + hit + '\t' + `hst` + '\t' + `hen` + '\t' + gaps + '\t' + aln_idn + '\n') ################################ if sub_len == sub_len2 and gaps == "NO_GAPS": current_group = "group_A_" + `group_number` + ".fasta" current_group = out_dir + "/" + current_group current_group_file = open(current_group, "wb") current_group_file.write('>' + id + '\n' + sub_seq + '\n' + \ '>' + hit + '\n' + sub_seq2 + '\n') if sub_len == sub_len2 and gaps != "NO_GAPS": current_group = "group_B_" + `group_number` + ".fasta" current_group = out_dir + "/" + current_group current_group_file = open(current_group, "wb") current_group_file.write('>' + id + '\n' + sub_seq + '\n' + \ '>' + hit + '\n' + sub_seq2 + '\n') if sub_len != sub_len2: current_group = "group_C_" + `group_number` + ".fasta" current_group = out_dir + "/" + current_group current_group_file = open(current_group, "wb") current_group_file.write('>' + id + '\n' + sub_seq + '\n' + \ '>' + hit + '\n' + sub_seq2 + '\n') ################################ group_number += 1 current_group_file.close() ################################ ### TRIPLET SUMMARY ### Triplet_Summary(cod_file, xcdn_file) in_file1.close() in_file2.close() in_file3.close() cod_file.close() out_file.close() kska_file.close() info_file.close() xcdn_file.close() def Triplet_Summary(cod_file, xcdn_file): global atgc_array global atgc_list global atgc_array2 global atgc_list2 global aa_array global cons_atgc_array global cons_atgc_array2 global aa_count_array global cons_aa_count_array global aa_count_array2 global cons_aa_count_array2 global amac_list test1 = 0 test2 = 0 test1a = 0 test2a = 0 test1c = 0 test2c = 0 test1ca = 0 test2ca = 0 ### ALL DATA ### for item in atgc_list: fraction = (atgc_array[item]*100000.00)/atgc_array['atgc'] test1 = test1 + atgc_array[item] fraction2 = (atgc_array2[item]*100000.00)/atgc_array2['atgc'] test2 = test2 + atgc_array2[item] fraction = round(fraction) fraction2 = round(fraction2) diff = fraction - fraction2 if fraction2 != 0: diff100 = ((fraction*1.00)/fraction2)*100 diff100 = round(diff100) if fraction2 == 0: diff100 = 0.0 am_ac = aa_array[item] #### PROPORTION OF AM_AC #### total_amac = aa_count_array[am_ac] total_amac2 = aa_count_array2[am_ac] tot_amac_fr = (total_amac*100.00)/aa_count_array['ALL'] tot_amac_fr2 = (total_amac2*100.00)/aa_count_array2['ALL'] if total_amac != 0: fract_amac = (atgc_array[item]*100.00)/total_amac if total_amac == 0: fract_amac = 0 if total_amac2 != 0: fract_amac2 = (atgc_array2[item]*100.00)/total_amac2 if total_amac2 == 0: fract_amac2 = 0 fract_amac = round(fract_amac,2) fract_amac2 = round(fract_amac2,2) tot_amac_fr = round(tot_amac_fr,2) tot_amac_fr2 = round(tot_amac_fr2,2) tot_amac_diff = tot_amac_fr - tot_amac_fr2 diff_amac = fract_amac - fract_amac2 fract_amac = str(fract_amac) fract_amac2 = str(fract_amac2) diff_amac = str(diff_amac) tot_amac_fr = str(tot_amac_fr) tot_amac_fr2 = str(tot_amac_fr2) tot_amac_diff = str(tot_amac_diff) ############################# cod_file.write(am_ac + '\t' + item + '\t' + `fraction` + '\t' + `fraction2` + '\t' + \ `diff` + '\t' + `diff100` + '\t' + \ `atgc_array[item]` + '\t' + `atgc_array['atgc']` + '\t' + \ `atgc_array2[item]` + '\t' + `atgc_array2['atgc']` + '\t' + \ fract_amac + '\t' + fract_amac2 + '\t' + diff_amac + '\t' + \ '***' + '\t' + tot_amac_fr + '\t' + tot_amac_fr2 + '\t' + tot_amac_diff + '\n') ###################### print '************************' print ' COUNT TEST 1: ' print `test1` + '\t' + `atgc_array['atgc']` + '\t' + 'QUERY TEST' print `test2` + '\t' + `atgc_array2['atgc']` + '\t' + 'SUBJECT TEST' for item in amac_list: test1a = test1a + aa_count_array[item] test2a = test2a + aa_count_array2[item] print `test1a` + '\t' + `aa_count_array['ALL']` + '\t' + 'AA QUERY TEST' print `test2a` + '\t' + `aa_count_array2['ALL']` + '\t' + 'AA SUBJECT TEST' print '************************' ### CONSERVED DATA ### for item in atgc_list: fraction = (cons_atgc_array[item]*100000.00)/cons_atgc_array['atgc'] test1c = test1c + cons_atgc_array[item] fraction2 = (cons_atgc_array2[item]*100000.00)/cons_atgc_array2['atgc'] test2c = test2c + cons_atgc_array2[item] fraction = round(fraction) fraction2 = round(fraction2) diff = fraction - fraction2 if fraction2 != 0: diff100 = ((fraction*1.00)/fraction2)*100 diff100 = round(diff100) if fraction2 == 0: diff100 = 0.0 am_ac = aa_array[item] #### PROPORTION OF AM_AC #### total_amac = cons_aa_count_array[am_ac] total_amac2 = cons_aa_count_array2[am_ac] tot_amac_fr = (total_amac*100.00)/cons_aa_count_array['ALL'] tot_amac_fr2 = (total_amac2*100.00)/cons_aa_count_array2['ALL'] if total_amac != 0: fract_amac = (cons_atgc_array[item]*100.00)/total_amac if total_amac == 0: fract_amac = 0 if total_amac2 != 0: fract_amac2 = (cons_atgc_array2[item]*100.00)/total_amac2 if total_amac2 == 0: fract_amac2 = 0 fract_amac = round(fract_amac,2) fract_amac2 = round(fract_amac2,2) tot_amac_fr = round(tot_amac_fr,2) tot_amac_fr2 = round(tot_amac_fr2,2) tot_amac_diff = tot_amac_fr - tot_amac_fr2 diff_amac = fract_amac - fract_amac2 fract_amac = str(fract_amac) fract_amac2 = str(fract_amac2) diff_amac = str(diff_amac) tot_amac_fr = str(tot_amac_fr) tot_amac_fr2 = str(tot_amac_fr2) tot_amac_diff = str(tot_amac_diff) ############################# xcdn_file.write(am_ac + '\t' + item + '\t' + `fraction` + '\t' + `fraction2` + '\t' + \ `diff` + '\t' + `diff100` + '\t' + \ `cons_atgc_array[item]` + '\t' + `cons_atgc_array['atgc']` + '\t' + \ `cons_atgc_array2[item]` + '\t' + `cons_atgc_array2['atgc']` + '\t' + \ fract_amac + '\t' + fract_amac2 + '\t' + diff_amac + '\t' + \ '***' + '\t' + tot_amac_fr + '\t' + tot_amac_fr2 + '\t' + tot_amac_diff + '\n') ###################### print '' print '************************' print ' COUNT TEST 2: ' print `test1c` + '\t' + `cons_atgc_array['atgc']` + '\t' + 'QUERY TEST' print `test2c` + '\t' + `cons_atgc_array2['atgc']` + '\t' + 'SUBJECT TEST' for item in amac_list: test1ca = test1ca + cons_aa_count_array[item] test2ca = test2ca + cons_aa_count_array2[item] print `test1ca` + '\t' + `cons_aa_count_array['ALL']` + '\t' + 'AA QUERY TEST' print `test2ca` + '\t' + `cons_aa_count_array2['ALL']` + '\t' + 'AA SUBJECT TEST' print '************************' def Count_Triplets(id, hit, sub_len, sub_seq, sub_len2, sub_seq2, kska_file, gaps): global atgc_array global atgc_list global atgc_array2 global atgc_list2 global aa_array global cons_atgc_array global cons_atgc_array2 global aa_count_array global cons_aa_count_array global aa_count_array2 global cons_aa_count_array2 global amac_list global stop_codon_status stop_codon_status = 'GOOD' triplet_array = {} triplet_array2 = {} amac_array = {} amac_array2 = {} k = 0 while k < (sub_len - 3): triplet = sub_seq[k:(k+3)] if triplet == 'TAA' or triplet == 'TAG' or triplet == 'TGA': stop_codon_status = 'BAD' print 'BAD CODON QUERY' k = k + 3 k = 0 while k < (sub_len2 - 3): triplet = sub_seq2[k:(k+3)] if triplet == 'TAA' or triplet == 'TAG' or triplet == 'TGA': stop_codon_status = 'BAD' print 'BAD CODON SUBJECT' k = k + 3 count_cons = "FALSE" if stop_codon_status == 'GOOD': if sub_len == sub_len2 and gaps == "NO_GAPS": ##### CONSERVATIVE CASE ##### ##### NO GAPS, IDENTICAL SEQUENCE LENGTH ##### k = 0 q = 1 while k < sub_len: triplet = sub_seq[k:(k+3)] triplet2 = sub_seq2[k:(k+3)] ####################### triplet = string.lower(triplet) triplet2 = string.lower(triplet2) if triplet in atgc_list and triplet2 in atgc_list: triplet_array[q] = triplet triplet_array2[q] = triplet2 amac = aa_array[triplet] amac2 = aa_array[triplet2] cons_atgc_array[triplet] += 1 cons_atgc_array2[triplet2] += 1 cons_atgc_array['atgc'] += 1 cons_atgc_array2['atgc'] += 1 cons_aa_count_array[amac] += 1 cons_aa_count_array2[amac2] += 1 cons_aa_count_array['ALL'] += 1 cons_aa_count_array2['ALL'] += 1 amac_array[q] = amac amac_array2[q] = amac2 count_cons = "TRUE" ####################### k = k + 3 q = q + 1 ##### ALL CASES ##### #### QUERY SEQUENCE #### k = 0 q = 1 while k < sub_len: triplet = sub_seq[k:(k+3)] ####################### triplet = string.lower(triplet) if triplet in atgc_list: am_ac = aa_array[triplet] aa_count_array[am_ac] += 1 atgc_array[triplet] += 1 atgc_array['atgc'] += 1 aa_count_array['ALL'] += 1 ####################### k = k + 3 q = q + 1 #### SUBJECT SEQUENCE #### k = 0 q = 1 while k < sub_len2: triplet = sub_seq2[k:(k+3)] ####################### triplet = string.lower(triplet) if triplet in atgc_list2: am_ac = aa_array[triplet] aa_count_array2[am_ac] += 1 atgc_array2[triplet] += 1 atgc_array2['atgc'] += 1 aa_count_array2['ALL'] += 1 ####################### k = k + 3 q = q + 1 ####################### if count_cons == "TRUE": ### COUNT KsKa ### p = 1 ks = 0 ka = 0 pm = 0 am = 0 while p < q: try: oops1 = triplet_array[p] oops2 = triplet_array2[p] test_pair = "GOOD" except: test_pair = "BAD TRIPLET" print test_pair if test_pair == "GOOD": if triplet_array[p] == triplet_array2[p]: pm = pm + 1 am = am + 1 if triplet_array[p] != triplet_array2[p] and amac_array[p] != amac_array2[p]: ka = ka + 1 if triplet_array[p] != triplet_array2[p] and amac_array[p] == amac_array2[p]: ks = ks + 1 am = am + 1 p = p + 1 prot_len = sub_len/3 kska_diff = ks - ka # kska_frct = (pm*100.0)/am # FRACTION OF CODON MATCHES WITHIN AMINO ACIDS MATCHES kska_frct = ((am - ks)*100.0)/am # FRACTION OF CODON MATCHES WITHIN AMINO ACIDS MATCHES kska_frct = round(kska_frct) identity = (am*1.0/prot_len)*100 identity = round(identity) kska_file.write(id + '\t' + hit + '\t' + `ks` + '\t' + `ka` + '\t' + `kska_diff` + \ '\t' + '***' + '\t' + `pm` + '\t' + `am` + '\t' +`prot_len` + '\t' + \ '***' + '\t' + `identity` + '\t' + `kska_frct` + '\n') def Set_Zero_Values(): global atgc_array global atgc_list global aa_array global cons_atgc_array global cons_atgc_array2 global aa_count_array global cons_aa_count_array global aa_count_array2 global cons_aa_count_array2 global amac_list atgc_array = {} aa_array = {} cons_atgc_array = {} cons_atgc_array2 = {} aa_count_array = {} cons_aa_count_array = {} aa_count_array2 = {} cons_aa_count_array2 = {} aa_array['aaa'] = 'Lys' aa_array['aat'] = 'Asn' aa_array['aag'] = 'Lys' aa_array['aac'] = 'Asn' aa_array['ata'] = 'Ile' aa_array['att'] = 'Ile' aa_array['atg'] = 'Met' aa_array['atc'] = 'Ile' aa_array['aga'] = 'Arg' aa_array['agt'] = 'Ser' aa_array['agg'] = 'Arg' aa_array['agc'] = 'Ser' aa_array['aca'] = 'Thr' aa_array['act'] = 'Thr' aa_array['acg'] = 'Thr' aa_array['acc'] = 'Thr' aa_array['taa'] = 'End' aa_array['tat'] = 'Tyr' aa_array['tag'] = 'End' aa_array['tac'] = 'Tyr' aa_array['tta'] = 'Leu' aa_array['ttt'] = 'Phe' aa_array['ttg'] = 'Leu' aa_array['ttc'] = 'Phe' aa_array['tga'] = 'End' aa_array['tgt'] = 'Cys' aa_array['tgg'] = 'Trp' aa_array['tgc'] = 'Cys' aa_array['tca'] = 'Ser' aa_array['tct'] = 'Ser' aa_array['tcg'] = 'Ser' aa_array['tcc'] = 'Ser' aa_array['gaa'] = 'Glu' aa_array['gat'] = 'Asp' aa_array['gag'] = 'Glu' aa_array['gac'] = 'Asp' aa_array['gta'] = 'Val' aa_array['gtt'] = 'Val' aa_array['gtg'] = 'Val' aa_array['gtc'] = 'Val' aa_array['gga'] = 'Gly' aa_array['ggt'] = 'Gly' aa_array['ggg'] = 'Gly' aa_array['ggc'] = 'Gly' aa_array['gca'] = 'Ala' aa_array['gct'] = 'Ala' aa_array['gcg'] = 'Ala' aa_array['gcc'] = 'Ala' aa_array['caa'] = 'Gln' aa_array['cat'] = 'His' aa_array['cag'] = 'Gln' aa_array['cac'] = 'His' aa_array['cta'] = 'Leu' aa_array['ctt'] = 'Leu' aa_array['ctg'] = 'Leu' aa_array['ctc'] = 'Leu' aa_array['cga'] = 'Arg' aa_array['cgt'] = 'Arg' aa_array['cgg'] = 'Arg' aa_array['cgc'] = 'Arg' aa_array['cca'] = 'Pro' aa_array['cct'] = 'Pro' aa_array['ccg'] = 'Pro' aa_array['ccc'] = 'Pro' amac_list = ['Ala', 'Asn', 'Asp', 'Arg', 'Cys', 'Glu', \ 'Phe', 'Gly', 'His', 'Ile', 'Lys', 'Leu', \ 'Met', 'Pro', 'Gln', 'Ser', 'Thr', 'Val', \ 'Trp', 'Tyr', 'End'] for item in amac_list: aa_count_array[item] = 0 aa_count_array2[item] = 0 cons_aa_count_array[item] = 0 cons_aa_count_array2[item] = 0 aa_count_array['ALL'] = 0 aa_count_array2['ALL'] = 0 cons_aa_count_array['ALL'] = 0 cons_aa_count_array2['ALL'] = 0 atgc_list = ['aaa', 'aat', 'aag', 'aac', \ 'ata', 'att', 'atg', 'atc', \ 'aga', 'agt', 'agg', 'agc', \ 'aca', 'act', 'acg', 'acc', \ 'taa', 'tat', 'tag', 'tac', \ 'tta', 'ttt', 'ttg', 'ttc', \ 'tga', 'tgt', 'tgg', 'tgc', \ 'tca', 'tct', 'tcg', 'tcc', \ 'gaa', 'gat', 'gag', 'gac', \ 'gta', 'gtt', 'gtg', 'gtc', \ 'gga', 'ggt', 'ggg', 'ggc', \ 'gca', 'gct', 'gcg', 'gcc', \ 'caa', 'cat', 'cag', 'cac', \ 'cta', 'ctt', 'ctg', 'ctc', \ 'cga', 'cgt', 'cgg', 'cgc', \ 'cca', 'cct', 'ccg', 'ccc'] for item in atgc_list: atgc_array[item] = 0 atgc_array['atgc'] = 0 ################# global atgc_array2 global atgc_list2 atgc_array2 = {} atgc_list2 = atgc_list for item in atgc_list2: atgc_array2[item] = 0 atgc_array2['atgc'] = 0 ################# for item in atgc_list: cons_atgc_array[item] = 0 cons_atgc_array['atgc'] = 0 for item in atgc_list2: cons_atgc_array2[item] = 0 cons_atgc_array2['atgc'] = 0 ################# ################## # # # MAIN BODY # # # ################## import math import re import sys import string import time import os if __name__ == "__main__": if len(sys.argv) <= 5 or len(sys.argv) > 6: print "Program usage: " print "input_file1(query_seqs) input_file2(db_seqs) input_file3(blastx.info2) output_file output_dir" exit if len(sys.argv) == 6: in_name1 = sys.argv[1] in_name2 = sys.argv[2] in_name3 = sys.argv[3] out_name = sys.argv[4] out_dir = sys.argv[5] Set_Zero_Values() Seqs_Extractor(in_name1, in_name2, in_name3, out_name, out_dir) ### THE END ###