#!/usr/bin/python ####################################################################### # # # Author: Elena Kochetkova # # Supervisor: Alexander Kozik # # Date: February 25, 2004 # # Description: This program extracts specified substrings of a given # # dna sequence and translates those to proteins... # # # ####################################################################### ################################################### import os import sys import string import re import copy from string import * import dna_to_prot ################################################## class _cls: def __init__(self): if sys.platform in ('linux-i386', 'linux2'): self._command = 'clear' elif sys.platform in ('win32', 'dos') or \ sys.platform.startswith('ms-dos'): self._command = 'cls' else: self._command = None def __call__(self): """Clears the screen.""" if self._command: os.system(self._command) clear_screen = _cls() global input_filename, output_filename def ask_filename(): global input_filename input = strip(raw_input("\n\nEnter the SOURCE file name: ")) return input def file_open_err(file_name): print "Error opening file ","\"",file_name,"\"" def would_you_like_prompt(prompt_line): ans ="" ans = lstrip(raw_input(prompt_line)) while ans == "": print "\nEnter y or n." ans = lstrip(raw_input(prompt_line)) return ans def fileExists(f): try: file = open (f) except IOError: exists = 0 else: exists = 1 return exists def open_inputfile(): global input_filename proper_name = 1 while proper_name != 0: try: # ask for input file name input_filename = ask_filename() # open the input file fp_in = open(input_filename, "rb") proper_name = 0 except: file_open_err(input_filename) prompt_line = "\nWould you like to continue (y/n)? " ans = would_you_like_prompt(prompt_line) if ans[0] == "n" or ans[0] == "N": sys.exit(0) return fp_in def open_outputfile(ext): global input_filename, output_filename return_value_arr = [] output_filename = input_filename + ext print "Your default destination file is: ", output_filename # open the output file if fileExists(output_filename) == 1: print "\nFile",output_filename,"already exists.\n" prompt_line = "\nWould you like to overwrite it (y/n)? " ans = would_you_like_prompt(prompt_line) print "\n\n" if ans[0] == "n" or ans[0] == "N": output_filename = lstrip(raw_input("\n\nEnter the DESTINATION file name: ")) fp_out = open(output_filename, "wb") else: fp_out = open(output_filename, "wb") else: try: fp_out = open(output_filename, "wb") except: file_open_err(output_filename) print "\nTry again later...\n\n" sys.exit (1) return_value_arr.append (fp_out) return_value_arr.append (output_filename) return return_value_arr def get_posns(str): if str[0].isdigit(): return str else: open_parenths_idx = str.find("(") if (open_parenths_idx != -1): close_parenths_idx = str.rfind(")") return get_posns(str[open_parenths_idx + 1:close_parenths_idx]) else: return get_posns(str[1:]) def get_dir_pos(data_str): return_arr = [] direction = "" dir_c = "" direction = "forward" if re.match (".*complement\(", data_str) and not re.match (".*join\(", data_str): positions = get_posns(data_str) direction = "reverse" elif not re.match (".*complement\(", data_str) and re.match (".*join\(", data_str): positions = get_posns(data_str) elif re.match (".*complement.*\(join\(", data_str): positions = get_posns(data_str) direction = "reverse" else: positions = get_posns(data_str) return direction, positions def revcomp(dna): #reverse complement of a DNA sequence comp = dna.translate(maketrans("ATGCatgc", "TACGtacg")) lcomp = list(comp) lcomp.reverse() return join(lcomp, "") def dna_translate(cdna, code = dna_to_prot.standard): """ translate a cDNA sequence to a protein """ return "".join([ code.get(cdna[i:i+3], "X") for i in xrange(0,len(cdna),3) ]) def choose_format(): clear_screen () print("\n\n") print("****************************************") print("* Please ascribe the order to the *") print("* following header elements: *") print("* gene_name (1 2 3) *") print("* protein_name (1 2 3) *") print("* genbank_id (1 2 3) *") print("****************************************") order1 = 0 order2 = 0 order3 = 0 name = "GENE_NAME" order1 = valid_int_input(name, order2, order3) name = "PROTEIN_NAME" order2 = valid_int_input(name, order1, order3) name = "GENEBANK_ID" order3 = valid_int_input(name, order1, order2) order_list = [] order_list.insert(0, int(order1)-1) order_list.insert(1, int(order2)-1) order_list.insert(2, int(order3)-1) return order_list def make_header(order_list, gene_name, protein_name, genbank_id): header_order_list = [] for i in range(len(order_list)): header_order_list.insert(i, " ") header_order_list[int(order_list[0])]= gene_name header_order_list[int(order_list[1])]= protein_name header_order_list[int(order_list[2])]= genbank_id header = "" for j in range(len(header_order_list)): header = header + header_order_list[j]+ " " return header def valid_int_input(name, num1, num2): int_input = 1 while int_input != 0: order = strip(raw_input("\n\nEnter the sequence order for " + name + ": ")) try: test = string.atoi(order) if (int(order) > 3) or (int(order) < 1): print ("Enter number 1-3...") else: if (order == num1) or (order == num2): print("Number was previously selected...") else: int_input = 0 except ValueError: print "\nNot a number...\n" return order def duplication_option_menu(): clear_screen () print("\n\n") print("****************************************") print("* In case of duplicated sequences, *") print("* would you like to extract *") print("* *") print("* 1. longest sequence *") print("* 2. first sequence *") print("* *") print("****************************************") int_input = 1 while int_input != 0: answer = strip(raw_input("\n\nEnter your answer: ")) try: test = string.atoi(answer) if (int(answer) > 2) or (int(answer) < 1): print ("Enter either 1 or 2...") else: int_input = 0 except ValueError: print "\nNot a number...\n" return answer def get_data(fp_in, fp_out): dna_beg_pos = 0; data_lines = 0; format_cntr = 0 data_array = []; dna_array = []; pos_array = [] while 1: data = fp_in.readline() data_array.append(data) if re.match ("^( {5}(((CDS)|(mRNA)|(tRNA)|(gene)|(misc_feature)).*)|ORIGIN)", data_array[data_lines]): pos_array.append(data_lines) if format_cntr == 0 or format_cntr == 1000: clear_screen() print "READING: Processing line "+str(data_lines)+"... Please wait..." format_cntr = 0 format_cntr +=1 # match the beginning of dna sequence if re.match("^ORIGIN", data): dna_beg_pos = data_lines + 1 if dna_beg_pos > 0 and data_lines >= dna_beg_pos: new_line = string.translate(data_array[data_lines],id,nonAZStr) dna_array.append(new_line) # match the end of dna sequence if re.match("//", data): break data_lines += 1 if not data: # end of file break if dna_beg_pos == 0: fp_out.write('WARNING --- No "ORIGIN" found!') sys.exit(0) del data_array[dna_beg_pos:] return dna_array, data_array, pos_array def defRegionOfInterest(data_array, pos_array): region_id_array = [] CDS_array = [] gene_array = [] format_cntr = 0 for count in range(len(pos_array)): if count > 0: if re.match ("^ {5}(CDS)", data_array [pos_array[count-1]]): CDS_str = ''.join (data_array[pos_array[count-1]: pos_array[count]]) CDS_array.append(CDS_str) if re.match ("^ {5}(gene)", data_array [pos_array[count-1]]): gene_str = ''.join (data_array[pos_array[count-1]: pos_array[count]]) gene_array.append(gene_str) if format_cntr == 0 or format_cntr == 1000: clear_screen () print "SCANNING REGIONS: Processing line "+str(count)+"... Please wait..." format_cntr = 0 format_cntr +=1 del data_array[0:] del pos_array[0:] return CDS_array, gene_array def get_gene_name_desc(gene_array): gene_info_array = [] format_cntr = 0 substr = [] first_warning = 0 id = string.maketrans("","") for k in range (len(gene_array)): gname_flag = 0 gene_info = [] warning_msg = [] fp_out9.write(gene_array[k]) temp_str = string.translate(gene_array[k], id,'\n''\r') temp_str = re.sub(' {1,}', ' ', temp_str) if not re.match(".*/note=", temp_str): gene_name_gene = strip(string.split(string.split(temp_str, "gene=\"")[1], "\"")[0]) warning_msg.append("WARNING: " + gene_name_gene + " does not have \"note\" field\n") temp = string.split(string.split(temp_str, "gene")[1], "/") fixed = strip(string.translate(temp[0],id,'\n''>''<')) if (get_posns(fixed)): positions = get_posns(fixed) pos_bounds = string.split(positions, "..") gene_info.append(gene_name_gene) gene_info.append("") #gene at name gene_info.append("") #description gene_info.append(pos_bounds[0]) gene_info.append(pos_bounds[-1]) substr = string.split(gene_array[k], ' /') for k1 in range (len(substr)): if re.match ("gene=", substr[k1]): gn_g = string.split(substr[k1], '\"') if re.match("At\dg", gn_g[1]): gname_flag = 1 at_name_gene = gn_g[1] gene_name_gene = gn_g[1] if len(gene_info_array) > 0: if gene_name_gene == gene_info_array[-1][0]: warning_msg.append("WARNING: " + gene_name_gene + " duplicated gene name in GENE region\n") if re.match ("note=", substr[k1]): # if gene name does not start with At, get At name if re.match (".*locus_tag:", substr[k1]): if gname_flag == 0: # get description if (substr[k1].find(';') == -1): at_gene_info = string.split(string.split(substr[k1], "locus_tag:")[1], "\"") at_name_gene = strip(at_gene_info[0]) desc = "" else: at_gene_info = string.split(substr[k1], "locus_tag:") semicol_idx = at_gene_info[1].find(';') at_gene_desc = strip(at_gene_info[1][semicol_idx+1:]) at_name_gene = strip(string.split(at_gene_info[1], ';')[0]) desc = re.sub(' {1,}', ' ', at_gene_desc) desc = string.translate(desc,id,'\n''\r') desc = desc[0:-1] + " " elif not re.match (".*locus_tag:", substr[k1]): at_name_gene = "" # get description idx = substr[k1].find('=') desc = strip(substr[k1][idx+1:]) desc = re.sub(' {1,}', ' ', desc) desc = string.translate(desc,id,'\n''\r') desc = desc[1:-1] + " " temp = string.split(substr[0], "gene") fixed = strip(string.translate(temp[1],id,'\n''>''<')) if (get_posns(fixed)): positions = get_posns(fixed) pos_bounds = string.split(positions, "..") gene_info.append(gene_name_gene) gene_info.append(at_name_gene) gene_info.append(desc) gene_info.append(pos_bounds[0]) gene_info.append(pos_bounds[-1]) msg_str = ''.join(warning_msg) if len(msg_str) > 0: if first_warning == 0: fp_out8.write("************************\n") fp_out8.write("* GENE REGION WARNINGS *\n") fp_out8.write("************************\n\n") fp_out8.write(msg_str) first_warning = 1 if len(gene_info) > 0: gene_info_array.append(gene_info) if format_cntr == 0 or format_cntr == 1000: clear_screen () print "GETTING INFO ON GENE REGION: Processing line " + str(k) + "... Please wait..." format_cntr = 0 format_cntr +=1 if first_warning == 1: fp_out8.write('\n\n\n') del gene_array[0:] return gene_info_array def put100charsPerLine (length, string, file): global input_filename line_begin = 0 line_end = 100 count = 0 format_cntr = 0 fp_out1.write(">" + input_filename + "\n") while length > 0: substr = string [line_begin:line_end] file.write(substr+'\n') line_begin += 100 line_end += 100 length -=100 if format_cntr == 0 or format_cntr == 1000: clear_screen () print "WRITING: Processing line " + str(count) + "... Please wait..." format_cntr = 0 count += 1 format_cntr += 1 def get_CDS_region_desc(CDS_array, duplication_option): cds_info_array = [] dupl_array = [] dupl_count = 0 unique = 0 format_cntr = 0 id = string.maketrans("","") for l in range (len(CDS_array)): fp_out4.write(CDS_array[l]) desc = "" substr = [] cds_join_list = [] substr = string.split(CDS_array[l], ' /') for l1 in range (len(substr)): if re.match ("gene=", substr[l1]): gnc = string.split(substr[l1], '\"') gene_name_cds = gnc[1] if re.match ("protein_id=", substr[l1]): prot_id = string.split(substr[l1], '\"') protein_id = prot_id[1] if re.match ("db_xref=\"GI:", substr[l1]): gb_id1 = string.split(substr[l1], ':') gb_id2 = string.split(gb_id1[1], '\"') genbank_id = gb_id2[0] if re.match ("note=", substr[l1]): cds_desc = string.split(substr[l1], 'note="') desc = strip(string.split(cds_desc[1], ' /')[0]) desc = re.sub(' {1,}', ' ', desc) desc = string.translate(desc,id,'\n''\r') desc = desc[0:-1] + " " cds_join_list.append (substr[0]) # join broken strings to retrieve numbers within the brackets id = string.maketrans("","") joined = string.translate(''.join(cds_join_list), id, '\n') direction = "" pos_line = "" warning_msg = [] direction, pos_line = get_dir_pos(joined) pos_subline = string.translate(pos_line, id, ' ' '\r') positions = re.sub ("\.\.", "-", pos_subline) positions = re.sub (",", "|", positions) positions = re.sub (">", "", positions) positions = re.sub ("<", "", positions) pos_pairs = string.split (pos_subline, ",") pos_arr = [] pos_array = [] for m in range (len(pos_pairs)): if re.match (".*\.\.", pos_pairs[m]): pair_split = string.split (pos_pairs[m], "..") if len(pair_split) == 2: pos_arr.append(pair_split[0]) pos_arr.append(pair_split[1]) else: pos_arr.append(pos_pairs[m]) pos_arr.append(pos_pairs[m]) warning_msg.append("WARNING: " + pos_pairs[m] + " does not have a pair\n") for l2 in range(len(pos_arr)): if re.match(".*<", pos_arr[l2])or re.match(".*>", pos_arr[l2]): # WARNING ABOUT > and < warning_msg.append ("WARNING: " + pos_arr[l2]+"contains > or <\n") #get rid of > and < pos_array.append(int(string.translate(pos_arr[l2], id, '>' '<'))) else: pos_array.append(int(pos_arr[l2])) cds_info = [] cds_info.append(gene_name_cds) cds_info.append(protein_id) cds_info.append(genbank_id) cds_info.append(direction) cds_info.append(pos_array) cds_info.append(positions) cds_info.append(warning_msg) cds_info.append(desc) if l == 0: cds_info_array.append(cds_info) if l > 0: if len(dupl_array) > 0: temp_gene_name = dupl_array[-1][0] else: temp_gene_name = cds_info_array[len(cds_info_array)-1][0] if (gene_name_cds == temp_gene_name ): unique = 1 if dupl_count == 0: warning_msg.append("WARNING: Gene name " + gene_name_cds + " is not UNIQUE\n") dupl_array.append(cds_info_array[l-1]) del cds_info_array[-1] dupl_array.append(cds_info) else: dupl_array.append(cds_info) dupl_count += 1 else: if unique == 1: dupl_count = 0 if int(duplication_option) == 1: idx = find_longest(dupl_array) else: idx = 0 for k in range(len(dupl_array)): if k == idx: dupl_array[k].append("UNIQUE_SET") else: dupl_array[k].append("DUPLIC_SET") cds_info_array.append(dupl_array[k]) del dupl_array[0:] unique = 0 cds_info_array.append(cds_info) else: cds_info_array.append(cds_info) if format_cntr == 0 or format_cntr == 1000: clear_screen () print "GETTING INFO ON CDS REGION: Processing line " + str(l) + "... Please wait..." format_cntr = 0 format_cntr +=1 del CDS_array[0:] return cds_info_array def find_longest(dupl_array): length_array = [] pos_array = [] for i in range (len(dupl_array)): pos_array.append(dupl_array[i][4]) j = 1 length = 0 while j < len(pos_array[i]): length += (int(pos_array[i][j]) + 1) - int(pos_array[i][j-1]) j +=2 length_array.append(length) if i == 0: longest = length_array[i] longest_idx = i if i > 0: if length_array[i] > longest: longest = length_array[i] longest_idx = i return longest_idx ################################################### # main # ################################################### clear_screen () fp_in = open_inputfile() print "\n\n\nreading data from the input file" , input_filename, "...\n\n" fd1 = open_outputfile(".out") fp_out1 = fd1[0] fd2 = open_outputfile(".join") fp_out2 = fd2[0] fd3 = open_outputfile(".protein") fp_out3 = fd3[0] fd4 = open_outputfile(".cds") fp_out4 = fd4[0] fd5 = open_outputfile(".position") fp_out5 = fd5[0] fd6 = open_outputfile(".500") fp_out6 = fd6[0] fd7 = open_outputfile(".inbetween") fp_out7 = fd7[0] fd8 = open_outputfile(".msg") fp_out8 = fd8[0] fd9 = open_outputfile(".gene") fp_out9 = fd9[0] fd10 = open_outputfile(".protein.dupl") fp_out10 = fd10[0] #test output11 = input_filename + ".test" fp_out11 = open(output11, "wb") order_list = choose_format() # 1 - longest, 2 - first duplication_option = duplication_option_menu() id = string.maketrans("","") # No actual translation nonAZList = [chr(x) for x in range(256) if not ("a"<=chr(x)<="z" or "A"<=chr(x)<="Z")] nonAZStr="".join(nonAZList) dna_array = []; data_array = []; pos_array =[] #dna_array contains only dna sequence #data_array contains information about all regions (CDS, gene, mRNA) dna_array, data_array, pos_array = get_data(fp_in, fp_out1) fp_in.close() # close the input file CDS_array = []; gene_array = [] CDS_array, gene_array = defRegionOfInterest(data_array, pos_array) dna_str = ''.join (dna_array) #fp_out11.write (dna_str) # retrieving information from gene region # retrieved is a list of tuples # first element is gene name, second - at_name_gene,third - description # fourth - beginning of gene interval, fifth - the end gene_info_array = [] gene_info_array = get_gene_name_desc(gene_array) CDS_info_array = [] CDS_dupl_array = [] # CDS_info_array is a list of lists: # 1st element is gene_name_cds, then - protein_id, genbank_id, # direction, pos_arr, positions, warning, [duplication_desc] CDS_info_array = get_CDS_region_desc(CDS_array, duplication_option) dna_modif = [] last_from_before = 0 first_warning = 0 format_cntr = 0 m = 0 at_gname_arr = [] # processing information from CDS region for l in range(len(CDS_info_array)): warning_msg = [] gene_name_cds = CDS_info_array[l][0] protein_id = CDS_info_array[l][1] genbank_id = CDS_info_array[l][2] direction = CDS_info_array[l][3] pos_arr = CDS_info_array[l][4] positions = CDS_info_array[l][5] warning = ''.join(CDS_info_array[l][6]) if warning != "": warning_msg.append(warning) desc_cds = CDS_info_array[l][7] if len(CDS_info_array[l]) == 8: unique = 0 elif len(CDS_info_array[l]) == 9: dupl_desc = CDS_info_array[l][8] if dupl_desc =='UNIQUE_SET': unique = 0 else: unique = 1 # check if gene name in gene regions matches the one in CDS region gene_name_gene = gene_info_array[m][0] if gene_info_array[m][1] != "": at_name_gene = gene_info_array[m][1] else: at_name_gene = gene_info_array[m][0] if gene_info_array[m][2] != "": desc = gene_info_array[m][2] elif desc_cds != "": desc = desc_cds else: desc = "NO DESCRIPTION FOUND" gene_beg_interval = int(gene_info_array[m][3]) gene_end_interval = int(gene_info_array[m][4]) gene_name_match = 0 header = make_header(order_list, gene_name_cds, protein_id, genbank_id) if ((pos_arr[0] >= gene_beg_interval) and (pos_arr[len(pos_arr)-1 ] <= gene_end_interval)) \ and (gene_name_gene == gene_name_cds): fp_out11.write("*" + gene_name_gene + " == " + gene_name_cds+"\n") gene_name_match = 1 else : fp_out11.write("WARNING: gene name/interval mismatch: "+gene_name_gene+" (in gene region) and "+gene_name_cds+" (in CDS region)\n") temp_m = m while ((m > 0)) and gene_name_match != 1: m -= 1 fp_out11.write("m counts down m = "+str(m)+"\n") if (gene_info_array[m][0] == gene_name_cds): if (pos_arr[0] >= int(gene_info_array[m][3])) and (pos_arr[len(pos_arr)-1 ] <= int(gene_info_array[m][4])): fp_out11.write(str(m) + " " + gene_info_array[m][0] + " == " + gene_name_cds + "\n") gene_name_gene = gene_info_array[m][0] if gene_info_array[m][1] != "": at_name_gene = gene_info_array[m][1] else: at_name_gene = gene_info_array[m][0] desc = gene_info_array[m][2] gene_name_match = 1 if gene_name_match == 0: m = temp_m while ((m < len(gene_info_array)-1)) and gene_name_match != 1: m += 1 fp_out11.write("m counts up m = "+str(m)+ "\n") if (gene_info_array[m][0] == gene_name_cds): if (pos_arr[0] >= int(gene_info_array[m][3])) and (pos_arr[len(pos_arr)-1 ] <= int(gene_info_array[m][4])): fp_out11.write(str(m) + " " + gene_info_array[m][0] + " == " + gene_name_cds+ "\n") gene_name_gene = gene_info_array[m][0] if gene_info_array[m][1] != "": at_name_gene = gene_info_array[m][1] else: at_name_gene = gene_info_array[m][0] desc = gene_info_array[m][2] gene_name_match = 1 if gene_name_match == 0: m = temp_m gene_name_gene = "None" at_name_gene = "None" m += 1 fp_out11.write("*********************************\n") if unique == 0: at_gname_arr.append(at_name_gene) l3 = 1 l4 = 0 beg_dna_seq = [] end_dna_seq = [] subseq_dna = [] beg_in_between = [] end_in_between = [] dna_str_modif = [] temp_dna_str = '' substr_arr = [] # while not last position in a position array while l3 <= len(pos_arr): beg_dna_seq.append (pos_arr[l3-1]) end_dna_seq.append (pos_arr[l3]) # capitalized substring corresponding to specified positions -> .join subseq_dna.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) # get 500 characters before the very first valid subsequence in a whole DNA sequence if (l4 == 0) and (l == 0): substr_arr.append(dna_str[int(beg_dna_seq[l4])-1-500:int(beg_dna_seq[l4])-1]) substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) dna_str_modif_zero = dna_str[:int(beg_dna_seq[l4])-1] dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) #lowercase sequence in between 2 genes #(extact -500 before first valid subsequense in each CDS -> substr_arr) elif (l4 == 0) and (l != 0): substr_arr.append(dna_str[int(beg_dna_seq[l4])-1-500:int(beg_dna_seq[l4])-1]) substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) if unique == 0: #if end position of one gene overlaps with the beginning position of another if last_from_before >= int(beg_dna_seq[0]): warning_msg.append ("WARNING: "+CDS_info_array[l-1][0]+"-"+gene_name_cds+": overlapping sequence fragment\n") dna_str_modif.append(string.upper(dna_str[last_from_before+1:int(end_dna_seq[l4])])) else: dna_str_modif.append(dna_str[last_from_before+1:int(beg_dna_seq[l4])-1]) dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) fp_out7.write('>'+at_gname_arr[0]+'_'+at_name_gene+' [ '+CDS_info_array[l-1][0]+'_'+gene_name_cds+' ] '\ +string.upper(CDS_info_array[l-1][3][0])+ string.upper(direction[0])+'\n') fp_out7.write(dna_str[last_from_before+1:int(beg_dna_seq[l4])-1] +'\n') del at_gname_arr[0] last_from_before = 0 #lowercase sequence in between 2 position pairs elif (l4 > 0): substr_arr.append(dna_str[int(end_dna_seq[l4-1]):int(beg_dna_seq[l4])-1]) substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) dna_str_modif.append(dna_str[int(end_dna_seq[l4-1]):int(beg_dna_seq[l4])-1]) dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) else: dna_str_modif.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) substr_arr.append(string.upper(dna_str[int(beg_dna_seq[l4])-1:int(end_dna_seq[l4])])) l4 = l4+1 l3 = l3+2 # extact +500 after last valid subsequense in each CDS -> substr_arr substr_arr.append(dna_str[int(end_dna_seq[l4-1]):int(end_dna_seq[l4-1])+500]) #data for .500 file join_substr = ''.join (substr_arr) if (l == len(CDS_info_array)-1): dna_str_modif_last = dna_str[int(end_dna_seq[l4-1]):] if unique == 0: last_from_before = int(end_dna_seq[l4-1])-1 # temp_dna_str = ''.join (dna_str_modif) dna_modif.append(''.join(dna_str_modif)) if direction == "reverse": rev_join_substr = revcomp(join_substr) if len(CDS_info_array[l]) == 9: fp_out6.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ] "+dupl_desc +"\n") else: fp_out6.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ]\n") fp_out6.write(rev_join_substr + '\n') else: if len(CDS_info_array[l]) == 9: fp_out6.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ] "+dupl_desc +"\n") else: fp_out6.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ]\n") fp_out6.write(join_substr + '\n') else: if direction == "reverse": rev_join_substr = revcomp(join_substr) fp_out6.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ] "+dupl_desc +"\n") fp_out6.write(rev_join_substr + '\n') else: fp_out6.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ] "+dupl_desc +"\n") fp_out6.write(join_substr + '\n') #data for .join file dna_substr = ''.join(subseq_dna) if direction == "reverse": rev_comp_dna = revcomp(dna_substr) prot_str = dna_translate (rev_comp_dna) else: prot_str = dna_translate (dna_substr) #find middle position mid_pos = int((int(beg_dna_seq[0])+ int(end_dna_seq[len(end_dna_seq)-1]))/2) if unique == 0: fp_out5.write(at_name_gene+'\t'+gene_name_cds +'\t'+protein_id +'\t'+'CDS'+'\t'+direction+'\t'+ \ genbank_id+'\t'+str(mid_pos)+'\t'+positions+'\n') fp_out3.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ]\n") fp_out3.write(prot_str +"\n") if len(CDS_info_array[l]) == 9: fp_out10.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ] " + dupl_desc + '\n') fp_out10.write(prot_str +"\n") if direction == "reverse": rev_dna_substr = revcomp(dna_substr) fp_out2.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ]\n") fp_out2.write(rev_dna_substr +'\n') else: fp_out2.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ]\n") fp_out2.write(dna_substr +'\n') else: #write translated protein sequence into fp_out10 fp_out10.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ] " + dupl_desc + '\n') fp_out10.write(prot_str +"\n") if len(dna_substr)%3 != 0: warning_msg.append ("WARNING: String length is not divisible by 3"+'\n') stop_cod_num = string.count(prot_str, "*") # count stop codons if stop_cod_num == 0: warning_msg.append ("WARNING: NO STOP CODON"+'\n') if (stop_cod_num == 1) and (prot_str[len(prot_str)-1] != '*') : warning_msg.append ("WARNING: PREMATURE STOP CODON"+'\n') if stop_cod_num >=2 : warning_msg.append ("WARNING: >1 STOP CODONS"+'\n') msg_str = ''.join(warning_msg) if len(msg_str) > 0: if first_warning == 0: fp_out8.write("***********************\n") fp_out8.write("* CDS REGION WARNINGS *\n") fp_out8.write("***********************\n\n") fp_out8.write(">" + at_name_gene + " " + desc + "[ "+ header + "CDS " + direction + " ]\n") fp_out8.write(msg_str) first_warning = 1 if format_cntr == 0 or format_cntr == 1000: clear_screen () print "TRANSLATING: Processing line " + str(l) + "... Please wait..." format_cntr = 0 format_cntr += 1 fp_out11.close() # close the output11 file fp_out10.close() # close the output10 file fp_out9.close() # close the output9 file fp_out8.close() # close the output8 file fp_out7.close() # close the output7 file fp_out6.close() # close the output6 file fp_out5.close() # close the output5 file fp_out4.close() # close the output4 file fp_out3.close() # close the output3 file fp_out2.close() # close the output2 file fp_in.close() # close the input file dna_modif.insert (0, dna_str_modif_zero) dna_modif.insert (len(dna_modif), dna_str_modif_last) modif_dna_str = ''.join (dna_modif) put100charsPerLine (len(modif_dna_str), modif_dna_str, fp_out1) fp_out1.close() # close the output1 file print "DNA length: " + str(len(dna_str)) print "\n\nModified file",fd1[1],"is created..." print "Modified file",fd2[1],"is created..." print "Modified file",fd3[1],"is created..." print "Modified file",fd4[1],"is created..." print "Modified file",fd5[1],"is created..." print "Modified file",fd6[1],"is created..." print "Modified file",fd7[1],"is created..." print "Modified file",fd8[1],"is created..." print "Modified file",fd9[1],"is created...\n\n"