#!/usr/bin/python ################################################################# # # # MAD MAPPING PROGRAM # # PART 1 ( CLUSTERING ) # # # # COPYRIGHT 2004 2005 # # Alexander Kozik # # # # http://www.atgc.org/ # # # # UCD Genome Center. R.Michelmore group # # # ################################################################# ################################################################# # +-------+ # # | BIT | # # SCORING SYSTEM: | | # # | REC | # # +-------+ # # # # . +-------+-------+-------+-------+-------+-------+ # # . | | | | | | | # # . | A | B | C | D | H | - | # # .| | | | | | | # # +-------*-------+-------+-------+-------+-------+-------+ # # | | . 6 | -6 | -4 | 4 | -2 | 0 | # # | A | | | | | | | # # | | 0 .| 1 | 1 | 0 | 0.5 | 0 | # # +-------+-------*-------+-------+-------+-------+-------+ # # | | -6 | . 6 | 4 | -4 | -2 | 0 | # # | B | | | | | | | # # | | 1 | 0 .| 0 | 1 | 0.5 | 0 | # # +-------+-------+-------*-------+-------+-------+-------+ # # | | -4 | 4 | . 4 | -4 | 0 | 0 | # # | C | | | | | | | # # | | 1 | 0 | 0 .| 1 | 0 | 0 | # # +-------+-------+-------+-------*-------+-------+-------+ # # | | 4 | -4 | -4 | . 4 | 0 | 0 | # # | D | | | | | | | # # | | 0 | 1 | 1 | 0 .| 0 | 0 | # # +-------+-------+-------+-------+-------*-------+-------+ # # | | -2 | -2 | 0 | 0 | . 2 | 0 | # # | H | | | | | | | # # | | 0.5 | 0.5 | 0 | 0 | 0 .| 0 | # # +-------+-------+-------+-------+-------+-------*-------+ # # | | 0 | 0 | 0 | 0 | 0 | . 0 | # # | - | | | | | | | # # | | 0 | 0 | 0 | 0 | 0 | 0 .| # # +-------+-------+-------+-------+-------+-------+-------*. # # # # # # NOTES: # # C - NOT A ( H or B ) # # D - NOT B ( H or A ) # # H - A and B # # # ################################################################# ################################################################# # # # EXAMPLES OF SCORING: # # # # # # POSITIVE LINKAGE: # # # # AAAAAAAAAAAAAAAAAAAA BIT SCORE = 6*20 = 120 # # AAAAAAAAAAAAAAAAAAAA REC SCORE = 0 (0.0) # # .. # # AAAAAAAAAAAAAAAAAAAA BIT SCORE = 6*18 - 6*2 = 96 # # AAAAAAAAAAAAAAAAAABB REC SCORE = 2 (2/20 = 0.1) # # # # AAAAAAAAAABBBBBBBBBB BIT SCORE = 6*10 + 6*10 = 120 # # AAAAAAAAAABBBBBBBBBB REC SCORE = 0 (0.0) # # .. # # AAAAAAAAABABBBBBBBBB BIT SCORE = 6*18 - 6*2 = 96 # # AAAAAAAAAABBBBBBBBBB REC SCORE = 2 (2/20 = 0.1) # # # # # # NO LINKAGE: # # .......... # # AAAAAAAAAAAAAAAAAAAA BIT SCORE = 6*10 - 6*10 = 0 # # AAAAAAAAAABBBBBBBBBB REC SCORE = 10 (10/20 = 0.5) # # . . . . . . . . . . # # BBBAABBAAAAAAABAABBB BIT SCORE = 6*10 - 6*10 = 0 # # BABBAABBABABABBBAABA REC SCORE = 10 (10/20 = 0.5) # # # # # # NEGATIVE LINKAGE: # # .................. # # AAAAAAAAAAAAAAAAAAAA BIT SCORE = 6*2 - 6*18 = -96 # # AABBBBBBBBBBBBBBBBBB REC SCORE = 18 (18/20 = 0.9) # # .................. # # ABABABABABABABABABAB BIT SCORE = 6*2 - 6*18 = -96 # # ABBABABABABABABABABA REC SCORE = 18 (18/20 = 0.9) # # # # # ################################################################# def Define_Bit_Scores(): ################### global bit_score_AA global bit_score_AB global bit_score_AC global bit_score_AD global bit_score_AH global bit_score_BA global bit_score_BB global bit_score_BC global bit_score_BD global bit_score_BH global bit_score_CA global bit_score_CB global bit_score_CC global bit_score_CD global bit_score_CH global bit_score_DA global bit_score_DB global bit_score_DC global bit_score_DD global bit_score_DH global bit_score_HA global bit_score_HB global bit_score_HC global bit_score_HD global bit_score_HH ################### global rec_score_AA global rec_score_AB global rec_score_AC global rec_score_AD global rec_score_AH global rec_score_BA global rec_score_BB global rec_score_BC global rec_score_BD global rec_score_BH global rec_score_CA global rec_score_CB global rec_score_CC global rec_score_CD global rec_score_CH global rec_score_DA global rec_score_DB global rec_score_DC global rec_score_DD global rec_score_DH global rec_score_HA global rec_score_HB global rec_score_HC global rec_score_HD global rec_score_HH ################### bit_score_AA = 6 bit_score_AB = -6 bit_score_AC = -4 bit_score_AD = 4 bit_score_AH = -2 bit_score_BA = -6 bit_score_BB = 6 bit_score_BC = 4 bit_score_BD = -4 bit_score_BH = -2 bit_score_CA = -4 bit_score_CB = 4 bit_score_CC = 4 bit_score_CD = -4 bit_score_CH = 0 bit_score_DA = 4 bit_score_DB = -4 bit_score_DC = -4 bit_score_DD = 4 bit_score_DH = 0 bit_score_HA = -2 bit_score_HB = -2 bit_score_HC = 0 bit_score_HD = 0 bit_score_HH = 2 ################# rec_score_AA = 0 rec_score_AB = 1 rec_score_AC = 1 rec_score_AD = 0 rec_score_AH = 0.5 rec_score_BA = 1 rec_score_BB = 0 rec_score_BC = 0 rec_score_BD = 1 rec_score_BH = 0.5 rec_score_CA = 1 rec_score_CB = 0 rec_score_CC = 0 rec_score_CD = 1 rec_score_CH = 0 rec_score_DA = 0 rec_score_DB = 1 rec_score_DC = 1 rec_score_DD = 0 rec_score_DH = 0 rec_score_HA = 0.5 rec_score_HB = 0.5 rec_score_HC = 0 rec_score_HD = 0 rec_score_HH = 0 ################# def Read_Data_File(in_name, out_name, rec_cut, bit_cut, dat_cut, frame_file_name): ################### global bit_score_AA global bit_score_AB global bit_score_AC global bit_score_AD global bit_score_AH global bit_score_BA global bit_score_BB global bit_score_BC global bit_score_BD global bit_score_BH global bit_score_CA global bit_score_CB global bit_score_CC global bit_score_CD global bit_score_CH global bit_score_DA global bit_score_DB global bit_score_DC global bit_score_DD global bit_score_DH global bit_score_HA global bit_score_HB global bit_score_HC global bit_score_HD global bit_score_HH ################### global rec_score_AA global rec_score_AB global rec_score_AC global rec_score_AD global rec_score_AH global rec_score_BA global rec_score_BB global rec_score_BC global rec_score_BD global rec_score_BH global rec_score_CA global rec_score_CB global rec_score_CC global rec_score_CD global rec_score_CH global rec_score_DA global rec_score_DB global rec_score_DC global rec_score_DD global rec_score_DH global rec_score_HA global rec_score_HB global rec_score_HC global rec_score_HD global rec_score_HH ################### print "=============================================" print "INPUT FILE: " + in_name print "OUTPUT FILE: " + out_name print "RECM CUTOFF: " + str(rec_cut) print "BITS CUTOFF: " + str(bit_cut) print "DATA CUTOFF: " + str(dat_cut) in_file = open(in_name, "rb") out_file0 = open(out_name + '.pairs_positive', "wb") out_file1 = open(out_name + '.pairs_all', "wb") out_file2 = open(out_name + '.pairs_negative', "wb") ############# GOOD AND BAD SET ############### out_file4 = open(out_name + '.set_uniq', "wb") out_file5 = open(out_name + '.set_dupl', "wb") ############################################## out_file6 = open(out_name + '.x_log_file', "wb") ############################################## out_file7 = open(out_name + '.x_scores_stat', "wb") ############################################## global out_file8 out_file8 = open(out_name + '.x_tree_clust', "wb") ############################################## out_file6.write("=============================================" + '\n') out_file6.write("INPUT FILE: " + in_name + '\n') out_file6.write("OUTPUT FILE: " + out_name + '\n') out_file6.write("RECM CUTOFF: " + str(rec_cut) + '\n') out_file6.write("BITS CUTOFF: " + str(bit_cut) + '\n') out_file6.write("DATA CUTOFF: " + str(dat_cut) + '\n') time.sleep(2) global id_list global id_array global pairs_array global pairs_array_N global frame_array global tree_clust_array id_list = [] ; # LIST OF ALL NON-REDUNDANT IDs id_array = {} ; # TO CHECK AND CREATE NON-REDUNDANT LIST matrix_array = {} ; # PAIRWISE DATA FOR GOOD PAIRS ( <= 0.4 ) matrix_array_X = {} ; # PAIRWISE DATA FOR GOOD PAIRS ( <= 0.25 ) matrix_array_1 = {} ; # 0.20 matrix_array_2 = {} ; # 0.18 matrix_array_3 = {} ; # 0.16 matrix_array_4 = {} ; # 0.14 matrix_array_5 = {} ; # 0.12 matrix_array_6 = {} ; # 0.10 matrix_array_7 = {} ; # 0.09 matrix_array_8 = {} ; # 0.08 matrix_array_9 = {} ; # 0.07 matrix_array_A = {} ; # 0.06 matrix_array_B = {} ; # 0.05 matrix_array_C = {} ; # 0.04 matrix_array_D = {} ; # 0.03 matrix_array_E = {} ; # 0.02 matrix_array_F = {} ; # 0.01 matrix_array_G = {} ; # 0.00 matrix_array_N = {} ; # PAIRWISE DATA FOR NEGATIVE LINKAGE ( >= 0.6 ) matrix_array_N1 = {} ; # -0.7 matrix_array_N2 = {} ; # -0.8 matrix_array_N3 = {} ; # -0.9 pairs_array = {} ; # ARRAY OF ALL POSITIVE PAIRS IN THE MATRIX pair_counter = 0 ; # COUNTER OF POSITIVE PAIRS IN THE MATRIX pairs_array_N = {} ; # ARRAY OF ALL NEGATIVE PAIRS IN THE MATRIX pair_counter_N = 0 ; # COUNTER OF NEGATIVE PAIRS IN THE MATRIX sequence_array = {} ; # ARRAY OF ALL SEQUENCES frame_array = {} ; # ARRAY OF FRAME MARKERS frame_position = {} ; # ARRAY WITH FRAME MARKERS COORDINATES bit_sum_array = {} ; # SUMMARY OF BIT SCORES rec_sum_array = {} ; # SUMMARY OF REC SCORES bit_abs_array = {} ; # SUMMARY OF BIT SCORES rec_abs_array = {} ; # SUMMARY OF REC SCORES tree_clust_array = {} ; # SUMMARY OF CLUSTERING ARRAY rec_list = ["00_01", "01_02", "02_03", "03_04", "04_05", "05_06", "06_07", "07_08", "08_09", "09_10"] bit_list = ["BIT_POS", "BIT_MED", "BIT_NEG"] ### HEADER IN SUMMARY FILE ### out_file7.write("MARKER_ID" + '\t') for item in rec_list: out_file7.write(item + '\t') out_file7.write("REC:P-N" + '\t') out_file7.write("REC_ABS" + '\t') out_file7.write("***" + '\t') for item in bit_list: out_file7.write(item + '\t') out_file7.write("BIT:P-N" + '\t') out_file7.write("BIT_ABS" + '\t') out_file7.write("***" + '\t') out_file7.write("LG_SUMMARY" + '\t') out_file7.write("NP_SUMMARY" + '\n') global print_frame print_frame = "FALSE" ### TRY TO READ FRAME MARKERS LIST ### try: frame_file = open(frame_file_name, "rb") print "USING FRAME MARKERS LIST" while 1: u = frame_file.readline() if u == '': break if '\n' in u: u = u[:-1] if '\r' in u: u = u[:-1] u = u.split('\t') fm = u[1] fl = u[0] try: fp = u[2] except: fp = "-" frame_array[fm] = fl frame_position[fm] = fp print_frame = "TRUE" except: print "DID NOT FIND FRAME MARKERS FILE: " + frame_file_name print "WORKING WITHOUT FRAME MARKERS LIST" # continue ### READ DATA FILE ### print "=============================================" out_file6.write("=============================================" + '\n') n = 0 d = 0 l = 1 init_len = 10000 duplic_id = [] while 1: t = in_file.readline() if t == '': break if '\n' in t: t = t[:-1] if '\r' in t: t = t[:-1] tl = t.split('\t') curr_len = len(tl) if tl[0][0] == ";": print "=============================================" print tl print "JUNK LINE" print "=============================================" time.sleep(2) if l == 1 and curr_len >= 12 and tl[0][0] != ";": init_len = curr_len if curr_len == init_len and tl[0][0] != ";": #### id = tl[0] # if id not in id_list: try: id_test = id_array[id] duplic_id.append(id) out_file5.write(t + '\n') print '\n' print id + " IS DUPLICATED -=- CHECK DATA FOR DUPLICATION" out_file6.write( id + " IS DUPLICATED -=- CHECK DATA FOR DUPLICATION" + '\n') d = d + 1 except: id_array[id] = 1 id_list.append(id) out_file4.write(t + '\n') q = 1 while q < init_len: data_point = tl[q] if data_point == "A" or data_point == "B" or data_point == "C" \ or data_point == "D" or data_point == "H" or data_point == "-": sequence_array[id,q] = data_point else: print '\n' print "WRONG DATA FORMAT" print "CHECK LINE: " + `l` + '\n' print t print "=============================================" print "VALUE: " + data_point print "=============================================" print "TERMINATED.........." print "=============================================" out_file6.write("TERMINATED") out_file6.write(".........." + '\n') sys.exit() q = q + 1 sys.stdout.write(".") n = n + 1 l = l + 1 if curr_len != init_len and l > 1: print '\n' print "WRONG NUMBER OF DATA POINTS" print "CHECK LINE: " + `l` + '\n' print t print "=============================================" print "TERMINATED.........." print "=============================================" out_file6.write("TERMINATED") out_file6.write(".........." + '\n') sys.exit() print '\n' print "=============================================" print `n` + " UNIQ IDs IN THE SET FOUND" print `d` + " IDs ARE DUPLICATED" out_file6.write("=============================================" + '\n') out_file6.write(`n` + " UNIQ IDs IN THE SET FOUND" + '\n') out_file6.write(`d` + " IDs ARE DUPLICATED" + '\n') duplic_id.sort() print duplic_id out_file6.write("=============================================" + '\n') print "CONTINUE ANALYSIS WITH " + `n` + " SEQUENCES OUT OF " + `n+d` out_file6.write("CONTINUE ANALYSIS WITH " + `n` + " SEQUENCES OUT OF " + `n+d` + '\n') print "=============================================" print "SUCCESS!!!" print "=============================================" out_file6.write("=============================================" + '\n') ######### PAIRWISE COMPARISON ######### id_list.sort() ### SET SUMMARY VALUES FOR MARKERS #### for item in id_list: tree_clust_array[item] = [] rec_abs_array[item] = 0 bit_abs_array[item] = 0 for rec_range in rec_list: rec_sum_array[item,rec_range] = 0 for bit_range in bit_list: bit_sum_array[item,bit_range] = 0 ### START ### dummy_cat = 1 dummy_len = len(id_list) dummy_value = dummy_len*dummy_len print `dummy_value` + " PAIRWISE COMPARISONS HAVE TO BE DONE" for item_a in id_list: for item_b in id_list: p = 1 bit_score = 0 recomb = 0 data_loss = 0 data_points = 0 line_tick_update = math.fmod(dummy_cat, 1000) if line_tick_update == 0: print `dummy_cat` + " COMPARISONS DONE OUT OF " + `dummy_value` while p < init_len: score_a = sequence_array[item_a,p] score_b = sequence_array[item_b,p] #### SCORES FOR PAIRWISE COMPARISON #### #### NO DATA #### if score_a == "-" or score_b == "-": data_loss = data_loss + 1 #### PERFECT MATCH #### if score_a == "A" and score_b == "A": recomb = recomb + rec_score_AA bit_score = bit_score + bit_score_AA data_points = data_points + 1 if score_a == "B" and score_b == "B": recomb = recomb + rec_score_BB bit_score = bit_score + bit_score_BB data_points = data_points + 1 #### REPULSION #### if score_a == "A" and score_b == "B": recomb = recomb + rec_score_AB bit_score = bit_score + bit_score_AB data_points = data_points + 1 if score_a == "B" and score_b == "A": recomb = recomb + rec_score_BA bit_score = bit_score + bit_score_BA data_points = data_points + 1 #### MONO - HETERO #### if score_a == "A" and score_b == "H": recomb = recomb + rec_score_AH bit_score = bit_score + bit_score_AH data_points = data_points + 1 if score_a == "B" and score_b == "H": recomb = recomb + rec_score_BH bit_score = bit_score + bit_score_BH data_points = data_points + 1 #### HETERO - MONO #### if score_a == "H" and score_b == "A": recomb = recomb + rec_score_HA bit_score = bit_score + bit_score_HA data_points = data_points + 1 if score_a == "H" and score_b == "B": recomb = recomb + rec_score_HB bit_score = bit_score + bit_score_HB data_points = data_points + 1 ### HETERO - HETERO ### if score_a == "H" and score_b == "H": recomb = recomb + rec_score_HH bit_score = bit_score + bit_score_HH data_points = data_points + 1 #### C CASE ALL #### if score_a == "C" and score_b == "C": recomb = recomb + rec_score_CC bit_score = bit_score + bit_score_CC data_points = data_points + 1 #### C - CASE (L) #### if score_a == "C" and score_b == "A": recomb = recomb + rec_score_CA bit_score = bit_score + bit_score_CA data_points = data_points + 1 if score_a == "C" and score_b == "B": recomb = recomb + rec_score_CB bit_score = bit_score + bit_score_CB data_points = data_points + 1 if score_a == "C" and score_b == "H": recomb = recomb + rec_score_CH bit_score = bit_score + bit_score_CH data_points = data_points + 1 #### C - CASE (R) #### if score_a == "A" and score_b == "C": recomb = recomb + rec_score_AC bit_score = bit_score + bit_score_AC data_points = data_points + 1 if score_a == "B" and score_b == "C": recomb = recomb + rec_score_BC bit_score = bit_score + bit_score_BC data_points = data_points + 1 if score_a == "H" and score_b == "C": recomb = recomb + rec_score_HC bit_score = bit_score + bit_score_HC data_points = data_points + 1 #### D CASE ALL #### if score_a == "D" and score_b == "D": recomb = recomb + rec_score_DD bit_score = bit_score + bit_score_DD data_points = data_points + 1 #### D - CASE (L) #### if score_a == "D" and score_b == "A": recomb = recomb + rec_score_DA bit_score = bit_score + bit_score_DA data_points = data_points + 1 if score_a == "D" and score_b == "B": recomb = recomb + rec_score_DB bit_score = bit_score + bit_score_DB data_points = data_points + 1 if score_a == "D" and score_b == "H": recomb = recomb + rec_score_DH bit_score = bit_score + bit_score_DH data_points = data_points + 1 #### D - CASE (R) #### if score_a == "A" and score_b == "D": recomb = recomb + rec_score_AD bit_score = bit_score + bit_score_AD data_points = data_points + 1 if score_a == "B" and score_b == "D": recomb = recomb + rec_score_BD bit_score = bit_score + bit_score_BD data_points = data_points + 1 if score_a == "H" and score_b == "D": recomb = recomb + rec_score_HD bit_score = bit_score + bit_score_HD data_points = data_points + 1 ##################################### #### D -=- C #### if score_a == "D" and score_b == "C": recomb = recomb + rec_score_DC bit_score = bit_score + bit_score_DC data_points = data_points + 1 #### C -=- D #### if score_a == "C" and score_b == "D": recomb = recomb + rec_score_CD bit_score = bit_score + bit_score_CD data_points = data_points + 1 ##################################### p = p + 1 dummy_cat = dummy_cat + 1 ### DATA POINT TEST ### if data_points + data_loss != init_len - 1: print "...SOMETHING IS WRONG..." sys.exit() ### ANALYZE ONLY VALUABLE CRAP ### data_fr = data_points*1.00/(data_points + data_loss) if data_fr >= dat_cut: rec_dec = recomb*1.00/data_points rec_dec = round(rec_dec,2) rec_str = str(rec_dec) data_fr = round(data_fr,2) data_fr_str = str(data_fr) ### SUMMARY DATA ### rec_abs_array[item_a] = rec_abs_array[item_a] + (0.5 - rec_dec) bit_abs_array[item_a] = bit_abs_array[item_a] + bit_score rec_data = rec_dec if rec_data <= 0.1 and rec_data >= 0: rec_sum_array[item_a,"00_01"] = rec_sum_array[item_a,"00_01"] + 1 if rec_data <= 0.2 and rec_data > 0.1: rec_sum_array[item_a,"01_02"] = rec_sum_array[item_a,"01_02"] + 1 if rec_data <= 0.3 and rec_data > 0.2: rec_sum_array[item_a,"02_03"] = rec_sum_array[item_a,"02_03"] + 1 if rec_data <= 0.4 and rec_data > 0.3: rec_sum_array[item_a,"03_04"] = rec_sum_array[item_a,"03_04"] + 1 if rec_data <= 0.5 and rec_data > 0.4: rec_sum_array[item_a,"04_05"] = rec_sum_array[item_a,"04_05"] + 1 if rec_data <= 0.6 and rec_data > 0.5: rec_sum_array[item_a,"05_06"] = rec_sum_array[item_a,"05_06"] + 1 if rec_data <= 0.7 and rec_data > 0.6: rec_sum_array[item_a,"06_07"] = rec_sum_array[item_a,"06_07"] + 1 if rec_data <= 0.8 and rec_data > 0.7: rec_sum_array[item_a,"07_08"] = rec_sum_array[item_a,"07_08"] + 1 if rec_data <= 0.9 and rec_data > 0.8: rec_sum_array[item_a,"08_09"] = rec_sum_array[item_a,"08_09"] + 1 if rec_data <= 1.0 and rec_data > 0.9: rec_sum_array[item_a,"09_10"] = rec_sum_array[item_a,"09_10"] + 1 if bit_score >= 100: bit_sum_array[item_a,"BIT_POS"] = bit_sum_array[item_a,"BIT_POS"] + 1 if bit_score <= -100: bit_sum_array[item_a,"BIT_NEG"] = bit_sum_array[item_a,"BIT_NEG"] + 1 if bit_score > -100 and bit_score < 100: bit_sum_array[item_a,"BIT_MED"] = bit_sum_array[item_a,"BIT_MED"] + 1 ### MATRIX DATA ### if rec_dec >= 0.6: pair_counter_N = pair_counter_N + 1 current_pair = [item_a, item_b] pairs_array_N[pair_counter_N] = current_pair matrix_values = [rec_dec, bit_score, data_fr] matrix_array_N[item_a,item_b] = matrix_values matrix_array_N1[item_a,item_b] = matrix_values matrix_array_N2[item_a,item_b] = matrix_values matrix_array_N3[item_a,item_b] = matrix_values out_file2.write(item_a + '\t' + item_b + '\t' + rec_str + '\t' + \ `bit_score` + '\t' + data_fr_str + '\t' + "***" + '\t' + `recomb` + \ '\t' + `data_points` + '\t' + `data_loss` + '\t' + `data_points + data_loss` + '\n') if rec_dec <= 0.4: pair_counter = pair_counter + 1 current_pair = [item_a, item_b] pairs_array[pair_counter] = current_pair matrix_values = [rec_dec, bit_score, data_fr] matrix_array[item_a,item_b] = matrix_values if rec_dec <= 0.25: matrix_array_X[item_a,item_b] = matrix_values ; # 0.25 matrix_array_1[item_a,item_b] = matrix_values ; # 0.20 matrix_array_2[item_a,item_b] = matrix_values ; # 0.18 matrix_array_3[item_a,item_b] = matrix_values ; # 0.16 matrix_array_4[item_a,item_b] = matrix_values ; # 0.14 matrix_array_5[item_a,item_b] = matrix_values ; # 0.12 matrix_array_6[item_a,item_b] = matrix_values ; # 0.10 matrix_array_7[item_a,item_b] = matrix_values ; # 0.09 matrix_array_8[item_a,item_b] = matrix_values ; # 0.08 matrix_array_9[item_a,item_b] = matrix_values ; # 0.07 matrix_array_A[item_a,item_b] = matrix_values ; # 0.06 matrix_array_B[item_a,item_b] = matrix_values ; # 0.05 matrix_array_C[item_a,item_b] = matrix_values ; # 0.04 matrix_array_D[item_a,item_b] = matrix_values ; # 0.03 matrix_array_E[item_a,item_b] = matrix_values ; # 0.02 matrix_array_F[item_a,item_b] = matrix_values ; # 0.01 matrix_array_G[item_a,item_b] = matrix_values ; # 0.00 out_file0.write(item_a + '\t' + item_b + '\t' + rec_str + '\t' + \ `bit_score` + '\t' + data_fr_str + '\t' + "***" + '\t' + `recomb` + \ '\t' + `data_points` + '\t' + `data_loss` + '\t' + `data_points + data_loss` + '\n') out_file1.write( item_a + '\t' + item_b + '\t' + rec_str + '\t' + \ `bit_score` + '\t' + data_fr_str + '\t' + "***" + '\t' + `recomb` + \ '\t' + `data_points` + '\t' + `data_loss` + '\t' + `data_points + data_loss` + '\n') ######################################## if data_fr < dat_cut: print "DATA POINTS ARE LESS THAN CUTOFF FOR PAIR: " + item_a + " " + item_b + " " + `data_points` ######################################## ### SUMMARY FILE ### rec_sum_diff1 = rec_sum_array[item_a,"00_01"] + rec_sum_array[item_a,"01_02"] \ - rec_sum_array[item_a,"08_09"] - rec_sum_array[item_a,"09_10"] bit_sum_diff1 = bit_sum_array[item_a,"BIT_POS"] - bit_sum_array[item_a,"BIT_NEG"] rec_sum_diff2 = rec_abs_array[item_a] bit_sum_diff2 = bit_abs_array[item_a] sum_text = "---" lnkg_sum = "CLASS_X" ### NP SUMMARY ### if rec_sum_diff2 > 0 and bit_sum_diff2 > 0: sum_text = "POSITIVE" if rec_sum_diff2 ==0 and bit_sum_diff2 == 0: sum_text = "NEUTRAL" if rec_sum_diff2 < 0 and bit_sum_diff2 >= 0: sum_text = "NEGATIVE REC" if bit_sum_diff2 < 0 and rec_sum_diff2 >= 0: sum_text = "NEGATIVE BIT" if bit_sum_diff2 < 0 and rec_sum_diff2 < 0: sum_text = "NEGATIVE ALL" ### LINKAGE SUMMARY ### min_best = 6 min_good = 3 min_step = 3 ### CLASS E if rec_sum_array[item_a,"00_01"] >= 0 and rec_sum_array[item_a,"00_01"] < min_good: if rec_sum_array[item_a,"01_02"] >= rec_sum_array[item_a,"00_01"] and rec_sum_array[item_a,"02_03"] >= rec_sum_array[item_a,"01_02"]: lnkg_sum = "CLASS_E" ### CLASS C AND D if rec_sum_array[item_a,"00_01"] >= min_good and rec_sum_array[item_a,"00_01"] < min_best: if rec_sum_array[item_a,"01_02"] >= rec_sum_array[item_a,"00_01"] and rec_sum_array[item_a,"02_03"] >= rec_sum_array[item_a,"01_02"]: lnkg_sum = "CLASS_D" if rec_sum_array[item_a,"01_02"] >= (rec_sum_array[item_a,"00_01"]+min_step) and rec_sum_array[item_a,"02_03"] >= (rec_sum_array[item_a,"01_02"]+min_step): lnkg_sum = "CLASS_C" ### CLASS A AND B if rec_sum_array[item_a,"00_01"] >= min_best: if rec_sum_array[item_a,"01_02"] >= rec_sum_array[item_a,"00_01"] and rec_sum_array[item_a,"02_03"] >= rec_sum_array[item_a,"01_02"]: lnkg_sum = "CLASS_B" if rec_sum_array[item_a,"01_02"] >= (rec_sum_array[item_a,"00_01"]+min_step) and rec_sum_array[item_a,"02_03"] >= (rec_sum_array[item_a,"01_02"]+min_step): lnkg_sum = "CLASS_A" ### CLASS F if rec_sum_array[item_a,"00_01"] == 0 and rec_sum_array[item_a,"01_02"] == 0: lnkg_sum = "CLASS_F" ####################### out_file7.write( item_a + '\t' + str(rec_sum_array[item_a,"00_01"]) + '\t' + \ str(rec_sum_array[item_a,"01_02"]) + '\t' + \ str(rec_sum_array[item_a,"02_03"]) + '\t' + \ str(rec_sum_array[item_a,"03_04"]) + '\t' + \ str(rec_sum_array[item_a,"04_05"]) + '\t' + \ str(rec_sum_array[item_a,"05_06"]) + '\t' + \ str(rec_sum_array[item_a,"06_07"]) + '\t' + \ str(rec_sum_array[item_a,"07_08"]) + '\t' + \ str(rec_sum_array[item_a,"08_09"]) + '\t' + \ str(rec_sum_array[item_a,"09_10"]) + '\t' + \ str(rec_sum_diff1) + '\t' + \ str(round(rec_sum_diff2,2)) + '\t' + \ "***" + '\t' + \ str(bit_sum_array[item_a,"BIT_POS"]) + '\t' + \ str(bit_sum_array[item_a,"BIT_MED"]) + '\t' + \ str(bit_sum_array[item_a,"BIT_NEG"]) + '\t' + \ str(bit_sum_diff1) + '\t' + \ str(bit_sum_diff2) + '\t' + \ "***" + '\t' + \ lnkg_sum + '\t' + \ sum_text + '\n') ##################### sys.stdout.write(".") ##################### print "" print "=============================================" print "GLOBAL MATRIX CREATED" print "=============================================" #### POSITIVE CLUSTERING #### print "" print "STARTING POSITIVE CLUSTERING" print "=============================================" global out_file14 out_file14 = open(out_name + '.group_info' + "_Summary", "wb") time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "01", matrix_array_1) ; # 0.20 print "ROUND 1 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 01)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 01)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "02", matrix_array_2) ; # 0.18 print "ROUND 2 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 02)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 02)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "03", matrix_array_3) ; # 0.16 print "ROUND 3 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 03)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 03)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "04", matrix_array_4) ; # 0.14 print "ROUND 4 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 04)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 04)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "05", matrix_array_5) ; # 0.12 print "ROUND 5 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 05)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 05)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "06", matrix_array_6) ; # 0.10 print "ROUND 6 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 06)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 06)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "07", matrix_array_7) ; # 0.09 print "ROUND 7 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 07)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 07)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "08", matrix_array_8) ; # 0.08 print "ROUND 8 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 08)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 08)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "09", matrix_array_9) ; # 0.07 print "ROUND 9 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 09)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 09)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "10", matrix_array_A) ; # 0.06 print "ROUND 10 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 10)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 10)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "11", matrix_array_B) ; # 0.05 print "ROUND 11 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 11)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 11)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "12", matrix_array_C) ; # 0.04 print "ROUND 12 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 12)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 12)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "13", matrix_array_D) ; # 0.03 print "ROUND 13 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 13)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 13)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "14", matrix_array_E) ; # 0.02 print "ROUND 14 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 14)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 14)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "15", matrix_array_F) ; # 0.01 print "ROUND 15 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 15)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 15)" + '\n') time.sleep(2) Seqs_Clustering(rec_cut, bit_cut, dat_cut, "16", matrix_array_G) ; # 0.00 print "ROUND 16 DONE" print `group_count` + " GROUPS WERE FOUND (STEP 16)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP 16)" + '\n') time.sleep(2) #### NEGATIVE CLUSTERING #### print "" print "=============================================" print "STARTING NEGATIVE CLUSTERING" print "=============================================" time.sleep(2) Seqs_Neg_Clustering(rec_cut, bit_cut, dat_cut, "N1", matrix_array_N1) ; # -0.7 print "ROUND N1 DONE" print `group_count` + " GROUPS WERE FOUND (STEP N1)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP N1)" + '\n') time.sleep(2) Seqs_Neg_Clustering(rec_cut, bit_cut, dat_cut, "N2", matrix_array_N2) ; # -0.8 print "ROUND N2 DONE" print `group_count` + " GROUPS WERE FOUND (STEP N2)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP N2)" + '\n') time.sleep(2) Seqs_Neg_Clustering(rec_cut, bit_cut, dat_cut, "N3", matrix_array_N3) ; # -0.9 print "ROUND N3 DONE" print `group_count` + " GROUPS WERE FOUND (STEP N3)" out_file6.write(`group_count` + " GROUPS WERE FOUND (STEP N3)" + '\n') time.sleep(2) out_file6.write("=============================================" + '\n') ### GROUP LIST ### f = 0 for item in id_list: ### TRY FRAME MARKER ID ### if print_frame == "TRUE": try: frame_group = frame_array[item] except: frame_group = "-" try: frame_coords = frame_position[item] except: frame_coords = "-" if print_frame == "FALSE": frame_group = "-" frame_coords = "-" tree_clust_array[item].append("***") tree_clust_array[item].append(frame_group) tree_clust_array[item].append(frame_coords) tree_clust_array[item].append("***") tree_clust_array[item].append(str(f)) tree_clust_array[item].append("***") tree_clust_array[item].append("LG") tree_clust_array[item].append(item) f = f + 1 print tree_clust_array[item] print "" print "=============================================" print " FINAL STEP " print " PROCESSING MEGALIST " print " DENDRO SORTING " print "=============================================" print "" time.sleep(2) ### MEGA LIST ### mega_list = [] for item in id_list: mega_list.append(tree_clust_array[item]) # print mega_list mega_list.sort() f = 0 for item in mega_list: current_list = [] item.append(str(f)) for subitem in item: subitem = str(subitem) current_list.append(subitem) current_list = "\t".join(current_list) print current_list out_file8.write(current_list + '\n') f = f + 1 # print mega_list ####################################################################### # for item in id_list: # ### TRY FRAME MARKER ID ### # if print_frame == "TRUE": # try: # frame_group = frame_array[element] # except: # frame_group = "-" # if print_frame == "FALSE": # frame_group = "-" # current_list = tree_clust_array[item] # current_list = "\t".join(current_list) # out_file8.write(item + '\t' + current_list + '\t' + "***" + '\t' + frame_group + '\t' + "***" + '\t' + `f` + '\n') # f = f + 1 ####################################################################### print "" print "=============================================" print " ANALYSIS DONE " print " ENJOY MAPPING " print "=============================================" print "" in_file.close() out_file0.close() out_file1.close() out_file2.close() out_file4.close() out_file5.close() out_file6.close() out_file7.close() out_file8.close() out_file14.close() #### POSITIVE CLUSTERING #### def Seqs_Clustering(rec_cut, bit_cut, dat_cut, x_counter, matrix_array_current): global id_list global id_array global adj_array global already_done global group_count global dfs_counter global working_group global group_depth global node_count global node_array global sequence_array global tree_clust_array global pairs_array global frame_array global print_frame global out_file14 global out_file8 node_array = {} adj_array = {} already_done = [] group_count = 0 pair_matrix_count = 0 pair_matrix_array = {} print "" print "ALREADY DONE" print already_done print "" print "" print "MATRIX ARRAY" print len(matrix_array_current) print "" time.sleep(2) out_file10 = open(out_name + '.matrix' + "_" + x_counter, "wb") out_file12 = open(out_name + '.adj_list' + "_" + x_counter, "wb") out_file13 = open(out_name + '.group_info' + "_" + x_counter, "wb") if x_counter == "01": rec_cut = rec_cut out_file14.write("### CLUSTERING RESULTS ITERATION 01 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "02": rec_cut = rec_cut - 0.02 out_file14.write("### CLUSTERING RESULTS ITERATION 02 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "03": rec_cut = rec_cut - 0.04 out_file14.write("### CLUSTERING RESULTS ITERATION 03 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "04": rec_cut = rec_cut - 0.06 out_file14.write("### CLUSTERING RESULTS ITERATION 04 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "05": rec_cut = rec_cut - 0.08 out_file14.write("### CLUSTERING RESULTS ITERATION 05 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "06": rec_cut = rec_cut - 0.10 out_file14.write("### CLUSTERING RESULTS ITERATION 06 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "07": rec_cut = rec_cut - 0.11 out_file14.write("### CLUSTERING RESULTS ITERATION 07 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "08": rec_cut = rec_cut - 0.12 out_file14.write("### CLUSTERING RESULTS ITERATION 08 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "09": rec_cut = rec_cut - 0.13 out_file14.write("### CLUSTERING RESULTS ITERATION 09 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "10": rec_cut = rec_cut - 0.14 out_file14.write("### CLUSTERING RESULTS ITERATION 10 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "11": rec_cut = rec_cut - 0.15 out_file14.write("### CLUSTERING RESULTS ITERATION 11 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "12": rec_cut = rec_cut - 0.16 out_file14.write("### CLUSTERING RESULTS ITERATION 12 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "13": rec_cut = rec_cut - 0.17 out_file14.write("### CLUSTERING RESULTS ITERATION 13 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "14": rec_cut = rec_cut - 0.18 out_file14.write("### CLUSTERING RESULTS ITERATION 14 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "15": rec_cut = rec_cut - 0.19 out_file14.write("### CLUSTERING RESULTS ITERATION 15 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') if x_counter == "16": rec_cut = rec_cut - 0.20 out_file14.write("### CLUSTERING RESULTS ITERATION 16 REC CUTOFF: " + str(rec_cut) + " ###" + '\n') ######################################## ### CLUSTERING ### ######################################## # print id_list print "------------------------------" print "FOUND " + `len(id_list)` + " UNIQ IDs" print "------------------------------" r = 0 ### CREATE NON-REDUNDANT MATRIX ### for key in pairs_array: id_a = pairs_array[key][0] id_b = pairs_array[key][1] try: cur_rec1 = float(matrix_array_current[id_a,id_b][0]) cur_bit1 = int(matrix_array_current[id_a,id_b][1]) cur_dat1 = float(matrix_array_current[id_a,id_b][2]) query1 = 1 print key except: # print "ALREADY PROCESSED" sys.stdout.write(".") query1 = 0 try: cur_rec2 = float(matrix_array_current[id_b,id_a][0]) cur_bit2 = int(matrix_array_current[id_b,id_a][1]) cur_dat2 = float(matrix_array_current[id_b,id_a][2]) r = r + 1 # print `r` + " REVERSE PAIR FOUND" query2 = 1 except: # print "NO REVERSE PAIR FOUND" query2 = 0 if query1 == 1 and query2 == 0 and cur_rec1 <= rec_cut and cur_bit1 >= bit_cut and \ cur_dat1 >= dat_cut and id_a != id_b: ### STRING CONVERSION ### cur_rec1_str = str(round(cur_rec1,2)) cur_dat1_str = str(round(cur_dat1,2)) ######################### out_file10.write(id_a + '\t' + id_b + '\t' + cur_rec1_str + '\t' + \ `cur_bit1` + '\t' + cur_dat1_str + '\n') pair_matrix_count = pair_matrix_count + 1 current_matrix_pair = [id_a, id_b] pair_matrix_array[id_a,id_b] = current_matrix_pair ### NEED TO UNSET ARRAY try: del matrix_array_current[id_a,id_b] except: # print "ALREADY REMOVED" sys.stdout.write(".") if query1 == 1 and query2 == 1 and id_a != id_b: if cur_rec1 <= cur_rec2: # print "CASE 1" if cur_dat1 >= dat_cut and cur_bit1 >= bit_cut and cur_rec1 <= rec_cut: ### STRING CONVERSION ### cur_rec1_str = str(round(cur_rec1,2)) cur_dat1_str = str(round(cur_dat1,2)) ######################### out_file10.write(id_a + '\t' + id_b + '\t' + cur_rec1_str + '\t' + \ `cur_bit1` + '\t' + cur_dat1_str + '\n') pair_matrix_count = pair_matrix_count + 1 current_matrix_pair = [id_a, id_b] pair_matrix_array[id_a,id_b] = current_matrix_pair ### NEED TO UNSET ARRAY try: del matrix_array_current[id_a,id_b] del matrix_array_current[id_b,id_a] except: # print "ALREADY REMOVED" sys.stdout.write(".") if cur_rec1 > cur_rec2: # print "CASE 2" if cur_dat2 >= dat_cut and cur_bit2 >= bit_cut and cur_rec2 <= rec_cut: ### STRING CONVERSION ### cur_rec2_str = str(round(cur_rec2,2)) cur_dat2_str = str(round(cur_dat2,2)) ######################### out_file10.write(id_b + '\t' + id_a + '\t' + cur_rec2_str + '\t' + \ `cur_bit2` + '\t' + cur_dat2_str + '\n') pair_matrix_count = pair_matrix_count + 1 current_matrix_pair = [id_b, id_a] pair_matrix_array[id_b,id_a] = current_matrix_pair ### NEED TO UNSET ARRAY try: del matrix_array_current[id_a,id_b] del matrix_array_current[id_b,id_a] except: # print "ALREADY REMOVED" sys.stdout.write(".") print "-------------------------------------" print `pair_matrix_count` + " PAIRS IN REDUNDANT MATRIX" print "-------------------------------------" print "BEGIN CLUSTERING" time.sleep(2) item_count = 0 id_list.sort() ### CREATE ADJACENCY LIST ### for item in id_list: item_count = item_count + 1 print `item_count` + '\t' + item item_list = [item] for key in pair_matrix_array: id_a = pair_matrix_array[key][0] id_b = pair_matrix_array[key][1] if id_a == item: item_list.append(id_b) if id_b == item: item_list.append(id_a) adj_array[item] = item_list item_string = " ".join(item_list) out_file12.write(item_string + '\n') print "GROUP ANALYSIS" time.sleep(2) ### GROUP ANALYSIS ### node_count = 0 for item in id_list: # if item not in already_done: try: node_test = node_array[item] except: node_array[item] = 1 group_count = group_count + 1 already_done.append(item) node_count = node_count + 1 working_group = [item] current_adj_list = adj_array[item] current_adj_len = len(current_adj_list) q = 0 while q <= (current_adj_len - 1): current_adj_item = current_adj_list[q] # print `q` + '\t' + current_adj_item if current_adj_item in already_done: go_to_dfs = 0 # if current_adj_item not in already_done: try: node_test = node_array[current_adj_item] except: node_array[current_adj_item] = 1 already_done.append(current_adj_item) node_count = node_count + 1 if current_adj_item not in working_group: working_group.append(current_adj_item) go_to_dfs = 1 dfs_counter = 0 # print 'Processing Group: ' + `group_count` DFS_procedure(current_adj_item) q = q + 1 # if item not in already_done: # already_done.append(item) working_group.sort() # print working_group print 'Processing Group: ' + `group_count` print 'Number of processed nodes: ' + `node_count` i = 0 for element in working_group: current_adj_list1 = adj_array[element] current_adj_len1 = len(current_adj_list1) current_group_len1 = len(working_group) ### TRY FRAME MARKER ID ### if print_frame == "TRUE": try: frame_marker = frame_array[element] frame_marker = ' ' + frame_marker + '_LinkageGroup ' except: frame_marker = " __LinkageGroup " if print_frame == "FALSE": frame_marker = "" ########################### # tree_clust_array[element].append(str(group_count)) tree_clust_array[element].append(group_count) print tree_clust_array[element] ########################### if i == 0: out_file13.write(element + '\t' + `(current_adj_len1 - 1)` + '\t' \ + `current_group_len1` + '\t' + `group_count` + '\t' + '*****' \ + '\t' + frame_marker + '\n') out_file14.write(element + '\t' + `(current_adj_len1 - 1)` + '\t' \ + `current_group_len1` + '\t' + `group_count` + '\t' + '*****' \ + '\t' + frame_marker + '\t' + "*" + x_counter + "*" + '\n') if i != 0: out_file13.write(element + '\t' + `(current_adj_len1 - 1)` + '\t' \ + `current_group_len1` + '\t' + `group_count` + '\t' + '-----' \ + '\t' + frame_marker + '\n') out_file14.write(element + '\t' + `(current_adj_len1 - 1)` + '\t' \ + `current_group_len1` + '\t' + `group_count` + '\t' + '-----' \ + '\t' + frame_marker + '\t' + "*" + x_counter + "*" + '\n') i = i + 1 ########################### print '======================' # print already_done print len(already_done) print `group_count` + '\t' + 'GROUPS FOUND' print '======================' out_file10.close() out_file12.close() out_file13.close() #### NEGATIVE CLUSTERING #### def Seqs_Neg_Clustering(rec_cut, bit_cut, dat_cut, x_counter, matrix_array_current): global id_list global id_array global adj_array global already_done global group_count global dfs_counter global working_group global group_depth global node_count global node_array global sequence_array global pairs_array_N global frame_array global print_frame node_array = {} adj_array = {} already_done = [] group_count = 0 pair_matrix_count = 0 pair_matrix_array = {} print "" print "ALREADY DONE" print already_done print "" print "" print "MATRIX ARRAY" print len(matrix_array_current) print "" time.sleep(2) if x_counter == "N1": rec_cut = 1.0 - rec_cut - 0.1 bit_cut = -1*bit_cut if x_counter == "N2": rec_cut = 1.0 - rec_cut bit_cut = -1*bit_cut if x_counter == "N3": rec_cut = 1.0 - rec_cut + 0.1 bit_cut = -1*bit_cut ######################################## ### CLUSTERING ### ######################################## out_file10 = open(out_name + '.matrix' + "_" + x_counter, "wb") out_file12 = open(out_name + '.adj_list' + "_" + x_counter, "wb") out_file13 = open(out_name + '.group_info' + "_" + x_counter, "wb") # print id_list print "------------------------------" print "FOUND " + `len(id_list)` + " UNIQ IDs" print "------------------------------" r = 0 ### CREATE NON-REDUNDANT MATRIX ### for key in pairs_array_N: id_a = pairs_array_N[key][0] id_b = pairs_array_N[key][1] try: cur_rec1 = float(matrix_array_current[id_a,id_b][0]) cur_bit1 = int(matrix_array_current[id_a,id_b][1]) cur_dat1 = float(matrix_array_current[id_a,id_b][2]) query1 = 1 print key except: # print "ALREADY PROCESSED" sys.stdout.write(".") query1 = 0 try: cur_rec2 = float(matrix_array_current[id_b,id_a][0]) cur_bit2 = int(matrix_array_current[id_b,id_a][1]) cur_dat2 = float(matrix_array_current[id_b,id_a][2]) r = r + 1 # print `r` + " REVERSE PAIR FOUND" query2 = 1 except: # print "NO REVERSE PAIR FOUND" query2 = 0 if query1 == 1 and query2 == 0 and cur_rec1 >= rec_cut and cur_bit1 <= bit_cut and \ cur_dat1 >= dat_cut and id_a != id_b: ### STRING CONVERSION ### cur_rec1_str = str(round(cur_rec1,2)) cur_dat1_str = str(round(cur_dat1,2)) ######################### out_file10.write(id_a + '\t' + id_b + '\t' + cur_rec1_str + '\t' + \ `cur_bit1` + '\t' + cur_dat1_str + '\n') pair_matrix_count = pair_matrix_count + 1 current_matrix_pair = [id_a, id_b] pair_matrix_array[id_a,id_b] = current_matrix_pair ### NEED TO UNSET ARRAY try: del matrix_array_current[id_a,id_b] except: # print "ALREADY REMOVED" sys.stdout.write(".") if query1 == 1 and query2 == 1 and id_a != id_b: if cur_rec1 >= cur_rec2: # print "CASE 1" if cur_dat1 >= dat_cut and cur_bit1 <= bit_cut and cur_rec1 >= rec_cut: ### STRING CONVERSION ### cur_rec1_str = str(round(cur_rec1,2)) cur_dat1_str = str(round(cur_dat1,2)) ######################### out_file10.write(id_a + '\t' + id_b + '\t' + cur_rec1_str + '\t' + \ `cur_bit1` + '\t' + cur_dat1_str + '\n') pair_matrix_count = pair_matrix_count + 1 current_matrix_pair = [id_a, id_b] pair_matrix_array[id_a,id_b] = current_matrix_pair ### NEED TO UNSET ARRAY try: del matrix_array_current[id_a,id_b] del matrix_array_current[id_b,id_a] except: # print "ALREADY REMOVED" sys.stdout.write(".") if cur_rec1 < cur_rec2: # print "CASE 2" if cur_dat2 >= dat_cut and cur_bit2 <= bit_cut and cur_rec2 >= rec_cut: ### STRING CONVERSION ### cur_rec2_str = str(round(cur_rec2,2)) cur_dat2_str = str(round(cur_dat2,2)) ######################### out_file10.write(id_b + '\t' + id_a + '\t' + cur_rec2_str + '\t' + \ `cur_bit2` + '\t' + cur_dat2_str + '\n') pair_matrix_count = pair_matrix_count + 1 current_matrix_pair = [id_b, id_a] pair_matrix_array[id_b,id_a] = current_matrix_pair ### NEED TO UNSET ARRAY try: del matrix_array_current[id_a,id_b] del matrix_array_current[id_b,id_a] except: # print "ALREADY REMOVED" sys.stdout.write(".") print "-------------------------------------" print `pair_matrix_count` + " PAIRS IN REDUNDANT MATRIX" print "-------------------------------------" print "BEGIN CLUSTERING" time.sleep(2) item_count = 0 id_list.sort() ### CREATE ADJACENCY LIST ### for item in id_list: item_count = item_count + 1 print `item_count` + '\t' + item item_list = [item] for key in pair_matrix_array: id_a = pair_matrix_array[key][0] id_b = pair_matrix_array[key][1] if id_a == item: item_list.append(id_b) if id_b == item: item_list.append(id_a) adj_array[item] = item_list item_string = " ".join(item_list) out_file12.write(item_string + '\n') print "GROUP ANALYSIS" time.sleep(2) ### GROUP ANALYSIS ### node_count = 0 for item in id_list: # if item not in already_done: try: node_test = node_array[item] except: node_array[item] = 1 group_count = group_count + 1 already_done.append(item) node_count = node_count + 1 working_group = [item] current_adj_list = adj_array[item] current_adj_len = len(current_adj_list) q = 0 while q <= (current_adj_len - 1): current_adj_item = current_adj_list[q] # print `q` + '\t' + current_adj_item if current_adj_item in already_done: go_to_dfs = 0 # if current_adj_item not in already_done: try: node_test = node_array[current_adj_item] except: node_array[current_adj_item] = 1 already_done.append(current_adj_item) node_count = node_count + 1 if current_adj_item not in working_group: working_group.append(current_adj_item) go_to_dfs = 1 dfs_counter = 0 # print 'Processing Group: ' + `group_count` DFS_procedure(current_adj_item) q = q + 1 # if item not in already_done: # already_done.append(item) working_group.sort() # print working_group print 'Processing Group: ' + `group_count` print 'Number of processed nodes: ' + `node_count` i = 0 for element in working_group: current_adj_list1 = adj_array[element] current_adj_len1 = len(current_adj_list1) current_group_len1 = len(working_group) ### TRY FRAME MARKER ID ### if print_frame == "TRUE": try: frame_marker = frame_array[element] frame_marker = ' ' + frame_marker + '_LinkageGroup ' except: frame_marker = " __LinkageGroup " if print_frame == "FALSE": frame_marker = "" ########################### if i == 0: out_file13.write(element + '\t' + `(current_adj_len1 - 1)` + '\t' \ + `current_group_len1` + '\t' + `group_count` + '\t' + '*****' \ + '\t' + frame_marker + '\n') if i != 0: out_file13.write(element + '\t' + `(current_adj_len1 - 1)` + '\t' \ + `current_group_len1` + '\t' + `group_count` + '\t' + '-----' \ + '\t' + frame_marker + '\n') i = i + 1 ########################### print '======================' # print already_done print len(already_done) print `group_count` + '\t' + 'GROUPS FOUND' print '======================' out_file10.close() out_file12.close() out_file13.close() ############################# def DFS_procedure(current_adj_item): global adj_array global already_done global group_count global dfs_counter global working_group global group_depth global node_count global node_array print "DFS" + '\t' + `dfs_counter` for key in adj_array: current_adj_list = adj_array[key] current_adj_len = len(current_adj_list) if current_adj_item in current_adj_list: q = 0 while q <= (current_adj_len - 1): current_adj_item1 = current_adj_list[q] # print `q` + '\t' + current_adj_item1 if current_adj_item1 in already_done: go_to_dfs = 0 # if current_adj_item1 not in already_done: try: node_test = node_array[current_adj_item1] except: node_array[current_adj_item1] = 1 already_done.append(current_adj_item1) node_count = node_count + 1 if current_adj_item1 not in working_group: working_group.append(current_adj_item1) go_to_dfs = 1 DFS_procedure(current_adj_item1) dfs_counter = dfs_counter + 1 print "DFS" + '\t' + `dfs_counter` group_depth = dfs_counter q = q + 1 ###################################################################### import math import re import sys import string import time if __name__ == "__main__": if len(sys.argv) <= 6 or len(sys.argv) > 7: print "" print "Program usage: " print "input_file[REC DATA] output_file[NAME] rec_cut[0.2] bit_cut[100] dat_cut[0.25] group_file_name[OPTIONAL]" print "if group file does not exist just enter [X]" print "" sys.exit() if len(sys.argv) == 7: in_name = sys.argv[1] out_name = sys.argv[2] rec_cut = sys.argv[3] bit_cut = sys.argv[4] dat_cut = sys.argv[5] frame_file_name = sys.argv[6] rec_cut = float(rec_cut) bit_cut = float(bit_cut) dat_cut = float(dat_cut) if rec_cut > 0.25 or rec_cut < 0.2: print "======================================================" print " Recombination cutoff must be between 0.25 and 0.20 " print "======================================================" sys.exit() if dat_cut > 1.00 or dat_cut < 0.1: print "======================================================" print " DataPoints cutoff must be between 1.00 and 0.10 " print "======================================================" sys.exit() if bit_cut > 1000 or bit_cut < 60: print "======================================================" print " BIT Score cutoff must be between 1000 and 60 " print "======================================================" sys.exit() sys.setrecursionlimit(10000) Define_Bit_Scores() Read_Data_File(in_name, out_name, rec_cut, bit_cut, dat_cut, frame_file_name) #### THE END ####