def LGM3_verification(input_file, output_file, new_update, table_name): #### If file has sender column with dashes change the value of the variable handicapped_sender_values from False to True #### #### If file has different entries in the test categories change the value of the variable handicapped_test_values from False to True #### #### If file has missing cross extension in image file name change the value of the variable handicapped_image_values from false to True #### handicapped_test_values = False #True handicapped_sender_values = False #True # handicapped_image_values = False #True handicapped_image_values = True ########################################### #file check variables column_length_counter = 0 check_line_length_list = [] check_blank_entry_list = [] check_column_label_flag = False check_column_labels = [] check_column_label_index = [] missing_column_label = [] #post file check variables matrix = {} rows = 0 columns = 17 repeat_marker_counter=0 failure_counter = 0 failure_list = [] warning_list = [] warning_counter = 0 #handicapped variables handicapped_sender = ["LKM", "SSM", "DLM", "AKM", "RWM", '-'] ############################################ #the file is checked for critical errors, if they are found the script halts try: check_file = open(input_file) while True: check_line = check_file.readline() if check_line == '': break if '\n' in check_line: check_line = check_line[:-1] if '\r' in check_line: check_line = check_line[:-1] check_line = check_line.split('\t') check_line_length_list.append(len(check_line)) for i in range (0,len(check_line)): if (check_line[i] == ''): check_blank_entry = column_length_counter +1, i+1 check_blank_entry_list.append(check_blank_entry) if (column_length_counter == 0): check_column_labels = check_line for i in range (0, len(check_line) ): if (check_line[i] in lgm_dictionary_005.column_label_lgm3): continue else: check_column_label_index.append(i) check_column_label_flag = True column_length_counter = column_length_counter + 1 except: print "DID NOT FIND FILE!: " + input_file return check_file.close() #Blank entries were found if (len(check_blank_entry_list) != 0): print "\nBlank entries were found at the following locations\n\n" for i in range (0, len(check_blank_entry_list)): print "Row", repr(check_blank_entry_list[i][0]), ", Column" , lgm_dictionary_005.excel_column_label_dictionary[check_blank_entry_list[i][1]] return #the following code checks that all lines in the file have an equal length. #Matrices handed to the script must be rectangular for i in range (0, len(check_line_length_list)): for j in range (i+1 , len(check_line_length_list)): if (check_line_length_list[i] != check_line_length_list[j]): print "\nThe matrix is not rectangular check the list of lines for discrepancies\n\n" for k in range (0, len(check_line_length_list)): print "line", k+1, "has", check_line_length_list[k], "entries" print "All lines must have the same number of entries" return #The column labels do not match the defined column labels in column_label_lgm3 if (check_column_label_flag == True): print "\nThe following column labels are incorrect\n\n" for i in check_column_label_index: print "Column", lgm_dictionary_005.excel_column_label_dictionary[i+1], "The label is", check_column_labels[i], "it should be", lgm_dictionary_005.column_label_lgm3[i], '\n' return #Checks that all columns are in the file for i in lgm_dictionary_005.column_label_lgm3: if i in check_column_labels: continue else: check_column_label_flag = True missing_column_label.append(i) if (check_column_label_flag == True): print "\nThe following columns are missing or they are not in the correct order.\n\n" for i in range (0,len(missing_column_label)): print missing_column_label[i] return #After file is checked for serious errors #The file is read line by line into a 2d matrix try: text_file = open(input_file) while True: line = text_file.readline() if line == '': break if '' in line: line = line[:-1] if '\r' in line: line = line[:-1] line = line.split('\t') #removes all whitespace characters from the front and rear of the entry for i in range (0, len(line)): line[i].lstrip() line[i].rstrip() if (line[0] != '-'): for i in range (0,len(line)): matrix[rows,i] = line[i] else: failure_string = (rows + 1) , "Row " + repr(rows + 1) + " was ignored because it had no marker_id" failure_list.append(failure_string) failure_counter = failure_counter + 1 for j in range(0,columns): matrix[rows, j] = "ignored_row" rows = rows + 1 text_file.close() except: print "DID NOT FIND FILE: " + input_file return #checks for repeated marker_id's while (repeat_marker_counter != rows): for i in range (repeat_marker_counter + 1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[repeat_marker_counter,0] == matrix [i,0]): failure_string = (repeat_marker_counter+ 1) , "Row " + repr(repeat_marker_counter+ 1)+ ", marker_id " + repr(matrix[repeat_marker_counter,0]) + " has been repeated in row " + repr(i+1) failure_list.append(failure_string) failure_counter = failure_counter + 1 repeat_marker_counter= repeat_marker_counter + 1 #Cross for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,1] in lgm_dictionary_005.cross): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " Cross has a value which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Amplification for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,2] in lgm_dictionary_005.amplification): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " Amplification has a value which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #band_n for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,3] in lgm_dictionary_005.band_n): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a band_n value which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Test quality if (handicapped_test_values == False): #agarose for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,7] in lgm_dictionary_005.quality): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " agarose test quality entry is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #sscp for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,8] in lgm_dictionary_005.quality): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " sscp test quality entry is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #tgce for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,9] in lgm_dictionary_005.quality): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " tgce test quality entry is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #dcap for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,10] in lgm_dictionary_005.quality): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " dcap test quality entry is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #other for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,11].count(' ') != 0): failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " other test entry is not recognized, it may only be one word." failure_list.append(failure_string) failure_counter = failure_counter + 1 continue #handicapped_test_values set to True else: #agarose for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,7] in lgm_dictionary_005.quality): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " agarose test quality entry is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #sscp for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,8] in lgm_dictionary_005.quality): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " sscp test quality entry is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #tgce for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,9] in lgm_dictionary_005.quality): continue else: warning_string = (i+1) , "Row " + repr(i+1) + " tgce test quality entry is not the standard type." warning_list.append(warning_string) warning_counter = warning_counter + 1 #dcap for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,10] in lgm_dictionary_005.quality): continue else: warning_string = (i+1) , "Row " + repr(i+1) + " dcap test quality entry is not the standard type." warning_list.append(warning_string) warning_counter = warning_counter + 1 #other for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,11].count(' ') != 0): warning_string = (i+1) , "Row " + repr(i+1) + " other test quality entry is not the standard type." warning_list.append(warning_string) warning_counter = warning_counter + 1 continue #Annotations for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue annotation_string = matrix[i,12] if (annotation_string.count(' ') > 11): failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has an annotation which is too long. Only 12 words are allowed." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Image file if (handicapped_image_values == False): for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,13] != '-'): image_file = matrix[i,13] image_file_split = image_file.split('.') if (image_file_split[len(image_file_split)-1] in lgm_dictionary_005.image_file_extension): if(image_file_split[len(image_file_split)-2] == matrix[i,1]): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " image file title requires proper cross label before file extension." failure_list.append(failure_string) failure_counter = failure_counter + 1 elif (image_file_split[len(image_file_split)-1] in lgm_dictionary_005.image_file_extension_upper): for j in range (0, len(lgm_dictionary_005.image_file_extension)): if (image_file_split[len(image_file_split)-1] == lgm_dictionary_005.image_file_extension[j].upper()): image_file_split[len(image_file_split)-1] = lgm_dictionary_005.image_file_extension[j] matrix[i,13] = string.join(image_file_split, '.') warning_string = (i+1) , "Row " + repr(i+1) + " The image file extension was changed from " + image_file_split[len(image_file_split)-1].upper() + " to " + lgm_dictionary_005.image_file_extension[j] warning_list.append(warning_string) warning_counter = warning_counter + 1 if (image_file_split[len(image_file_split)-1] in lgm_dictionary_005.image_file_extension): if(image_file_split[len(image_file_split)-2] == matrix[i,1]): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " image file title is requires proper cross label before file extension." failure_list.append(failure_string) failure_counter = failure_counter + 1 else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " image file title is not in the proper format, check the file extension." failure_list.append(failure_string) failure_counter = failure_counter + 1 else: for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,13] != '-'): image_file = matrix[i,13] image_file_split = image_file.split('.') if (image_file_split[len(image_file_split)-1] in lgm_dictionary_005.image_file_extension): if(image_file_split[len(image_file_split)-2] == matrix[i,1]): continue else: warning_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " image file title does not have proper cross label before file extension." warning_list.append(warning_string) warning_counter = warning_counter + 1 elif (image_file_split[len(image_file_split)-1] in lgm_dictionary_005.image_file_extension_upper): for j in range (0, len(lgm_dictionary_005.image_file_extension)): if (image_file_split[len(image_file_split)-1] == lgm_dictionary_005.image_file_extension[j].upper()): image_file_split[len(image_file_split)-1] = lgm_dictionary_005.image_file_extension[j] matrix[i,13] = string.join(image_file_split, '.') warning_string = (i+1) , "Row " + repr(i+1) + " The image file extension was changed from " + image_file_split[len(image_file_split)-1].upper() + " to " + lgm_dictionary_005.image_file_extension[j] warning_list.append(warning_string) warning_counter = warning_counter + 1 if (image_file_split[len(image_file_split)-1] in lgm_dictionary_005.image_file_extension): if(image_file_split[len(image_file_split)-2] == matrix[i,1]): continue else: warning_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " image file title does not have proper cross label before file extension." warning_list.append(warning_string) warning_counter = warning_counter + 1 else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " image file title is not in the proper format, check the file extension." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Status for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,14] in lgm_dictionary_005.status: continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a status which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Sender if (handicapped_sender_values == False): for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,15] in lgm_dictionary_005.sender: continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a sender which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 else: for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,15] in handicapped_sender: if (matrix[i,15] == '-'): warning_string = (i+1) , "Row " + repr(i+1) + " sender field is empty" warning_list.append(warning_string) warning_counter = warning_counter + 1 continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a sender which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Date_t3 for i in range (1,rows): if (matrix[i,0] == "ignored_row"): continue date = matrix[i,16] for j in range (0, len(date)): if date[j].isdigit(): continue elif date[j] == '/': continue elif date[j] == '\\': matrix[i,16] = string.replace(date,'\\','/') warning_string = (i+1) , "Row " + repr(i+1) + " Date used backslashes, use foward slashes in your dates!" warning_list.append(warning_string) warning_counter = warning_counter + 1 break else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " date is not in the correct format, date should be mm/dd/yyyy." failure_list.append(failure_string) failure_counter = failure_counter + 1 break ### REFORMAT DUMMY DATE for i in range (1,rows): if (matrix[i,0] == "ignored_row"): continue date = matrix[i,16] ### REFORMAT INTO MYSQL date = date.split('/') mm = date[0] dd = date[1] yy = date[2] ############ if len(mm) == 1: mm = "0" + mm if len(dd) == 1: dd = "0" + dd matrix[i,16] = yy + "-" + mm + "-" + dd print matrix[i,16] #Newline #assings _newline value based on os if (os.name == 'nt'): _newline = '\r\n' elif (os.name == 'mac'): _newline = '\r' else: _newline = '\n' #Stops file output and outputs failure file if (failure_counter != 0): failure_file = open(output_file + ".failure" , "wbU") failure_list.sort() for i in range (0,len(failure_list)): failure_file.write(failure_list[i][1]) failure_file.write(_newline) print "\nOutput file was not created, check the failure file." failure_file.close() return #Output warning file if (warning_counter !=0): warning_file = open(output_file + ".warning" , "wbU") warning_list.sort() for j in range (0, len(warning_list)): warning_file.write(warning_list[j][1]) warning_file.write(_newline) print "\nWarning file was created, please check for data entry mistakes." warning_file.close() #Create output file file_output = open(output_file + ".clean", "wbU") for i in range (0,rows): for j in range (0,columns): file_output.write(matrix[i,j]) file_output.write('\t') file_output.write(_newline) file_output.close() if new_update == "UPDATE": file_mysql = open(output_file + ".update.table", "wbU") file_mysql.write("table_name" + '\t' + "marker_id" + '\t' + "field" + '\t' + "new_value" + '\n') for i in range (0,rows): for j in range (0,columns): ### WRITE DATA TO FILE ### ### i AND j > 0 TO SKIP REPLACEMENT OF MARKER_ID if matrix[i,j] != "X" and i > 0 and j > 0: file_mysql.write(table_name + '\t' + matrix[i,0] + '\t' + matrix[0,j] + '\t' + matrix[i,j] + '\n') file_mysql.close() print "\nprogram exited succesfully" import array import string import os import sys import string import lgm_dictionary_005 if __name__ == "__main__": if len(sys.argv) <= 4 or len(sys.argv) > 5: print "Program usage: " print "Input_file, Output_file, NEW/UPDATE, Table_Name" exit else: input_file = sys.argv[1] output_file = sys.argv[2] new_update = sys.argv[3] table_name = sys.argv[4] LGM3_verification(input_file, output_file, new_update, table_name)