def LGM1_verification(input_file, output_file, new_update, table_name): ################################## #file check variables column_length_counter = 0 check_line_length_list = [] check_blank_entry_list = [] check_column_label_flag = False check_column_labels = [] check_column_label_index = [] missing_column_label = [] #post file check variables matrix = {} rows = 0 columns = 11 repeat_marker_counter=0 failure_counter = 0 failure_list = [] warning_list = [] warning_counter = 0 ################################## try: check_file = open(input_file) while True: check_line = check_file.readline() if check_line == '': break if '\n' in check_line: check_line = check_line[:-1] if '\r' in check_line: check_line = check_line[:-1] check_line = check_line.split('\t') check_line_length_list.append(len(check_line)) for i in range (0,len(check_line)): if (check_line[i] == ''): check_blank_entry = column_length_counter +1, i+1 check_blank_entry_list.append(check_blank_entry) if (column_length_counter == 0): check_column_labels = check_line for i in range (0, len(check_line) ): if (check_line[i] in lgm_dictionary_005.column_label_lgm1): continue else: check_column_label_index.append(i) check_column_label_flag = True column_length_counter = column_length_counter + 1 except: print "DID NOT FIND FILE!: " + input_file return check_file.close() #Blank entries were found if (len(check_blank_entry_list) != 0): print "\nBlank entries were found at the following locations\n\n" for i in range (0, len(check_blank_entry_list)): print "Row", repr(check_blank_entry_list[i][0]), ", Column" , lgm_dictionary_005.excel_column_label_dictionary[check_blank_entry_list[i][1]] return #the following code checks that all lines in the file have an equal length. #Matrices handed to the script must be rectangular for i in range (0, len(check_line_length_list)): for j in range (i+1 , len(check_line_length_list)): if (check_line_length_list[i] != check_line_length_list[j]): print "\nThe matrix is not rectangular check the list of lines for discrepancies\n\n" for k in range (0, len(check_line_length_list)): print "line", k+1, "has", check_line_length_list[k], "entries" print "All lines must have the same number of entries" return #The column labels do not match the defined column labels in column_label_lgm1 if (check_column_label_flag == True): print "\nThe following column labels are incorrect\n\n" for i in check_column_label_index: print "Column", lgm_dictionary_005.excel_column_label_dictionary[i+1], "The label is", check_column_labels[i], "it should be", lgm_dictionary_005.column_label_lgm1[i], '\n' return #Checks that all columns are in the file for i in lgm_dictionary_005.column_label_lgm1: if i in check_column_labels: continue else: check_column_label_flag = True missing_column_label.append(i) if (check_column_label_flag == True): print "\nThe following columns are missing or they are not in the correct order.\n\n" for i in range (0,len(missing_column_label)): print missing_column_label[i] return #After file is checked for serious errors #The file is read line by line into a 2d matrix try: text_file = open(input_file) while True: line = text_file.readline() if line == '': break if '' in line: line = line[:-1] if '\r' in line: line = line[:-1] line = line.split('\t') #removes all whitespace characters from the front and rear of the entry for i in range (0, len(line)): line[i].lstrip() line[i].rstrip() if (line[0] != '-'): for i in range (0,len(line)): matrix[rows,i] = line[i] else: failure_string = (rows + 1) , "Row " + repr(rows + 1) + " was ignored because it had no marker_id" failure_list.append(failure_string) failure_counter = failure_counter + 1 for j in range(0,columns): matrix[rows, j] = "ignored_row" rows = rows + 1 text_file.close() except: print "DID NOT FIND FILE: " + input_file return while (repeat_marker_counter != rows): for i in range (repeat_marker_counter + 1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[repeat_marker_counter,0] == matrix [i,0]): failure_string = (repeat_marker_counter+ 1) , "Row " + repr(repeat_marker_counter+ 1) + ", marker_id " + repr(matrix[repeat_marker_counter,0]) + " has been repeated in row " + repr(i+1) failure_list.append(failure_string) failure_counter = failure_counter + 1 repeat_marker_counter= repeat_marker_counter + 1 #marker_type for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if (matrix[i,2] in lgm_dictionary_005.marker_type): continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a marker_type which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #marker_origin for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,3]in lgm_dictionary_005.marker_origin: continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a marker_origin which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Later should log in and check that est_id is valid #marker_origin, marker_type, est_id for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,3] in lgm_dictionary_005.marker_origin: if (matrix[i,3] == lgm_dictionary_005.marker_origin[0]):# marker_origin = CGPDB if (matrix[i,2] == lgm_dictionary_005.marker_type[0]):#marker_type = EST if (matrix[i,1] == '-'): failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " requires an est_id."#works assuming all fields at least have a - failure_list.append(failure_string) failure_counter = failure_counter + 1 #COS for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,4]in lgm_dictionary_005.cos: continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a cos label which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Class for i in range(1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,5] in lgm_dictionary_005._class: continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a class which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Annotations for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue annotation_string = matrix[i,6] if (annotation_string.count(' ') > 11): failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has an annotation which is too long. Only 12 words are allowed." failure_list.append(failure_string) failure_counter = failure_counter + 1 #should login and check status #Status for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,7] in lgm_dictionary_005.status: continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a status which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Sender for i in range (1, rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,8] in lgm_dictionary_005.sender: continue else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " has a sender which is not recognized." failure_list.append(failure_string) failure_counter = failure_counter + 1 #Date for i in range (1,rows): if (matrix[i,0] == "ignored_row"): continue date = matrix[i,9] for j in range (0, len(date)): if date[j].isdigit(): continue elif date[j] == '/': continue elif date[j] == '\\': matrix[i,9] = string.replace(date,'\\','/') warning_string = (i+1) , "Row " + repr(i+1) + " Date used backslashes, use foward slashes in your dates!" warning_list.append(warning_string) warning_counter = warning_counter + 1 break else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " date is not in the correct format, date should be mm/dd/yyyy." failure_list.append(failure_string) failure_counter = failure_counter + 1 break ### REFORMAT DUMMY DATE for i in range (1,rows): if (matrix[i,0] == "ignored_row"): continue date = matrix[i,9] ### REFORMAT INTO MYSQL date = date.split('/') mm = date[0] dd = date[1] yy = date[2] ############ if len(mm) == 1: mm = "0" + mm if len(dd) == 1: dd = "0" + dd matrix[i,9] = yy + "-" + mm + "-" + dd print matrix[i,9] #Public/Private for i in range (1,rows): if (matrix[i,0] == "ignored_row"): continue if matrix[i,10] in lgm_dictionary_005.public_private: continue elif matrix[i,10] == '-': matrix[i,10] = "public" warning_string = (i+1) , "Row " + repr(i+1) + " public_private entry '-' was changed to public, enter public or private!" warning_list.append(warning_string) warning_counter = warning_counter + 1 else: failure_string = (i+1) , "Row " + repr(i+1) + ", marker_id " + matrix[i,0] + " public_private label is not recognized check row " + repr(i) + '' failure_list.append(failure_string) failure_counter = failure_counter + 1 if (os.name == 'nt'): _newline = '\r\n' elif (os.name == 'mac'): _newline = '\r' else: _newline = '\n' #Stops file output and outputs failure file if (failure_counter != 0): failure_file = open(output_file + ".failure" , "wbU") failure_list.sort() for i in range (0,len(failure_list)): failure_file.write(failure_list[i][1]) failure_file.write(_newline) print "\nOutput file was not created, check the failure file." failure_file.close() return #Output warning file if (warning_counter !=0): warning_file = open(output_file + ".warning" , "wbU") warning_list.sort() for j in range (0, len(warning_list)): warning_file.write(warning_list[j][1]) warning_file.write(_newline) print "\nWarning file was created, please check for data entry mistakes." warning_file.close() #Create output file file_output = open(output_file + ".clean", "wbU") for i in range (0,rows): if (matrix[i,0] == "ignored_row"): continue for j in range (0,columns): file_output.write(matrix[i,j]) file_output.write('\t') file_output.write(_newline) file_output.close() if new_update == "UPDATE": file_mysql = open(output_file + ".update.table", "wbU") file_mysql.write("table_name" + '\t' + "marker_id" + '\t' + "field" + '\t' + "new_value" + '\n') for i in range (0,rows): for j in range (0,columns): ### WRITE DATA TO FILE ### ### i AND j > 0 TO SKIP REPLACEMENT OF MARKER_ID if matrix[i,j] != "X" and i > 0 and j > 0: file_mysql.write(table_name + '\t' + matrix[i,0] + '\t' + matrix[0,j] + '\t' + matrix[i,j] + '\n') file_mysql.close() print "\nprogram exited succesfully" import array import string import os import sys import string import lgm_dictionary_005 if __name__ == "__main__": if len(sys.argv) <= 4 or len(sys.argv) > 5: print "Program usage: " print "Input_file, Output_file, NEW/UPDATE, Table_Name" exit else: input_file = sys.argv[1] output_file = sys.argv[2] new_update = sys.argv[3] table_name = sys.argv[4] LGM1_verification(input_file, output_file, new_update, table_name)