from collections import defaultdict import os import pickle DATA_FOLDER = "/home/chmrodrigues/Documents/ppi2/reverse_mutations/data/aaindex" def main(): aaindex2_file = os.path.join(DATA_FOLDER,"aaindex2") aaindex3_file = os.path.join(DATA_FOLDER,"aaindex3") lines_index2 = ' '.join([item for item in open(aaindex2_file,'r').readlines()]) lines_index3 = ' '.join([item for item in open(aaindex3_file,'r').readlines()]) attrs_index2 = [item for item in lines_index2.split('//\n') if len(item) != 0] attrs_index3 = [item for item in lines_index3.split('//\n') if len(item) != 0] attr_name = str() all_matrices = dict() for line in attrs_index2: attr_elements = line.split('\n') attr_name = [item for item in attr_elements if item.strip().startswith("H ")][0].split()[-1] rows_columns_index = [attr_elements.index(item) for item in attr_elements if item.startswith(" M rows =")][0] rows = attr_elements[rows_columns_index].split()[3].replace(",","") columns = attr_elements[rows_columns_index].split()[-1] attr_dict = dict() for row in rows: attr_dict[row] = dict() for col in columns: attr_dict[row][col] = None for i in range(rows_columns_index+1,len(attr_elements)): values = attr_elements[i].split() try: row = rows[i-(rows_columns_index+1)] for idx,value in enumerate(values): col = columns[idx] try: attr_dict[row][col] = float(value) except ValueError: attr_dict[row][col] = value except IndexError: pass all_matrices[attr_name] = attr_dict print(len(all_matrices)) pickle.dump(all_matrices, open('index2.p','wb'),protocol=2) attr_name = str() all_matrices = dict() for line in attrs_index3: attr_elements = line.split('\n') attr_name = [item for item in attr_elements if item.strip().startswith("H ")][0].split()[-1] rows_columns_index = [attr_elements.index(item) for item in attr_elements if item.startswith(" M rows =")][0] rows = attr_elements[rows_columns_index].split()[3].replace(",","") columns = attr_elements[rows_columns_index].split()[-1] attr_dict = dict() for row in rows: attr_dict[row] = dict() for col in columns: attr_dict[row][col] = None for i in range(rows_columns_index+1,len(attr_elements)): values = attr_elements[i].split() try: row = rows[i-(rows_columns_index+1)] for idx,value in enumerate(values): col = columns[idx] try: attr_dict[row][col] = float(value) except ValueError: attr_dict[row][col] = value except IndexError: pass all_matrices[attr_name] = attr_dict pickle.dump(all_matrices, open('index3.p','wb'),protocol=2) print(len(all_matrices)) return True if __name__ == "__main__": main()