diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 6fd8ca6..be9fc9a 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -71,7 +71,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') -arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') +arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for Data, input and output', action='store_true') arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') @@ -925,9 +925,9 @@ changes_dict = {} ##BROKENNNN!!!! common_muts gene_LF1['mutation'].head() -common_muts_lower = list((map(lambda x: x.lower(), common_muts))) -common_muts_lower -##BROKENNNN!!!! +#common_muts_lower = list((map(lambda x: x.lower(), common_muts))) +#common_muts_lower + for i in common_muts: #for i in common_muts_lower: #print(i) @@ -992,8 +992,7 @@ print('\n----------------------------------' ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) #%% FIXME: Add sanity check to make sure you can add value_count checks -#%% Resolving ambiguous muts -# Merging ambiguous muts +#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations #================= # Merge ambig muts # with gene_LF1 @@ -1034,7 +1033,7 @@ gene_LF1['mutation_info'].value_counts() # reassign #%% PHEW! Good to go for downstream stuff -#%% Add column: Mutationinformation +#%% Add column: Mutationinformation ==> gene_LF1 # splitting mutation column to get mCSM style muts #===================================================== # Formatting df: read aa dict and pull relevant info @@ -1098,7 +1097,8 @@ for k, v in my_aa_dict.items(): #gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) -mylen1 = len(gene_LF1.columns) +#mylen1 = len(gene_LF1.columns) +mylen0_v2 = len(gene_LF1.columns) # sanity checks print('checking if 3-letter wt&mut residue extraction worked correctly') @@ -1115,14 +1115,15 @@ else: , '\nmutant-type\n', mut , '\ndim of df:', gene_LF1.shape) -if mylen1 == mylen0 + ncol_mutf_add: +#if mylen1 == mylen0 + ncol_mutf_add: +if mylen0_v2 == mylen0 + ncol_mutf_add: print('PASS: successfully added', ncol_mutf_add, 'cols' , '\nold length:', mylen0 - , '\nnew len:', mylen1) + , '\nnew len:', mylen0_v2) else: print('FAIL: failed to add cols:' , '\nold length:', mylen0 - , '\nnew len:', mylen1) + , '\nnew len:', mylen0_v2) # clear variables del(k, v, wt, mut, lookup_dict) @@ -1137,7 +1138,7 @@ print('Created column: mutationinformation' , '\n=====================================================================\n' , gene_LF1.mutationinformation.head(10)) -#order by position for convenience +# order by position for convenience gene_LF1.dtypes # converting position to numeric @@ -1145,17 +1146,20 @@ gene_LF1['position'] = pd.to_numeric(gene_LF1['position']) # sort by position inplace foo = gene_LF1['position'].value_counts() -foo +foo = foo.sort_index() + gene_LF1.sort_values(by = ['position'], inplace = True) bar = gene_LF1['position'].value_counts() +bar = bar.sort_index() + +if all(foo == bar): + print('PASS: df ordered by position') + print(gene_LF1['position'].head()) +else: + sys.exit('FAIL: df could not be ordered. Check source') + +print('\nDim of gene_LF1:', len(gene_LF1.columns), 'more cols:\n') -# FIXME:Can only compare identically-labeled Series objects -#if (foo == bar).all(): -# print('PASS: df ordered by position') -# print(gene_LF1['position'].head()) -#else: -# print('FAIL: df could not be ordered. Check source') -# sys.exit() #%% Create a copy of mutationinformation column for downstream mergeing gene_LF1['Mut'] = gene_LF1['mutationinformation'] gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] @@ -1185,8 +1189,12 @@ gene_LF1['pos_count'] = gene_LF1['position'].map(z1) #test_df2 = test_df.loc[test_df['position'] == 10] ############################################################################### +cols_added = ['Mut', 'Mut_copy', 'index', 'index_copy', 'pos_count', 'snp_frequency'] +print('\nAdded', len(cols_added), 'more cols:\n' + , '\nDim of new gene_LF1:', len(gene_LF1.columns)) +mylen1 = len(gene_LF1.columns) # updated my_len1 ############################################################################### -#%% Add column: aa property_water +#%% Add column: aa property_water ==> gene_LF1 #========= # iterate through the dict, create a lookup dict that i.e # lookup_dict = {three_letter_code: aa_prop_water} @@ -1235,7 +1243,7 @@ else: # clear variables del(k, v, wt, mut, lookup_dict) -#%% Add column: aa_prop_polarity +#%% Add column: aa_prop_polarity ==> gene_LF1 #======== # iterate through the dict, create a lookup dict that i.e # lookup_dict = {three_letter_code: aa_prop_polarity} @@ -1283,7 +1291,7 @@ else: # clear variables del(k, v, wt, mut, lookup_dict) -#%% Add column: aa_calcprop +#%% Add column: aa_calcprop ==> gene_LF1 #======== # iterate through the dict, create a lookup dict that i.e # lookup_dict = {three_letter_code: aa_calcprop} @@ -1923,12 +1931,12 @@ print('Finished writing:', outfile_pos , '\nNo. of rows:', len(pos_only_sorted) , '\nNo. of cols:', len(pos_only_sorted.columns) , '\n=============================================================' - , '\n\n\n') + , '\n') del(out_filename_pos) ############################################################################### #%% Quick summary output -print('============================================' +print('\n============================================' , '\nQuick summary output for', drug, 'and' , gene.lower() , '\n============================================' , '\nTotal samples:', total_samples @@ -1942,18 +1950,17 @@ print('============================================' , '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100 , '\n' - , '\nTotal no. of unique snps:', len(snps_only) - , '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique() - - , '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD - , '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD + , '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only) - - , '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique() + , '\nTotal no.of unique dr muts:' , dr_muts_rev.nunique() + , '\nTotal no.of unique other muts:' , other_muts_rev.nunique() + , '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique() + + , '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique() , '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique() - , '\nTotal no. of samples with missense muts:', len(gene_LF4) - , '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique() - , '\n') + , '\nTotal no. of samples with nsSNPs:', len(gene_LF4) + , '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique() + ) if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('\nTotal no.of samples with ambiguous muts:', len(inspect)