saving data extraction with final processing

2022-04-27 12:09:14 +01:00 · 2022-04-27 12:09:14 +01:00 · e419d320ac
commit e419d320ac
parent 3c436f0c27
1 changed files with 42 additions and 35 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -71,7 +71,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
 arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
 arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
 arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
-arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
+arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for Data, input and output', action='store_true')
 arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
@ -925,9 +925,9 @@ changes_dict = {}
 ##BROKENNNN!!!!
 common_muts
 gene_LF1['mutation'].head()
-common_muts_lower =  list((map(lambda x: x.lower(), common_muts)))
+#common_muts_lower =  list((map(lambda x: x.lower(), common_muts)))
-common_muts_lower
+#common_muts_lower
-##BROKENNNN!!!!
+
 for i in common_muts:
 #for i in common_muts_lower:
    #print(i)
@ -992,8 +992,7 @@ print('\n----------------------------------'
 ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
 #%% FIXME: Add sanity check to make sure you can add value_count checks
-#%% Resolving ambiguous muts
+#%% Resolving ambiguous muts: Merging ambiguous muts with the revised annotations
 # Merging ambiguous muts
 #=================
 # Merge ambig muts
 # with gene_LF1
@ -1034,7 +1033,7 @@ gene_LF1['mutation_info'].value_counts()
 # reassign
 #%% PHEW! Good to go for downstream stuff
-#%% Add column: Mutationinformation
+#%% Add column: Mutationinformation ==> gene_LF1
 # splitting mutation column to get mCSM style muts
 #=====================================================
 # Formatting df: read aa dict and pull relevant info
@ -1098,7 +1097,8 @@ for k, v in my_aa_dict.items():
 #gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
 gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) 
-mylen1 = len(gene_LF1.columns)
+#mylen1 = len(gene_LF1.columns)
 mylen0_v2 = len(gene_LF1.columns)
 # sanity checks
 print('checking if 3-letter wt&mut residue extraction worked correctly')
@ -1115,14 +1115,15 @@ else:
          , '\nmutant-type\n', mut
          , '\ndim of df:', gene_LF1.shape)
-if mylen1 ==  mylen0 + ncol_mutf_add:
+#if mylen1 ==  mylen0 + ncol_mutf_add:
 if mylen0_v2 ==  mylen0 + ncol_mutf_add:
    print('PASS: successfully added', ncol_mutf_add, 'cols'
      , '\nold length:', mylen0
-      , '\nnew len:', mylen1)
+      , '\nnew len:', mylen0_v2)
 else:
    print('FAIL: failed to add cols:'
          , '\nold length:', mylen0
-          , '\nnew len:', mylen1)
+          , '\nnew len:', mylen0_v2)
 # clear variables
 del(k, v, wt, mut, lookup_dict)
@ -1137,7 +1138,7 @@ print('Created column: mutationinformation'
 	, '\n=====================================================================\n'
    , gene_LF1.mutationinformation.head(10))
-#order by position for convenience
+# order by position for convenience
 gene_LF1.dtypes
 # converting position to numeric
@ -1145,17 +1146,20 @@ gene_LF1['position'] = pd.to_numeric(gene_LF1['position'])
 # sort by position inplace 
 foo = gene_LF1['position'].value_counts()
-foo
+foo = foo.sort_index()
 gene_LF1.sort_values(by = ['position'], inplace = True)
 bar = gene_LF1['position'].value_counts()
 bar = bar.sort_index()
 if all(foo == bar):
    print('PASS: df ordered by position')
    print(gene_LF1['position'].head())
 else:
    sys.exit('FAIL: df could not be ordered. Check source')
 print('\nDim of gene_LF1:', len(gene_LF1.columns), 'more cols:\n')
 # FIXME:Can only compare identically-labeled Series objects
 #if (foo == bar).all():
 #    print('PASS: df ordered by position')
 #    print(gene_LF1['position'].head())
 #else:
 #    print('FAIL: df could not be ordered. Check source')
 #    sys.exit()
 #%% Create a copy of mutationinformation column for downstream mergeing
 gene_LF1['Mut']      = gene_LF1['mutationinformation'] 
 gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] 
@ -1185,8 +1189,12 @@ gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
 #test_df2 = test_df.loc[test_df['position'] == 10] 
 ###############################################################################
 cols_added = ['Mut', 'Mut_copy', 'index', 'index_copy', 'pos_count', 'snp_frequency']
 print('\nAdded', len(cols_added), 'more cols:\n'
      , '\nDim of new gene_LF1:', len(gene_LF1.columns))
 mylen1 = len(gene_LF1.columns) # updated my_len1
 ###############################################################################
-#%% Add column: aa property_water
+#%% Add column: aa property_water ==> gene_LF1
 #=========
 # iterate through the dict, create a lookup dict that i.e
 # lookup_dict =  {three_letter_code: aa_prop_water} 
@ -1235,7 +1243,7 @@ else:
 # clear variables
 del(k, v, wt, mut, lookup_dict)
-#%% Add column: aa_prop_polarity
+#%% Add column: aa_prop_polarity ==> gene_LF1
 #========
 # iterate through the dict, create a lookup dict that i.e
 # lookup_dict =  {three_letter_code: aa_prop_polarity} 
@ -1283,7 +1291,7 @@ else:
 # clear variables
 del(k, v, wt, mut, lookup_dict)
-#%% Add column: aa_calcprop
+#%% Add column: aa_calcprop ==> gene_LF1
 #========
 # iterate through the dict, create a lookup dict that i.e
 # lookup_dict =  {three_letter_code: aa_calcprop} 
@ -1923,12 +1931,12 @@ print('Finished writing:', outfile_pos
      , '\nNo. of rows:', len(pos_only_sorted)
      , '\nNo. of cols:', len(pos_only_sorted.columns)
      , '\n============================================================='
-      , '\n\n\n')
+      , '\n')
 del(out_filename_pos)
 ###############################################################################
 #%% Quick summary output
-print('============================================'
+print('\n============================================'
      , '\nQuick summary output for', drug, 'and' , gene.lower()
      , '\n============================================'
      , '\nTotal samples:', total_samples
@ -1942,18 +1950,17 @@ print('============================================'
      , '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
 	  , '\n'
-      , '\nTotal no. of unique snps:', len(snps_only)
+      , '\nTotal no. of unique nsSNPs [check1: length of snps_only]:', len(snps_only)
      , '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique()
-      , '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD 
+      , '\nTotal no.of unique dr muts:'    , dr_muts_rev.nunique() 
-      , '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD 
+      , '\nTotal no.of unique other muts:' , other_muts_rev.nunique()
      , '\nTotal no. of unique nsSNPs [check2: dr_muts + other_muts]:', dr_muts_rev.nunique()+other_muts_rev.nunique()
-      
+      , '\nTotal no.of unique nSNSPs [check3, gene_LF4]:', gene_LF4['mutationinformation'].nunique()
      , '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique()
      , '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
-      , '\nTotal no. of samples with missense muts:', len(gene_LF4)
+      , '\nTotal no. of samples with nsSNPs:', len(gene_LF4)
-      , '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique()  
+      , '\nTotal no. of unique sample ids with nsSNPs:', gene_LF4['id'].nunique()
-      , '\n')
+      )
 if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
    print('\nTotal no.of samples with ambiguous muts:', len(inspect)