updated results summary in the data_extraction.py

2020-11-12 17:05:29 +00:00 · 2020-11-12 17:05:29 +00:00 · ccdd6029be
commit ccdd6029be
parent f9fd74812a
1 changed files with 114 additions and 58 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -21,30 +21,22 @@ Created on Tue Aug  6 12:56:03 2019
 # where each row is a separate mutation
 # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
 # NOTE
 #drtype is renamed to 'resistance' in the 35k dataset
 # output files: all lower case
-# 0) <gene>_common_ids.csv
+# 1) <gene>_gwas.csv
-# 1) <gene>_ambiguous_muts.csv
+# 2) <gene>_common_ids.csv
-# 2) <gene>_mcsm_snps.csv
+# 3) <gene>_ambiguous_muts.csv
-# 3) <gene>_metadata.csv
+# 4) <gene>_mcsm_snps.csv
-# 4) <gene>_all_muts_msa.csv
+# 5) <gene>_metadata_poscounts.csv
-# 5) <gene>_mutational_positons.csv
+# 6) <gene>_metadata.csv
 # 7) <gene>_all_muts_msa.csv
 # 8) <gene>_mutational_positons.csv
-# FIXME
+#------------
-## Make all cols lowercase
+# NOTE
-## change WildPos: wild_pos
+#-----------
-## Add an extra col: wild_chain_pos
+# drtype is renamed to 'resistance' in the 35k dataset
-## output df: <gene>_linking_df.csv
+# all colnames in the ouput files lowercase
 #containing the following cols
 #1. Mutationinformation
 #2. wild_type
 #3. position
 #4. mutant_type
 #5. chain
 #6. wild_pos
 #7. wild_chain_pos
 #-------------
 # requires
@ -66,9 +58,6 @@ os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 # Requires
 from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
 from tidy_split import tidy_split
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@ -77,6 +66,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
 arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
 arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
 arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
 arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
 arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
@ -89,9 +79,10 @@ gene = args.gene
 datadir = args.datadir
 indir   = args.input_dir
 outdir  = args.output_dir
 make_dirs  = args.make_dirs
-#drug = 'pyrazinamide'
+#drug = 'ethambutol'
-#gene = 'pncA'
+#gene = 'embB'
 #%% input and output dirs and files
 #=======
@ -106,19 +97,46 @@ if not indir:
 if not outdir:
    outdir = datadir + '/' + drug + '/output'
 if make_dirs:
    print('make_dirs is turned on, creating data dir:', datadir)
    try: 
        os.makedirs(datadir, exist_ok = True) 
        print("Directory '%s' created successfully" %datadir) 
    except OSError as error: 
        print("Directory '%s' can not be created") 
    print('make_dirs is turned on, creating indir:', indir)
    try: 
        os.makedirs(indir, exist_ok = True) 
        print("Directory '%s' created successfully" %indir) 
    except OSError as error: 
        print("Directory '%s' can not be created") 
    print('make_dirs is turned on, creating outdir:', outdir)
    try: 
        os.makedirs(outdir, exist_ok = True) 
        print("Directory '%s' created successfully" %outdir) 
    except OSError as error: 
        print("Directory '%s' can not be created") 
 # handle missing dirs here
 if not os.path.isdir(datadir):
    print('ERROR: Data directory does not exist:', datadir
-    , '\nPlease create and ensure gwas data is present and then rerun')
+    , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs')
    sys.exit()
 if not os.path.isdir(indir):
    print('ERROR: Input directory does not exist:', indir
-    , '\nPlease either create or specify indir and rerun')
+    , '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs')
    sys.exit()
 if not os.path.isdir(outdir):
    print('ERROR: Output directory does not exist:', outdir
-    , '\nPlease create or specify outdir and rerun')
+    , '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs')
    sys.exit()
 # Requires
 from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
 from tidy_split import tidy_split
 #=======================================================================    
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
@ -230,21 +248,17 @@ print('RESULT: Total samples:', total_samples
 # counts NA per column
 meta_data.isna().sum()
 print('No. of NAs/column:' + '\n', meta_data.isna().sum()
-		, '\n===========================================================')
+		, '\n===========================================================\n')
 #%% Write check file
 check_file = outdir + '/' + gene.lower() + '_gwas.csv'
 meta_data.to_csv(check_file, index = False)
-print('Writing subsetted gwas data'
+print('\n----------------------------------'
      , '\nWriting subsetted gwas data:', gene
      , '\n----------------------------------'
      , '\nFile', check_file
      , '\nDim:', meta_data.shape)
 # glance
 #meta_data.head()
 #total_samples - NA pyrazinamide = ?
 # 19K:  19265-6754 = 12511
 # 33K: 33681 - 23823 = 9858
 # equivalent of table in R
 # drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
@ -309,7 +323,7 @@ print('=================================================================')
 del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
 #========
-# Second: counting <gene_match> mutations in dr_muts_col column
+# Second: counting <gene_match> mutations in other_muts_col column
 #========
 print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)    
 # drop na and extract a clean df
@ -355,7 +369,7 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g
 expected_rows = dr_gene_count + other_gene_count
 #del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
-del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt )
+del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
 #%%
 ############
@ -398,7 +412,7 @@ meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_mat
 print('gene_snp_match in dr:', len(meta_gene_dr))
 dr_id = meta_gene_dr['id'].unique()
-print('RESULT: No. of samples with dr muts in pncA:', len(dr_id))
+print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id))
 if len(id_dr) == len(meta_gene_dr):
    print('PASS: lengths match'
@ -448,7 +462,7 @@ meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contai
 print('gene_snp_match in other:', len(meta_gene_other))
 other_id = meta_gene_other['id'].unique()
-print('RESULT: No. of samples with other muts:', len(other_id))
+print('RESULT: No. of samples with other muts in', gene, ':', len(other_id))
 if len(id_other) == len(meta_gene_other):
    print('PASS: lengths match'
@ -485,6 +499,7 @@ common_ids2.columns = ['index', 'id2']
 # should be True
 if common_ids['id'].equals(common_ids2['id2']):
    print('PASS: Further cross checks on common ids')
    nu_common_ids = common_ids.nunique()
 else:
    sys.exit('FAIL: Further cross checks on common ids')
@ -496,8 +511,9 @@ print('Expected no. of gene samples:', expected_gene_samples
 #print(outdir)
 out_filename_cid = gene.lower() + '_common_ids.csv'
 outfile_cid =  outdir + '/' + out_filename_cid
-
+print('\n----------------------------------'
-print('Writing file:'
+      '\nWriting file: common_ids'
      '\n----------------------------------'
      , '\nFile:', outfile_cid
      , '\nNo. of rows:', len(common_ids)
      , '\n=============================================================')
@ -517,7 +533,7 @@ print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
 meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
 extracted_gene_samples = meta_gene_all['id'].nunique()
-print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples
+print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples
      , '\n===================================================================')
 # sanity check: length of gene samples
@ -774,7 +790,6 @@ print('Length of dfs pre and post processing...'
      , '\ngene LF data (unique mutation in each row):', len(gene_LF1)
      , '\n=============================================================')
 #%% sanity check for extraction
 # This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
 print('Verifying whether extraction process worked correctly...')
@ -865,7 +880,11 @@ else:
         , '\n=========================================================')       
 #%% clear variables
-del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)  
+del(id_dr, id_other
    #, meta_data
    , meta_gene_dr, meta_gene_other
    #, mut_grouped
    , muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)  
 del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)                     
@ -877,7 +896,9 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
 out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
 outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
-print('Writing file: ambiguous muts'
+print('\n----------------------------------'
      , '\nWriting file: ambiguous muts'
      , '\n----------------------------------'
      , '\nFilename:', outfile_ambig_muts)
 inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
@ -1163,7 +1184,9 @@ else:
 out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv'
 outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
-print('Writing file: mCSM style muts'
+print('\n----------------------------------'
      , '\nWriting file: mCSM style muts'
      , '\n----------------------------------'
      , '\nFile:', outfile_mcsmsnps
      , '\nmutation format (SNP): {WT}<POS>{MUT}'
      , '\nNo. of distinct muts:', len(snps_only)
@ -1191,7 +1214,9 @@ metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = T
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
 outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
-print('Writing file: Metadata poscounts'
+print('\n----------------------------------'
      , 'Writing file: Metadata poscounts'
      , '\n----------------------------------'
      , '\nFile:', outfile_metadata_poscounts
      , '\n============================================================')
@ -1205,9 +1230,12 @@ del(out_filename_metadata_poscounts)
 #%% Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
-out_filename_metadata = gene.lower() + '_metadata_raw.csv'
+#out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale?
 out_filename_metadata = gene.lower() + '_metadata.csv'
 outfile_metadata = outdir + '/' + out_filename_metadata
-print('Writing file: LF formatted data'
+print('\n----------------------------------'
      , '\nWriting file: LF formatted data'
      , '\n----------------------------------'
      , '\nFile:', outfile_metadata
      , '\n============================================================')
@ -1248,10 +1276,12 @@ else:
 out_filename_msa = gene.lower() +'_all_muts_msa.csv'
 outfile_msa = outdir + '/' + out_filename_msa
-print('Writing file: mCSM style muts for msa',
+print('\n----------------------------------'
-      '\nFile:', outfile_msa,
+      , '\nWriting file: mCSM style muts for msa'
-      '\nmutation format (SNP): {WT}<POS>{MUT}',
+      , '\n----------------------------------'
-      '\nNo.of lines of msa:', len(all_muts_msa))
+      , '\nFile:', outfile_msa
      , '\nmutation format (SNP): {WT}<POS>{MUT}'
      , '\nNo.of lines of msa:', len(all_muts_msa))
 all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
@ -1278,7 +1308,9 @@ pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
 out_filename_pos = gene.lower() + '_mutational_positons.csv'
 outfile_pos = outdir + '/' + out_filename_pos
-print('Writing file: mutational positions'
+print('\n----------------------------------'
      , 'Writing file: mutational positions'
      , '\n----------------------------------'
      , '\nFile:', outfile_pos
      , '\nNo. of distinct positions:', len(pos_only_sorted)
      , '\n=============================================================')
@ -1293,15 +1325,39 @@ print('Finished writing:', outfile_pos
 del(out_filename_pos)
 #%% quick summary output
 #print('============================================'
 #      , '\nQuick summary output for', drug, 'and' , gene.lower()
 #      , '\n============================================'
 #      , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
 #      , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
 #      , '\nTotal no. of samples with missense muts:', len(gene_LF1)
 #      , '\n============================================================='
 #      , '\n\n\n')
 print('============================================'
-      , '\nQuick summary output for', gene.lower()
+      , '\nQuick summary output for', drug, 'and' , gene.lower()
      , '\n============================================'
      , '\nTotal samples:', total_samples
      , '\nNo. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
 	  , '\n'
      , '\nPercentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
 	  , '\n'
      , '\nTotal no.of unique dr muts:', mut_grouped[dr_muts_col]
      , '\nTotal no.of unique other muts:', mut_grouped[other_muts_col]     
      , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
      , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
      , '\nTotal no. of samples with missense muts:', len(gene_LF1)
      , '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique()  
      , '\n'
      , '\nTotal no.of samples with common_ids:',  nu_common_ids['id']
      , '\nTotal no.of samples with ambiguous muts:', len(inspect)
      #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
      , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
      , '\n============================================================='
      , '\n\n\n')
 #=======================================================================
 print(u'\u2698' * 50,
      '\nEnd of script: Data extraction and writing files'