updated pnca_extraction and AF_OR calcs

2020-03-23 17:36:42 +00:00 · 2020-03-23 17:36:42 +00:00 · b331227023
commit b331227023
parent eb021349fe
4 changed files with 195 additions and 699 deletions
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@ -36,9 +36,10 @@ import numpy as np
 # 1) pnca_ambiguous_muts.csv
 # 2) pnca_mcsm_snps.csv
 # 3) pnca_metadata.csv
-# 4) pnca_comp_snps.csv
-# 5) pnca_all_muts_msa.csv
-# 6) pnca_mutational_positons.csv
+# 4) pnca_comp_snps.csv <---deleted> 
+
+# 4) pnca_all_muts_msa.csv
+# 5) pnca_mutational_positons.csv
 #========================================================
 #%% specify homedir as python doesn't recognise tilde
 homedir = os.path.expanduser('~') 
@ -52,23 +53,25 @@ os.getcwd()
 from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
 #========================================================

-#drug = 'pyrazinamide'
+#%% variable assignment: input and output paths & filenames
+
+drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'

-#%% specify variables for input and output paths and filenames
-
 #=======
 # input dir
 #=======
-indir = 'git/Data/pyrazinamide/input/original'
-
+#indir = 'git/Data/pyrazinamide/input/original'
+indir = 'git/Data' + '/' + drug + '/' + 'input/original'
 #=========
 # output dir
 #=========
 # several output files
 # output filenames in respective sections at the time of outputting files
-outdir = 'git/Data/pyrazinamide/output'
+#outdir = 'git/Data/pyrazinamide/output'
+outdir = 'git/Data' + '/' + drug + '/' + 'output'
+
 #%%end of variable assignment for input and output files
 #==============================================================================
 #%% Read files
@ -77,7 +80,7 @@ in_filename  = 'original_tanushree_data_v2.csv'
 infile = homedir + '/' + indir + '/' + in_filename
 print('Reading input master file:', infile)

-master_data = pd.read_csv(infile, sep = ',')  
+master_data  = pd.read_csv(infile, sep = ',')  

 # column names
 #list(master_data.columns)
@ -334,6 +337,8 @@ print('Writing file: common ids:\n',

 common_ids.to_csv(outfile0)
 print('======================================================================')
+del(out_filename0)
+

 # clear variables
 del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
@ -701,21 +706,6 @@ del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_m
 
 #%% end of data extraction and some files writing. Below are some more files writing.

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 #%%: write file: ambiguous muts
 # uncomment as necessary
 #print(outdir)
@ -735,6 +725,8 @@ inspect.to_csv(outfile1)
 print('Finished writing:', out_filename1, '\nExpected no. of rows (no. of samples with the ambiguous muts present):', dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum())
 print('======================================================================')
 del(out_filename1)
+
+
 #%%
 #===========
 # Split 'mutation' column into three:  wild_type, position and
@ -891,6 +883,8 @@ print('Finished writing:', out_filename2,
      '\nNo. of rows:', len(snps_only) )
 print('======================================================================')
 del(out_filename2)
+
+
 #%% Write file: pnca_metadata (i.e pnca_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename3 = gene.lower() + '_' + 'metadata.csv'
@ -903,45 +897,10 @@ pnca_LF1.to_csv(outfile3, header = True, index = False)
 print('Finished writing:', out_filename3,
      '\nNo. of rows:', len(pnca_LF1),
      '\nNo. of cols:', len(pnca_LF1.columns) )
-
 print('======================================================================')
+del(out_filename3)

-#%% Write file: comp SNPs (i.e snps without any corresponding 'NA' in the <drug>
-# column to allow OR calcs)

-# remove NA from pyrazinamide cols
-pnca_LF2 = pnca_LF1.dropna(subset=['pyrazinamide'])
-
-print('extracting OR muts by removing NAs from pyrazinamide cols')
-if pnca_LF2.pyrazinamide.isna().sum() > 0:
-    print('FAIL: NAs NOT removed successfully')
-else:
-    print('PASS: NAs removed successfully')
-
-# extracting comp snps only
-comp_snps_only = pd.DataFrame(pnca_LF2['mutation'].unique())
-#print('Total no. of comp snps:', len(comp_snps_only))
-comp_snps_only.head()
-
-# assign column name
-comp_snps_only.columns = ['mutation']
-
-# count how many positions this corresponds to
-comp_pos_only = pd.DataFrame(pnca_LF2['position'].unique())
-#print('Total no. of pos corresponding to comp_snps:', len(comp_pos_only))
- 
-out_filename4 = gene.lower() + '_' + 'comp_snps.csv'
-outfile4 = homedir + '/' + outdir + '/' + out_filename4
-print('Writing file: comp snps to allow OR calcs',
-      '\nFilename:', out_filename4,
-      '\nPath:',  homedir + '/' + outdir,
-      '\nNo. of comp muts:', len(comp_snps_only),
-      '\nNo. of distinct positions for comp muts:', len(comp_pos_only) )
-
-comp_snps_only.to_csv(outfile4, header = True, index = False)
-
-print('Finished writing:', out_filename4,
-      '\nNo. of rows:', len(comp_snps_only) )
 #%% write file: mCSM style but with repitions for MSA and logo plots
 all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation']) 
 all_muts_msa.head()
@ -970,21 +929,22 @@ else:
          '\nDebug please!')
 print('======================================================================')

-out_filename5 = gene.lower() + '_' + 'all_muts_msa.csv'
-outfile5 = homedir + '/' + outdir + '/' + out_filename5
+out_filename4 = gene.lower() + '_' + 'all_muts_msa.csv'
+outfile4 = homedir + '/' + outdir + '/' + out_filename4

 print('Writing file: mCSM style muts for msa',
      '\nmutation format (SNP): {Wt}<POS>{Mut}',
      '\nNo.of lines of msa:', len(all_muts_msa),  
-      '\nFilename:', out_filename5,
+      '\nFilename:', out_filename4,
      '\nPath:',  homedir +'/'+ outdir)

-all_muts_msa_sorted.to_csv(outfile5, header = False, index = False)
+all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)

-print('Finished writing:', out_filename5,
+print('Finished writing:', out_filename4,
      '\nNo. of rows:', len(all_muts_msa) )
 print('======================================================================')
-del(out_filename5)
+del(out_filename4)
+

 #%% write file for mutational positions
 # count how many positions this corresponds to
@ -999,20 +959,22 @@ pos_only.position.dtype
 # sort by position value
 pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)

-out_filename6 = gene.lower() + '_' + 'mutational_positons.csv'
-outfile6 = homedir + '/' + outdir + '/' + out_filename6
+out_filename5 = gene.lower() + '_' + 'mutational_positons.csv'
+outfile5 = homedir + '/' + outdir + '/' + out_filename5

 print('Writing file: mutational positions',
      '\nNo. of distinct positions:', len(pos_only_sorted),
-      '\nFilename:', out_filename6,
+      '\nFilename:', out_filename5,
      '\nPath:',  homedir +'/'+ outdir)

-pos_only_sorted.to_csv(outfile6, header = True, index = False)
+pos_only_sorted.to_csv(outfile5, header = True, index = False)

-print('Finished writing:', out_filename6,
+print('Finished writing:', out_filename5,
      '\nNo. of rows:', len(pos_only_sorted) )
 print('======================================================================')
-del(out_filename6)
+del(out_filename5)
+
+
 #%% end of script
 print('======================================================================')
 print(u'\u2698' * 50,