updated script to combine dfs

2020-08-21 13:22:28 +01:00 · 2020-08-21 13:22:28 +01:00 · acd0b8355b
commit acd0b8355b
parent 841d18d10b
3 changed files with 98 additions and 112 deletions
--- a/scripts/af_or_calcs.R
+++ b/scripts/af_or_calcs.R
@ -31,6 +31,7 @@ if(is.null(drug)|is.null(gene)) {
 #options(scipen = 999) #disabling scientific notation in R.
 #========================================================
 #%% variable assignment: input and output paths & filenames
+
 gene_match = paste0(gene,'_p.')
 cat(gene_match)

@ -46,7 +47,7 @@ outdir = paste0(datadir, '/', drug, '/', 'output')
 #===========
 # input file 1: master data
 #in_filename_master  = 'original_tanushree_data_v2.csv' #19K
-in_filename_master  = 'mtb_gwas_meta_v3.csv' #33k
+in_filename_master  = 'mtb_gwas_meta_v6.csv' #35k
 infile_master = paste0(datadir, '/', in_filename_master)
 cat(paste0('Reading infile1: raw data', ' ', infile_master) )

@ -324,7 +325,7 @@ x = sapply(snps,function(m){
 #%%======================================================
 # Writing file with calculated ORs and AFs
 cat(paste0('writing output file: '
-           , '\nFilen: ', outfile_af_or))
+           , '\nFile: ', outfile_af_or))

 write.csv(ors_df, outfile_af_or
          , row.names = F)
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -71,7 +71,7 @@ args = arg_parser.parse_args()
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
-#gene_match = gene + '_p.'
+gene_match = gene + '_p.'

 drug    = args.drug
 gene    = args.gene
@ -99,7 +99,7 @@ in_filename_foldx = gene.lower() + '_foldx.csv'
 in_filename_dssp = gene.lower() + '_dssp.csv'
 in_filename_kd = gene.lower() + '_kd.csv'
 in_filename_rd = gene.lower() + '_rd.csv'
-in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
+in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv'
 in_filename_afor = gene.lower() + '_af_or.csv'
 in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'

@ -109,19 +109,18 @@ infile_foldx = outdir + '/' + in_filename_foldx
 infile_dssp = outdir + '/' + in_filename_dssp
 infile_kd = outdir + '/' + in_filename_kd
 infile_rd = outdir + '/' + in_filename_rd
-infile_snpinfo = indir + '/' + in_filename_snpinfo 
+infile_snpinfo = outdir + '/' + in_filename_snpinfo 
 infile_afor = outdir + '/' + in_filename_afor
 infile_afor_kin = outdir + '/' + in_filename_afor_kin

-
 print('\nInput path:', indir
-      , '\nOutput path:', outdir
+      , '\nOutput path:', outdir, '\n'
      , '\nInput filename mcsm:', infile_mcsm
-      , '\nInput filename foldx:', infile_foldx
+      , '\nInput filename foldx:', infile_foldx, '\n'
      , '\nInput filename dssp:', infile_dssp
      , '\nInput filename kd:', infile_kd 
-      , '\nInput filename rd', infile_rd
-      , '\nInput filename snp info:', infile_snpinfo
+      , '\nInput filename rd', infile_rd , '\n'
+      , '\nInput filename snp info:', infile_snpinfo, '\n'
      , '\nInput filename af or:', infile_afor
      , '\nInput filename afor kinship:', infile_afor_kin
      , '\n============================================================')
@ -208,99 +207,69 @@ print('\nResult of Fourth merge:', combined_df.shape

 # OR merges: TEDIOUSSSS!!!!

-#%%============================================================================
+
+#%%RRRR
 print('==================================='
-      , '\nFifth merge: afor_df + snpinfo_df'
+      , '\nFifth merge: afor_df + afor_kin_df'
      , '\n===================================')

 # OR combining
 afor_df = pd.read_csv(infile_afor, sep = ',')
 #afor_df.columns = afor_df.columns.str.lower()

-snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',') 
-#snpinfo_df_all.columns = snpinfo_df_all.columns.str.lower()
-
-#afor_snpinfo_dfs = combine_dfs_with_checks(afor_df, snpinfo_df_all, my_join = i_join)
-merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df_all)
-afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df_all, on = merging_cols_m5, how = l_join)
-
-# finding mutations lacking meta data
-foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
-foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
-
-bar = snpinfo_df_all[~snpinfo_df_all['mutation'].isin(afor_df['mutation'])]
-bar1 = snpinfo_df_all[snpinfo_df_all['mutation'].isin(afor_df['mutation'])]
-
-# checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# afor_df
-if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique():
-    print('No duplicate mutations detected in afor_df')
-else:
-    print('Dropping duplicate mutations detected in afor_df')
-    afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first')
-    
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# finding mutations lacking meta data
-# FIXME: should get fixmed with JP's resolved dataset!?
-print('There are', len(afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])])
-      , 'mutations with various or calculated that have no additional info...STRANGE'
-      , 'Reported to Jody on 14 july 2020 on skype!')
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
-foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
-                                                  
-# snpinfo_df_all
-ndups = 0
-if not snpinfo_df_all['mutation'].shape[0] == snpinfo_df_all['mutation'].nunique():
-    ndups = snpinfo_df_all['mutation'].duplicated().sum()
-    print(ndups, 'duplicated muts detected in snpinfo_df_all.'
-          , '\nHowever these may have different nucleotide changes. Checking further...')
-    #expected_nrows = afor_df.shape[0] + ndups    
-cols_to_check = ['mutation', 'mutationinformation', 'ref_allele', 'alt_allele']
-
-if snpinfo_df_all.duplicated(subset = cols_to_check).sum() == 0:
-    print('No *REAL* duplicate muts detected in snpinfo_df_all'
-          , '\nDim of df:', snpinfo_df_all.shape)
-    snpinfo_df_all = snpinfo_df_all.copy()
-else:
-    print(snpinfo_df_all.duplicated(subset = cols_to_check).sum()
-          , ' Actual duplicate mutations detected in snpinfo_df_all')
-    dup_muts = snpinfo_df_all[['mutation', 'mutationinformation']][snpinfo_df_all.duplicated(subset = cols_to_check)]
-    print(len(dup_muts), 'duplicated mutation detected' 
-          , '\nDropping duplicated mutations before merging')
-    snpinfo_df_all = snpinfo_df_all.drop_duplicates(subset = cols_to_check, keep = 'first')
-    print('Dim of df after removing duplicates:', snpinfo_df_all.shape)
-    
-
-if len(afor_snpinfo_dfs) == afor_df.shape[0] + ndups:
-    print('PASS: succesfully combined with left join')
-else:
-    print('FAIL: unsuccessful merge'
-             , '\nDim of df1:', afor_df.shape
-             , '\nDim of df2:', snpinfo_df_all.shape)
-    sys.exit()
-      
-print('\nResult of Fifth merge:', afor_snpinfo_dfs.shape
-      , '\n===================================================================')
-#%%============================================================================   
-print('==================================='
-      , '\nSixth merge: fifth merge + afor_kin_df'
-      , '\nafor_snpinfo_dfs + afor_kin_df'
-      , '\n===================================')
 afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
 afor_kin_df.columns = afor_kin_df.columns.str.lower()

-#ors_df = combine_dfs_with_checks(afor_snpinfo_dfs, afor_kin_df, my_join = o_join)
-merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df)
-print('Dim of df1:', afor_snpinfo_dfs.shape
-      , '\nDim of df2:', afor_kin_df.shape
-      , '\nNo. of merging_cols:', len(merging_cols_m6))

-ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = o_join)

-# Dropping unncessary columns
-cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
+merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df)
+
+print('Dim of afor_df:', afor_df.shape
+      , '\nDim of afor_kin_df:', afor_kin_df.shape)
+
+# finding if ALL afor_kin_df muts are present in afor_df
+# i.e all kinship muts should be PRESENT in mycalcs_present
+if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
+    print('PASS: ALL or_kinship muts are present in my or list')
+else:
+    nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
+    nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]    
+    print('FAIL:', nf_muts, 'muts present in afor_kin_df NOT present in my or list'
+          , '\nsee "nf_muts_df" created containing not found(nf) muts')
+    sys.exit()
+
+
+# Now checking how many afor_df muts are NOT present in afor_kin_df    
+common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])  
+extra_muts_myor =   afor_kin_df.shape[0] - common_muts
+
+print('=========================================='
+      , '\nmy or calcs', extra_muts_myor, 'extra mutation\n'
+      , '\n==========================================')
+
+print('Expected cals for merging with outer_join...')
+
+expected_rows = afor_df.shape[0] + extra_muts_myor
+expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
+
+ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
+
+if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
+    print('PASS: OR dfs successfully combined! PHEWWWW!')
+else:
+    print('FAIL: could not combine OR dfs'
+          , '\nCheck expected rows and cols calculation and join type')
+
+print('Dim of merged ors_df:', ors_df.shape)
+#%%============================================================================
+# formatting ors_df
+
+ors_df.columns
+
+
+# Dropping unncessary columns: already removed in ealier preprocessing
+#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
+cols_to_drop = ['n_miss']
 print('Dropping', len(cols_to_drop), 'columns:\n'
      , cols_to_drop)
 ors_df.drop(cols_to_drop, axis = 1, inplace = True)
@ -314,14 +283,16 @@ column_order = ['mutation'
                , 'wild_type'               
                , 'position'
                , 'mutant_type'                              
-                , 'chr_num_allele'
+                #, 'chr_num_allele' #old
                , 'ref_allele'
                , 'alt_allele'
-                , 'mut_info'
+                , 'mut_info_f1'
+                , 'mut_info_f2'
                , 'mut_type'
                , 'gene_id'
-                , 'gene_number'
-                , 'mut_region'                
+                #, 'gene_number' #old
+                , 'gene_name'
+                #, 'mut_region'                
                #, 'reference_allele'
                #, 'alternate_allele'
                , 'chromosome_number'                
@ -347,14 +318,16 @@ column_order = ['mutation'
                , 'zval_logistic'
                , 'logl_h1_kin'
                , 'l_remle_kin'
-                , 'wt_3let'
-                , 'mt_3let'
+                #, 'wt_3let' # old
+                #, 'mt_3let' # old
                #, 'symbol'
-                , 'n_miss']
+                #, 'n_miss'
+                ]

-if len(column_order) == ors_df.shape[1] == len(DataFrame(column_order).isin(ors_df.columns)):
-    print('PASS: Column order generated for all columns in df', len(column_order), 'columns'
-         , '\nApplying column order to df...' )
+if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()):
+    print('PASS: Column order generated for all:', len(column_order), 'columns'
+          , '\nColumn names match, safe to reorder columns'
+          , '\nApplying column order to df...' )
    ors_df_ordered = ors_df[column_order]
 else:
    print('FAIL: Mismatch in no. of cols to reorder'
@ -366,15 +339,15 @@ print('\nResult of Sixth merge:', ors_df_ordered.shape
      , '\n===================================================================')
 #%%============================================================================ 
 print('==================================='
-      , '\nSeventh merge: Fourth + Sixth merge'
+      , '\nSixth merge: Fourth + Fifth merge'
      , '\ncombined_df + ors_df_ordered'
      , '\n===================================')

 #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
-merging_cols_m7 = detect_common_cols(combined_df, ors_df_ordered)
+merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
 print('Dim of df1:', combined_df.shape
      , '\nDim of df2:', ors_df_ordered.shape
-      , '\nNo. of merging_cols:', len(merging_cols_m7))
+      , '\nNo. of merging_cols:', len(merging_cols_m6))

 print('Checking mutations in the two dfs:'
      , '\nmuts in df1 but NOT in df2:'
@ -385,13 +358,13 @@ print('Checking mutations in the two dfs:'
 #print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )

 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = l_join)
-#combined_df_all.shape
+combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
+combined_df_all.shape

 # FIXME: DIM
 # only with left join!
 outdf_expected_rows = len(combined_df)
-outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m7)
+outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)

 #if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:  
 if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
@ -426,8 +399,6 @@ combined_df_all.to_csv(outfile_comb, index = False)
 print('\nFinished writing file:'
      , '\nNo. of rows:', combined_df_all.shape[0]
      , '\nNo. of cols:', combined_df_all.shape[1])
-
-
 #=======================================================================
 #%% incase you FIX the the function: combine_dfs_with_checks
 #def main():
--- a/scripts/or_kinship_link.py
+++ b/scripts/or_kinship_link.py
@ -261,6 +261,20 @@ else:
    sys.exit()
    
 del(df_ncols, ncols_add)
+
+#%% now adding mutation style = <gene>_p.abc1cde 
+dfm2_mis['mutation'] = gene.lower() + '_' + dfm2_mis['mut_info_f2'].astype(str)
+# convert to lowercase
+dfm2_mis['mutation'] = dfm2_mis['mutation'].str.lower()
+
+# quick sanity check
+check = dfm2_mis['mutation'].value_counts().value_counts() == dfm2_mis['mut_info_f2'].value_counts().value_counts()
+
+if check.all():
+    print('PASS: added column "mutation" containing mutation format: <gene>_p.abc1cde')
+else:
+    print('FAIL: could not add "mutation" column!')
+    sys.exit()
 #%% Calculating OR from beta coeff
 print('Calculating OR...')
 df_ncols = dfm2_mis.shape[1]
@ -364,7 +378,7 @@ print('Reordering', dfm2_mis.shape[1], 'columns'

 #dfm2_mis.columns

-column_order = [#'mutation',
+column_order = ['mutation',
 'mutationinformation', 
 'wild_type', 
 'position',