diff --git a/scripts/af_or_calcs.R b/scripts/af_or_calcs.R index 5f93d26..f3971f4 100755 --- a/scripts/af_or_calcs.R +++ b/scripts/af_or_calcs.R @@ -31,6 +31,7 @@ if(is.null(drug)|is.null(gene)) { #options(scipen = 999) #disabling scientific notation in R. #======================================================== #%% variable assignment: input and output paths & filenames + gene_match = paste0(gene,'_p.') cat(gene_match) @@ -46,7 +47,7 @@ outdir = paste0(datadir, '/', drug, '/', 'output') #=========== # input file 1: master data #in_filename_master = 'original_tanushree_data_v2.csv' #19K -in_filename_master = 'mtb_gwas_meta_v3.csv' #33k +in_filename_master = 'mtb_gwas_meta_v6.csv' #35k infile_master = paste0(datadir, '/', in_filename_master) cat(paste0('Reading infile1: raw data', ' ', infile_master) ) @@ -324,7 +325,7 @@ x = sapply(snps,function(m){ #%%====================================================== # Writing file with calculated ORs and AFs cat(paste0('writing output file: ' - , '\nFilen: ', outfile_af_or)) + , '\nFile: ', outfile_af_or)) write.csv(ors_df, outfile_af_or , row.names = F) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index fef2779..ba40b46 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -71,7 +71,7 @@ args = arg_parser.parse_args() #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' -#gene_match = gene + '_p.' +gene_match = gene + '_p.' drug = args.drug gene = args.gene @@ -99,7 +99,7 @@ in_filename_foldx = gene.lower() + '_foldx.csv' in_filename_dssp = gene.lower() + '_dssp.csv' in_filename_kd = gene.lower() + '_kd.csv' in_filename_rd = gene.lower() + '_rd.csv' -in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv' +in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' in_filename_afor = gene.lower() + '_af_or.csv' in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' @@ -109,19 +109,18 @@ infile_foldx = outdir + '/' + in_filename_foldx infile_dssp = outdir + '/' + in_filename_dssp infile_kd = outdir + '/' + in_filename_kd infile_rd = outdir + '/' + in_filename_rd -infile_snpinfo = indir + '/' + in_filename_snpinfo +infile_snpinfo = outdir + '/' + in_filename_snpinfo infile_afor = outdir + '/' + in_filename_afor infile_afor_kin = outdir + '/' + in_filename_afor_kin - print('\nInput path:', indir - , '\nOutput path:', outdir + , '\nOutput path:', outdir, '\n' , '\nInput filename mcsm:', infile_mcsm - , '\nInput filename foldx:', infile_foldx + , '\nInput filename foldx:', infile_foldx, '\n' , '\nInput filename dssp:', infile_dssp - , '\nInput filename kd:', infile_kd - , '\nInput filename rd', infile_rd - , '\nInput filename snp info:', infile_snpinfo + , '\nInput filename kd:', infile_kd + , '\nInput filename rd', infile_rd , '\n' + , '\nInput filename snp info:', infile_snpinfo, '\n' , '\nInput filename af or:', infile_afor , '\nInput filename afor kinship:', infile_afor_kin , '\n============================================================') @@ -208,99 +207,69 @@ print('\nResult of Fourth merge:', combined_df.shape # OR merges: TEDIOUSSSS!!!! -#%%============================================================================ + +#%%RRRR print('===================================' - , '\nFifth merge: afor_df + snpinfo_df' + , '\nFifth merge: afor_df + afor_kin_df' , '\n===================================') # OR combining afor_df = pd.read_csv(infile_afor, sep = ',') #afor_df.columns = afor_df.columns.str.lower() -snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',') -#snpinfo_df_all.columns = snpinfo_df_all.columns.str.lower() - -#afor_snpinfo_dfs = combine_dfs_with_checks(afor_df, snpinfo_df_all, my_join = i_join) -merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df_all) -afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df_all, on = merging_cols_m5, how = l_join) - -# finding mutations lacking meta data -foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])] -foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])] - -bar = snpinfo_df_all[~snpinfo_df_all['mutation'].isin(afor_df['mutation'])] -bar1 = snpinfo_df_all[snpinfo_df_all['mutation'].isin(afor_df['mutation'])] - -# checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# afor_df -if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique(): - print('No duplicate mutations detected in afor_df') -else: - print('Dropping duplicate mutations detected in afor_df') - afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first') - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# finding mutations lacking meta data -# FIXME: should get fixmed with JP's resolved dataset!? -print('There are', len(afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]) - , 'mutations with various or calculated that have no additional info...STRANGE' - , 'Reported to Jody on 14 july 2020 on skype!') -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - -foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])] -foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])] - -# snpinfo_df_all -ndups = 0 -if not snpinfo_df_all['mutation'].shape[0] == snpinfo_df_all['mutation'].nunique(): - ndups = snpinfo_df_all['mutation'].duplicated().sum() - print(ndups, 'duplicated muts detected in snpinfo_df_all.' - , '\nHowever these may have different nucleotide changes. Checking further...') - #expected_nrows = afor_df.shape[0] + ndups -cols_to_check = ['mutation', 'mutationinformation', 'ref_allele', 'alt_allele'] - -if snpinfo_df_all.duplicated(subset = cols_to_check).sum() == 0: - print('No *REAL* duplicate muts detected in snpinfo_df_all' - , '\nDim of df:', snpinfo_df_all.shape) - snpinfo_df_all = snpinfo_df_all.copy() -else: - print(snpinfo_df_all.duplicated(subset = cols_to_check).sum() - , ' Actual duplicate mutations detected in snpinfo_df_all') - dup_muts = snpinfo_df_all[['mutation', 'mutationinformation']][snpinfo_df_all.duplicated(subset = cols_to_check)] - print(len(dup_muts), 'duplicated mutation detected' - , '\nDropping duplicated mutations before merging') - snpinfo_df_all = snpinfo_df_all.drop_duplicates(subset = cols_to_check, keep = 'first') - print('Dim of df after removing duplicates:', snpinfo_df_all.shape) - - -if len(afor_snpinfo_dfs) == afor_df.shape[0] + ndups: - print('PASS: succesfully combined with left join') -else: - print('FAIL: unsuccessful merge' - , '\nDim of df1:', afor_df.shape - , '\nDim of df2:', snpinfo_df_all.shape) - sys.exit() - -print('\nResult of Fifth merge:', afor_snpinfo_dfs.shape - , '\n===================================================================') -#%%============================================================================ -print('===================================' - , '\nSixth merge: fifth merge + afor_kin_df' - , '\nafor_snpinfo_dfs + afor_kin_df' - , '\n===================================') afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') afor_kin_df.columns = afor_kin_df.columns.str.lower() - -#ors_df = combine_dfs_with_checks(afor_snpinfo_dfs, afor_kin_df, my_join = o_join) -merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df) -print('Dim of df1:', afor_snpinfo_dfs.shape - , '\nDim of df2:', afor_kin_df.shape - , '\nNo. of merging_cols:', len(merging_cols_m6)) -ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = o_join) -# Dropping unncessary columns -cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] + +merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df) + +print('Dim of afor_df:', afor_df.shape + , '\nDim of afor_kin_df:', afor_kin_df.shape) + +# finding if ALL afor_kin_df muts are present in afor_df +# i.e all kinship muts should be PRESENT in mycalcs_present +if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]: + print('PASS: ALL or_kinship muts are present in my or list') +else: + nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]) + nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])] + print('FAIL:', nf_muts, 'muts present in afor_kin_df NOT present in my or list' + , '\nsee "nf_muts_df" created containing not found(nf) muts') + sys.exit() + + +# Now checking how many afor_df muts are NOT present in afor_kin_df +common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])]) +extra_muts_myor = afor_kin_df.shape[0] - common_muts + +print('==========================================' + , '\nmy or calcs', extra_muts_myor, 'extra mutation\n' + , '\n==========================================') + +print('Expected cals for merging with outer_join...') + +expected_rows = afor_df.shape[0] + extra_muts_myor +expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5) + +ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join) + +if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols: + print('PASS: OR dfs successfully combined! PHEWWWW!') +else: + print('FAIL: could not combine OR dfs' + , '\nCheck expected rows and cols calculation and join type') + +print('Dim of merged ors_df:', ors_df.shape) +#%%============================================================================ +# formatting ors_df + +ors_df.columns + + +# Dropping unncessary columns: already removed in ealier preprocessing +#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] +cols_to_drop = ['n_miss'] print('Dropping', len(cols_to_drop), 'columns:\n' , cols_to_drop) ors_df.drop(cols_to_drop, axis = 1, inplace = True) @@ -314,14 +283,16 @@ column_order = ['mutation' , 'wild_type' , 'position' , 'mutant_type' - , 'chr_num_allele' + #, 'chr_num_allele' #old , 'ref_allele' , 'alt_allele' - , 'mut_info' + , 'mut_info_f1' + , 'mut_info_f2' , 'mut_type' , 'gene_id' - , 'gene_number' - , 'mut_region' + #, 'gene_number' #old + , 'gene_name' + #, 'mut_region' #, 'reference_allele' #, 'alternate_allele' , 'chromosome_number' @@ -347,14 +318,16 @@ column_order = ['mutation' , 'zval_logistic' , 'logl_h1_kin' , 'l_remle_kin' - , 'wt_3let' - , 'mt_3let' + #, 'wt_3let' # old + #, 'mt_3let' # old #, 'symbol' - , 'n_miss'] + #, 'n_miss' + ] -if len(column_order) == ors_df.shape[1] == len(DataFrame(column_order).isin(ors_df.columns)): - print('PASS: Column order generated for all columns in df', len(column_order), 'columns' - , '\nApplying column order to df...' ) +if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()): + print('PASS: Column order generated for all:', len(column_order), 'columns' + , '\nColumn names match, safe to reorder columns' + , '\nApplying column order to df...' ) ors_df_ordered = ors_df[column_order] else: print('FAIL: Mismatch in no. of cols to reorder' @@ -366,15 +339,15 @@ print('\nResult of Sixth merge:', ors_df_ordered.shape , '\n===================================================================') #%%============================================================================ print('===================================' - , '\nSeventh merge: Fourth + Sixth merge' + , '\nSixth merge: Fourth + Fifth merge' , '\ncombined_df + ors_df_ordered' , '\n===================================') #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join) -merging_cols_m7 = detect_common_cols(combined_df, ors_df_ordered) +merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) print('Dim of df1:', combined_df.shape , '\nDim of df2:', ors_df_ordered.shape - , '\nNo. of merging_cols:', len(merging_cols_m7)) + , '\nNo. of merging_cols:', len(merging_cols_m6)) print('Checking mutations in the two dfs:' , '\nmuts in df1 but NOT in df2:' @@ -385,13 +358,13 @@ print('Checking mutations in the two dfs:' #print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) ) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = l_join) -#combined_df_all.shape +combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join) +combined_df_all.shape # FIXME: DIM # only with left join! outdf_expected_rows = len(combined_df) -outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m7) +outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6) #if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows: if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows: @@ -426,8 +399,6 @@ combined_df_all.to_csv(outfile_comb, index = False) print('\nFinished writing file:' , '\nNo. of rows:', combined_df_all.shape[0] , '\nNo. of cols:', combined_df_all.shape[1]) - - #======================================================================= #%% incase you FIX the the function: combine_dfs_with_checks #def main(): diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py index 8c6b569..b1a00ce 100755 --- a/scripts/or_kinship_link.py +++ b/scripts/or_kinship_link.py @@ -261,6 +261,20 @@ else: sys.exit() del(df_ncols, ncols_add) + +#%% now adding mutation style = _p.abc1cde +dfm2_mis['mutation'] = gene.lower() + '_' + dfm2_mis['mut_info_f2'].astype(str) +# convert to lowercase +dfm2_mis['mutation'] = dfm2_mis['mutation'].str.lower() + +# quick sanity check +check = dfm2_mis['mutation'].value_counts().value_counts() == dfm2_mis['mut_info_f2'].value_counts().value_counts() + +if check.all(): + print('PASS: added column "mutation" containing mutation format: _p.abc1cde') +else: + print('FAIL: could not add "mutation" column!') + sys.exit() #%% Calculating OR from beta coeff print('Calculating OR...') df_ncols = dfm2_mis.shape[1] @@ -364,7 +378,7 @@ print('Reordering', dfm2_mis.shape[1], 'columns' #dfm2_mis.columns -column_order = [#'mutation', +column_order = ['mutation', 'mutationinformation', 'wild_type', 'position',