updated script to combine dfs

This commit is contained in:
Tanushree Tunstall 2020-08-21 13:22:28 +01:00
parent 841d18d10b
commit acd0b8355b
3 changed files with 98 additions and 112 deletions

View file

@ -31,6 +31,7 @@ if(is.null(drug)|is.null(gene)) {
#options(scipen = 999) #disabling scientific notation in R.
#========================================================
#%% variable assignment: input and output paths & filenames
gene_match = paste0(gene,'_p.')
cat(gene_match)
@ -46,7 +47,7 @@ outdir = paste0(datadir, '/', drug, '/', 'output')
#===========
# input file 1: master data
#in_filename_master = 'original_tanushree_data_v2.csv' #19K
in_filename_master = 'mtb_gwas_meta_v3.csv' #33k
in_filename_master = 'mtb_gwas_meta_v6.csv' #35k
infile_master = paste0(datadir, '/', in_filename_master)
cat(paste0('Reading infile1: raw data', ' ', infile_master) )
@ -324,7 +325,7 @@ x = sapply(snps,function(m){
#%%======================================================
# Writing file with calculated ORs and AFs
cat(paste0('writing output file: '
, '\nFilen: ', outfile_af_or))
, '\nFile: ', outfile_af_or))
write.csv(ors_df, outfile_af_or
, row.names = F)

View file

@ -71,7 +71,7 @@ args = arg_parser.parse_args()
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
gene_match = gene + '_p.'
drug = args.drug
gene = args.gene
@ -99,7 +99,7 @@ in_filename_foldx = gene.lower() + '_foldx.csv'
in_filename_dssp = gene.lower() + '_dssp.csv'
in_filename_kd = gene.lower() + '_kd.csv'
in_filename_rd = gene.lower() + '_rd.csv'
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv'
in_filename_afor = gene.lower() + '_af_or.csv'
in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
@ -109,19 +109,18 @@ infile_foldx = outdir + '/' + in_filename_foldx
infile_dssp = outdir + '/' + in_filename_dssp
infile_kd = outdir + '/' + in_filename_kd
infile_rd = outdir + '/' + in_filename_rd
infile_snpinfo = indir + '/' + in_filename_snpinfo
infile_snpinfo = outdir + '/' + in_filename_snpinfo
infile_afor = outdir + '/' + in_filename_afor
infile_afor_kin = outdir + '/' + in_filename_afor_kin
print('\nInput path:', indir
, '\nOutput path:', outdir
, '\nOutput path:', outdir, '\n'
, '\nInput filename mcsm:', infile_mcsm
, '\nInput filename foldx:', infile_foldx
, '\nInput filename foldx:', infile_foldx, '\n'
, '\nInput filename dssp:', infile_dssp
, '\nInput filename kd:', infile_kd
, '\nInput filename rd', infile_rd
, '\nInput filename snp info:', infile_snpinfo
, '\nInput filename kd:', infile_kd
, '\nInput filename rd', infile_rd , '\n'
, '\nInput filename snp info:', infile_snpinfo, '\n'
, '\nInput filename af or:', infile_afor
, '\nInput filename afor kinship:', infile_afor_kin
, '\n============================================================')
@ -208,99 +207,69 @@ print('\nResult of Fourth merge:', combined_df.shape
# OR merges: TEDIOUSSSS!!!!
#%%============================================================================
#%%RRRR
print('==================================='
, '\nFifth merge: afor_df + snpinfo_df'
, '\nFifth merge: afor_df + afor_kin_df'
, '\n===================================')
# OR combining
afor_df = pd.read_csv(infile_afor, sep = ',')
#afor_df.columns = afor_df.columns.str.lower()
snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',')
#snpinfo_df_all.columns = snpinfo_df_all.columns.str.lower()
#afor_snpinfo_dfs = combine_dfs_with_checks(afor_df, snpinfo_df_all, my_join = i_join)
merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df_all)
afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df_all, on = merging_cols_m5, how = l_join)
# finding mutations lacking meta data
foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
bar = snpinfo_df_all[~snpinfo_df_all['mutation'].isin(afor_df['mutation'])]
bar1 = snpinfo_df_all[snpinfo_df_all['mutation'].isin(afor_df['mutation'])]
# checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# afor_df
if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique():
print('No duplicate mutations detected in afor_df')
else:
print('Dropping duplicate mutations detected in afor_df')
afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first')
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# finding mutations lacking meta data
# FIXME: should get fixmed with JP's resolved dataset!?
print('There are', len(afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])])
, 'mutations with various or calculated that have no additional info...STRANGE'
, 'Reported to Jody on 14 july 2020 on skype!')
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
# snpinfo_df_all
ndups = 0
if not snpinfo_df_all['mutation'].shape[0] == snpinfo_df_all['mutation'].nunique():
ndups = snpinfo_df_all['mutation'].duplicated().sum()
print(ndups, 'duplicated muts detected in snpinfo_df_all.'
, '\nHowever these may have different nucleotide changes. Checking further...')
#expected_nrows = afor_df.shape[0] + ndups
cols_to_check = ['mutation', 'mutationinformation', 'ref_allele', 'alt_allele']
if snpinfo_df_all.duplicated(subset = cols_to_check).sum() == 0:
print('No *REAL* duplicate muts detected in snpinfo_df_all'
, '\nDim of df:', snpinfo_df_all.shape)
snpinfo_df_all = snpinfo_df_all.copy()
else:
print(snpinfo_df_all.duplicated(subset = cols_to_check).sum()
, ' Actual duplicate mutations detected in snpinfo_df_all')
dup_muts = snpinfo_df_all[['mutation', 'mutationinformation']][snpinfo_df_all.duplicated(subset = cols_to_check)]
print(len(dup_muts), 'duplicated mutation detected'
, '\nDropping duplicated mutations before merging')
snpinfo_df_all = snpinfo_df_all.drop_duplicates(subset = cols_to_check, keep = 'first')
print('Dim of df after removing duplicates:', snpinfo_df_all.shape)
if len(afor_snpinfo_dfs) == afor_df.shape[0] + ndups:
print('PASS: succesfully combined with left join')
else:
print('FAIL: unsuccessful merge'
, '\nDim of df1:', afor_df.shape
, '\nDim of df2:', snpinfo_df_all.shape)
sys.exit()
print('\nResult of Fifth merge:', afor_snpinfo_dfs.shape
, '\n===================================================================')
#%%============================================================================
print('==================================='
, '\nSixth merge: fifth merge + afor_kin_df'
, '\nafor_snpinfo_dfs + afor_kin_df'
, '\n===================================')
afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
afor_kin_df.columns = afor_kin_df.columns.str.lower()
#ors_df = combine_dfs_with_checks(afor_snpinfo_dfs, afor_kin_df, my_join = o_join)
merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df)
print('Dim of df1:', afor_snpinfo_dfs.shape
, '\nDim of df2:', afor_kin_df.shape
, '\nNo. of merging_cols:', len(merging_cols_m6))
ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = o_join)
# Dropping unncessary columns
cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df)
print('Dim of afor_df:', afor_df.shape
, '\nDim of afor_kin_df:', afor_kin_df.shape)
# finding if ALL afor_kin_df muts are present in afor_df
# i.e all kinship muts should be PRESENT in mycalcs_present
if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
print('PASS: ALL or_kinship muts are present in my or list')
else:
nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]
print('FAIL:', nf_muts, 'muts present in afor_kin_df NOT present in my or list'
, '\nsee "nf_muts_df" created containing not found(nf) muts')
sys.exit()
# Now checking how many afor_df muts are NOT present in afor_kin_df
common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])
extra_muts_myor = afor_kin_df.shape[0] - common_muts
print('=========================================='
, '\nmy or calcs', extra_muts_myor, 'extra mutation\n'
, '\n==========================================')
print('Expected cals for merging with outer_join...')
expected_rows = afor_df.shape[0] + extra_muts_myor
expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
print('PASS: OR dfs successfully combined! PHEWWWW!')
else:
print('FAIL: could not combine OR dfs'
, '\nCheck expected rows and cols calculation and join type')
print('Dim of merged ors_df:', ors_df.shape)
#%%============================================================================
# formatting ors_df
ors_df.columns
# Dropping unncessary columns: already removed in ealier preprocessing
#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
cols_to_drop = ['n_miss']
print('Dropping', len(cols_to_drop), 'columns:\n'
, cols_to_drop)
ors_df.drop(cols_to_drop, axis = 1, inplace = True)
@ -314,14 +283,16 @@ column_order = ['mutation'
, 'wild_type'
, 'position'
, 'mutant_type'
, 'chr_num_allele'
#, 'chr_num_allele' #old
, 'ref_allele'
, 'alt_allele'
, 'mut_info'
, 'mut_info_f1'
, 'mut_info_f2'
, 'mut_type'
, 'gene_id'
, 'gene_number'
, 'mut_region'
#, 'gene_number' #old
, 'gene_name'
#, 'mut_region'
#, 'reference_allele'
#, 'alternate_allele'
, 'chromosome_number'
@ -347,14 +318,16 @@ column_order = ['mutation'
, 'zval_logistic'
, 'logl_h1_kin'
, 'l_remle_kin'
, 'wt_3let'
, 'mt_3let'
#, 'wt_3let' # old
#, 'mt_3let' # old
#, 'symbol'
, 'n_miss']
#, 'n_miss'
]
if len(column_order) == ors_df.shape[1] == len(DataFrame(column_order).isin(ors_df.columns)):
print('PASS: Column order generated for all columns in df', len(column_order), 'columns'
, '\nApplying column order to df...' )
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()):
print('PASS: Column order generated for all:', len(column_order), 'columns'
, '\nColumn names match, safe to reorder columns'
, '\nApplying column order to df...' )
ors_df_ordered = ors_df[column_order]
else:
print('FAIL: Mismatch in no. of cols to reorder'
@ -366,15 +339,15 @@ print('\nResult of Sixth merge:', ors_df_ordered.shape
, '\n===================================================================')
#%%============================================================================
print('==================================='
, '\nSeventh merge: Fourth + Sixth merge'
, '\nSixth merge: Fourth + Fifth merge'
, '\ncombined_df + ors_df_ordered'
, '\n===================================')
#combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
merging_cols_m7 = detect_common_cols(combined_df, ors_df_ordered)
merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
print('Dim of df1:', combined_df.shape
, '\nDim of df2:', ors_df_ordered.shape
, '\nNo. of merging_cols:', len(merging_cols_m7))
, '\nNo. of merging_cols:', len(merging_cols_m6))
print('Checking mutations in the two dfs:'
, '\nmuts in df1 but NOT in df2:'
@ -385,13 +358,13 @@ print('Checking mutations in the two dfs:'
#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = l_join)
#combined_df_all.shape
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
combined_df_all.shape
# FIXME: DIM
# only with left join!
outdf_expected_rows = len(combined_df)
outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m7)
outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)
#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:
if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
@ -426,8 +399,6 @@ combined_df_all.to_csv(outfile_comb, index = False)
print('\nFinished writing file:'
, '\nNo. of rows:', combined_df_all.shape[0]
, '\nNo. of cols:', combined_df_all.shape[1])
#=======================================================================
#%% incase you FIX the the function: combine_dfs_with_checks
#def main():

View file

@ -261,6 +261,20 @@ else:
sys.exit()
del(df_ncols, ncols_add)
#%% now adding mutation style = <gene>_p.abc1cde
dfm2_mis['mutation'] = gene.lower() + '_' + dfm2_mis['mut_info_f2'].astype(str)
# convert to lowercase
dfm2_mis['mutation'] = dfm2_mis['mutation'].str.lower()
# quick sanity check
check = dfm2_mis['mutation'].value_counts().value_counts() == dfm2_mis['mut_info_f2'].value_counts().value_counts()
if check.all():
print('PASS: added column "mutation" containing mutation format: <gene>_p.abc1cde')
else:
print('FAIL: could not add "mutation" column!')
sys.exit()
#%% Calculating OR from beta coeff
print('Calculating OR...')
df_ncols = dfm2_mis.shape[1]
@ -364,7 +378,7 @@ print('Reordering', dfm2_mis.shape[1], 'columns'
#dfm2_mis.columns
column_order = [#'mutation',
column_order = ['mutation',
'mutationinformation',
'wild_type',
'position',