updated script to combine dfs
This commit is contained in:
parent
841d18d10b
commit
acd0b8355b
3 changed files with 98 additions and 112 deletions
|
@ -31,6 +31,7 @@ if(is.null(drug)|is.null(gene)) {
|
|||
#options(scipen = 999) #disabling scientific notation in R.
|
||||
#========================================================
|
||||
#%% variable assignment: input and output paths & filenames
|
||||
|
||||
gene_match = paste0(gene,'_p.')
|
||||
cat(gene_match)
|
||||
|
||||
|
@ -46,7 +47,7 @@ outdir = paste0(datadir, '/', drug, '/', 'output')
|
|||
#===========
|
||||
# input file 1: master data
|
||||
#in_filename_master = 'original_tanushree_data_v2.csv' #19K
|
||||
in_filename_master = 'mtb_gwas_meta_v3.csv' #33k
|
||||
in_filename_master = 'mtb_gwas_meta_v6.csv' #35k
|
||||
infile_master = paste0(datadir, '/', in_filename_master)
|
||||
cat(paste0('Reading infile1: raw data', ' ', infile_master) )
|
||||
|
||||
|
@ -324,7 +325,7 @@ x = sapply(snps,function(m){
|
|||
#%%======================================================
|
||||
# Writing file with calculated ORs and AFs
|
||||
cat(paste0('writing output file: '
|
||||
, '\nFilen: ', outfile_af_or))
|
||||
, '\nFile: ', outfile_af_or))
|
||||
|
||||
write.csv(ors_df, outfile_af_or
|
||||
, row.names = F)
|
||||
|
|
|
@ -71,7 +71,7 @@ args = arg_parser.parse_args()
|
|||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
#gene_match = gene + '_p.'
|
||||
gene_match = gene + '_p.'
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
@ -99,7 +99,7 @@ in_filename_foldx = gene.lower() + '_foldx.csv'
|
|||
in_filename_dssp = gene.lower() + '_dssp.csv'
|
||||
in_filename_kd = gene.lower() + '_kd.csv'
|
||||
in_filename_rd = gene.lower() + '_rd.csv'
|
||||
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
|
||||
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv'
|
||||
in_filename_afor = gene.lower() + '_af_or.csv'
|
||||
in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
|
||||
|
||||
|
@ -109,19 +109,18 @@ infile_foldx = outdir + '/' + in_filename_foldx
|
|||
infile_dssp = outdir + '/' + in_filename_dssp
|
||||
infile_kd = outdir + '/' + in_filename_kd
|
||||
infile_rd = outdir + '/' + in_filename_rd
|
||||
infile_snpinfo = indir + '/' + in_filename_snpinfo
|
||||
infile_snpinfo = outdir + '/' + in_filename_snpinfo
|
||||
infile_afor = outdir + '/' + in_filename_afor
|
||||
infile_afor_kin = outdir + '/' + in_filename_afor_kin
|
||||
|
||||
|
||||
print('\nInput path:', indir
|
||||
, '\nOutput path:', outdir
|
||||
, '\nOutput path:', outdir, '\n'
|
||||
, '\nInput filename mcsm:', infile_mcsm
|
||||
, '\nInput filename foldx:', infile_foldx
|
||||
, '\nInput filename foldx:', infile_foldx, '\n'
|
||||
, '\nInput filename dssp:', infile_dssp
|
||||
, '\nInput filename kd:', infile_kd
|
||||
, '\nInput filename rd', infile_rd
|
||||
, '\nInput filename snp info:', infile_snpinfo
|
||||
, '\nInput filename rd', infile_rd , '\n'
|
||||
, '\nInput filename snp info:', infile_snpinfo, '\n'
|
||||
, '\nInput filename af or:', infile_afor
|
||||
, '\nInput filename afor kinship:', infile_afor_kin
|
||||
, '\n============================================================')
|
||||
|
@ -208,99 +207,69 @@ print('\nResult of Fourth merge:', combined_df.shape
|
|||
|
||||
# OR merges: TEDIOUSSSS!!!!
|
||||
|
||||
#%%============================================================================
|
||||
|
||||
#%%RRRR
|
||||
print('==================================='
|
||||
, '\nFifth merge: afor_df + snpinfo_df'
|
||||
, '\nFifth merge: afor_df + afor_kin_df'
|
||||
, '\n===================================')
|
||||
|
||||
# OR combining
|
||||
afor_df = pd.read_csv(infile_afor, sep = ',')
|
||||
#afor_df.columns = afor_df.columns.str.lower()
|
||||
|
||||
snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',')
|
||||
#snpinfo_df_all.columns = snpinfo_df_all.columns.str.lower()
|
||||
|
||||
#afor_snpinfo_dfs = combine_dfs_with_checks(afor_df, snpinfo_df_all, my_join = i_join)
|
||||
merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df_all)
|
||||
afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df_all, on = merging_cols_m5, how = l_join)
|
||||
|
||||
# finding mutations lacking meta data
|
||||
foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
|
||||
foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
|
||||
|
||||
bar = snpinfo_df_all[~snpinfo_df_all['mutation'].isin(afor_df['mutation'])]
|
||||
bar1 = snpinfo_df_all[snpinfo_df_all['mutation'].isin(afor_df['mutation'])]
|
||||
|
||||
# checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# afor_df
|
||||
if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique():
|
||||
print('No duplicate mutations detected in afor_df')
|
||||
else:
|
||||
print('Dropping duplicate mutations detected in afor_df')
|
||||
afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first')
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# finding mutations lacking meta data
|
||||
# FIXME: should get fixmed with JP's resolved dataset!?
|
||||
print('There are', len(afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])])
|
||||
, 'mutations with various or calculated that have no additional info...STRANGE'
|
||||
, 'Reported to Jody on 14 july 2020 on skype!')
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
|
||||
foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
|
||||
|
||||
# snpinfo_df_all
|
||||
ndups = 0
|
||||
if not snpinfo_df_all['mutation'].shape[0] == snpinfo_df_all['mutation'].nunique():
|
||||
ndups = snpinfo_df_all['mutation'].duplicated().sum()
|
||||
print(ndups, 'duplicated muts detected in snpinfo_df_all.'
|
||||
, '\nHowever these may have different nucleotide changes. Checking further...')
|
||||
#expected_nrows = afor_df.shape[0] + ndups
|
||||
cols_to_check = ['mutation', 'mutationinformation', 'ref_allele', 'alt_allele']
|
||||
|
||||
if snpinfo_df_all.duplicated(subset = cols_to_check).sum() == 0:
|
||||
print('No *REAL* duplicate muts detected in snpinfo_df_all'
|
||||
, '\nDim of df:', snpinfo_df_all.shape)
|
||||
snpinfo_df_all = snpinfo_df_all.copy()
|
||||
else:
|
||||
print(snpinfo_df_all.duplicated(subset = cols_to_check).sum()
|
||||
, ' Actual duplicate mutations detected in snpinfo_df_all')
|
||||
dup_muts = snpinfo_df_all[['mutation', 'mutationinformation']][snpinfo_df_all.duplicated(subset = cols_to_check)]
|
||||
print(len(dup_muts), 'duplicated mutation detected'
|
||||
, '\nDropping duplicated mutations before merging')
|
||||
snpinfo_df_all = snpinfo_df_all.drop_duplicates(subset = cols_to_check, keep = 'first')
|
||||
print('Dim of df after removing duplicates:', snpinfo_df_all.shape)
|
||||
|
||||
|
||||
if len(afor_snpinfo_dfs) == afor_df.shape[0] + ndups:
|
||||
print('PASS: succesfully combined with left join')
|
||||
else:
|
||||
print('FAIL: unsuccessful merge'
|
||||
, '\nDim of df1:', afor_df.shape
|
||||
, '\nDim of df2:', snpinfo_df_all.shape)
|
||||
sys.exit()
|
||||
|
||||
print('\nResult of Fifth merge:', afor_snpinfo_dfs.shape
|
||||
, '\n===================================================================')
|
||||
#%%============================================================================
|
||||
print('==================================='
|
||||
, '\nSixth merge: fifth merge + afor_kin_df'
|
||||
, '\nafor_snpinfo_dfs + afor_kin_df'
|
||||
, '\n===================================')
|
||||
afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
|
||||
afor_kin_df.columns = afor_kin_df.columns.str.lower()
|
||||
|
||||
#ors_df = combine_dfs_with_checks(afor_snpinfo_dfs, afor_kin_df, my_join = o_join)
|
||||
merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df)
|
||||
print('Dim of df1:', afor_snpinfo_dfs.shape
|
||||
, '\nDim of df2:', afor_kin_df.shape
|
||||
, '\nNo. of merging_cols:', len(merging_cols_m6))
|
||||
|
||||
ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = o_join)
|
||||
|
||||
# Dropping unncessary columns
|
||||
cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
|
||||
merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df)
|
||||
|
||||
print('Dim of afor_df:', afor_df.shape
|
||||
, '\nDim of afor_kin_df:', afor_kin_df.shape)
|
||||
|
||||
# finding if ALL afor_kin_df muts are present in afor_df
|
||||
# i.e all kinship muts should be PRESENT in mycalcs_present
|
||||
if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
|
||||
print('PASS: ALL or_kinship muts are present in my or list')
|
||||
else:
|
||||
nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
|
||||
nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]
|
||||
print('FAIL:', nf_muts, 'muts present in afor_kin_df NOT present in my or list'
|
||||
, '\nsee "nf_muts_df" created containing not found(nf) muts')
|
||||
sys.exit()
|
||||
|
||||
|
||||
# Now checking how many afor_df muts are NOT present in afor_kin_df
|
||||
common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])
|
||||
extra_muts_myor = afor_kin_df.shape[0] - common_muts
|
||||
|
||||
print('=========================================='
|
||||
, '\nmy or calcs', extra_muts_myor, 'extra mutation\n'
|
||||
, '\n==========================================')
|
||||
|
||||
print('Expected cals for merging with outer_join...')
|
||||
|
||||
expected_rows = afor_df.shape[0] + extra_muts_myor
|
||||
expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
|
||||
|
||||
ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
|
||||
|
||||
if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
|
||||
print('PASS: OR dfs successfully combined! PHEWWWW!')
|
||||
else:
|
||||
print('FAIL: could not combine OR dfs'
|
||||
, '\nCheck expected rows and cols calculation and join type')
|
||||
|
||||
print('Dim of merged ors_df:', ors_df.shape)
|
||||
#%%============================================================================
|
||||
# formatting ors_df
|
||||
|
||||
ors_df.columns
|
||||
|
||||
|
||||
# Dropping unncessary columns: already removed in ealier preprocessing
|
||||
#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
|
||||
cols_to_drop = ['n_miss']
|
||||
print('Dropping', len(cols_to_drop), 'columns:\n'
|
||||
, cols_to_drop)
|
||||
ors_df.drop(cols_to_drop, axis = 1, inplace = True)
|
||||
|
@ -314,14 +283,16 @@ column_order = ['mutation'
|
|||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
, 'chr_num_allele'
|
||||
#, 'chr_num_allele' #old
|
||||
, 'ref_allele'
|
||||
, 'alt_allele'
|
||||
, 'mut_info'
|
||||
, 'mut_info_f1'
|
||||
, 'mut_info_f2'
|
||||
, 'mut_type'
|
||||
, 'gene_id'
|
||||
, 'gene_number'
|
||||
, 'mut_region'
|
||||
#, 'gene_number' #old
|
||||
, 'gene_name'
|
||||
#, 'mut_region'
|
||||
#, 'reference_allele'
|
||||
#, 'alternate_allele'
|
||||
, 'chromosome_number'
|
||||
|
@ -347,14 +318,16 @@ column_order = ['mutation'
|
|||
, 'zval_logistic'
|
||||
, 'logl_h1_kin'
|
||||
, 'l_remle_kin'
|
||||
, 'wt_3let'
|
||||
, 'mt_3let'
|
||||
#, 'wt_3let' # old
|
||||
#, 'mt_3let' # old
|
||||
#, 'symbol'
|
||||
, 'n_miss']
|
||||
#, 'n_miss'
|
||||
]
|
||||
|
||||
if len(column_order) == ors_df.shape[1] == len(DataFrame(column_order).isin(ors_df.columns)):
|
||||
print('PASS: Column order generated for all columns in df', len(column_order), 'columns'
|
||||
, '\nApplying column order to df...' )
|
||||
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()):
|
||||
print('PASS: Column order generated for all:', len(column_order), 'columns'
|
||||
, '\nColumn names match, safe to reorder columns'
|
||||
, '\nApplying column order to df...' )
|
||||
ors_df_ordered = ors_df[column_order]
|
||||
else:
|
||||
print('FAIL: Mismatch in no. of cols to reorder'
|
||||
|
@ -366,15 +339,15 @@ print('\nResult of Sixth merge:', ors_df_ordered.shape
|
|||
, '\n===================================================================')
|
||||
#%%============================================================================
|
||||
print('==================================='
|
||||
, '\nSeventh merge: Fourth + Sixth merge'
|
||||
, '\nSixth merge: Fourth + Fifth merge'
|
||||
, '\ncombined_df + ors_df_ordered'
|
||||
, '\n===================================')
|
||||
|
||||
#combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
|
||||
merging_cols_m7 = detect_common_cols(combined_df, ors_df_ordered)
|
||||
merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
|
||||
print('Dim of df1:', combined_df.shape
|
||||
, '\nDim of df2:', ors_df_ordered.shape
|
||||
, '\nNo. of merging_cols:', len(merging_cols_m7))
|
||||
, '\nNo. of merging_cols:', len(merging_cols_m6))
|
||||
|
||||
print('Checking mutations in the two dfs:'
|
||||
, '\nmuts in df1 but NOT in df2:'
|
||||
|
@ -385,13 +358,13 @@ print('Checking mutations in the two dfs:'
|
|||
#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = l_join)
|
||||
#combined_df_all.shape
|
||||
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
|
||||
combined_df_all.shape
|
||||
|
||||
# FIXME: DIM
|
||||
# only with left join!
|
||||
outdf_expected_rows = len(combined_df)
|
||||
outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m7)
|
||||
outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)
|
||||
|
||||
#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:
|
||||
if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
|
||||
|
@ -426,8 +399,6 @@ combined_df_all.to_csv(outfile_comb, index = False)
|
|||
print('\nFinished writing file:'
|
||||
, '\nNo. of rows:', combined_df_all.shape[0]
|
||||
, '\nNo. of cols:', combined_df_all.shape[1])
|
||||
|
||||
|
||||
#=======================================================================
|
||||
#%% incase you FIX the the function: combine_dfs_with_checks
|
||||
#def main():
|
||||
|
|
|
@ -261,6 +261,20 @@ else:
|
|||
sys.exit()
|
||||
|
||||
del(df_ncols, ncols_add)
|
||||
|
||||
#%% now adding mutation style = <gene>_p.abc1cde
|
||||
dfm2_mis['mutation'] = gene.lower() + '_' + dfm2_mis['mut_info_f2'].astype(str)
|
||||
# convert to lowercase
|
||||
dfm2_mis['mutation'] = dfm2_mis['mutation'].str.lower()
|
||||
|
||||
# quick sanity check
|
||||
check = dfm2_mis['mutation'].value_counts().value_counts() == dfm2_mis['mut_info_f2'].value_counts().value_counts()
|
||||
|
||||
if check.all():
|
||||
print('PASS: added column "mutation" containing mutation format: <gene>_p.abc1cde')
|
||||
else:
|
||||
print('FAIL: could not add "mutation" column!')
|
||||
sys.exit()
|
||||
#%% Calculating OR from beta coeff
|
||||
print('Calculating OR...')
|
||||
df_ncols = dfm2_mis.shape[1]
|
||||
|
@ -364,7 +378,7 @@ print('Reordering', dfm2_mis.shape[1], 'columns'
|
|||
|
||||
#dfm2_mis.columns
|
||||
|
||||
column_order = [#'mutation',
|
||||
column_order = ['mutation',
|
||||
'mutationinformation',
|
||||
'wild_type',
|
||||
'position',
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue