#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 2(mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 2 dfs with aa position as linking column # Input: 2 dfs # _complex_mcsm_norm.csv # _foldx.csv # Output: .csv of all 2 dfs combined # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np #from varname import nameof import argparse #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # local imports from combining_dfs import combine_dfs_with_checks from combining_dfs import detect_common_cols #======================================================================= #%% command line args #arg_parser = argparse.ArgumentParser() #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive #args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' #drug = args.drug #gene = args.gene #====== # dirs #====== datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/' + 'input' outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= #in_filename_linking = gene.lower() + '_linking_df.csv' in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' in_filename_foldx = gene.lower() + '_foldx.csv' in_filename_dssp = gene.lower() + '_dssp.csv' in_filename_kd = gene.lower() + '_kd.csv' in_filename_rd = gene.lower() + '_rd.csv' in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv' in_filename_afor = gene.lower() + '_af_or.csv' in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' #infile_linking = outdir + '/' + in_filename_linking infile_mcsm = outdir + '/' + in_filename_mcsm infile_foldx = outdir + '/' + in_filename_foldx infile_dssp = outdir + '/' + in_filename_dssp infile_kd = outdir + '/' + in_filename_kd infile_rd = outdir + '/' + in_filename_rd infile_snpinfo = indir + '/' + in_filename_snpinfo infile_afor = outdir + '/' + in_filename_afor infile_afor_kin = outdir + '/' + in_filename_afor_kin print('\nInput path:', outdir , '\nInput filename mcsm:', infile_mcsm , '\nInput filename foldx:', infile_foldx , '\nInput filename dssp:', infile_dssp , '\nInput filename kd:', infile_kd , '\nInput filename rd', infile_rd , '\nInput filename snp info:', infile_snpinfo , '\nInput filename af or:', infile_afor , '\nInput filename afor kinship:', infile_afor_kin , '\n============================================================') #======= # output #======= out_filename_comb = gene.lower() + '_all_params.csv' outfile_comb = outdir + '/' + out_filename_comb print('Output filename:', outfile_comb , '\n============================================================') o_join = 'outer' l_join = 'left' r_join = 'right' i_join = 'inner' #del(in_filename_dssp, in_filename_foldx) # end of variable assignment for input and output files #======================================================================= # call function to detect common cols # FIXME: do the OR combining in the end to iron out any problems # Couldn't run the function combin #======================================================================= def main(): print('Reading input files:') #dssp_df = pd.read_csv(infile_dssp, sep = ',') #dssp_df.columns = dssp_df.columns.str.lower() #kd_df = pd.read_csv(infile_kd, sep = ',') #kd_df.columns = kd_df.columns.str.lower() # print('Dimension left df:', dssp_df.shape # , '\nDimension right_df:', kd_df.shape # , '\njoin type:', o_join # , '\n=========================================================') # detect common cols #merging_cols = detect_common_cols(dssp_df, kd_df) #print('Length of common cols:', len(merging_cols) # , '\nmerging column/s:', merging_cols, 'type:', type(merging_cols) # , '\ndtypes in merging columns:', dssp_df[merging_cols].dtypes) #combined_df1 = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) #print('Dimensions of combined df:', combined_df1.shape # , '\nsneak peak:', combined_df1.head() # , '\ndtypes in cols:\n', combined_df1.dtypes) #if __name__ == '__main__': # main() #======================================================================= #%% end of script #hardcoded test mcsm_df = pd.read_csv(infile_mcsm, sep = ',') mcsm_df.columns = mcsm_df.columns.str.lower() foldx_df = pd.read_csv(infile_foldx , sep = ',') print('===================================' , '\nFirst merge: mcsm + foldx' , '\n===================================') #mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join) merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df) mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1, how = 'outer') ncols_m1 = len(mcsm_foldx_dfs.columns) print('===================================' , '\nSecond merge: dssp + kd' , '\n===================================') dssp_df = pd.read_csv(infile_dssp, sep = ',') kd_df = pd.read_csv(infile_kd, sep = ',') rd_df = pd.read_csv(infile_rd, sep = ',') #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) merging_cols_m2 = detect_common_cols(dssp_df, kd_df) dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = 'outer') print('===================================' , '\nThird merge: dssp_kd_dfs + rd_df' , '\n===================================') #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join) merging_cols_m3 = detect_common_cols(dssp_df, kd_df) dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3, how = 'outer') ncols_m3 = len(dssp_kd_rd_dfs.columns) print('===================================' , '\nFourth merge: First merge + Third merge' , '\n===================================') #combined_dfs = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)# gives wrong! merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs) combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4) combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = 'inner') if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: print('PASS: successfully combined 5 dfs' , '\nnrows combined_df:', len(combined_df) , '\ncols combined_df:', len(combined_df.columns)) else: sys.exit('FAIL: check individual df merges') #%% OR combining afor_df = pd.read_csv(infile_afor, sep = ',') afor_df.columns = afor_df.columns.str.lower() if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique(): print('No duplicate muts detected in afor_df') else: print('Dropping duplicate muts detected in afor_df') afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first') snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',') snpinfo_df = snpinfo_df_all[['mutation', 'mutationinformation']] if snpinfo_df['mutation'].shape[0] == snpinfo_df['mutation'].nunique(): print('No duplicate muts detected in snpinfo_df') else: dups = snpinfo_df['mutation'].duplicated().sum() print( dups, 'Duplicate muts detected in snpinfo_df' , '\nDim:', snpinfo_df.shape) print('Dropping duplicate muts') snpinfo_df = snpinfo_df.drop_duplicates(subset = 'mutation', keep = 'first') print('Dim:', snpinfo_df.shape) print('===================================' , '\nFifth merge: afor_df + snpinfo_df' , '\n===================================') merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df) afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df, on = merging_cols_m5, how = 'left') #afor_df.shape #snpinfo_df.shape if len(afor_snpinfo_dfs) == afor_df.shape[0]: print('PASS: succesfully combined with left join') else: sys.exit('FAIL: unsuccessful merge') #%% afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') afor_kin_df.columns = afor_kin_df.columns.str.lower() print('===================================' , '\nSixth merge: afor_snpinfo_dfs + afor_kin_df' , '\n===================================') merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df) print('Dim of df1:', afor_snpinfo_dfs.shape , '\nDim of df2:', afor_kin_df.shape , '\nno. of merging_cols:', len(merging_cols_m6)) ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = 'outer') print('Dim of ors_df:', ors_df.shape) #%% print('===================================' , '\nSeventh merge: combined_df + ors_df' , '\n===================================') merging_cols_m7 = detect_common_cols(combined_df, ors_df) print('Dim of df1:', combined_df.shape , '\nDim of df2:', ors_df.shape , '\nno. of merging_cols:', len(merging_cols_m7)) print('checking mutations in the two dfs:' , '\nmuts in df1 but NOT in df2:' , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum() , 'muts in df2 but NOT in df1:' , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) #print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df['mutationinformation']) ) #combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'outer') # FIXME combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'left') outdf_expected_rows = len(combined_df) outdf_expected_cols = len(combined_df.columns) + len(ors_df.columns) - len(merging_cols_m7) print('\nDim of combined_df_all:', combined_df_all.shape) if combined_df_all.shape[1] == outdf_expected_cols: print('combined_df has expected no. of cols') if combined_df_all.shape[0] == outdf_expected_rows: print('combined_df has expected no. of rows') else: print('WARNING: nrows discrepancy noted' , '\nFIX IT') print ('thing finished') #%% write csv combined_df_all.to_csv(outfile_comb, index = False)