From 943513a338a6a1fe2ee38775527282f5e54bb139 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 7 Jul 2020 16:06:11 +0100 Subject: [PATCH] added script to combine all files in one --- scripts/combine_struct_dfs.py | 298 ------------------------- scripts/combining_FIXME.py | 177 +++++++++++++++ scripts/combining_dfs.py | 405 ++++++++++++++++++++++------------ scripts/combining_test.py | 303 ------------------------- 4 files changed, 435 insertions(+), 748 deletions(-) delete mode 100755 scripts/combine_struct_dfs.py create mode 100755 scripts/combining_FIXME.py delete mode 100755 scripts/combining_test.py diff --git a/scripts/combine_struct_dfs.py b/scripts/combine_struct_dfs.py deleted file mode 100755 index 5dc21d5..0000000 --- a/scripts/combine_struct_dfs.py +++ /dev/null @@ -1,298 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -Created on Tue Aug 6 12:56:03 2019 - -@author: tanu -''' -# FIXME: change filename 4 (mcsm normalised data) -# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline -#======================================================================= -# Task: combine 4 dfs with aa position as linking column -# This is done in 2 steps: -# merge 1: of 3 dfs (filenames in lowercase) -# _dssp.csv -# _kd.csv -# _rd.csv - -# merge 2: of 2 dfs -# gene.lower() + '_complex_mcsm_norm.csv' (!fix name) -# output df from merge1 - -# Input: 3 dfs -# _dssp.csv -# _kd.csv -# _rd.csv -# gene.lower() + '_complex_mcsm_norm.csv' (!fix name) - -# Output: .csv of all 4 dfs combined - -# useful link -# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns -#======================================================================= -#%% load packages -import sys, os -import pandas as pd -#import numpy as np -import argparse -#======================================================================= -#%% specify input and curr dir -homedir = os.path.expanduser('~') - -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/scripts') -os.getcwd() -#======================================================================= -#%% command line args -arg_parser = argparse.ArgumentParser() -arg_parser.add_argument('-d', '--drug', help='drug name', default = None) -arg_parser.add_argument('-g', '--gene', help='gene name', default = None) # case sensitive -args = arg_parser.parse_args() -#======================================================================= -#%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' -#gene_match = gene + '_p.' - -drug = args.drug -gene = args.gene -#========== -# data dir -#========== -datadir = homedir + '/' + 'git/Data' - -#======= -# input -#======= -indir = datadir + '/' + drug + '/' + 'output' -in_filename1 = gene.lower() + '_dssp.csv' -in_filename2 = gene.lower() + '_kd.csv' -in_filename3 = gene.lower() + '_rd.csv' -#in_filename4 = 'mcsm_complex1_normalised.csv' # FIXNAME -in_filename4 = gene.lower() + '_complex_mcsm_norm.csv' - -infile1 = indir + '/' + in_filename1 -infile2 = indir + '/' + in_filename2 -infile3 = indir + '/' + in_filename3 -infile4 = indir + '/' + in_filename4 - -print('\nInput path:', indir - , '\nInput filename1:', in_filename1 - , '\nInput filename2:', in_filename2 - , '\nInput filename3:', in_filename3 - , '\nInput filename4:', in_filename4 - , '\n===================================================================') - -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_mcsm_struct_params.csv' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\nOutput path:', outdir - , '\n===================================================================') - -#%% end of variable assignment for input and output files -#======================================================================= -#%% function/methd to combine 4 dfs - -def combine_dfs(dssp_csv, kd_csv, rd_csv, mcsm_csv, out_combined_csv): - """ - Combine 4 dfs - - @param dssp_df: csv file (output from dssp_df.py) - @type dssp_df: string - - @param kd_df: csv file (output from kd_df.py) - @type ks_df: string - - @param rd_df: csv file (output from rd_df.py) - @type rd_df: string - - # FIXME - @param mcsm_df: csv file (output of mcsm pipeline)CHECK} - @type mcsm_df: string - - @param out_combined_csv: csv file output - @type out_combined_csv: string - - @return: none, writes combined df as csv - """ - #======================== - # read input csv files to combine - #======================== - dssp_df = pd.read_csv(dssp_csv, sep = ',') - kd_df = pd.read_csv(kd_csv, sep = ',') - rd_df = pd.read_csv(rd_csv, sep = ',') - mcsm_df = pd.read_csv(mcsm_csv, sep = ',') - - print('Reading input files:' - , '\ndssp file:', dssp_csv - , '\nNo. of rows:', len(dssp_df) - , '\nNo. of cols:', len(dssp_df.columns) - , '\nColumn names:', dssp_df.columns - , '\n===================================================================' - , '\nkd file:', kd_csv - , '\nNo. of rows:', len(kd_df) - , '\nNo. of cols:', len(kd_df.columns) - , '\nColumn names:', kd_df.columns - , '\n===================================================================' - , '\nrd file:', rd_csv - , '\nNo. of rows:', len(rd_df) - , '\nNo. of cols:', len(rd_df.columns) - , '\nColumn names:', rd_df.columns - , '\n===================================================================' - , '\nrd file:', mcsm_csv - , '\nNo. of rows:', len(mcsm_df) - , '\nNo. of cols:', len(mcsm_df.columns) - , '\nColumn names:', mcsm_df.columns - , '\n===================================================================') - - #======================== - # merge 1 (combined_df1) - # concatenating 3dfs: - # dssp_df, kd_df, rd_df - #======================== - print('starting first merge...\n') - - # checking no. of rows - print('Checking if no. of rows of the 3 dfs are equal:\n' - , len(dssp_df) == len(kd_df) == len(rd_df) - , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' - , '\n===================================================================') - - # variables for sanity checks - expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df)) - # beware of harcoding! used for sanity check - ndfs = 3 - ncol_merge = 1 - offset = ndfs- ncol_merge - expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset - - print('Merge 1:' - , '\ncombining 3dfs by commom col: position' - , '\nExpected nrows in combined_df:', expected_rows_df1 - , '\nExpected ncols in combined_df:', expected_cols_df1 - , '\nResetting the common col as the index' - , '\n===================================================================') - - #dssp_df.set_index('position', inplace = True) - #kd_df.set_index('position', inplace = True) - #rd_df.set_index('position', inplace =True) - - #combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() - #combined_df.rename(columns = {'index':'position'}) - - combined_df1 = pd.concat( - (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df]) - , axis = 1, join = 'outer').reset_index() - - # sanity check - print('Checking dimensions of concatenated df1...') - if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1: - print('PASS: combined df has expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df1) - , '\nNo. of cols in combined df:', len(combined_df1.columns) - , '\n===============================================================') - else: - print('FAIL: combined df does not have expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df1) - , '\nNo. of cols in combined df:', len(combined_df1.columns) - , '\n===============================================================') - - #======================== - # merge 2 (combined_df2) - # concatenating 2dfs: - # mcsm_df, combined_df1 (result of merge1) - # sort the cols - #======================== - print('starting second merge...\n') - - # rename col 'Position' in mcsm_df to lowercase 'position' - # as it matches the combined_df1 colname to perfom merge - - #mcsm_df.columns - #mcsm_df.rename(columns = {'Position':'position'}) # not working! - # copy 'Position' column with the correct colname - print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging' - , '\nNo. of cols before copying: ', len(mcsm_df.columns)) - - mcsm_df['position'] = mcsm_df['Position'] - print('No. of cols after copying: ', len(mcsm_df.columns)) - - # sanity check - if mcsm_df['position'].equals(mcsm_df['Position']): - print('PASS: Copying worked correctly' - , '\ncopied col matches original column' - , '\n===============================================================') - else: - print('FAIL: copied col does not match original column' - , '\n================================================================') - - # variables for sanity checks - expected_rows_df2 = len(mcsm_df) - # beware of harcoding! used for sanity check - ndfs = 2 - ncol_merge = 1 - offset = ndfs - ncol_merge - expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset - - print('Merge 2:' - , '\ncombining 2dfs by commom col: position' - , '\nExpected nrows in combined_df:', expected_rows_df2 - , '\nExpected ncols in combined_df:', expected_cols_df2 - , '\n===================================================================') - - combined_df2 = mcsm_df.merge(combined_df1, on = 'position') - - # sanity check - print('Checking dimensions of concatenated df2...') - if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2: - print('PASS: combined df2 has expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df2) - , '\nNo. of cols in combined df:', len(combined_df2.columns) - , '\n===============================================================') - else: - print('FAIL: combined df2 does not have expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df2) - , '\nNo. of cols in combined df:', len(combined_df2.columns) - , '\n===============================================================') - - #=============== - # writing file - #=============== - print('Writing file:' - , '\nFilename:', out_combined_csv -# , '\nPath:', outdir - , '\nExpected no. of rows:', len(combined_df2) - , '\nExpected no. of cols:', len(combined_df2.columns) - , '\n=========================================================') - - combined_df2.to_csv(out_combined_csv, header = True, index = False) - -#%% end of function -#======================================================================= -#%% call function -#combine_dfs(infile1, infile2, infile3, infile4, outfile) -#======================================================================= -def main(): - print('Combining 4 dfs:\n' - , in_filename1, '\n' - , in_filename2, '\n' - , in_filename3, '\n' - , in_filename4, '\n' - , 'output csv:', out_filename) - combine_dfs(infile1, infile2, infile3, infile4, outfile) - print('Finished Writing file:' - , '\nFilename:', outfile -## , '\nNo. of rows:', '' -## , '\nNo. of cols:', '' - , '\n===========================================================') - -if __name__ == '__main__': - main() -#%% end of script -#======================================================================= - diff --git a/scripts/combining_FIXME.py b/scripts/combining_FIXME.py new file mode 100755 index 0000000..bbdd864 --- /dev/null +++ b/scripts/combining_FIXME.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +# FIXME: change filename 2(mcsm normalised data) +# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline +#======================================================================= +# Task: combine 2 dfs on comm_valson cols by detecting them +# includes sainity checks + +#======================================================================= +#%% load packages +import sys, os +import pandas as pd +import numpy as np +import re +#from varname import nameof + +#%% end of variable assignment for input and output files +#======================================================================= +#%% function/methd to combine dfs + +def detect_common_cols (df1, df2): + """ + Detect comm_valson cols + + @param df1: df + @type df1: pandas df + + @param df2: df + @type df2: pandas df + + @return: comm_valson cols + @type: list + """ + common_cols = np.intersect1d(df1.columns, df2.columns).tolist() + print('Length of comm_cols:', len(common_cols) + , '\nmerging column/s:', common_cols + , '\ntype:', type(common_cols) + , '\ndtypes in merging columns:\n', df1[common_cols].dtypes) + + return common_cols + +#%% Function to combine 2 dfs by detecting commom cols and performing +# sanity checks on the output df +def combine_dfs_with_checks(df1, df2, my_join = 'outer'): + """ + Combine 2 dfs by finding merging columns automatically + + @param df1: data frame + @type df1: pandas df + + @param df2: data frame + @type df2: pandas df + + @my_join: join type for merging + @type my_join: string + + @return: combined_df + @type: pandas df + """ + + print('Finding comm_cols and merging cols:' + ,'\n=========================================================') + + common_cols = np.intersect1d(df1.columns, df2.columns).tolist() + print('Length of comm_cols:', len(common_cols) + , '\nmerging column/s:', common_cols + , '\ntype:', type(common_cols)) + + #print('\ndtypes in merging columns:\n', df1[common_cols].dtypes) + + print('selecting consistent dtypes for merging (object i.e string)') + #merging_cols = df1[comm_valson_cols].select_dtypes(include = [object]).columns.tolist() + #merging_cols = df1[comm_valson_cols].select_dtypes(include = ['int64']).columns.tolist() + merging_cols = common_cols.copy() + + nmerging_cols = len(merging_cols) + print(' length of merging cols:', nmerging_cols + , '\nmerging cols:', merging_cols, 'type:', type(merging_cols) + , '\n=========================================================') + + #======================== + # merge 1 (combined_df) + # concatenating 2dfs: + # df1, df2 + #======================== + # checking cross-over of mutations in the two dfs to merge + ndiff_1 = df1[merging_cols].squeeze().isin(df2[merging_cols].squeeze()).sum() + ndiff1 = df1.shape[0] - ndiff_1 + print('There are', ndiff1, 'unmatched mutations in left df') + + #missing_mutinfo = df1[~left_df['mutationinformation'].isin(df2['mutationinformation'])] + #missing_mutinfo.to_csv('infoless_muts.csv') + + ndiff_2 = df2[merging_cols].squeeze().isin(df1[merging_cols].squeeze()).sum() + ndiff2 = df2.shape[0] - ndiff_2 + print('There are', ndiff2, 'unmatched mutations in right_df') + + #comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols]) + #comm_vals_count = len(comm_vals) + #print('length of comm_valson values:', comm_vals_count , '\ntype:', type(comm_vals_count)) + + #======================== + # merging dfs & sanity checks + #======================== + fail = False + print('combing with:', my_join) + comb_df = pd.merge(df1, df2, on = merging_cols, how = my_join) + + expected_cols = df1.shape[1] + df2.shape[1] - nmerging_cols + + + if my_join == 'right': + df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') + expected_rows = df2_nd.shape[0] + + if my_join == 'left': + expected_rows = df1.shape[0] + + + #if my_join == 'inner': + # expected_rows = comm_vals_count + + #if my_join == 'outer': + # df1_nd = df1.drop_duplicates(merging_cols, keep = 'first') + # df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') + # expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - comm_vals_count + + + if my_join == ('inner' or 'outer') and len(merging_cols) > 1: + #comm_vals = np.intersect1d(df1['mutationinformation'], df2['mutationinformation']) + print('length of merging_cols > 1, therefore omitting row checks') + combined_df = comb_df.copy() + expected_rows = len(combined_df) + + else: + comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols]) + print('length of merging_cols == 1, calculating expected rows in merged_df') + combined_df = comb_df.drop_duplicates(subset = merging_cols, keep ='first') + if my_join == 'inner': + expected_rows = len(comm_vals) + if my_join == 'outer': + df1_nd = df1.drop_duplicates(merging_cols, keep = 'first') + df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') + expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - len(comm_vals) + + if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols: + print('PASS: successfully combined dfs with:', my_join, 'join') + else: + print('FAIL: combined_df\'s expected rows and cols not matched') + fail = True + print('\nExpected no. of rows:', expected_rows + , '\nGot:', len(combined_df) + , '\nExpected no. of cols:', expected_cols + , '\nGot:', len(combined_df.columns)) + if fail: + sys.exit() + + #if clean: + #foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1) + #print(foo.columns) + #print('Detected duplicate cols with suffix: _x _y' + # , '\Dropping duplicate cols and cleaning') + + # drop position col containing suffix '_y' and then rename col without suffix + combined_df_clean = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1) + combined_df_clean.rename(columns=lambda x: re.sub('_x$','', x), inplace = True) + + return combined_df_clean + +#%% end of function +#======================================================================= + diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index ce19fb0..764768a 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -8,169 +8,280 @@ Created on Tue Aug 6 12:56:03 2019 # FIXME: change filename 2(mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= -# Task: combine 2 dfs on comm_valson cols by detecting them -# includes sainity checks +# Task: combine 2 dfs with aa position as linking column +# Input: 2 dfs +# _complex_mcsm_norm.csv +# _foldx.csv + +# Output: .csv of all 2 dfs combined + +# useful link +# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np -import re #from varname import nameof +import argparse -#%% end of variable assignment for input and output files #======================================================================= -#%% function/methd to combine dfs +#%% specify input and curr dir +homedir = os.path.expanduser('~') -def detect_common_cols (df1, df2): - """ - Detect comm_valson cols - - @param df1: df - @type df1: pandas df +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() - @param df2: df - @type df2: pandas df - - @return: comm_valson cols - @type: list - """ - common_cols = np.intersect1d(df1.columns, df2.columns).tolist() - print('Length of comm_cols:', len(common_cols) - , '\nmerging column/s:', common_cols - , '\ntype:', type(common_cols) - , '\ndtypes in merging columns:\n', df1[common_cols].dtypes) - - return common_cols - +# FIXME: local imports +#from combining import combine_dfs_with_checks +from combining_FIXME import detect_common_cols +#======================================================================= +#%% command line args +#arg_parser = argparse.ArgumentParser() +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +#args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = gene + '_p.' -def combine_dfs_with_checks(df1, df2, my_join = 'outer'): - """ - Combine 2 dfs by finding merging columns automatically +#drug = args.drug +#gene = args.gene +#====== +# dirs +#====== +datadir = homedir + '/' + 'git/Data' +indir = datadir + '/' + drug + '/' + 'input' +outdir = datadir + '/' + drug + '/' + 'output' - @param df1: data frame - @type df1: pandas df +#======= +# input +#======= +in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' +in_filename_foldx = gene.lower() + '_foldx.csv' +in_filename_dssp = gene.lower() + '_dssp.csv' +in_filename_kd = gene.lower() + '_kd.csv' +in_filename_rd = gene.lower() + '_rd.csv' +in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv' +in_filename_afor = gene.lower() + '_af_or.csv' +in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' - @param df2: data frame - @type df2: pandas df - - @my_join: join type for merging - @type my_join: string - - @return: combined_df - @type: pandas df - """ - - print('Finding comm_cols and merging cols:' - ,'\n=========================================================') - common_cols = np.intersect1d(df1.columns, df2.columns).tolist() - print('Length of comm_cols:', len(common_cols) - , '\nmerging column/s:', common_cols - , '\ntype:', type(common_cols)) - - #print('\ndtypes in merging columns:\n', df1[common_cols].dtypes) - - print('selecting consistent dtypes for merging (object i.e string)') - #merging_cols = df1[comm_valson_cols].select_dtypes(include = [object]).columns.tolist() - #merging_cols = df1[comm_valson_cols].select_dtypes(include = ['int64']).columns.tolist() - merging_cols = common_cols.copy() +infile_mcsm = outdir + '/' + in_filename_mcsm +infile_foldx = outdir + '/' + in_filename_foldx +infile_dssp = outdir + '/' + in_filename_dssp +infile_kd = outdir + '/' + in_filename_kd +infile_rd = outdir + '/' + in_filename_rd +infile_snpinfo = indir + '/' + in_filename_snpinfo +infile_afor = outdir + '/' + in_filename_afor +infile_afor_kin = outdir + '/' + in_filename_afor_kin - nmerging_cols = len(merging_cols) - print(' length of merging cols:', nmerging_cols - , '\nmerging cols:', merging_cols, 'type:', type(merging_cols) - , '\n=========================================================') + +print('\nInput path:', outdir + , '\nInput filename mcsm:', infile_mcsm + , '\nInput filename foldx:', infile_foldx + , '\nInput filename dssp:', infile_dssp + , '\nInput filename kd:', infile_kd + , '\nInput filename rd', infile_rd + , '\nInput filename snp info:', infile_snpinfo + , '\nInput filename af or:', infile_afor + , '\nInput filename afor kinship:', infile_afor_kin + , '\n============================================================') + +#======= +# output +#======= +out_filename_comb = gene.lower() + '_all_params.csv' +outfile_comb = outdir + '/' + out_filename_comb +print('Output filename:', outfile_comb + , '\n============================================================') + +o_join = 'outer' +l_join = 'left' +r_join = 'right' +i_join = 'inner' + +# end of variable assignment for input and output files +#&%%==================================================================== +mcsm_df = pd.read_csv(infile_mcsm, sep = ',') +mcsm_df.columns = mcsm_df.columns.str.lower() +foldx_df = pd.read_csv(infile_foldx , sep = ',') + +print('===================================' + , '\nFirst merge: mcsm + foldx' + , '\n===================================') +#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join) +merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df) + +mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1, how = 'outer') +ncols_m1 = len(mcsm_foldx_dfs.columns) +#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +print('===================================' + , '\nSecond merge: dssp + kd' + , '\n===================================') + +dssp_df = pd.read_csv(infile_dssp, sep = ',') +kd_df = pd.read_csv(infile_kd, sep = ',') +rd_df = pd.read_csv(infile_rd, sep = ',') + +#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) +merging_cols_m2 = detect_common_cols(dssp_df, kd_df) + +dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = 'outer') + +print('===================================' + , '\nThird merge: dssp_kd_dfs + rd_df' + , '\n===================================') +#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join) +merging_cols_m3 = detect_common_cols(dssp_df, kd_df) +dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3, how = 'outer') + +ncols_m3 = len(dssp_kd_rd_dfs.columns) +#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +print('===================================' + , '\nFourth merge: First merge + Third merge' + , '\n===================================') +#combined_dfs = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)# gives wrong! +merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs) +combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4) + +combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = 'inner') + + +if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: + print('PASS: successfully combined 5 dfs' + , '\nnrows combined_df:', len(combined_df) + , '\ncols combined_df:', len(combined_df.columns)) +else: + sys.exit('FAIL: check individual df merges') +#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +#%% OR combining +afor_df = pd.read_csv(infile_afor, sep = ',') +afor_df.columns = afor_df.columns.str.lower() + +if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique(): + print('No duplicate muts detected in afor_df') +else: + print('Dropping duplicate muts detected in afor_df') + afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first') + + +snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',') +snpinfo_df = snpinfo_df_all[['mutation', 'mutationinformation']] + + +if snpinfo_df['mutation'].shape[0] == snpinfo_df['mutation'].nunique(): + print('No duplicate muts detected in snpinfo_df') +else: + dups = snpinfo_df['mutation'].duplicated().sum() + print( dups, 'Duplicate muts detected in snpinfo_df' + , '\nDim:', snpinfo_df.shape) + print('Dropping duplicate muts') + snpinfo_df = snpinfo_df.drop_duplicates(subset = 'mutation', keep = 'first') + print('Dim:', snpinfo_df.shape) + +print('===================================' + , '\nFifth merge: afor_df + snpinfo_df' + , '\n===================================') + +merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df) + +afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df, on = merging_cols_m5, how = 'left') +if len(afor_snpinfo_dfs) == afor_df.shape[0]: + print('PASS: succesfully combined with left join' + , '\nDim of df1:', afor_df.shape + , '\nDim of df2:', snpinfo_df.shape) +else: + sys.exit('FAIL: unsuccessful merge') + +#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') +afor_kin_df.columns = afor_kin_df.columns.str.lower() + +print('===================================' + , '\nSixth merge: afor_snpinfo_dfs + afor_kin_df' + , '\n===================================') + +merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df) + +print('Dim of df1:', afor_snpinfo_dfs.shape + , '\nDim of df2:', afor_kin_df.shape + , '\nno. of merging_cols:', len(merging_cols_m6)) + +ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = 'outer') + +print('Dim of ors_df:', ors_df.shape) + +#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +print('===================================' + , '\nSeventh merge: combined_df + ors_df' + , '\n===================================') + +merging_cols_m7 = detect_common_cols(combined_df, ors_df) + +print('Dim of df1:', combined_df.shape + , '\nDim of df2:', ors_df.shape + , '\nno. of merging_cols:', len(merging_cols_m7)) + +print('checking mutations in the two dfs:' + , '\nmuts in df1 but NOT in df2:' + , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum() + , '\nmuts in df2 but NOT in df1:' + , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) + +#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df['mutationinformation']) ) + +#combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'outer') # FIXME +combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'left') + +outdf_expected_rows = len(combined_df) +outdf_expected_cols = len(combined_df.columns) + len(ors_df.columns) - len(merging_cols_m7) + +print('\nDim of combined_df_all:', combined_df_all.shape + , '\nwith join type: ????') + +if combined_df_all.shape[1] == outdf_expected_cols: + print('combined_df has expected no. of cols') +if combined_df_all.shape[0] == outdf_expected_rows: + print('combined_df has expected no. of rows') +else: + print('WARNING: nrows discrepancy noted' + , '\nFIX IT') +print ('thing finished') +#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# write csv + +combined_df_all.to_csv(outfile_comb, index = False) + +#======================================================================= +#%% incase you FIX the the function: combine_dfs_with_checks +#def main(): - #======================== - # merge 1 (combined_df) - # concatenating 2dfs: - # df1, df2 - #======================== - # checking cross-over of mutations in the two dfs to merge - ndiff_1 = df1[merging_cols].squeeze().isin(df2[merging_cols].squeeze()).sum() - ndiff1 = df1.shape[0] - ndiff_1 - print('There are', ndiff1, 'unmatched mutations in left df') +# print('Reading input files:') + #mcsm_df = pd.read_csv(infile_mcsm, sep = ',') + #mcsm_df.columns = mcsm_df.columns.str.lower() + + #foldx_df = pd.read_csv(infile_foldx , sep = ',') + + #dssp_df = pd.read_csv(infile_dssp, sep = ',') + #dssp_df.columns = dssp_df.columns.str.lower() + + #kd_df = pd.read_csv(infile_kd, sep = ',') + #kd_df.columns = kd_df.columns.str.lower() - #missing_mutinfo = df1[~left_df['mutationinformation'].isin(df2['mutationinformation'])] - #missing_mutinfo.to_csv('infoless_muts.csv') + #rd_df = pd.read_csv(infile_kd, sep = ',') + + - ndiff_2 = df2[merging_cols].squeeze().isin(df1[merging_cols].squeeze()).sum() - ndiff2 = df2.shape[0] - ndiff_2 - print('There are', ndiff2, 'unmatched mutations in right_df') - - #comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols]) - #comm_vals_count = len(comm_vals) - #print('length of comm_valson values:', comm_vals_count , '\ntype:', type(comm_vals_count)) - - #======================== - # merging dfs & sanity checks - #======================== - fail = False - print('combing with:', my_join) - comb_df = pd.merge(df1, df2, on = merging_cols, how = my_join) - - expected_cols = df1.shape[1] + df2.shape[1] - nmerging_cols - - - if my_join == 'right': - df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') - expected_rows = df2_nd.shape[0] - - if my_join == 'left': - expected_rows = df1.shape[0] - - - #if my_join == 'inner': - # expected_rows = comm_vals_count - - #if my_join == 'outer': - # df1_nd = df1.drop_duplicates(merging_cols, keep = 'first') - # df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') - # expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - comm_vals_count - - - if my_join == ('inner' or 'outer') and len(merging_cols) > 1: - #comm_vals = np.intersect1d(df1['mutationinformation'], df2['mutationinformation']) - print('length of merging_cols > 1, therefore omitting row checks') - combined_df = comb_df.copy() - expected_rows = len(combined_df) - - else: - comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols]) - print('length of merging_cols == 1, calculating expected rows in merged_df') - combined_df = comb_df.drop_duplicates(subset = merging_cols, keep ='first') - if my_join == 'inner': - expected_rows = len(comm_vals) - if my_join == 'outer': - df1_nd = df1.drop_duplicates(merging_cols, keep = 'first') - df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') - expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - len(comm_vals) - - if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols: - print('PASS: successfully combined dfs with:', my_join, 'join') - else: - print('FAIL: combined_df\'s expected rows and cols not matched') - fail = True - print('\nExpected no. of rows:', expected_rows - , '\nGot:', len(combined_df) - , '\nExpected no. of cols:', expected_cols - , '\nGot:', len(combined_df.columns)) - if fail: - sys.exit() - - #if clean: - #foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1) - #print(foo.columns) - #print('Detected duplicate cols with suffix: _x _y' - # , '\Dropping duplicate cols and cleaning') - - # drop position col containing suffix '_y' and then rename col without suffix - combined_df_clean = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1) - combined_df_clean.rename(columns=lambda x: re.sub('_x$','', x), inplace = True) - - return combined_df_clean - -#%% end of function -#======================================================================= - +#if __name__ == '__main__': +# main() +#======================================================================= +#%% end of script diff --git a/scripts/combining_test.py b/scripts/combining_test.py deleted file mode 100755 index 19b9c51..0000000 --- a/scripts/combining_test.py +++ /dev/null @@ -1,303 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -Created on Tue Aug 6 12:56:03 2019 - -@author: tanu -''' -# FIXME: change filename 2(mcsm normalised data) -# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline -#======================================================================= -# Task: combine 2 dfs with aa position as linking column - -# Input: 2 dfs -# _complex_mcsm_norm.csv -# _foldx.csv - -# Output: .csv of all 2 dfs combined - -# useful link -# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns -#======================================================================= -#%% load packages -import sys, os -import pandas as pd -import numpy as np -#from varname import nameof -import argparse - -#======================================================================= -#%% specify input and curr dir -homedir = os.path.expanduser('~') - -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/scripts') -os.getcwd() - -# local imports -from combining_dfs import combine_dfs_with_checks -from combining_dfs import detect_common_cols -#======================================================================= -#%% command line args -#arg_parser = argparse.ArgumentParser() -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -#args = arg_parser.parse_args() -#======================================================================= -#%% variable assignment: input and output -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' - -#drug = args.drug -#gene = args.gene -#====== -# dirs -#====== -datadir = homedir + '/' + 'git/Data' -indir = datadir + '/' + drug + '/' + 'input' -outdir = datadir + '/' + drug + '/' + 'output' - -#======= -# input -#======= -#in_filename_linking = gene.lower() + '_linking_df.csv' -in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' -in_filename_foldx = gene.lower() + '_foldx.csv' -in_filename_dssp = gene.lower() + '_dssp.csv' -in_filename_kd = gene.lower() + '_kd.csv' -in_filename_rd = gene.lower() + '_rd.csv' -in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv' -in_filename_afor = gene.lower() + '_af_or.csv' -in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' - - -#infile_linking = outdir + '/' + in_filename_linking -infile_mcsm = outdir + '/' + in_filename_mcsm -infile_foldx = outdir + '/' + in_filename_foldx -infile_dssp = outdir + '/' + in_filename_dssp -infile_kd = outdir + '/' + in_filename_kd -infile_rd = outdir + '/' + in_filename_rd -infile_snpinfo = indir + '/' + in_filename_snpinfo -infile_afor = outdir + '/' + in_filename_afor -infile_afor_kin = outdir + '/' + in_filename_afor_kin - - -print('\nInput path:', outdir - , '\nInput filename mcsm:', infile_mcsm - , '\nInput filename foldx:', infile_foldx - , '\nInput filename dssp:', infile_dssp - , '\nInput filename kd:', infile_kd - , '\nInput filename rd', infile_rd - , '\nInput filename snp info:', infile_snpinfo - , '\nInput filename af or:', infile_afor - , '\nInput filename afor kinship:', infile_afor_kin - , '\n============================================================') - -#======= -# output -#======= -out_filename_comb = gene.lower() + '_all_params.csv' -outfile_comb = outdir + '/' + out_filename_comb -print('Output filename:', outfile_comb - , '\n============================================================') - -o_join = 'outer' -l_join = 'left' -r_join = 'right' -i_join = 'inner' - - -#del(in_filename_dssp, in_filename_foldx) -# end of variable assignment for input and output files - -#======================================================================= -# call function to detect common cols -# FIXME: do the OR combining in the end to iron out any problems -# Couldn't run the function combin -#======================================================================= -def main(): - - print('Reading input files:') - - #dssp_df = pd.read_csv(infile_dssp, sep = ',') - #dssp_df.columns = dssp_df.columns.str.lower() - - #kd_df = pd.read_csv(infile_kd, sep = ',') - #kd_df.columns = kd_df.columns.str.lower() - -# print('Dimension left df:', dssp_df.shape -# , '\nDimension right_df:', kd_df.shape -# , '\njoin type:', o_join -# , '\n=========================================================') - - # detect common cols - #merging_cols = detect_common_cols(dssp_df, kd_df) - #print('Length of common cols:', len(merging_cols) - # , '\nmerging column/s:', merging_cols, 'type:', type(merging_cols) - # , '\ndtypes in merging columns:', dssp_df[merging_cols].dtypes) - - #combined_df1 = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) - #print('Dimensions of combined df:', combined_df1.shape - # , '\nsneak peak:', combined_df1.head() - # , '\ndtypes in cols:\n', combined_df1.dtypes) - -#if __name__ == '__main__': -# main() -#======================================================================= -#%% end of script -#hardcoded test - -mcsm_df = pd.read_csv(infile_mcsm, sep = ',') -mcsm_df.columns = mcsm_df.columns.str.lower() -foldx_df = pd.read_csv(infile_foldx , sep = ',') - -print('===================================' - , '\nFirst merge: mcsm + foldx' - , '\n===================================') -#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join) -merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df) - -mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1, how = 'outer') -ncols_m1 = len(mcsm_foldx_dfs.columns) - -print('===================================' - , '\nSecond merge: dssp + kd' - , '\n===================================') - -dssp_df = pd.read_csv(infile_dssp, sep = ',') -kd_df = pd.read_csv(infile_kd, sep = ',') -rd_df = pd.read_csv(infile_rd, sep = ',') - -#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) -merging_cols_m2 = detect_common_cols(dssp_df, kd_df) - -dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = 'outer') - -print('===================================' - , '\nThird merge: dssp_kd_dfs + rd_df' - , '\n===================================') -#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join) -merging_cols_m3 = detect_common_cols(dssp_df, kd_df) -dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3, how = 'outer') - -ncols_m3 = len(dssp_kd_rd_dfs.columns) - -print('===================================' - , '\nFourth merge: First merge + Third merge' - , '\n===================================') -#combined_dfs = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)# gives wrong! -merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs) -combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4) - -combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = 'inner') - - -if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: - print('PASS: successfully combined 5 dfs' - , '\nnrows combined_df:', len(combined_df) - , '\ncols combined_df:', len(combined_df.columns)) -else: - sys.exit('FAIL: check individual df merges') - -#%% OR combining -afor_df = pd.read_csv(infile_afor, sep = ',') -afor_df.columns = afor_df.columns.str.lower() - -if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique(): - print('No duplicate muts detected in afor_df') -else: - print('Dropping duplicate muts detected in afor_df') - afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first') - - -snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',') -snpinfo_df = snpinfo_df_all[['mutation', 'mutationinformation']] - - -if snpinfo_df['mutation'].shape[0] == snpinfo_df['mutation'].nunique(): - print('No duplicate muts detected in snpinfo_df') -else: - dups = snpinfo_df['mutation'].duplicated().sum() - print( dups, 'Duplicate muts detected in snpinfo_df' - , '\nDim:', snpinfo_df.shape) - print('Dropping duplicate muts') - snpinfo_df = snpinfo_df.drop_duplicates(subset = 'mutation', keep = 'first') - print('Dim:', snpinfo_df.shape) - - -print('===================================' - , '\nFifth merge: afor_df + snpinfo_df' - , '\n===================================') - -merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df) - -afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df, on = merging_cols_m5, how = 'left') -#afor_df.shape -#snpinfo_df.shape -if len(afor_snpinfo_dfs) == afor_df.shape[0]: - print('PASS: succesfully combined with left join') -else: - sys.exit('FAIL: unsuccessful merge') - -#%% - -afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') -afor_kin_df.columns = afor_kin_df.columns.str.lower() - -print('===================================' - , '\nSixth merge: afor_snpinfo_dfs + afor_kin_df' - , '\n===================================') - -merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df) - -print('Dim of df1:', afor_snpinfo_dfs.shape - , '\nDim of df2:', afor_kin_df.shape - , '\nno. of merging_cols:', len(merging_cols_m6)) - -ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = 'outer') - -print('Dim of ors_df:', ors_df.shape) - -#%% - -print('===================================' - , '\nSeventh merge: combined_df + ors_df' - , '\n===================================') - -merging_cols_m7 = detect_common_cols(combined_df, ors_df) - -print('Dim of df1:', combined_df.shape - , '\nDim of df2:', ors_df.shape - , '\nno. of merging_cols:', len(merging_cols_m7)) - -print('checking mutations in the two dfs:' - , '\nmuts in df1 but NOT in df2:' - , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum() - , 'muts in df2 but NOT in df1:' - , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) - -#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df['mutationinformation']) ) - -#combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'outer') # FIXME -combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'left') - -outdf_expected_rows = len(combined_df) -outdf_expected_cols = len(combined_df.columns) + len(ors_df.columns) - len(merging_cols_m7) - -print('\nDim of combined_df_all:', combined_df_all.shape) - -if combined_df_all.shape[1] == outdf_expected_cols: - print('combined_df has expected no. of cols') -if combined_df_all.shape[0] == outdf_expected_rows: - print('combined_df has expected no. of rows') -else: - print('WARNING: nrows discrepancy noted' - , '\nFIX IT') - - -print ('thing finished') -#%% write csv - -combined_df_all.to_csv(outfile_comb, index = False)