diff --git a/scripts/combine_dfs.py b/scripts/combine_dfs.py new file mode 100755 index 0000000..ea5e1e3 --- /dev/null +++ b/scripts/combine_dfs.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +# FIXME: change filename 4 (mcsm normalised data) +# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline +#============================================================================= +# Task: combine 4 dfs with aa position as linking column +# This is done in 2 steps: +# merge 1: of 3 dfs (filenames in lowercase) +# _dssp.csv +# _kd.csv +# _pnca_rd.csv + +# merge 2: of 2 dfs +# pnca_complex_mcsm_norm.csv (!fix name) +# output df from merge1 + +# Input: 3 dfs +# _dssp.csv +# _kd.csv +# _pnca_rd.csv +# pnca_complex_mcsm_norm.csv (!fix name) + +# Output: .csv of all 4 dfs combined + +# useful link +# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns +#============================================================================= +#%% load packages +import sys, os +import pandas as pd +#import numpy as np +import argparse +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin') +arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' +#gene_match = gene + '_p.' + +drug = args.drug +gene = args.gene +#========== +# data dir +#========== +datadir = homedir + '/' + 'git/Data' + +#======= +# input +#======= +indir = datadir + '/' + drug + '/' + 'output' +in_filename1 = 'pnca_dssp.csv' +in_filename2 = 'pnca_kd.csv' +in_filename3 = 'pnca_rd.csv' +#in_filename4 = 'mcsm_complex1_normalised.csv' # FIXNAME +in_filename4 = 'pnca_complex_mcsm_norm.csv' + +infile1 = indir + '/' + in_filename1 +infile2 = indir + '/' + in_filename2 +infile3 = indir + '/' + in_filename3 +infile4 = indir + '/' + in_filename4 + +print('\nInput path:', indir + , '\nInput filename1:', in_filename1 + , '\nInput filename2:', in_filename2 + , '\nInput filename3:', in_filename3 + , '\nInput filename4:', in_filename4 + , '\n===================================================================') + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +out_filename = gene.lower() + '_mcsm_struct_params.csv' +outfile = outdir + '/' + out_filename +print('Output filename:', out_filename + , '\nOutput path:', outdir + , '\n===================================================================') + +#%% end of variable assignment for input and output files +#======================================================================= +#%% function/methd to combine 4 dfs + +def combine_dfs(dssp_csv, kd_csv, rd_csv, mcsm_csv, out_combined_csv): + """ + Combine 4 dfs + + @param dssp_df: csv file (output from dssp_df.py) + @type dssp_df: string + + @param kd_df: csv file (output from kd_df.py) + @type ks_df: string + + @param rd_df: csv file (output from rd_df.py) + @type rd_df: string + + # FIXME + @param mcsm_df: csv file (output of mcsm pipeline)CHECK} + @type mcsm_df: string + + @param out_combined_csv: csv file output + @type out_combined_csv: string + + @return: none, writes combined df as csv + """ + #======================== + # read input csv files to combine + #======================== + dssp_df = pd.read_csv(dssp_csv, sep = ',') + kd_df = pd.read_csv(kd_csv, sep = ',') + rd_df = pd.read_csv(rd_csv, sep = ',') + mcsm_df = pd.read_csv(mcsm_csv, sep = ',') + + print('Reading input files:' + , '\ndssp file:', dssp_csv + , '\nNo. of rows:', len(dssp_df) + , '\nNo. of cols:', len(dssp_df.columns) + , '\nColumn names:', dssp_df.columns + , '\n===================================================================' + , '\nkd file:', kd_csv + , '\nNo. of rows:', len(kd_df) + , '\nNo. of cols:', len(kd_df.columns) + , '\nColumn names:', kd_df.columns + , '\n===================================================================' + , '\nrd file:', rd_csv + , '\nNo. of rows:', len(rd_df) + , '\nNo. of cols:', len(rd_df.columns) + , '\nColumn names:', rd_df.columns + , '\n===================================================================' + , '\nrd file:', mcsm_csv + , '\nNo. of rows:', len(mcsm_df) + , '\nNo. of cols:', len(mcsm_df.columns) + , '\nColumn names:', mcsm_df.columns + , '\n===================================================================') + + #======================== + # merge 1 (combined_df1) + # concatenating 3dfs: + # dssp_df, kd_df, rd_df + #======================== + print('starting first merge...\n') + + # checking no. of rows + print('Checking if no. of rows of the 3 dfs are equal:\n' + , len(dssp_df) == len(kd_df) == len(rd_df) + , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' + , '\n===================================================================') + + # variables for sanity checks + expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df)) + # beware of harcoding! used for sanity check + ndfs = 3 + ncol_merge = 1 + offset = ndfs- ncol_merge + expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset + + print('Merge 1:' + , '\ncombining 3dfs by commom col: position' + , '\nExpected nrows in combined_df:', expected_rows_df1 + , '\nExpected ncols in combined_df:', expected_cols_df1 + , '\nResetting the common col as the index' + , '\n===================================================================') + + #dssp_df.set_index('position', inplace = True) + #kd_df.set_index('position', inplace = True) + #rd_df.set_index('position', inplace =True) + + #combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() + #combined_df.rename(columns = {'index':'position'}) + + combined_df1 = pd.concat( + (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df]) + , axis = 1, join = 'outer').reset_index() + + # sanity check + print('Checking dimensions of concatenated df1...') + if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1: + print('PASS: combined df has expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df1) + , '\nNo. of cols in combined df:', len(combined_df1.columns) + , '\n===============================================================') + else: + print('FAIL: combined df does not have expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df1) + , '\nNo. of cols in combined df:', len(combined_df1.columns) + , '\n===============================================================') + + #======================== + # merge 2 (combined_df2) + # concatenating 2dfs: + # mcsm_df, combined_df1 (result of merge1) + # sort the cols + #======================== + print('starting second merge...\n') + + # rename col 'Position' in mcsm_df to lowercase 'position' + # as it matches the combined_df1 colname to perfom merge + + #mcsm_df.columns + #mcsm_df.rename(columns = {'Position':'position'}) # not working! + # copy 'Position' column with the correct colname + print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging' + , '\nNo. of cols before copying: ', len(mcsm_df.columns)) + + mcsm_df['position'] = mcsm_df['Position'] + print('No. of cols after copying: ', len(mcsm_df.columns)) + + # sanity check + if mcsm_df['position'].equals(mcsm_df['Position']): + print('PASS: Copying worked correctly' + , '\ncopied col matches original column' + , '\n===============================================================') + else: + print('FAIL: copied col does not match original column' + , '\n================================================================') + + # variables for sanity checks + expected_rows_df2 = len(mcsm_df) + # beware of harcoding! used for sanity check + ndfs = 2 + ncol_merge = 1 + offset = ndfs - ncol_merge + expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset + + print('Merge 2:' + , '\ncombining 2dfs by commom col: position' + , '\nExpected nrows in combined_df:', expected_rows_df2 + , '\nExpected ncols in combined_df:', expected_cols_df2 + , '\n===================================================================') + + combined_df2 = mcsm_df.merge(combined_df1, on = 'position') + + # sanity check + print('Checking dimensions of concatenated df2...') + if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2: + print('PASS: combined df2 has expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df2) + , '\nNo. of cols in combined df:', len(combined_df2.columns) + , '\n===============================================================') + else: + print('FAIL: combined df2 does not have expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df2) + , '\nNo. of cols in combined df:', len(combined_df2.columns) + , '\n===============================================================') + + #=============== + # writing file + #=============== + print('Writing file:' + , '\nFilename:', out_combined_csv +# , '\nPath:', outdir + , '\nExpected no. of rows:', len(combined_df2) + , '\nExpected no. of cols:', len(combined_df2.columns) + , '\n=========================================================') + + combined_df2.to_csv(out_combined_csv, header = True, index = False) + +#%% end of function +#======================================================================= +#%% call function +#combine_dfs(infile1, infile2, infile3, infile4, outfile) +#======================================================================= +def main(): + print('Combining 4 dfs:\n' + , in_filename1, '\n' + , in_filename2, '\n' + , in_filename3, '\n' + , in_filename4, '\n' + , 'output csv:', out_filename) + combine_dfs(infile1, infile2, infile3, infile4, outfile) + print('Finished Writing file:' + , '\nFilename:', out_filename + , '\nPath:', outdir +## , '\nNo. of rows:', '' +## , '\nNo. of cols:', '' + , '\n===========================================================') + +if __name__ == '__main__': + main() +#%% end of script +#======================================================================= +