#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 4 (mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 4 dfs with aa position as linking column # This is done in 2 steps: # merge 1: of 3 dfs (filenames in lowercase) # _dssp.csv # _kd.csv # _rd.csv # merge 2: of 2 dfs # gene.lower() + '_complex_mcsm_norm.csv' (!fix name) # output df from merge1 # Input: 3 dfs # _dssp.csv # _kd.csv # _rd.csv # gene.lower() + '_complex_mcsm_norm.csv' (!fix name) # Output: .csv of all 4 dfs combined # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd #import numpy as np import argparse #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') #arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' drug = args.drug gene = args.gene #========== # data dir #========== datadir = homedir + '/' + 'git/Data' #======= # input #======= indir = datadir + '/' + drug + '/' + 'output' in_filename1 = gene.lower() + '_dssp.csv' in_filename2 = gene.lower() + '_kd.csv' in_filename3 = gene.lower() + '_rd.csv' #in_filename4 = 'mcsm_complex1_normalised.csv' # FIXNAME in_filename4 = gene.lower() + '_complex_mcsm_norm.csv' infile1 = indir + '/' + in_filename1 infile2 = indir + '/' + in_filename2 infile3 = indir + '/' + in_filename3 infile4 = indir + '/' + in_filename4 print('\nInput path:', indir , '\nInput filename1:', in_filename1 , '\nInput filename2:', in_filename2 , '\nInput filename3:', in_filename3 , '\nInput filename4:', in_filename4 , '\n===================================================================') #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_mcsm_struct_params.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir , '\n===================================================================') #%% end of variable assignment for input and output files #======================================================================= #%% function/methd to combine 4 dfs def combine_dfs(dssp_csv, kd_csv, rd_csv, mcsm_csv, out_combined_csv): """ Combine 4 dfs @param dssp_df: csv file (output from dssp_df.py) @type dssp_df: string @param kd_df: csv file (output from kd_df.py) @type ks_df: string @param rd_df: csv file (output from rd_df.py) @type rd_df: string # FIXME @param mcsm_df: csv file (output of mcsm pipeline)CHECK} @type mcsm_df: string @param out_combined_csv: csv file output @type out_combined_csv: string @return: none, writes combined df as csv """ #======================== # read input csv files to combine #======================== dssp_df = pd.read_csv(dssp_csv, sep = ',') kd_df = pd.read_csv(kd_csv, sep = ',') rd_df = pd.read_csv(rd_csv, sep = ',') mcsm_df = pd.read_csv(mcsm_csv, sep = ',') print('Reading input files:' , '\ndssp file:', dssp_csv , '\nNo. of rows:', len(dssp_df) , '\nNo. of cols:', len(dssp_df.columns) , '\nColumn names:', dssp_df.columns , '\n===================================================================' , '\nkd file:', kd_csv , '\nNo. of rows:', len(kd_df) , '\nNo. of cols:', len(kd_df.columns) , '\nColumn names:', kd_df.columns , '\n===================================================================' , '\nrd file:', rd_csv , '\nNo. of rows:', len(rd_df) , '\nNo. of cols:', len(rd_df.columns) , '\nColumn names:', rd_df.columns , '\n===================================================================' , '\nrd file:', mcsm_csv , '\nNo. of rows:', len(mcsm_df) , '\nNo. of cols:', len(mcsm_df.columns) , '\nColumn names:', mcsm_df.columns , '\n===================================================================') #======================== # merge 1 (combined_df1) # concatenating 3dfs: # dssp_df, kd_df, rd_df #======================== print('starting first merge...\n') # checking no. of rows print('Checking if no. of rows of the 3 dfs are equal:\n' , len(dssp_df) == len(kd_df) == len(rd_df) , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' , '\n===================================================================') # variables for sanity checks expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df)) # beware of harcoding! used for sanity check ndfs = 3 ncol_merge = 1 offset = ndfs- ncol_merge expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset print('Merge 1:' , '\ncombining 3dfs by commom col: position' , '\nExpected nrows in combined_df:', expected_rows_df1 , '\nExpected ncols in combined_df:', expected_cols_df1 , '\nResetting the common col as the index' , '\n===================================================================') #dssp_df.set_index('position', inplace = True) #kd_df.set_index('position', inplace = True) #rd_df.set_index('position', inplace =True) #combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() #combined_df.rename(columns = {'index':'position'}) combined_df1 = pd.concat( (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df]) , axis = 1, join = 'outer').reset_index() # sanity check print('Checking dimensions of concatenated df1...') if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1: print('PASS: combined df has expected dimensions' , '\nNo. of rows in combined df:', len(combined_df1) , '\nNo. of cols in combined df:', len(combined_df1.columns) , '\n===============================================================') else: print('FAIL: combined df does not have expected dimensions' , '\nNo. of rows in combined df:', len(combined_df1) , '\nNo. of cols in combined df:', len(combined_df1.columns) , '\n===============================================================') #======================== # merge 2 (combined_df2) # concatenating 2dfs: # mcsm_df, combined_df1 (result of merge1) # sort the cols #======================== print('starting second merge...\n') # rename col 'Position' in mcsm_df to lowercase 'position' # as it matches the combined_df1 colname to perfom merge #mcsm_df.columns #mcsm_df.rename(columns = {'Position':'position'}) # not working! # copy 'Position' column with the correct colname print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging' , '\nNo. of cols before copying: ', len(mcsm_df.columns)) mcsm_df['position'] = mcsm_df['Position'] print('No. of cols after copying: ', len(mcsm_df.columns)) # sanity check if mcsm_df['position'].equals(mcsm_df['Position']): print('PASS: Copying worked correctly' , '\ncopied col matches original column' , '\n===============================================================') else: print('FAIL: copied col does not match original column' , '\n================================================================') # variables for sanity checks expected_rows_df2 = len(mcsm_df) # beware of harcoding! used for sanity check ndfs = 2 ncol_merge = 1 offset = ndfs - ncol_merge expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset print('Merge 2:' , '\ncombining 2dfs by commom col: position' , '\nExpected nrows in combined_df:', expected_rows_df2 , '\nExpected ncols in combined_df:', expected_cols_df2 , '\n===================================================================') combined_df2 = mcsm_df.merge(combined_df1, on = 'position') # sanity check print('Checking dimensions of concatenated df2...') if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2: print('PASS: combined df2 has expected dimensions' , '\nNo. of rows in combined df:', len(combined_df2) , '\nNo. of cols in combined df:', len(combined_df2.columns) , '\n===============================================================') else: print('FAIL: combined df2 does not have expected dimensions' , '\nNo. of rows in combined df:', len(combined_df2) , '\nNo. of cols in combined df:', len(combined_df2.columns) , '\n===============================================================') #=============== # writing file #=============== print('Writing file:' , '\nFilename:', out_combined_csv # , '\nPath:', outdir , '\nExpected no. of rows:', len(combined_df2) , '\nExpected no. of cols:', len(combined_df2.columns) , '\n=========================================================') combined_df2.to_csv(out_combined_csv, header = True, index = False) #%% end of function #======================================================================= #%% call function #combine_dfs(infile1, infile2, infile3, infile4, outfile) #======================================================================= def main(): print('Combining 4 dfs:\n' , in_filename1, '\n' , in_filename2, '\n' , in_filename3, '\n' , in_filename4, '\n' , 'output csv:', out_filename) combine_dfs(infile1, infile2, infile3, infile4, outfile) print('Finished Writing file:' , '\nFilename:', out_filename , '\nPath:', outdir ## , '\nNo. of rows:', '' ## , '\nNo. of cols:', '' , '\n===========================================================') if __name__ == '__main__': main() #%% end of script #=======================================================================