#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 2(mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 2 dfs with aa position as linking column # Input: 2 dfs # _complex_mcsm_norm.csv # _foldx.csv # Output: .csv of all 2 dfs combined # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np #from varname import nameof import argparse from combining import combine_stability_dfs #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' drug = args.drug gene = args.gene #====== # dirs #====== datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/' + 'input' outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' in_filename_foldx = gene.lower() + '_foldx.csv' infile_mcsm = outdir + '/' + in_filename_mcsm infile_foldx = outdir + '/' + in_filename_foldx print('\nInput path:', outdir , '\nInput filename1:', in_filename_mcsm , '\nInput filename2:', in_filename_foldx , '\n============================================================') #======= # output #======= out_filename_comb = gene.lower() + '_mcsm_foldx.csv' outfile_comb = outdir + '/' + out_filename_comb print('Output filename:', outfile_comb , '\n============================================================') my_join_type = 'outer' #my_join_type = 'left' #my_join_type = 'right' #my_join_type = 'inner' # end of variable assignment for input and output files #%% call function #======================================================================= #combine_stability_dfs(mcsm_df, foldx_df, outfile) #======================================================================= def main(): combined_df = combine_stability_dfs(infile_mcsm, infile_foldx, my_join = my_join_type) print('Combining 2 dfs...' , '\nArguments to function combine_stability_dfs:' , '\ndf1:', in_filename_mcsm , '\ndf2:', in_filename_foldx , '\njoin_type:', my_join_type , '\ncombined df sneak peak:\n' , combined_df.head()) print('Writing output...') combined_df.to_csv(outfile_comb, index = False) print('Finished writing output file' , '\nOutput file:', outfile_comb , '\nDimensions:', combined_df.shape) if __name__ == '__main__': main() #======================================================================= #%% end of script