diff --git a/scripts/combine_afs_ors.py b/scripts/combine_afs_ors.py index 2fb0f8f..02a5243 100755 --- a/scripts/combine_afs_ors.py +++ b/scripts/combine_afs_ors.py @@ -128,7 +128,8 @@ print(merging_cols) nmerging_cols = len(merging_cols) print(' length of merging cols:', nmerging_cols , '\nmerging cols:', merging_cols, 'type:', type(merging_cols)) - + +#https://stackoverflow.com/questions/22720739/pandas-left-outer-join-results-in-table-larger-than-left-table # drop duplicates else the expected rows don't match print('Checking for duplicates in common col:', common_cols , '\nNo of duplicates:' diff --git a/scripts/combining.py b/scripts/combining.py new file mode 100755 index 0000000..8bab131 --- /dev/null +++ b/scripts/combining.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +# FIXME: change filename 2(mcsm normalised data) +# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline +#======================================================================= +# Task: combine 2 dfs with aa position as linking column + +# Input: 2 dfs +# _complex_mcsm_norm.csv +# _foldx.csv + +# Output: .csv of all 2 dfs combined + +# useful link +# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns +#======================================================================= +#%% load packages +import sys, os +import pandas as pd +import numpy as np +#from varname import nameof + +#%% end of variable assignment for input and output files +#======================================================================= +#%% function/methd to combine 4 dfs + +#def combine_stability_dfs(mcsm_df, foldx_df, out_combined_df): +def combine_stability_dfs(mcsm_df, foldx_df, my_join = 'outer'): + """ + Combine 2 dfs + + @param mcsm_df: csv file (output from mcsm pipeline) + @type mcsm_df: string + + @param foldx_df: csv file (output from runFoldx.py) + @type foldx_df: string + + @param out_combined_df: csv file output + @type out_combined_df: string + + @return: none, writes combined df as csv + """ + #======================== + # read input csv files to combine + #======================== + print('Reading input files:') + + left_df = pd.read_csv(mcsm_df, sep = ',') + left_df.columns = left_df.columns.str.lower() + + right_df = pd.read_csv(foldx_df, sep = ',') + right_df.columns = right_df.columns.str.lower() + + print('Dimension left df:', left_df.shape + , '\nDimesnion right_df:', right_df.shape +# , '\njoin type:', join_type + , '\n=========================================================') + + print('Finding common cols and merging cols:' + ,'\n=========================================================') + + + common_cols = np.intersect1d(left_df.columns, right_df.columns).tolist() + print('Length of common cols:', len(common_cols) + , '\ncommon column/s:', common_cols, 'type:', type(common_cols)) + + print('selecting consistent dtypes for merging (object i.e string)') + merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist() + nmerging_cols = len(merging_cols) + print(' length of merging cols:', nmerging_cols + , '\nmerging cols:', merging_cols, 'type:', type(merging_cols) + , '\n=========================================================') + + #======================== + # merge 1 (combined_df) + # concatenating 2dfs: + # mcsm_df, foldx_df + #======================== + # checking cross-over of mutations in the two dfs to merge + #ndiff1 = left_df.shape[0] - left_df['mutationinformation'].isin(right_df['mutationinformation']).sum() + ndiff_1 = left_df[merging_cols].squeeze().isin(right_df[merging_cols].squeeze()).sum() + print('ndiff_1:', ndiff_1) + + ndiff1 = left_df.shape[0] - ndiff_1 + #print('There are', ndiff1, 'unmatched mutations in left df') + + #missing_mutinfo = left_df[~left_df['mutationinformation'].isin(right_df['mutationinformation'])] + #missing_mutinfo.to_csv('infoless_muts.csv') + + #ndiff2 = right_df.shape[0] - right_df['mutationinformation'].isin(left_df['mutationinformation']).sum() + ndiff_2 = right_df[merging_cols].squeeze().isin(left_df[merging_cols].squeeze()).sum() + print('ndiff_2:', ndiff_2) + + ndiff2 = right_df.shape[0] - ndiff_2 + #print('There are', ndiff2, 'unmatched mutations in right_df') + + comm = np.intersect1d(left_df[merging_cols], right_df[merging_cols]) + comm_count = len(comm) + print('inner:', comm, '\nlength:', comm_count , '\ntype:', type(comm_count)) + + #======================== + # sanity checks for join type + #======================== + fail = False + print('combing with:', my_join) + combined_df = pd.merge(left_df, right_df, on = merging_cols, how = my_join) + combined_df1 = combined_df.drop_duplicates(subset = merging_cols, keep ='first') + + if my_join == 'inner': + #expected_rows = left_df.shape[0] - ndiff1 + expected_rows = comm_count + + if my_join == 'outer': + #expected_rows = right_df.shape[0] + ndiff1 + expected_rows = max(left_df.shape[0], right_df.shape[0]) + + if my_join == 'right': + expected_rows = right_df.shape[0] + + if my_join == 'left': + expected_rows = left_df.shape[0] + + expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols + + if len(combined_df1) == expected_rows and len(combined_df1.columns) == expected_cols: + print('PASS: successfully combined dfs with:', my_join, 'join') + else: + print('FAIL: combined_df\'s expected rows and cols not matched') + fail = True + print('\nExpected no. of rows:', expected_rows + , '\nGot:', len(combined_df1) + , '\nExpected no. of cols:', expected_cols + , '\nGot:', len(combined_df1.columns)) + if fail: + sys.exit() + + return combined_df1 + +#%% end of function +#======================================================================= + diff --git a/scripts/combining_mcsm_foldx.py b/scripts/combining_mcsm_foldx.py new file mode 100755 index 0000000..434ab92 --- /dev/null +++ b/scripts/combining_mcsm_foldx.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +# FIXME: change filename 2(mcsm normalised data) +# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline +#======================================================================= +# Task: combine 2 dfs with aa position as linking column + +# Input: 2 dfs +# _complex_mcsm_norm.csv +# _foldx.csv + +# Output: .csv of all 2 dfs combined + +# useful link +# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns +#======================================================================= +#%% load packages +import sys, os +import pandas as pd +import numpy as np +#from varname import nameof +import argparse +from combining import combine_stability_dfs +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' +#gene_match = gene + '_p.' + +drug = args.drug +gene = args.gene +#====== +# dirs +#====== +datadir = homedir + '/' + 'git/Data' +indir = datadir + '/' + drug + '/' + 'output' +outdir = datadir + '/' + drug + '/' + 'output' + +#======= +# input +#======= +in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' +in_filename_foldx = gene.lower() + '_foldx.csv' + +infile_mcsm = indir + '/' + in_filename_mcsm +infile_foldx = indir + '/' + in_filename_foldx + +print('\nInput path:', indir + , '\nInput filename1:', in_filename_mcsm + , '\nInput filename2:', in_filename_foldx + , '\n============================================================') + +#======= +# output +#======= +out_filename_comb = gene.lower() + '_mcsm_foldx.csv' +outfile_comb = outdir + '/' + out_filename_comb +print('Output filename:', outfile_comb + , '\n============================================================') + +my_join_type = 'outer' +#my_join_type = 'left' +#my_join_type = 'right' +#my_join_type = 'inner' + +# end of variable assignment for input and output files +#%% call function +#======================================================================= +#combine_stability_dfs(mcsm_df, foldx_df, outfile) +#======================================================================= +def main(): + + combined_df = combine_stability_dfs(infile_mcsm, infile_foldx, my_join = my_join_type) + print('Combining 2 dfs...' + , '\nArguments to function combine_stability_dfs:' + , '\ndf1:', in_filename_mcsm + , '\ndf2:', in_filename_foldx + , '\njoin_type:', my_join_type + , '\ncombined df sneak peak:\n' + , combined_df.head()) + + print('Writing output...') + + combined_df.to_csv(outfile_comb, index = False) + + print('Finished writing output file' + , '\nOutput file:', outfile_comb + , '\nDimensions:', combined_df.shape) + +if __name__ == '__main__': + main() +#======================================================================= +#%% end of script \ No newline at end of file diff --git a/scripts/reference_dict.py b/scripts/reference_dict.py old mode 100644 new mode 100755