#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 2(mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 2 dfs with aa position as linking column # Input: 2 dfs # _complex_mcsm_norm.csv # _foldx.csv # Output: .csv of all 2 dfs combined # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np #from varname import nameof import argparse from combining import combine_stability_dfs from combining import detect_common_cols #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() #======================================================================= #%% command line args #arg_parser = argparse.ArgumentParser() #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive #args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' #drug = args.drug #gene = args.gene #====== # dirs #====== datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/' + 'input' outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= #in_filename_linking = gene.lower() + '_linking_df.csv' #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' #in_filename_foldx = gene.lower() + '_foldx.csv' in_filename_dssp = gene.lower() + '_dssp.csv' in_filename_kd = gene.lower() + '_kd.csv' #in_filename_rd = gene.lower() + '_rd.csv' in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv' in_filename_afor = gene.lower() + '_af_or.csv' in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' #infile_linking = outdir + '/' + in_filename_linking #infile_mcsm = outdir + '/' + in_filename_mcsm #infile_foldx = outdir + '/' + in_filename_foldx infile_dssp = outdir + '/' + in_filename_dssp infile_kd = outdir + '/' + in_filename_kd #infile_rd = outdir + '/' + in_filename_rd infile_snpinfo = indir + '/' + in_filename_snpinfo infile_afor = outdir + '/' + in_filename_afor infile_afor_kin = outdir + '/' + in_filename_afor_kin print('\nInput path:', outdir # , '\nInput filename1:', infile_mcsm # , '\nInput filename2:', infile_foldx , '\nInput filename2:', infile_dssp , '\nInput filename2:', infile_kd # , '\nInput filename2:', infile_rd , '\nInput filename snp info:', infile_snpinfo , '\nInput filename af or:', infile_afor , '\nInput filename afor kinship:', infile_afor_kin , '\n============================================================') #======= # output #======= #out_filename_comb = gene.lower() + '_struct_params_TEST.csv' #outfile_comb = outdir + '/' + out_filename_comb #print('Output filename:', outfile_comb # , '\n============================================================') o_join = 'outer' l_join = 'left' r_join = 'right' i_join = 'inner' #del(in_filename_dssp, in_filename_foldx) # end of variable assignment for input and output files #======================================================================= # call function to detect common cols #======================================================================= def main(): print('Reading input files:') #dssp_df = pd.read_csv(infile_dssp, sep = ',') #dssp_df.columns = dssp_df.columns.str.lower() #kd_df = pd.read_csv(infile_kd, sep = ',') #kd_df.columns = kd_df.columns.str.lower() # print('Dimension left df:', dssp_df.shape # , '\nDimension right_df:', kd_df.shape # , '\njoin type:', o_join # , '\n=========================================================') # detect common cols #merging_cols = detect_common_cols(dssp_df, kd_df) #print('Length of common cols:', len(merging_cols) # , '\nmerging column/s:', merging_cols, 'type:', type(merging_cols) # , '\ndtypes in merging columns:', dssp_df[merging_cols].dtypes) #combined_df1 = combine_stability_dfs(dssp_df, kd_df, my_join = o_join) #print('Dimensions of combined df:', combined_df1.shape # , '\nsneak peak:', combined_df1.head() # , '\ndtypes in cols:\n', combined_df1.dtypes) #============================================================================= afor_df = pd.read_csv(infile_afor, sep = ',') afor_df.columns = afor_df.columns.str.lower() snpinfo_df = pd.read_csv(infile_snpinfo, sep = ',') snpinfo_df.columns = snpinfo_df.columns.str.lower() # print('Dimension df1:', afor_df.shape # , '\nDimension df2:', snpinfo_df.shape # , '\njoin type:', l_join # , '\n=========================================================') # detect common cols merging_cols = detect_common_cols(afor_df, snpinfo_df) #print('Length of common cols:', len(merging_cols) # , '\nmerging column/s:', merging_cols, 'type:', type(merging_cols) # , '\ndtypes in merging columns:', snpinfo_df[merging_cols].dtypes) comb_afor_snpinfo = combine_stability_dfs(afor_df, snpinfo_df, my_join = l_join) #print('Dimensions of combined df:', comb_afor_snpinfo.shape # , '\nsneak peak:', comb_afor_snpinfo.head() # , '\ndtypes in cols:\n', comb_afor_snpinfo.dtypes) #============================================================================= afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') afor_kin_df.columns = afor_kin_df.columns.str.lower() # detect common cols merging_cols = detect_common_cols(comb_afor_snpinfo, afor_kin_df) # comb2 = combine_stability_dfs(comb_afor_snpinfo, afor_kin_df, my_join = o_join) #print('Dimensions of combined df:', comb2.shape # , '\nsneak peak:', comb2.head() # , '\ndtypes in cols:\n', comb2.dtype) if __name__ == '__main__': main() #======================================================================= #%% end of script #hardocoded test dssp_df = pd.read_csv(infile_dssp, sep = ',') kd_df = pd.read_csv(infile_kd, sep = ',') afor_df = pd.read_csv(infile_afor, sep = ',') snpinfo_df = pd.read_csv(infile_snpinfo, sep = ',') afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') merging_cols = ['alt_allele', 'chr_num_allele', 'chromosome_number', 'gene_id', 'gene_number', 'mut_info', 'mut_region', 'mut_type', 'mutant_type', 'mutationinformation', 'position', 'ref_allele', 'wild_type'] print('doing thing') comb_afor_snpinfo = pd.merge(afor_df, snpinfo_df, on = 'mutation', how = 'inner') comb2 = pd.merge(comb_afor_snpinfo, afor_kin_df, on = merging_cols, how = i_join) comb3 = comb2.drop_duplicates(subset=merging_cols, keep = 'first') common = np.intersect1d(comb_afor_snpinfo['mutationinformation'], afor_kin_df['mutationinformation']) print('comb3 dim:', comb3.shape , '\ncomb2 dim:', comb2.shape , '\ndim of df1:', comb_afor_snpinfo.shape , '\ndim of df2:', afor_kin_df.shape , '\ncommon vals:', len(common)) print('expected:\n') bar = combine_stability_dfs(comb_afor_snpinfo, afor_kin_df, my_join = o_join) print('XXXXXX\n:', bar.shape) #bar = np.intersect1d(comb_afor_snpinfo[merging_cols[0]], afor_kin_df[merging_cols[0]]) #print('common values:',len(bar)) #comb2 = combine_stability_dfs(comb_afor_snpinfo, afor_kin_df, my_join = o_join) print ('thing finished')