#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' #======================================================================= # Task: combining all dfs to a single one # Input: 8 dfs #1) .lower()'_complex_mcsm_norm.csv' #2) .lower()_foldx.csv' #3) .lower()_dssp.csv' #4) .lower()_kd.csv' #5) .lower()_rd.csv' #6) 'ns' + .lower()_snp_info.csv' #7) .lower()_af_or.csv' #8) .lower() _af_or_kinship.csv # combining order #Merge1 = 1 + 2 #Merge2 = 3 + 4 #Merge3 = Merge2 + 5 #Merge4 = Merge1 + Merge3 #Merge5 = 6 + 7 #Merge6 = Merge5 + 8 #Merge7 = Merge4 + Merge6 # Output: single csv of all 8 dfs combined # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd from pandas import DataFrame import numpy as np #from varname import nameof import argparse #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # FIXME: local imports #from combining import combine_dfs_with_checks from combining_FIXME import detect_common_cols from reference_dict import oneletter_aa_dict from reference_dict import low_3letter_dict from aa_code import get_aa_3lower from aa_code import get_aa_1upper # REGEX: as required # mcsm_regex = r'^([A-Za-z]{1})([0-9]+)([A-Za-z]{1})$' # mcsm_wt = mcsm_df['mutationinformation'].str.extract(mcsm_regex)[0] # mcsm_mut = mcsm_df['mutationinformation'].str.extract(mcsm_regex)[2] # gwas_regex = r'^([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$' # gwas_wt = mcsm_df['mutation'].str.extract(gwas_regex)[0] # gwas_pos = mcsm_df['mutation'].str.extract(gwas_regex)[1] # gwas_mut = mcsm_df['mutation'].str.extract(gwas_regex)[2] #======================================================================= #%% command line args: case sensitive arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' drug = args.drug gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) # !"Redundant, now that improvements have been made! # See section "REGEX" # nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' # print('nsSNP for gene', gene, ':', nssnp_match) # wt_regex = gene_match.lower()+'([A-Za-z]{3})' # print('wt regex:', wt_regex) # mut_regex = r'[0-9]+(\w{3})$' # print('mt regex:', mut_regex) # pos_regex = r'([0-9]+)' # print('position regex:', pos_regex) #%%======================================================================= #============== # directories #============== if not datadir: datadir = homedir + '/git/Data/' if not indir: indir = datadir + drug + '/input/' if not outdir: outdir = datadir + drug + '/output/' #======= # input #======= #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb in_filename_foldx = gene.lower() + '_foldx.csv' in_filename_dssp = gene.lower() + '_dssp.csv' in_filename_kd = gene.lower() + '_kd.csv' in_filename_rd = gene.lower() + '_rd.csv' in_filename_deepddg = gene.lower() + '_complex_ddg_results.txt' # change to decent filename and put it in the correct dir in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info in_filename_afor = gene.lower() + '_af_or.csv' in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' infile_mcsm = outdir + in_filename_mcsm infile_foldx = outdir + in_filename_foldx infile_dssp = outdir + in_filename_dssp infile_kd = outdir + in_filename_kd infile_rd = outdir + in_filename_rd infile_deepddg = outdir + 'deep_ddg/' + in_filename_deepddg infile_snpinfo = outdir + '/' + in_filename_snpinfo infile_afor = outdir + '/' + in_filename_afor infile_afor_kin = outdir + '/' + in_filename_afor_kin print('\nInput path:', indir , '\nOutput path:', outdir, '\n' , '\nInput filename mcsm:', infile_mcsm , '\nInput filename foldx:', infile_foldx, '\n' , '\nInput filename dssp:', infile_dssp , '\nInput filename kd:', infile_kd , '\nInput filename rd', infile_rd # , '\nInput filename rd', infile_deepddg , '\n' , '\nInput filename snp info:', infile_snpinfo, '\n' , '\nInput filename af or:', infile_afor , '\nInput filename afor kinship:', infile_afor_kin , '\n============================================================') #======= # output #======= out_filename_comb = gene.lower() + '_all_params.csv' outfile_comb = outdir + '/' + out_filename_comb print('Output filename:', outfile_comb , '\n===================================================================') o_join = 'outer' l_join = 'left' r_join = 'right' i_join = 'inner' # end of variable assignment for input and output files #%%============================================================================ print('===================================' , '\nFirst merge: mcsm + foldx' , '\n===================================') mcsm_df = pd.read_csv(infile_mcsm, sep = ',') # add 3 lowercase aa code for wt and mutant get_aa_3lower(df = mcsm_df , wt_colname = 'wild_type' , mut_colname = 'mutant_type' , col_wt = 'wt_aa_3lower' , col_mut = 'mut_aa_3lower') #mcsm_df.columns = mcsm_df.columns.str.lower() foldx_df = pd.read_csv(infile_foldx , sep = ',') #mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join) merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df) mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1, how = o_join) ncols_m1 = len(mcsm_foldx_dfs.columns) print('\n\nResult of first merge:', mcsm_foldx_dfs.shape , '\n===================================================================') mcsm_foldx_dfs[merging_cols_m1].apply(len) mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs) #%% print('===================================' , '\nSecond merge: mcsm_foldx_dfs + deepddg' , '\n===================================') deepddg_df = pd.read_csv(infile_deepddg, sep = ' ') deepddg_df.columns deepddg_df.rename(columns = {'#chain' : 'chain_id' , 'WT' : 'wild_type_deepddg' , 'ResID' : 'position' , 'Mut' : 'mutant_type_deepddg'} , inplace = True) deepddg_df['mutationinformation'] = deepddg_df['wild_type_deepddg'] + deepddg_df['position'].map(str) + deepddg_df['mutant_type_deepddg'] # add deepddg outcome column: <0--> Destabilising, >0 --> Stabilising deepddg_df['deepddg_outcome'] = np.where(deepddg_df['deepddg'] < 0, 'Destabilising', 'Stabilising') deepddg_df['deepddg_outcome'].value_counts() # drop extra columns to allow clean merging deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1) # rearrange columns deepddg_short_df.columns deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]] mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_short_df, on = 'mutationinformation', how = l_join) mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts() ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns) #%%============================================================================ print('===================================' , '\nSecond merge: dssp + kd' , '\n===================================') dssp_df = pd.read_csv(infile_dssp, sep = ',') kd_df = pd.read_csv(infile_kd, sep = ',') rd_df = pd.read_csv(infile_rd, sep = ',') #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) merging_cols_m2 = detect_common_cols(dssp_df, kd_df) dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = o_join) print('\n\nResult of second merge:', dssp_kd_dfs.shape , '\n===================================================================') #%%============================================================================ print('===================================' , '\nThird merge: second merge + rd_df' , '\ndssp_kd_dfs + rd_df' , '\n===================================') #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join) merging_cols_m3 = detect_common_cols(dssp_kd_dfs, rd_df) dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3 , how = o_join) ncols_m3 = len(dssp_kd_rd_dfs.columns) print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape , '\n===================================================================') dssp_kd_rd_dfs[merging_cols_m3].apply(len) dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs) #%%============================================================================ print('=======================================' , '\nFourth merge: First merge + Third merge' , '\nmcsm_foldx_dfs + dssp_kd_rd_dfs' , '\n=======================================') #combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join) #merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs) #combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = i_join) #combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4) # with deepddg values merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs) combined_df = pd.merge(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = i_join) combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4) if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: print('PASS: successfully combined 5 dfs' , '\nNo. of rows combined_df:', len(combined_df) , '\nNo. of cols combined_df:', len(combined_df.columns)) else: sys.exit('FAIL: check individual df merges') print('\nResult of Fourth merge:', combined_df.shape , '\n===================================================================') combined_df[merging_cols_m4].apply(len) combined_df[merging_cols_m4].apply(len) == len(combined_df) #%%============================================================================ # Format the combined df columns combined_df_colnames = combined_df.columns # check redundant columns combined_df['chain'].equals(combined_df['chain_id']) combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan combined_df['wild_type'].equals(combined_df['wild_type_dssp']) #sanity check foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']] # Drop cols cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps' ] combined_df_clean = combined_df.drop(cols_to_drop, axis = 1) del(foo) #%%============================================================================ # Output columns out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv' outfile_stab_struc = outdir + '/' + out_filename_stab_struc print('Output filename:', outfile_stab_struc , '\n===================================================================') # write csv print('Writing file: combined stability and structural parameters') combined_df.to_csv(outfile_stab_struc, index = False) print('\nFinished writing file:' , '\nNo. of rows:', combined_df.shape[0] , '\nNo. of cols:', combined_df.shape[1]) #%% end of script