From 69e2567ffc0649f566baf28742d794d8bbeddd59 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 26 Mar 2020 17:14:20 +0000 Subject: [PATCH] added script to combined dfs of structural params like kd, dssp & rd --- meta_data_analysis/combine_dfs.py | 152 ++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100755 meta_data_analysis/combine_dfs.py diff --git a/meta_data_analysis/combine_dfs.py b/meta_data_analysis/combine_dfs.py new file mode 100755 index 0000000..f298324 --- /dev/null +++ b/meta_data_analysis/combine_dfs.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +#============================================================================= +# Task: combine 3 dfs with aa position as linking column + +# Input: 3 dfs +# pnca_dssp.csv +# pnca_kd.csv +# pnca_rd.csv + +# Output: .csv with 3) +#============================================================================= +#%% load packages +import sys, os +import pandas as pd +#import numpy as np +#============================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#============================================================================= +#%% variable assignment: input and output +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = gene + '_p.' + +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' + +#======= +# input +#======= +#indir = 'git/Data/pyrazinamide/input/original' +indir = datadir + '/' + drug + '/' + 'output' +in_filename1 = 'pnca_dssp.csv' +in_filename2 = 'pnca_kd.csv' +in_filename3 = 'pnca_rd.csv' + +infile1 = indir + '/' + in_filename1 +infile2 = indir + '/' + in_filename2 +infile3 = indir + '/' + in_filename3 + +print('\nInput path:', indir + , '\nInput filename1:', in_filename1 + , '\nInput filename2:', in_filename2 + , '\nInput filename3:', in_filename3 + , '\n===================================================================') + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +out_filename = gene.lower() + '_struct_params.csv' +outfile = outdir + '/' + out_filename +print('Output filename:', out_filename + , '\nOutput path:', outdir + , '\n===================================================================') + +#%% end of variable assignment for input and output files +#======================================================================= +#%% Read input file +dssp_df = pd.read_csv(infile1, sep = ',') +kd_df = pd.read_csv(infile2, sep = ',') +rd_df = pd.read_csv(infile3, sep = ',') + +print('Reading input files:' + , '\ndssp file:', infile1 + , '\nNo. of rows:', len(dssp_df) + , '\nNo. of cols:', len(dssp_df.columns) + , '\nColumn names:', dssp_df.columns + , '\n===================================================================' + , '\nkd file:', infile2 + , '\nNo. of rows:', len(kd_df) + , '\nNo. of cols:', len(kd_df.columns) + , '\nColumn names:', kd_df.columns + , '\n===================================================================' + , '\nrd file:', infile3 + , '\nNo. of rows:', len(rd_df) + , '\nNo. of cols:', len(rd_df.columns) + , '\nColumn names:', rd_df.columns + , '\n===================================================================') + +#======================== +# checking no. of rows +#======================== +print('Checking if no. of rows of the 3 dfs are equal:\n' + , len(dssp_df) == len(kd_df) == len(rd_df) + , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' + , '\n===================================================================') + +#=================== +# concatenating dfs +#=================== + +expected_rows = max(len(dssp_df), len(kd_df), len(rd_df)) + +# beware of harcoding! used for sanity check +offset = 2 # 1 common col in each of the three dfs-1 +expected_cols = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset + +print('Combining dfs by commom col: position' + , '\nExpected nrows in combined_df:', expected_rows + , '\nExpected ncols in combined_df:', expected_cols + , '\nResetting the common col as the index' + , '\n===================================================================') + +dssp_df.set_index('position',inplace = True) +kd_df.set_index('position',inplace = True) +rd_df.set_index('position',inplace =True) + +combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() +combined_df.rename(columns = {'index':'position'}) + +print('Checking dimensions of concatenated df...') +if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols: + print('PASS: combined df has expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df) + , '\nNo. of cols in combined df:', len(combined_df.columns) + , '\n===============================================================') +else: + print('FAIL: combined df has not expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df) + , '\nNo. of cols in combined df:', len(combined_df.columns) + , '\n===============================================================') + +#%% write file +print('Writing file:' + , '\nFilename:', out_filename + , '\nPath:', outdir + , '\n===================================================================') + +combined_df.to_csv(outfile, header = True, index = False) + +print('Finished writing:', out_filename + , '\nNo. of rows:', len(combined_df) + , '\nNo. of cols:', len(combined_df.columns) + , '\n===================================================================') + +#%% end of script +#============================================================================== \ No newline at end of file