#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' #============================================================================= # Task: combine 3 dfs with aa position as linking column # Input: 3 dfs # pnca_dssp.csv # pnca_kd.csv # pnca_rd.csv # Output: .csv with 3) #============================================================================= #%% load packages import sys, os import pandas as pd #import numpy as np #============================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #============================================================================= #%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' #========== # data dir #========== #indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= #indir = 'git/Data/pyrazinamide/input/original' indir = datadir + '/' + drug + '/' + 'output' in_filename1 = 'pnca_dssp.csv' in_filename2 = 'pnca_kd.csv' in_filename3 = 'pnca_rd.csv' infile1 = indir + '/' + in_filename1 infile2 = indir + '/' + in_filename2 infile3 = indir + '/' + in_filename3 print('\nInput path:', indir , '\nInput filename1:', in_filename1 , '\nInput filename2:', in_filename2 , '\nInput filename3:', in_filename3 , '\n===================================================================') #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_struct_params.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir , '\n===================================================================') #%% end of variable assignment for input and output files #======================================================================= #%% Read input file dssp_df = pd.read_csv(infile1, sep = ',') kd_df = pd.read_csv(infile2, sep = ',') rd_df = pd.read_csv(infile3, sep = ',') print('Reading input files:' , '\ndssp file:', infile1 , '\nNo. of rows:', len(dssp_df) , '\nNo. of cols:', len(dssp_df.columns) , '\nColumn names:', dssp_df.columns , '\n===================================================================' , '\nkd file:', infile2 , '\nNo. of rows:', len(kd_df) , '\nNo. of cols:', len(kd_df.columns) , '\nColumn names:', kd_df.columns , '\n===================================================================' , '\nrd file:', infile3 , '\nNo. of rows:', len(rd_df) , '\nNo. of cols:', len(rd_df.columns) , '\nColumn names:', rd_df.columns , '\n===================================================================') #======================== # checking no. of rows #======================== print('Checking if no. of rows of the 3 dfs are equal:\n' , len(dssp_df) == len(kd_df) == len(rd_df) , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' , '\n===================================================================') #=================== # concatenating dfs #=================== expected_rows = max(len(dssp_df), len(kd_df), len(rd_df)) # beware of harcoding! used for sanity check offset = 2 # 1 common col in each of the three dfs-1 expected_cols = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset print('Combining dfs by commom col: position' , '\nExpected nrows in combined_df:', expected_rows , '\nExpected ncols in combined_df:', expected_cols , '\nResetting the common col as the index' , '\n===================================================================') dssp_df.set_index('position',inplace = True) kd_df.set_index('position',inplace = True) rd_df.set_index('position',inplace =True) combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() combined_df.rename(columns = {'index':'position'}) print('Checking dimensions of concatenated df...') if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols: print('PASS: combined df has expected dimensions' , '\nNo. of rows in combined df:', len(combined_df) , '\nNo. of cols in combined df:', len(combined_df.columns) , '\n===============================================================') else: print('FAIL: combined df has not expected dimensions' , '\nNo. of rows in combined df:', len(combined_df) , '\nNo. of cols in combined df:', len(combined_df.columns) , '\n===============================================================') #%% write file print('Writing file:' , '\nFilename:', out_filename , '\nPath:', outdir , '\n===================================================================') combined_df.to_csv(outfile, header = True, index = False) print('Finished writing:', out_filename , '\nNo. of rows:', len(combined_df) , '\nNo. of cols:', len(combined_df.columns) , '\n===================================================================') #%% end of script #==============================================================================