#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 4 (mcsm normalised data) # to be consistent like (pnca_mcsm_norm.csv) #============================================================================= # Task: combine 4 dfs with aa position as linking column # This is done in 2 steps: # merge 1: of 3 dfs # pnca_dssp.csv # pnca_kd.csv # pnca_rd.csv # merge 2: of 2 dfs # mcsm_complex1_normalised.csv (!fix name) # output df from merge1 # Input: 3 dfs # pnca_dssp.csv # pnca_kd.csv # pnca_rd.csv # mcsm_complex1_normalised.csv (!fix name) # Output: .csv of all 4 dfs combined # useful link #https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #============================================================================= #%% load packages import sys, os import pandas as pd #import numpy as np #============================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #============================================================================= #%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' #========== # data dir #========== #indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= #indir = 'git/Data/pyrazinamide/input/original' indir = datadir + '/' + drug + '/' + 'output' in_filename1 = 'pnca_dssp.csv' in_filename2 = 'pnca_kd.csv' in_filename3 = 'pnca_rd.csv' in_filename4 = 'mcsm_complex1_normalised.csv' # FIXNAME infile1 = indir + '/' + in_filename1 infile2 = indir + '/' + in_filename2 infile3 = indir + '/' + in_filename3 infile4 = indir + '/' + in_filename4 print('\nInput path:', indir , '\nInput filename1:', in_filename1 , '\nInput filename2:', in_filename2 , '\nInput filename3:', in_filename3 , '\nInput filename4:', in_filename4 , '\n===================================================================') #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_mcsm_struct_params.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir , '\n===================================================================') #%% end of variable assignment for input and output files #======================================================================= #%% Read input file dssp_df = pd.read_csv(infile1, sep = ',') kd_df = pd.read_csv(infile2, sep = ',') rd_df = pd.read_csv(infile3, sep = ',') mcsm_df = pd.read_csv(infile4, sep = ',') print('Reading input files:' , '\ndssp file:', infile1 , '\nNo. of rows:', len(dssp_df) , '\nNo. of cols:', len(dssp_df.columns) , '\nColumn names:', dssp_df.columns , '\n===================================================================' , '\nkd file:', infile2 , '\nNo. of rows:', len(kd_df) , '\nNo. of cols:', len(kd_df.columns) , '\nColumn names:', kd_df.columns , '\n===================================================================' , '\nrd file:', infile3 , '\nNo. of rows:', len(rd_df) , '\nNo. of cols:', len(rd_df.columns) , '\nColumn names:', rd_df.columns , '\n===================================================================' , '\nrd file:', infile4 , '\nNo. of rows:', len(mcsm_df) , '\nNo. of cols:', len(mcsm_df.columns) , '\nColumn names:', mcsm_df.columns , '\n===================================================================') #%% Begin combining dfs #=================== # concatenating df1 (3dfs): dssp_df + kd_df+ rd_df #=================== print('starting first merge...\n') # checking no. of rows print('Checking if no. of rows of the 3 dfs are equal:\n' , len(dssp_df) == len(kd_df) == len(rd_df) , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' , '\n===================================================================') # variables for sanity checks expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df)) # beware of harcoding! used for sanity check ndfs = 3 ncol_merge = 1 offset = ndfs- ncol_merge expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset print('Merge 1:' , '\ncombining 3dfs by commom col: position' , '\nExpected nrows in combined_df:', expected_rows_df1 , '\nExpected ncols in combined_df:', expected_cols_df1 , '\nResetting the common col as the index' , '\n===================================================================') #dssp_df.set_index('position', inplace = True) #kd_df.set_index('position', inplace = True) #rd_df.set_index('position', inplace =True) #combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() #combined_df.rename(columns = {'index':'position'}) combined_df1 = pd.concat( (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df]) , axis = 1, join = 'outer').reset_index() # sanity check print('Checking dimensions of concatenated df1...') if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1: print('PASS: combined df has expected dimensions' , '\nNo. of rows in combined df:', len(combined_df1) , '\nNo. of cols in combined df:', len(combined_df1.columns) , '\n===============================================================') else: print('FAIL: combined df does not have expected dimensions' , '\nNo. of rows in combined df:', len(combined_df1) , '\nNo. of cols in combined df:', len(combined_df1.columns) , '\n===============================================================') #=================== # concatenating df2 (2dfs): mcsm_df + combined_df1 # sort sorts the cols #=================== print('starting second merge...\n') # rename col 'Position' in mcsm_df to lowercase 'position' # as it matches the combined_df1 colname to perfom merge #mcsm_df.columns #mcsm_df.rename(columns = {'Position':'position'}) # not working! # copy 'Position' column with the correct colname print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging' , '\nNo. of cols before copying: ', len(mcsm_df.columns)) mcsm_df['position'] = mcsm_df['Position'] print('No. of cols after copying: ', len(mcsm_df.columns)) # sanity check if mcsm_df['position'].equals(mcsm_df['Position']): print('PASS: Copying worked correctly' , '\ncopied col matches original column' , '\n===============================================================') else: print('FAIL: copied col does not match original column' , '\n================================================================') # variables for sanity checks expected_rows_df2 = len(mcsm_df) # beware of harcoding! used for sanity check ndfs = 2 ncol_merge = 1 offset = ndfs - ncol_merge expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset print('Merge 2:' , '\ncombining 2dfs by commom col: position' , '\nExpected nrows in combined_df:', expected_rows_df2 , '\nExpected ncols in combined_df:', expected_cols_df2 , '\n===================================================================') combined_df2 = mcsm_df.merge(combined_df1, on = 'position') # sanity check print('Checking dimensions of concatenated df2...') if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2: print('PASS: combined df2 has expected dimensions' , '\nNo. of rows in combined df:', len(combined_df2) , '\nNo. of cols in combined df:', len(combined_df2.columns) , '\n===============================================================') else: print('FAIL: combined df2 does not have expected dimensions' , '\nNo. of rows in combined df:', len(combined_df2) , '\nNo. of cols in combined df:', len(combined_df2.columns) , '\n===============================================================') #%% write file print('Writing file:' , '\nFilename:', out_filename , '\nPath:', outdir , '\n===================================================================') combined_df2.to_csv(outfile, header = True, index = False) print('Finished writing:', out_filename , '\nNo. of rows:', len(combined_df2) , '\nNo. of cols:', len(combined_df2.columns) , '\n===================================================================') #%% end of script #==============================================================================