From 82e96fcdbadf46ec697247adedb3a756fa6c9ff4 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 27 Mar 2020 12:39:02 +0000 Subject: [PATCH] combining mcsm and struct params --- meta_data_analysis/combine_dfs.py | 150 +++++++++++++++++++++++------- 1 file changed, 118 insertions(+), 32 deletions(-) diff --git a/meta_data_analysis/combine_dfs.py b/meta_data_analysis/combine_dfs.py index f298324..745505b 100755 --- a/meta_data_analysis/combine_dfs.py +++ b/meta_data_analysis/combine_dfs.py @@ -5,15 +5,30 @@ Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' +# FIXME: change filename 4 (mcsm normalised data) +# to be consistent like (pnca_mcsm_norm.csv) #============================================================================= -# Task: combine 3 dfs with aa position as linking column +# Task: combine 4 dfs with aa position as linking column +# This is done in 2 steps: +# merge 1: of 3 dfs +# pnca_dssp.csv +# pnca_kd.csv +# pnca_rd.csv + +# merge 2: of 2 dfs +# mcsm_complex1_normalised.csv (!fix name) +# output df from merge1 # Input: 3 dfs # pnca_dssp.csv # pnca_kd.csv # pnca_rd.csv +# mcsm_complex1_normalised.csv (!fix name) -# Output: .csv with 3) +# Output: .csv of all 4 dfs combined + +# useful link +#https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #============================================================================= #%% load packages import sys, os @@ -47,22 +62,25 @@ indir = datadir + '/' + drug + '/' + 'output' in_filename1 = 'pnca_dssp.csv' in_filename2 = 'pnca_kd.csv' in_filename3 = 'pnca_rd.csv' +in_filename4 = 'mcsm_complex1_normalised.csv' # FIXNAME infile1 = indir + '/' + in_filename1 infile2 = indir + '/' + in_filename2 infile3 = indir + '/' + in_filename3 +infile4 = indir + '/' + in_filename4 print('\nInput path:', indir , '\nInput filename1:', in_filename1 , '\nInput filename2:', in_filename2 , '\nInput filename3:', in_filename3 + , '\nInput filename4:', in_filename4 , '\n===================================================================') #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_struct_params.csv' +out_filename = gene.lower() + '_mcsm_struct_params.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir @@ -73,7 +91,8 @@ print('Output filename:', out_filename #%% Read input file dssp_df = pd.read_csv(infile1, sep = ',') kd_df = pd.read_csv(infile2, sep = ',') -rd_df = pd.read_csv(infile3, sep = ',') +rd_df = pd.read_csv(infile3, sep = ',') +mcsm_df = pd.read_csv(infile4, sep = ',') print('Reading input files:' , '\ndssp file:', infile1 @@ -90,49 +109,116 @@ print('Reading input files:' , '\nNo. of rows:', len(rd_df) , '\nNo. of cols:', len(rd_df.columns) , '\nColumn names:', rd_df.columns + , '\n===================================================================' + , '\nrd file:', infile4 + , '\nNo. of rows:', len(mcsm_df) + , '\nNo. of cols:', len(mcsm_df.columns) + , '\nColumn names:', mcsm_df.columns , '\n===================================================================') +#%% Begin combining dfs +#=================== +# concatenating df1 (3dfs): dssp_df + kd_df+ rd_df +#=================== +print('starting first merge...\n') -#======================== # checking no. of rows -#======================== print('Checking if no. of rows of the 3 dfs are equal:\n' , len(dssp_df) == len(kd_df) == len(rd_df) , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' , '\n===================================================================') -#=================== -# concatenating dfs -#=================== - -expected_rows = max(len(dssp_df), len(kd_df), len(rd_df)) - +# variables for sanity checks +expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df)) # beware of harcoding! used for sanity check -offset = 2 # 1 common col in each of the three dfs-1 -expected_cols = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset +ndfs = 3 +ncol_merge = 1 +offset = ndfs- ncol_merge +expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset -print('Combining dfs by commom col: position' - , '\nExpected nrows in combined_df:', expected_rows - , '\nExpected ncols in combined_df:', expected_cols +print('Merge 1:' + , '\ncombining 3dfs by commom col: position' + , '\nExpected nrows in combined_df:', expected_rows_df1 + , '\nExpected ncols in combined_df:', expected_cols_df1 , '\nResetting the common col as the index' , '\n===================================================================') -dssp_df.set_index('position',inplace = True) -kd_df.set_index('position',inplace = True) -rd_df.set_index('position',inplace =True) +#dssp_df.set_index('position', inplace = True) +#kd_df.set_index('position', inplace = True) +#rd_df.set_index('position', inplace =True) -combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() -combined_df.rename(columns = {'index':'position'}) +#combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() +#combined_df.rename(columns = {'index':'position'}) -print('Checking dimensions of concatenated df...') -if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols: +combined_df1 = pd.concat( + (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df]) + , axis = 1, join = 'outer').reset_index() + +# sanity check +print('Checking dimensions of concatenated df1...') +if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1: print('PASS: combined df has expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df) - , '\nNo. of cols in combined df:', len(combined_df.columns) + , '\nNo. of rows in combined df:', len(combined_df1) + , '\nNo. of cols in combined df:', len(combined_df1.columns) , '\n===============================================================') else: - print('FAIL: combined df has not expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df) - , '\nNo. of cols in combined df:', len(combined_df.columns) + print('FAIL: combined df does not have expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df1) + , '\nNo. of cols in combined df:', len(combined_df1.columns) + , '\n===============================================================') + +#=================== +# concatenating df2 (2dfs): mcsm_df + combined_df1 +# sort sorts the cols +#=================== +print('starting second merge...\n') + +# rename col 'Position' in mcsm_df to lowercase 'position' +# as it matches the combined_df1 colname to perfom merge +#mcsm_df.columns +#mcsm_df.rename(columns = {'Position':'position'}) # not working! +# copy 'Position' column with the correct colname +print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging' + , '\nNo. of cols before copying: ', len(mcsm_df.columns)) + +mcsm_df['position'] = mcsm_df['Position'] +print('No. of cols after copying: ', len(mcsm_df.columns)) + +# sanity check +if mcsm_df['position'].equals(mcsm_df['Position']): + print('PASS: Copying worked correctly' + , '\ncopied col matches original column' + , '\n===============================================================') +else: + print('FAIL: copied col does not match original column' + , '\n================================================================') + +# variables for sanity checks +expected_rows_df2 = len(mcsm_df) +# beware of harcoding! used for sanity check +ndfs = 2 +ncol_merge = 1 +offset = ndfs - ncol_merge +expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset + +print('Merge 2:' + , '\ncombining 2dfs by commom col: position' + , '\nExpected nrows in combined_df:', expected_rows_df2 + , '\nExpected ncols in combined_df:', expected_cols_df2 + , '\n===================================================================') + +combined_df2 = mcsm_df.merge(combined_df1, on = 'position') + +# sanity check +print('Checking dimensions of concatenated df2...') +if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2: + print('PASS: combined df2 has expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df2) + , '\nNo. of cols in combined df:', len(combined_df2.columns) + , '\n===============================================================') +else: + print('FAIL: combined df2 does not have expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df2) + , '\nNo. of cols in combined df:', len(combined_df2.columns) , '\n===============================================================') #%% write file @@ -141,11 +227,11 @@ print('Writing file:' , '\nPath:', outdir , '\n===================================================================') -combined_df.to_csv(outfile, header = True, index = False) +combined_df2.to_csv(outfile, header = True, index = False) print('Finished writing:', out_filename - , '\nNo. of rows:', len(combined_df) - , '\nNo. of cols:', len(combined_df.columns) + , '\nNo. of rows:', len(combined_df2) + , '\nNo. of cols:', len(combined_df2.columns) , '\n===================================================================') #%% end of script