#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 2(mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 2 dfs with aa position as linking column # Input: 2 dfs # _complex_mcsm_norm.csv # _foldx.csv # Output: .csv of all 2 dfs combined # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np #from varname import nameof #%% end of variable assignment for input and output files #======================================================================= #%% function/methd to combine 4 dfs #def combine_stability_dfs(mcsm_df, foldx_df, out_combined_df): def combine_stability_dfs(mcsm_df, foldx_df, my_join = 'outer'): """ Combine 2 dfs @param mcsm_df: csv file (output from mcsm pipeline) @type mcsm_df: string @param foldx_df: csv file (output from runFoldx.py) @type foldx_df: string @param out_combined_df: csv file output @type out_combined_df: string @return: none, writes combined df as csv """ #======================== # read input csv files to combine #======================== print('Reading input files:') left_df = pd.read_csv(mcsm_df, sep = ',') left_df.columns = left_df.columns.str.lower() right_df = pd.read_csv(foldx_df, sep = ',') right_df.columns = right_df.columns.str.lower() print('Dimension left df:', left_df.shape , '\nDimesnion right_df:', right_df.shape # , '\njoin type:', join_type , '\n=========================================================') print('Finding common cols and merging cols:' ,'\n=========================================================') common_cols = np.intersect1d(left_df.columns, right_df.columns).tolist() print('Length of common cols:', len(common_cols) , '\ncommon column/s:', common_cols, 'type:', type(common_cols)) print('selecting consistent dtypes for merging (object i.e string)') merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist() nmerging_cols = len(merging_cols) print(' length of merging cols:', nmerging_cols , '\nmerging cols:', merging_cols, 'type:', type(merging_cols) , '\n=========================================================') #======================== # merge 1 (combined_df) # concatenating 2dfs: # mcsm_df, foldx_df #======================== # checking cross-over of mutations in the two dfs to merge #ndiff1 = left_df.shape[0] - left_df['mutationinformation'].isin(right_df['mutationinformation']).sum() ndiff_1 = left_df[merging_cols].squeeze().isin(right_df[merging_cols].squeeze()).sum() print('ndiff_1:', ndiff_1) ndiff1 = left_df.shape[0] - ndiff_1 #print('There are', ndiff1, 'unmatched mutations in left df') #missing_mutinfo = left_df[~left_df['mutationinformation'].isin(right_df['mutationinformation'])] #missing_mutinfo.to_csv('infoless_muts.csv') #ndiff2 = right_df.shape[0] - right_df['mutationinformation'].isin(left_df['mutationinformation']).sum() ndiff_2 = right_df[merging_cols].squeeze().isin(left_df[merging_cols].squeeze()).sum() print('ndiff_2:', ndiff_2) ndiff2 = right_df.shape[0] - ndiff_2 #print('There are', ndiff2, 'unmatched mutations in right_df') comm = np.intersect1d(left_df[merging_cols], right_df[merging_cols]) comm_count = len(comm) print('inner:', comm, '\nlength:', comm_count , '\ntype:', type(comm_count)) #======================== # sanity checks for join type #======================== fail = False print('combing with:', my_join) combined_df = pd.merge(left_df, right_df, on = merging_cols, how = my_join) combined_df1 = combined_df.drop_duplicates(subset = merging_cols, keep ='first') if my_join == 'inner': #expected_rows = left_df.shape[0] - ndiff1 expected_rows = comm_count if my_join == 'outer': #expected_rows = right_df.shape[0] + ndiff1 expected_rows = max(left_df.shape[0], right_df.shape[0]) if my_join == 'right': expected_rows = right_df.shape[0] if my_join == 'left': expected_rows = left_df.shape[0] expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols if len(combined_df1) == expected_rows and len(combined_df1.columns) == expected_cols: print('PASS: successfully combined dfs with:', my_join, 'join') else: print('FAIL: combined_df\'s expected rows and cols not matched') fail = True print('\nExpected no. of rows:', expected_rows , '\nGot:', len(combined_df1) , '\nExpected no. of cols:', expected_cols , '\nGot:', len(combined_df1.columns)) if fail: sys.exit() return combined_df1 #%% end of function #=======================================================================