#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 2(mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 2 dfs on comm_valson cols by detecting them # includes sainity checks #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np import re #from varname import nameof #%% end of variable assignment for input and output files #======================================================================= #%% function/methd to combine dfs def detect_common_cols (df1, df2): """ Detect comm_valson cols @param df1: df @type df1: pandas df @param df2: df @type df2: pandas df @return: comm_valson cols @type: list """ common_cols = np.intersect1d(df1.columns, df2.columns).tolist() print('Length of comm_cols:', len(common_cols) , '\nmerging column/s:', common_cols , '\ntype:', type(common_cols) , '\ndtypes in merging columns:\n', df1[common_cols].dtypes) return common_cols def combine_dfs_with_checks(df1, df2, my_join = 'outer'): """ Combine 2 dfs by finding merging columns automatically @param df1: data frame @type df1: pandas df @param df2: data frame @type df2: pandas df @my_join: join type for merging @type my_join: string @return: combined_df @type: pandas df """ print('Finding comm_cols and merging cols:' ,'\n=========================================================') common_cols = np.intersect1d(df1.columns, df2.columns).tolist() print('Length of comm_cols:', len(common_cols) , '\nmerging column/s:', common_cols , '\ntype:', type(common_cols)) #print('\ndtypes in merging columns:\n', df1[common_cols].dtypes) print('selecting consistent dtypes for merging (object i.e string)') #merging_cols = df1[comm_valson_cols].select_dtypes(include = [object]).columns.tolist() #merging_cols = df1[comm_valson_cols].select_dtypes(include = ['int64']).columns.tolist() merging_cols = common_cols.copy() nmerging_cols = len(merging_cols) print(' length of merging cols:', nmerging_cols , '\nmerging cols:', merging_cols, 'type:', type(merging_cols) , '\n=========================================================') #======================== # merge 1 (combined_df) # concatenating 2dfs: # df1, df2 #======================== # checking cross-over of mutations in the two dfs to merge ndiff_1 = df1[merging_cols].squeeze().isin(df2[merging_cols].squeeze()).sum() ndiff1 = df1.shape[0] - ndiff_1 print('There are', ndiff1, 'unmatched mutations in left df') #missing_mutinfo = df1[~left_df['mutationinformation'].isin(df2['mutationinformation'])] #missing_mutinfo.to_csv('infoless_muts.csv') ndiff_2 = df2[merging_cols].squeeze().isin(df1[merging_cols].squeeze()).sum() ndiff2 = df2.shape[0] - ndiff_2 print('There are', ndiff2, 'unmatched mutations in right_df') #comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols]) #comm_vals_count = len(comm_vals) #print('length of comm_valson values:', comm_vals_count , '\ntype:', type(comm_vals_count)) #======================== # merging dfs & sanity checks #======================== fail = False print('combing with:', my_join) comb_df = pd.merge(df1, df2, on = merging_cols, how = my_join) expected_cols = df1.shape[1] + df2.shape[1] - nmerging_cols if my_join == 'right': df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') expected_rows = df2_nd.shape[0] if my_join == 'left': expected_rows = df1.shape[0] #if my_join == 'inner': # expected_rows = comm_vals_count #if my_join == 'outer': # df1_nd = df1.drop_duplicates(merging_cols, keep = 'first') # df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') # expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - comm_vals_count if my_join == ('inner' or 'outer') and len(merging_cols) > 1: #comm_vals = np.intersect1d(df1['mutationinformation'], df2['mutationinformation']) print('length of merging_cols > 1, therefore omitting row checks') combined_df = comb_df.copy() expected_rows = len(combined_df) else: comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols]) print('length of merging_cols == 1, calculating expected rows in merged_df') combined_df = comb_df.drop_duplicates(subset = merging_cols, keep ='first') if my_join == 'inner': expected_rows = len(comm_vals) if my_join == 'outer': df1_nd = df1.drop_duplicates(merging_cols, keep = 'first') df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - len(comm_vals) if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols: print('PASS: successfully combined dfs with:', my_join, 'join') else: print('FAIL: combined_df\'s expected rows and cols not matched') fail = True print('\nExpected no. of rows:', expected_rows , '\nGot:', len(combined_df) , '\nExpected no. of cols:', expected_cols , '\nGot:', len(combined_df.columns)) if fail: sys.exit() #if clean: #foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1) #print(foo.columns) #print('Detected duplicate cols with suffix: _x _y' # , '\Dropping duplicate cols and cleaning') # drop position col containing suffix '_y' and then rename col without suffix combined_df_clean = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1) combined_df_clean.rename(columns=lambda x: re.sub('_x$','', x), inplace = True) return combined_df_clean #%% end of function #=======================================================================