#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 4 (mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 2 dfs with aa position as linking column # This is done in 2 steps: # merge 1: # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np import argparse #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # local import #from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! from reference_dict import low_3letter_dict #======================================================================= #%% command line args #arg_parser = argparse.ArgumentParser() #arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide') #arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pncA') # case sensitive #args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' # cmd variables #drug = args.drug #gene = args.gene #gene_match = gene + '_p.' #========== # dir #========== datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/' + 'input' outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv' in_filename_afor = gene.lower() + '_af_or.csv' in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' infile0 = indir + '/' + in_filename_snpinfo infile1 = outdir + '/' + in_filename_afor infile2 = outdir + '/' + in_filename_afor_kin print('Input file0:', infile0 , '\nInput file1:', infile1 , '\nInput file2:', infile2 , '\n===================================================================') #======= # output #======= out_filename = gene.lower() + '_metadata_afs_ors.csv' outfile = outdir + '/' + out_filename print('Output file:', outfile , '\n===================================================================') del(in_filename_afor, in_filename_afor_kin, datadir, indir, outdir) #%% end of variable assignment for input and output files #======================================================================= #%% format mutations # mut_format: gene.abc1cde | 1A>1B #======================== # read input csv files to combine #======================== snpinfo_df = pd.read_csv(infile0, sep = ',') snpinfo_ncols = len(snpinfo_df.columns) snpinfo_nrows = len(snpinfo_df) print('No. of rows in', infile0, ':', snpinfo_nrows , '\nNo. of cols in', infile0, ':', snpinfo_ncols) afor_df = pd.read_csv(infile1, sep = ',') afor_ncols = len(afor_df.columns) afor_nrows = len(afor_df) print('No. of rows in', infile1, ':', afor_nrows , '\nNo. of cols in', infile1, ':', afor_ncols) afor_kin_df = pd.read_csv(infile2, sep = ',') afor_kin_nrows = len(afor_kin_df) afor_kin_ncols = len(afor_kin_df.columns) print('No. of rows in', infile2, ':', afor_kin_nrows , '\nNo. of cols in', infile2, ':', afor_kin_ncols) #%% Process afor_df #1) pull all snp_info so you have ref_allele, etc # i.e merge afor_df and snpinfo_df # find merging column left_df = afor_df.copy() left_df_nrows = len(left_df) left_df_ncols = len(left_df.columns) right_df = snpinfo_df.copy() right_df_nrows = len(right_df) right_df_ncols = len(right_df.columns) common_cols = np.intersect1d(left_df.columns, right_df.columns).tolist() print('Length of common cols:', len(common_cols) , '\ncommon column/s:', common_cols, 'type:', type(common_cols)) print('selecting consistent dtypes for merging (object i.e string)') #https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist() print(merging_cols) nmerging_cols = len(merging_cols) print(' length of merging cols:', nmerging_cols , '\nmerging cols:', merging_cols, 'type:', type(merging_cols)) # drop duplicates else the expected rows don't match print('Checking for duplicates in common col:', common_cols , '\nNo of duplicates:' , len(right_df[right_df.duplicated(common_cols)]) , '\noriginal length:', right_df_nrows) right_df = right_df[~right_df.duplicated(common_cols)] right_df_nrows = len(right_df) print('\nrevised length:', right_df_nrows) # checking cross-over of mutations in the two dfs to merge ndiff1 = afor_nrows - afor_df['mutation'].isin(snpinfo_df['mutation']).sum() print('There are', ndiff1, 'mutations with OR, but no snp_info' , '\nExtracting and writing out file') #afor_df[afor_df['mutation'].isin(snpinfo_df['mutation'])] missing_mutinfo = afor_df[~afor_df['mutation'].isin(snpinfo_df['mutation'])] #len(missing_mutinfo.duplicated(common_cols)) #missing_mutinfo.to_csv('infoless_muts.csv') ndiff2 = snpinfo_nrows - snpinfo_df['mutation'].isin(afor_df['mutation']).sum() print('There are', ndiff2, 'mutations that do not have OR, but have snp_info') # Define join type #my_join = 'inner' #my_join = 'outer' #my_join = 'right' my_join = 'left' print('combing with join:', my_join) combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join) print('nrows:', len(combined_df1) , '\nshape:', combined_df1.shape) # inner = 252 left_df_nrows - ndiff1 # outer = 331 right_df_nrows + ndiff1 # right = 290 right_df_nrows # left = 293 left_df_nrows #%% # see if you want an extra clause here! # Define join type #my_join = 'inner' #my_join = 'outer' #my_join = 'right' my_join = 'left' fail = False print('combing with:', my_join) combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join) if my_join == 'inner': #expected_rows = left_df_nrows - ndiff1 expected_rows = left_df.shape[0] - ndiff1 if my_join == 'outer': #expected_rows = right_df_nrows + ndiff1 expected_rows = right_df.shape[0] + ndiff1 if my_join == 'right': #expected_rows = right_df_nrows expected_rows = right_df.shape[0] if my_join == 'left': #expected_rows = left_df_nrows expected_rows = left_df.shape[0] expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols if len(combined_df1) == expected_rows and len(combined_df1.columns) == expected_cols: print('PASS: successfully combined dfs with:', my_join, 'join') else: print('FAIL: combined_df\'s expected rows and cols not matched') fail = True print('\nExpected no. of rows:', expected_rows , '\nGot:', len(combined_df1) , '\nExpected no. of cols:', expected_cols , '\nGot:', len(combined_df1.columns)) if fail: sys.exit() # update nrows and ncols afor_info_nrows = len(afor_info_df) afor_info_ncols = len(afor_info_df.columns) #%%