#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 4 (mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 2 dfs with aa position as linking column # This is done in 2 steps: # merge 1: # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np import argparse #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # local import #from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! from reference_dict import low_3letter_dict #======================================================================= #%% command line args #arg_parser = argparse.ArgumentParser() #arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide') #arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pncA') # case sensitive #args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' # cmd variables #drug = args.drug #gene = args.gene #gene_match = gene + '_p.' #========== # dir #========== datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/' + 'input' outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv' in_filename_afor = gene.lower() + '_af_or.csv' in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' infile0 = indir + '/' + in_filename_snpinfo infile1 = outdir + '/' + in_filename_afor infile2 = outdir + '/' + in_filename_afor_kin print('Input file0:', infile0 , '\nInput file1:', infile1 , '\nInput file2:', infile2 , '\n=============================================================') #======= # output #======= out_filename = gene.lower() + '_metadata_afs_ors.csv' outfile = outdir + '/' + out_filename print('Output file:', outfile , '\n=============================================================') del(in_filename_afor, in_filename_afor_kin, datadir, indir, outdir) #%% end of variable assignment for input and output files #======================================================================= #%% format mutations # mut_format: gene.abc1cde | 1A>1B #======================== # read input csv files to combine #======================== snpinfo_df = pd.read_csv(infile0, sep = ',') #snpinfo_ncols = len(snpinfo_df.columns) #snpinfo.shape[0] = len(snpinfo_df) print('No. of rows in', infile0, ':', snpinfo_df.shape[0] , '\nNo. of cols in', infile0, ':', snpinfo_df.shape[1]) afor_df = pd.read_csv(infile1, sep = ',') #afor_ncols = len(afor_df.columns) #afor.shape[0] = len(afor_df) print('No. of rows in', infile1, ':', afor_df.shape[0] , '\nNo. of cols in', infile1, ':', afor_df.shape[1]) afor_kin_df = pd.read_csv(infile2, sep = ',') #afor_kin.shape[0] = len(afor_kin_df) #afor_kin_ncols = len(afor_kin_df.columns) print('No. of rows in', infile2, ':', afor_kin_df.shape[0] , '\nNo. of cols in', infile2, ':', afor_kin_df.shape[1]) #%% Process afor_df #1) pull all snp_info so you have ref_allele, etc # i.e merge afor_df and snpinfo_df # find merging column left_df = afor_df.copy() right_df = snpinfo_df.copy() common_cols = np.intersect1d(left_df.columns, right_df.columns).tolist() print('Length of common cols:', len(common_cols) , '\ncommon column/s:', common_cols, 'type:', type(common_cols)) #https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu print('selecting consistent dtypes for merging (object i.e string)') merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist() print(merging_cols) nmerging_cols = len(merging_cols) print(' length of merging cols:', nmerging_cols , '\nmerging cols:', merging_cols, 'type:', type(merging_cols)) #https://stackoverflow.com/questions/22720739/pandas-left-outer-join-results-in-table-larger-than-left-table # drop duplicates else the expected rows don't match print('Checking for duplicates in common col:', common_cols , '\nNo of duplicates:' , len(right_df[right_df.duplicated(common_cols)]) , '\noriginal length:', right_df.shape[0]) right_df = right_df[~right_df.duplicated(common_cols)] print('\nrevised length:', right_df.shape[0]) # checking cross-over of mutations in the two dfs to merge ndiff1 = left_df.shape[0] - left_df['mutation'].isin(right_df['mutation']).sum() print('There are', ndiff1, 'mutations with OR, but no snp_info' , '\nExtracting and writing out file') missing_mutinfo = left_df[~left_df['mutation'].isin(right_df['mutation'])] #missing_mutinfo.to_csv('infoless_muts.csv') ndiff2 = right_df.shape[0] - right_df['mutation'].isin(left_df['mutation']).sum() print('There are', ndiff2, 'mutations that do not have OR, but have snp_info') # Define join type #my_join = 'inner' #my_join = 'outer' #my_join = 'right' my_join = 'left' print('combing with join:', my_join) combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join) print('\nshape:', combined_df1.shape) # inner = 252 left_df.shape[0] - ndiff1 # outer = 331 right_df.shape[0] + ndiff1 # right = 290 right_df.shape[0] # left = 293 left_df.shape[0] #%% # see if you want an extra clause here! # Define join type #my_join = 'inner' #my_join = 'outer' #my_join = 'right' my_join = 'left' fail = False print('combing with:', my_join) combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join) if my_join == 'inner': #expected_rows = left_df.shape[0] - ndiff1 expected_rows = left_df.shape[0] - ndiff1 if my_join == 'outer': #expected_rows = right_df.shape[0] + ndiff1 expected_rows = right_df.shape[0] + ndiff1 if my_join == 'right': #expected_rows = right_df.shape[0] expected_rows = right_df.shape[0] if my_join == 'left': #expected_rows = left_df.shape[0] expected_rows = left_df.shape[0] expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols if len(combined_df1) == expected_rows and len(combined_df1.columns) == expected_cols: print('PASS: successfully combined dfs with:', my_join, 'join') else: print('FAIL: combined_df\'s expected rows and cols not matched') fail = True print('\nExpected no. of rows:', expected_rows , '\nGot:', len(combined_df1) , '\nExpected no. of cols:', expected_cols , '\nGot:', len(combined_df1.columns)) if fail: sys.exit() # delete variables del(left_df, right_df, common_cols, merging_cols, nmerging_cols, my_join, ndiff1, ndiff2, missing_mutinfo , expected_rows, expected_cols, fail) del(afor_df, snpinfo_df) #======================================================================= #%% Second merge: combined_df1 and afor_kin_df left_df = combined_df1.copy() right_df = afor_kin_df.copy() common_cols = np.intersect1d(left_df.columns, right_df.columns).tolist() print('Length of common cols:', len(common_cols) , '\ncommon column/s:', common_cols, 'type:', type(common_cols)) #https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu print('selecting consistent dtypes for merging (object i.e string)') #FIXME #merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist() merging_cols = ['wild_type', 'mutant_type', 'mutationinformation'] nmerging_cols_cols = len(merging_cols) print(merging_cols) nmerging_cols = len(merging_cols) print(' length of merging cols:', nmerging_cols , '\nmerging cols:', merging_cols, 'type:', type(merging_cols)) ndiff1 = left_df.shape[0] - left_df['mutationinformation'].isin(right_df['mutationinformation']).sum() print('There are', ndiff1, 'mutations with OR, but not in OR kinship' , '\nExtracting and writing out file') missing_mutinfo = left_df[~left_df['mutationinformation'].isin(right_df['mutationinformation'])] #missing_mutinfo.to_csv('infoless_muts.csv') ndiff2 = right_df.shape[0] - right_df['mutationinformation'].isin(left_df['mutationinformation']).sum() print('There are', ndiff2, 'mutations that do not have OR, but have OR kinship') my_join = 'outer' fail = False print('combing with:', my_join) combined_df2 = pd.merge(left_df, right_df, on = merging_cols, how = my_join) if my_join == 'inner': #expected_rows = left_df.shape[0] - ndiff1 expected_rows = left_df.shape[0] - ndiff1 if my_join == 'outer': #expected_rows = right_df.shape[0] + ndiff1 expected_rows = right_df.shape[0] + ndiff1 if my_join == 'right': #expected_rows = right_df.shape[0] expected_rows = right_df.shape[0] if my_join == 'left': #expected_rows = left_df.shape[0] expected_rows = left_df.shape[0] expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols if len(combined_df2) == expected_rows and len(combined_df2.columns) == expected_cols: print('PASS: successfully combined dfs with:', my_join, 'join') else: print('FAIL: combined_df\'s expected rows and cols not matched') fail = True print('\nExpected no. of rows:', expected_rows , '\nGot:', len(combined_df2) , '\nExpected no. of cols:', expected_cols , '\nGot:', len(combined_df2.columns)) if fail: sys.exit() #%% check duplicate cols: ones containing suffix '_x' or '_y' # should only be position foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1) print(foo.columns) # should only be position # drop position col containing suffix '_y' and then rename col without suffix combined_or_df = combined_df2.drop(combined_df2.filter(regex = r'.*_y').columns, axis = 1) #combined_or_df['position_x'].head() # renaming columns #combined_or_df.rename(columns = {'position_x': 'position'}, inplace = True) #combined_or_df['position'].head() #recheck #foo = combined_or_df.filter(regex = r'.*_x|_y', axis = 1) #print(foo.columns) # should only be empty # remove '_x' from some cols import re def clean_colnames(colname): if re.search('.*_x', colname): pos = re.search('.*_x', colname).start() return colname[:pos] else: return colname #https://stackoverflow.com/questions/26500156/renaming-column-in-dataframe-for-pandas-using-regular-expression combined_or_df.columns combined_or_df.rename(columns=lambda x: re.sub('_x$','',x), inplace = True) combined_or_df.columns #FIXME: this should be 0 when you run the 35k dataset combined_or_df['chromosome_number'].isna().sum() #%% rearraging columns print('Dim of df prefromatting:', combined_or_df.shape) print(combined_or_df.columns, '\nshape:', combined_or_df.shape) # removing unnecessary column combined_or_df = combined_or_df.drop(['symbol'], axis = 1) print(combined_or_df.columns, '\nshape:', combined_or_df.shape) #%% reorder columns #https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns # setting column's order output_df = combined_or_df[['mutation', 'mutationinformation', 'wild_type', 'position', 'mutant_type', 'chr_num_allele', 'ref_allele', 'alt_allele', 'mut_info', 'mut_type', 'gene_id', 'gene_number', 'mut_region', 'reference_allele', 'alternate_allele', 'chromosome_number', 'af', 'af_kin', 'or_kin', 'or_logistic', 'or_mychisq', 'est_chisq', 'or_fisher', 'ci_low_logistic', 'ci_hi_logistic', 'ci_low_fisher', 'ci_hi_fisher', 'pwald_kin', 'pval_logistic', 'pval_fisher', 'pval_chisq', 'beta_logistic', 'beta_kin', 'se_logistic', 'se_kin', 'zval_logistic', 'logl_H1_kin', 'l_remle_kin', 'wt_3let', 'mt_3let', 'n_diff', 'tot_diff', 'n_miss']] # sanity check after rearranging if combined_or_df.shape == output_df.shape and set(combined_or_df.columns) == set(output_df.columns): print('PASS: Successfully formatted df with rearranged columns') else: sys.exit('FAIL: something went wrong when rearranging columns!') #%% write file print('\n=====================================================================' , '\nWriting output file:\n', outfile , '\nNo.of rows:', len(output_df) , '\nNo. of cols:', len(output_df.columns)) output_df.to_csv(outfile, index = False)