#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: change filename 4 (mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline #======================================================================= # Task: combine 2 dfs with aa position as linking column # This is done in 2 steps: # merge 1: # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns #======================================================================= #%% load packages import sys, os import pandas as pd import numpy as np import argparse #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # local import from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide') arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pncA') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' #gene_match = gene + '_p.' # cmd variables drug = args.drug gene = args.gene gene_match = gene + '_p.' #========== # dir #========== datadir = homedir + '/' + 'git/Data' outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= in_filename_afor = gene.lower() + '_af_or.csv' in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' infile1 = outdir + '/' + in_filename_afor infile2 = outdir + '/' + in_filename_afor_kin print('Input file1:', infile1 , '\nInput file2:', infile2 , '\n===================================================================') #======= # output #======= out_filename = gene.lower() + '_metadata_afs_ors.csv' outfile = outdir + '/' + out_filename print('Output file:', outfile , '\n===================================================================') del(in_filename_afor, in_filename_afor_kin, datadir, outdir) #%% end of variable assignment for input and output files #======================================================================= #%% format mutations # mut_format: gene.abc1cde | 1A>1B #======================== # read input csv files to combine #======================== afor_df = pd.read_csv(infile1, sep = ',') afor_df_ncols = len(afor_df.columns) afor_df_nrows = len(afor_df) print('No. of rows in', infile1, ':', afor_df_nrows , '\nNo. of cols in', infile1, ':', afor_df_ncols) afor_kin_df = pd.read_csv(infile2, sep = ',') afor_kin_df_nrows = len(afor_kin_df) afor_kin_df_ncols = len(afor_kin_df.columns) print('No. of rows in', infile2, ':', afor_kin_df_nrows , '\nNo. of cols in', infile2, ':', afor_kin_df_ncols) #======= # Iterate through the dict, create a lookup dict i.e # lookup_dict = {three_letter_code: one_letter_code}. # lookup dict should be the key and the value (you want to create a column for) # Then use this to perform the mapping separetly for wild type and mutant cols. # The three letter code is extracted using a string match match from the dataframe and then converted # to 'pandas series'since map only works in pandas series #======= gene_regex = gene_match.lower()+'(\w{3})' print('gene regex being used:', gene_regex) # initialise a sub dict that is lookup dict for three letter code to 1-letter code # adding three more cols lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['one_letter_code'] # wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on wt = afor_df['mutation'].str.extract(gene_regex).squeeze() afor_df['wild_type'] = wt.map(lookup_dict) mut = afor_df['mutation'].str.extract('\d+(\w{3})$').squeeze() afor_df['mutant_type'] = mut.map(lookup_dict) # extract position info from mutation column separetly using string match afor_df['position'] = afor_df['mutation'].str.extract(r'(\d+)') # combine the wild_type+poistion+mutant_type columns to generate # mutationinformation (matches mCSM output field) # Remember to use .map(str) for int col types to allow string concatenation afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type'] print('Created column: mutationinformation' , '\n=====================================================================' , afor_df['mutationinformation'].head(10)) # sanity check ncols_add = 4 # beware of hardcoding (3 cols for mcsm style mut + 1 for concatenating them all) if len(afor_df.columns) == afor_df_ncols + ncols_add: afor_df_ncols = len(afor_df.columns) # update afor_df_ncols after adding cols print('PASS: successfully added', ncols_add, 'cols' , '\nold length:', afor_df_ncols , '\nnew length:', len(afor_df.columns)) else: print('FAIL: failed to add cols:' , '\nExpected cols:', afor_df_ncols + ncols_add , '\nGot:', len(afor_df.columns)) sys.exit() #%% Detect mutation format to see if you apply this func # FIXME #afor_df.iloc[[0]].str.match('pnca_') #afor_df.dtypes #foo = afor_df.loc[:, afor_df.dtypes == object] genomic_mut_regex = gene_match.lower()+'\w{3}\d+\w{3}' print('gene regex being used:', genomic_mut_regex) afor_df[(afor_df == genomic_mut_regex).any(axis = 1)] #%% Finding common col to merge on # Define merging column: multiple cols have been used for merge else the common cols # get suffixes '_x' and '_y' attached # also, couldn't include 'position' in merging_cols since data types don't match merging_cols = ['wild_type', 'mutant_type', 'mutationinformation'] ncommon_cols= len(merging_cols) # checking cross-over of mutations in the two dfs to merge ndiff1 = afor_kin_df_nrows - afor_df['mutationinformation'].isin(afor_kin_df['mutationinformation']).sum() print(ndiff1) ndiff2 = afor_kin_df_nrows - afor_kin_df['mutationinformation'].isin(afor_df['mutationinformation']).sum() print(ndiff2) #%% combining dfs # Define join type #my_join = 'inner' #my_join = 'right' #my_join = 'left' my_join = 'outer' fail = False # sanity check: how many muts from afor_kin_df are in afor_df. should be a complete subset if ndiff2 == 0: print('PASS: all muts in afor_kin_df are present in afor_df' , '\nProceeding with combining the dfs...') combined_df = pd.merge(afor_df, afor_kin_df, on = merging_cols, how = my_join) if my_join == ('outer' or 'left') : print('combing with:', my_join) expected_rows = afor_df_nrows + ndiff1 if my_join == ('inner' or 'right'): print('combing with:', my_join) expected_rows = afor_kin_df_nrows expected_cols = afor_df_ncols + afor_kin_df_ncols - ncommon_cols if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols: print('PASS: successfully combined dfs with:', my_join, 'join') else: print('FAIL: combined_df\'s expected rows and cols not matched') fail = True # BAD practice! just a placeholder to avoid code duplication print('\nExpected no. of rows:', expected_rows , '\nGot:', len(combined_df) , '\nExpected no. of cols:', expected_cols , '\nGot:', len(combined_df.columns)) if fail: sys.exit('ERROR: combined_df may be incorrectly combined') else: print('FAIL: numbers mismatch, mutations present in afor_kin_df but not in afor_df') sys.exit('ERROR: Not all mutations in the kinship_df are present in the df with other ORs') #%% check duplicate cols: ones containing suffix '_x' or '_y' # should only be position foo = combined_df.filter(regex = r'.*_x|_y', axis = 1) print(foo.columns) # should only be position # drop position col containing suffix '_y' and then rename col without suffix combined_or_df = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1) combined_or_df['position_x'].head() # renaming columns combined_or_df.rename(columns = {'position_x': 'position'}, inplace = True) combined_or_df['position'].head() # recheck foo = combined_or_df.filter(regex = r'.*_x|_y', axis = 1) print(foo.columns) # should only be empty #%% rearraging columns print('Dim of df prefromatting:', combined_or_df.shape) print(combined_or_df.columns) #%% reorder columns #https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns # setting column's order output_df = combined_or_df[['mutation', 'mutationinformation', 'wild_type', 'position', 'mutant_type', 'chr_num_allele', 'ref_allele', 'alt_allele', 'mut_info', 'mut_type', 'gene_id', 'gene_number', 'mut_region', 'reference_allele', 'alternate_allele', 'chromosome_number', 'af', 'af_kin', 'or_kin', 'or_logistic', 'or_mychisq', 'est_chisq', 'or_fisher', 'ci_low_logistic', 'ci_hi_logistic', 'ci_low_fisher', 'ci_hi_fisher', 'pwald_kin', 'pval_logistic', 'pval_fisher', 'pval_chisq', 'beta_logistic', 'beta_kin', 'se_logistic', 'se_kin', 'zval_logistic', 'logl_H1_kin', 'l_remle_kin', 'n_diff', 'tot_diff', 'n_miss']] # sanity check after rearranging if combined_or_df.shape == output_df.shape and set(combined_or_df.columns) == set(output_df.columns): print('PASS: Successfully formatted df with rearranged columns') else: sys.exit('FAIL: something went wrong when rearranging columns!') #%% write file print('\n=====================================================================' , '\nWriting output file:\n', outfile , '\nNo.of rows:', len(output_df) , '\nNo. of cols:', len(output_df.columns)) output_df.to_csv(outfile, index = False)