#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Jun 10 11:13:49 2020 @author: tanu """ #======================================================================= #%% useful links #https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/ #https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/ #https://stackoverflow.com/questions/40348541/pandas-diff-with-string #======================================================================= import os, sys import pandas as pd import numpy as np import re import argparse homedir = os.path.expanduser('~') os.chdir(homedir + '/git/LSHTM_analysis/scripts') # local import from find_missense import find_missense #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None) arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None) arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') # FIXME: remove defaults arg_parser.add_argument('-sc', '--start_coord', help = 'start of coding region (cds) of gene', default = None, type = int) # pnca cds arg_parser.add_argument('-ec', '--end_coord', help = 'end of coding region (cds) of gene', default = None, type = int) # pnca cds arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variables #gene = 'pncA' #drug = 'pyrazinamide' #start_cds = 2288681 #end_cds = 2289241 #%%===================================================================== # Command line options gene = args.gene drug = args.drug gene_match = gene + '_p.' datadir = args.datadir indir = args.input_dir outdir = args.output_dir start_cds = args.start_coord end_cds = args.end_coord #%%======================================================================= #============== # directories #============== if not datadir: datadir = homedir + '/' + 'git/Data' if not indir: indir = datadir + '/' + drug + '/input' if not outdir: outdir = datadir + '/' + drug + '/output' #======= # input #======= gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.txt' #gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.csv' gene_info = indir + '/' + gene_info_filename print('gene info file: ', gene_info , '\n============================================================') in_filename_or = 'ns'+ gene.lower()+ '_assoc.txt' gene_or = indir + '/' + in_filename_or print('gene OR file: ', gene_or , '\n============================================================') #======= # output #======= gene_or_filename = gene.lower() + '_af_or_kinship.csv' # other one is called AFandOR outfile_or_kin = outdir + '/' + gene_or_filename print('Output file: ', outfile_or_kin , '\n============================================================') #%% read files: preformatted using bash # or file: '...assoc.txt' # FIXME: call bash script from here or_df = pd.read_csv(gene_or, sep = '\t', header = 0, index_col = False) # 182, 12 (without filtering for missense muts, it was 212 i.e we 30 muts weren't missense) or_df.head() or_df.columns #%% snp_info file: master and gene specific ones # gene info info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10 #info_df2 = pd.read_csv(gene_info, sep = ',', header = 0) #447 10 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100 print('*****RESULT*****' , '\nPercentage of missense mut in pncA:', mis_mut_cover , '\n*****RESULT*****') # v6: 61.07 # v4: 65.7% #%% Extracting ref allele and alt allele as single letters # info_df has some of these params as more than a single letter, which means that # when you try to merge ONLY using chromosome_number, then it messes up... and is WRONG. # Hence the merge needs to be performed on a unique set of attributes which in our case # would be chromosome_number, ref_allele and alt_allele df_ncols = len(or_df.columns) print('Dim of df:',or_df.shape , '\nExtracting missense muts as single letters from: find_missense()') find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0') print('Dim of revised df:', or_df.shape, ' after extraction of missense muts') # FIXME: import this from function ncols_added_func = 4 if or_df.shape[1] == df_ncols + ncols_added_func: print('PASS: Succesfuly extracted ref and alt alleles for missense muts') else: print('FAIL: No. of cols mismatch' ,'\nOriginal length:', df_ncols , '\nExpected no. of cols:', df_ncols + ncols_added_func , '\nGot:', or_df.shape[1] , '\nCheck hardcoded value of ncols_add?') if (or_df['tot_diff'] == 1).sum() == len(or_df) and (or_df['n_diff'] == 1).sum() == len(or_df) and or_df['n_diff'].equals(or_df['tot_diff']): print('PASS: missene muts correctly extracted from source') else: print('FAIL: n_diff and tot_diff differ, check source data') sys.exit() del(df_ncols, ncols_added_func) #%% check dtypes before merging #or_df.dtypes or_df.info() #info_df2.dtypes info_df2.info() #%% perform merge: or_df and snp_info print('Preparing dfs for merging... Finding common cols to merge') # find common columns #merging_cols = ['chromosome_number', 'ref_allele', 'alt_allele'] merging_cols = or_df.columns[or_df.columns.isin(info_df2.columns)].to_list() print('No. of common cols identified:', len(merging_cols) , '\nColumns to merge on:', merging_cols , '\nChecking dtypes in merging_cols...' , '\n=================================================') # make sure chromosome_number dtypes are consisent or_df[merging_cols].dtypes == info_df2[merging_cols].dtypes # info_df2 contains multiple chromosome number in the column, so it is not # possible to convert this to int. Therefore, converting to string in or_df column if not (or_df[merging_cols].dtypes == info_df2[merging_cols].dtypes).all(): print('Data types not same, converting chromsome_number to str in or_df') or_df['chromosome_number'] = or_df['chromosome_number'].astype(str) print('Checking after converting dtype in or_df') if (or_df[merging_cols].dtypes == info_df2[merging_cols].dtypes).all(): print('PASS: dfs ready to merge..') else: print('FAIL: unable to make dtypes consistent required for merging! Check dtypes') sys.exit() # %% sanity check and perform merge #my_join = 'inner' #my_join = 'outer' my_join = 'left' #my_join = 'right' expected_cols = or_df.shape[1] + info_df2.shape[1] - len(merging_cols) print('Merging 2 dfs: or_df and info_df' , '\nJoin type:', my_join , '\nColumns to merge on:', merging_cols , '\nExpected cols after merging:', expected_cols , '\n=================================================') dfm2 = pd.merge(or_df, info_df2, on = merging_cols, how = my_join, indicator = True) dfm2['_merge'].value_counts() expected_cols = expected_cols + 1 # due to indicator = T # count no. of nan in 'mut_type'. ideally should be 0 if not dfm2['mut_type'].isna().sum() > 0: print('Good merging, no NA detected') else: print('OR detected without metadata' , '\nNo. of NA in dfm2:', dfm2['mut_type'].isna().sum() , '\nWriting these to output file to check later with jody!') dfm2_missing_info = dfm2[dfm2['mut_type'].isna()] missing_or_metadata = "or_kinship_missing_metadata.csv" outfile_missing_or_metadata = outdir + '/' + str(dfm2['mut_type'].isna().sum()) + '_' + missing_or_metadata print('\noutput file:', outfile_missing_or_metadata) dfm2_missing_info.to_csv(outfile_missing_or_metadata, index = False) print('Finsihed writing file' , '\nDim:', dfm2_missing_info.shape) #PENDING Jody's reply # !!!!!!!! # drop nan from dfm2_mis as these are not useful print('Dropping NAs before further processing...') dfm2_mis = dfm2[dfm2['mut_type'].notnull()] # !!!!!!!! #%% extract mut info into three cols df_ncols = len(dfm2_mis.columns) print('Dim of df to add cols to:', dfm2_mis.shape) # column names already present, wrap this in a if and perform sanity check ncols_add = 0 if not 'wild_type' in dfm2_mis.columns: print('Extracting and adding column: wild_type' , '\n===============================================================') dfm2_mis['wild_type'] = dfm2_mis['mut_info_f1'].str.extract('(\w{1})>') ncols_add+=1 if not 'position' in dfm2_mis.columns: print('Extracting and adding column: position' , '\n===============================================================') dfm2_mis['position'] = dfm2_mis['mut_info_f1'].str.extract('(\d+)') #dfm2_mis['position'] = dfm2_mis[:,'mut_info_f1'].str.extract('(\d+)') ncols_add+=1 if not 'mutant_type' in dfm2_mis.columns: print('Extracting and adding column: mutant_type' , '\n================================================================') dfm2_mis['mutant_type'] = dfm2_mis['mut_info_f1'].str.extract('>\d+(\w{1})') ncols_add+=1 if not 'mutationinformation' in dfm2_mis.columns: print('combining to create column: mutationinformation' , '\n===============================================================') dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type'] ncols_add+=1 print('No. of cols added:', ncols_add) if len(dfm2_mis.columns) == df_ncols + ncols_add: print('PASS: mcsm style muts added to df' , '\n===============================================================') else: print('FAIL: No. of cols mismatch' ,'\nOriginal length:', df_ncols , '\nExpected no. of cols:', df_ncols + ncols_add , '\nGot:', len(dfm2_mis.columns)) sys.exit() del(df_ncols, ncols_add) #%% now adding mutation style = _p.abc1cde dfm2_mis['mutation'] = gene.lower() + '_' + dfm2_mis['mut_info_f2'].astype(str) # convert to lowercase dfm2_mis['mutation'] = dfm2_mis['mutation'].str.lower() # quick sanity check check = dfm2_mis['mutation'].value_counts().value_counts() == dfm2_mis['mut_info_f2'].value_counts().value_counts() if check.all(): print('PASS: added column "mutation" containing mutation format: _p.abc1cde') else: print('FAIL: could not add "mutation" column!') sys.exit() #%% Calculating OR from beta coeff print('Calculating OR...') df_ncols = dfm2_mis.shape[1] print('No. of cols pre-formatting data:', df_ncols , '\n===================================================================') #1) Add column: OR for kinship calculated from beta coef ncols_add = 0 if not 'or_kin' in dfm2_mis.columns: #dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta']) # gives copy warning dfm2_mis.loc[:,'or_kin'] = np.exp(dfm2_mis.loc[:,'beta']) print(dfm2_mis['or_kin'].head()) ncols_add+=1 print('Calculating OR from beta coeff by exponent function and adding column:' , '\nNo. of cols added:', ncols_add , '\n', dfm2_mis['beta'].head() , '\nNo. of cols after adding OR_kin:', len(dfm2_mis.columns) , '\n===================================================================') if dfm2_mis.shape[1] == df_ncols + ncols_add: print('PASS: Dimension of df match' , '\nDim of df:', dfm2_mis.shape , '\n================================================================') else: print('FAIL: Dim mismatch' , '\nOriginal no. of cols:', df_ncols , '\nExpected no. of cols:', df_ncols + ncols_add , '\nGot:', dfm2_mis.shape[1]) sys.exit() #2) rename columns to reflect that it is coming from kinship matrix adjustment dfm2_mis.rename(columns = {'af': 'af_kin' , 'beta': 'beta_kin' , 'p_wald': 'pwald_kin' , 'se': 'se_kin', 'logl_H1': 'logl_H1_kin' , 'l_remle': 'l_remle_kin' , 'ref_allele1': 'reference_allele'}, inplace = True) del(df_ncols, ncols_add) #%%==============================!!!!!!!======================================= # FIXME: should be at source # checking tot_diff column #print((dfm2_mis['tot_diff']==1).all()) and remove these cols #%%==============================!!!!!!!======================================= #3) drop some not required cols (including duplicate if you want) #3a) drop duplicate columns dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, only used for sanity check dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns) print('Total no of duplicate columns:', len(dup_cols) , '\nDuplicate columns identified:', dup_cols , '\n===================================================================') #dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff #print('removing duplicate columns: kept one of the dup_cols i.e tot_diff') df_ncols = dfm2_mis.shape[1] print('Removing', len(dup_cols), 'duplicate columns:', dup_cols , '\nOriginal dim:', dfm2_mis.shape) dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True) if dfm2_mis.shape[1] == df_ncols - len(dup_cols): print('PASS: Dimensions match' , '\nDim:', dfm2_mis.shape , '\nRemoved', len(dup_cols), 'columns from' , df_ncols , '\n===============================================================') else: print('FAIL: Dimensions mismatch' , '\nOriginal no. of cols:', df_ncols , '\nNo. of cols to drop:', len(dup_cols) , '\nExpected:', df_ncols - len(dup_cols) , '\nGot:', dfm2_mis.shape[1]) sys.exit() del(df_ncols) #3b) other not useful columns print('Dropping other redundant or unnecessary columns...') cols_to_drop = ['chromosome_text', 'n_diff', 'chr', '_merge' , 'mut_region' , 'reference_allele', 'alternate_allele'] df_ncols = dfm2_mis.shape[1] dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True) #dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True) if dfm2_mis.shape[1] == df_ncols - len(cols_to_drop): print('PASS:', len(cols_to_drop), 'columns successfully dropped' , '\nDim:', dfm2_mis.shape , '\nRemoved', len(cols_to_drop), 'columns from', df_ncols , '\nDim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape , '\n===========================================') else: print('FAIL: Dimensions mismatch' , '\nOriginal no. of cols:', df_ncols , '\nExpected:', df_ncols - len(cols_to_drop) , '\nGot:', dfm2_mis.shape[1]) sys.exit() del(df_ncols) #%%===================================================================== #4) reorder columnn print('Reordering', dfm2_mis.shape[1], 'columns' , '\n===============================================') #dfm2_mis.columns column_order = ['mutation', 'mutationinformation', 'wild_type', 'position', 'mutant_type', #'chr_num_allele', 'ref_allele', 'alt_allele', 'mut_info_f1', 'mut_info_f2', 'mut_type', 'gene_id', 'gene_name', 'chromosome_number', #'afs 'af_kin', 'or_kin', # 'ors_logistic', # 'ors_chi_cus', # 'ors_fisher', 'pwald_kin', # 'pvals_logistic', # 'pvals_fisher', # 'ci_lb_fisher', # 'ci_ub_fisher' , 'beta_kin', 'se_kin', 'logl_H1_kin', 'l_remle_kin', # 'stat_chi', # 'pvals_chi', # 'n_diff', # 'tot_diff', 'n_miss', #'wt_3let', #'mt_3let' ] if (len(column_order) == dfm2_mis.shape[1] and pd.DataFrame(column_order).isin(dfm2_mis.columns).all()).all(): print('PASS: Column order generated for', len(column_order), 'columns' , '\nColumn names match to perform reordering' , '\nApplying column order to df...' ) orkin_linked = dfm2_mis[column_order] else: print('FAIL: Mismatch in no. of cols to reorder' , '\nNo. of cols in df to reorder:', dfm2_mis.shape[1] , '\nOrder generated for:', len(column_order), 'columns' , '\n', dfm2_mis.shape[1], 'should match', len(column_order)) sys.exit() # sanity check after reassigning columns if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns): print('PASS: Successfully formatted df with rearranged columns') else: sys.exit('FAIL: something went wrong when rearranging columns!') # converting position and chromosome number to numeric orkin_linked.dtypes #orkin_linked['chromosome_number'] = pd.to_numeric(orkin_linked['chromosome_number']) orkin_linked[['chromosome_number', 'position']] = orkin_linked[['chromosome_number', 'position']].apply(pd.to_numeric) orkin_linked.dtypes # write frequency of position counts orkin_pos = pd.DataFrame(orkin_linked['position']) z = orkin_pos['position'].value_counts() z1 = z.to_dict() orkin_pos['or_pos_count'] = orkin_pos['position'].map(z1) orkin_pos['or_pos_count'].value_counts() orkin_linked['position'] foo = orkin_linked['position'].value_counts() # order df by position orkin_linked_o = orkin_linked.sort_values(by = ['position']) bar = orkin_linked_o['position'].value_counts() if (foo == bar).all(): print('PASS: df reorderd by position for output' , '\nWriting output file') else: print('FAIL: could not reorder by position') sys.exit() #%% write file print('\n=====================================================================' , '\nWriting output file:\n', outfile_or_kin , '\nNo. of rows:', len(dfm2_mis) , '\nNo. of cols:', len(dfm2_mis.columns)) orkin_linked_o.to_csv(outfile_or_kin, index = False) #%% diff b/w allele0 and 1: or_df #https://stackoverflow.com/questions/40348541/pandas-diff-with-string #df = or_df.iloc[[5, 15, 17, 19, 34]] #df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) #df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)