#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Jun 10 11:13:49 2020 @author: tanu """ #======================================================================= #%% useful links #https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/ #https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/ #https://stackoverflow.com/questions/40348541/pandas-diff-with-string #======================================================================= #%% specify dirs import os, sys import pandas as pd import numpy as np import re import argparse homedir = os.path.expanduser('~') os.chdir(homedir + '/git/LSHTM_analysis/scripts') # local import from find_missense import find_missense #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide') arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive arg_parser.add_argument('-s', '--start_coord', help = 'start of coding region (cds) of gene', default = 2288681) # pnca cds arg_parser.add_argument('-e', '--end_coord', help = 'end of coding region (cds) of gene', default = 2289241) # pnca cds args = arg_parser.parse_args() #======================================================================= #%% variables #gene = 'pncA' #drug = 'pyrazinamide' #start_cds = 2288681 #end_cds = 2289241 # cmd variables gene = args.gene drug = args.drug start_cds = args.start_coord end_cds = args.end_coord #======================================================================= #%% input and output dirs and files #======= # data dir #======= datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/input' outdir = datadir + '/' + drug + '/output' #======= # input #======= info_filename = 'snp_info.txt' snp_info = datadir + '/' + info_filename print('Info file: ', snp_info , '\n============================================================') gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.txt' gene_info = indir + '/' + gene_info_filename print('gene info file: ', gene_info , '\n============================================================') in_filename_or = 'ns'+ gene.lower()+ '_assoc.txt' gene_or = indir + '/' + in_filename_or print('gene OR file: ', gene_or , '\n============================================================') #======= # output #======= gene_or_filename = gene.lower() + '_af_or_kinship.csv' # other one is called AFandOR outfile_or_kin = outdir + '/' + gene_or_filename print('Output file: ', outfile_or_kin , '\n============================================================') #%% read files: preformatted using bash # or file: '...assoc.txt' or_df = pd.read_csv(gene_or, sep = '\t', header = 0, index_col = False) # 182, 12 (without filtering for missense muts, it was 212 i.e we 30 muts weren't missense) or_df.head() or_df.columns #%% snp_info file: master and gene specific ones # gene info #info_df2 = pd.read_csv('nssnp_info_pnca.txt', sep = '\t', header = 0) #303, 10 info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100 print('*****RESULT*****' , '\nPercentage of MISsense mut in pncA:', mis_mut_cover , '\n*****RESULT*****') #65.7% # large file #info_df = pd.read_csv('snp_info.txt', sep = '\t', header = None) #12010 info_df = pd.read_csv(snp_info, sep = '\t') #12010 #info_df.columns = ['chromosome_number', 'ref_allele', 'alt_allele', 'snp_info'] #12009, 4 info_df['chromosome_number'].nunique() #10257 mut_cover = (info_df['chromosome_number'].nunique()/info_df['chromosome_number'].count()) * 100 print('*****RESULT*****' ,'\nPercentage of mutations in pncA:', mut_cover , '\n*****RESULT*****') #85.4% # extract unique chr position numbers genomic_pos = info_df['chromosome_number'].unique() genomic_pos_df = pd.DataFrame(genomic_pos, columns = ['chr_pos']) genomic_pos_df.dtypes genomic_pos_min = info_df['chromosome_number'].min() genomic_pos_max = info_df['chromosome_number'].max() # genomic coord for pnca coding region cds_len = (end_cds-start_cds) + 1 pred_prot_len = (cds_len/3) - 1 # mindblowing: difference b/w bitwise (&) and 'and' # DO NOT want &: is this bit set to '1' in both variables? Is this what you want? #if (genomic_pos_min <= start_cds) & (genomic_pos_max >= end_cds): print('*****RESULT*****' , '\nlength of coding region:', cds_len, 'bp' , '\npredicted protein length:', pred_prot_len, 'aa' , '\n*****RESULT*****') if genomic_pos_min <= start_cds and genomic_pos_max >= end_cds: print ('PASS: coding region for gene included in snp_info.txt') else: print('FAIL: coding region for gene not included in info file snp_info.txt') sys.exit('ERROR: coding region of gene not included in the info file') #%% Extracting ref allele and alt allele as single letters # info_df has some of these params as more than a single letter, which means that # when you try to merge ONLY using chromosome_number, then it messes up... and is WRONG. # Hence the merge needs to be performed on a unique set of attributes which in our case # would be chromosome_number, ref_allele and alt_allele #FIXME: Turn to a function orig_len = len(or_df.columns) #find_missense(or_df, 'ref_allele1', 'alt_allele0') find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0') ncols_add = 4 if len(or_df.columns) == orig_len + ncols_add: print('PASS: Succesfuly extracted ref and alt alleles for missense muts') else: print('FAIL: No. of cols mismatch' ,'\noriginal length:', orig_len , '\nExpected no. of cols:', orig_len + ncols_add , '\nGot no. of cols:', len(or_df.columns)) sys.exit() del(orig_len, ncols_add) #%% TRY MERGE # check dtypes or_df.dtypes info_df.dtypes #or_df.info() # pandas documentation where it mentions: "Pandas uses the object dtype for storing strings" # check how many unique chr_num in info_df are in or_df genomic_pos_df['chr_pos'].isin(or_df['chromosome_number']).sum() #144 # check how many chr_num in or_df are in info_df: should be ALL of them or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() #182 # sanity check 2 if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df): print('PASS: all genomic locs in or_df have meta datain info.txt') else: sys.exit('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt') #%% Perform merge #my_join = 'inner' #my_join = 'outer' my_join = 'left' #my_join = 'right' #dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = my_join, indicator = True) # not unique! dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True) dfm1['_merge'].value_counts() # count no. of missense mutations ONLY dfm1.snp_info.str.count(r'(missense.*)').sum() dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True) dfm2['_merge'].value_counts() # count no. of nan dfm2['mut_type'].isna().sum() # drop nan dfm2_mis = dfm2[dfm2['mut_type'].notnull()] #%% sanity check # count no. of missense muts if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum(): print('PASSED: numbers cross checked' , '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum() , '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum()) # two ways to filter to get only missense muts test = dfm1[dfm1['snp_info'].str.count('missense.*')>0] dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True] test.equals(dfm1_mis) # drop nan dfm2_mis = dfm2[dfm2['mut_type'].notnull()] if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]): print('PASS: Further cross checks successful') else: print('FAIL: Second cross check unsuccessfull. Debug please!') sys.exit() #%% extract mut info into three cols orig_len = len(dfm2_mis.columns) dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>') dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)') dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})') dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type'] # sanity check ncols_add = 4 if len(dfm2_mis.columns) == orig_len + ncols_add: print('PASS: Succesfully extracted and added mutationinformation(mcsm style)') else: print('FAIL: No. of cols mismatch' ,'\noriginal length:', orig_len , '\nExpected no. of cols:', orig_len + ncols_add , '\nGot no. of cols:', len(dfm2_mis.columns)) sys.exit() #%% formatting data for output print('no of cols preformatting data:', len(dfm2_mis.columns)) #1) Add column: OR for kinship calculated from beta coeff print('converting beta coeff to OR by exponent function\n:' , dfm2_mis['beta'].head()) dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta']) print(dfm2_mis['or_kin'].head()) #2) rename af column dfm2_mis.rename(columns = {'af': 'af_kin' , 'beta': 'beta_kin' , 'p_wald': 'pwald_kin' , 'se': 'se_kin', 'logl_H1': 'logl_H1_kin' , 'l_remle': 'l_remle_kin'}, inplace = True) #3) drop some not required cols (including duplicate if you want) #3a) drop duplicate columns dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns) print('Duplicate columns identified:', dup_cols) dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff print('removing duplicate columns: kept one of the dup_cols i.e tot_diff') dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True) print(dfm2_mis.columns) #3b) other not useful columns dfm2_mis.drop(['chromosome_text', 'chr', 'symbol', '_merge', ], axis = 1, inplace = True) dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True) print(dfm2_mis.columns) #4) reorder columns orkin_linked = dfm2_mis[['mutationinformation', 'wild_type', 'position', 'mutant_type', 'chr_num_allele', 'ref_allele', 'alt_allele', 'mut_info', 'mut_type', 'gene_id', 'gene_number', 'mut_region', 'reference_allele', 'alternate_allele', 'chromosome_number', #'afs 'af_kin', 'or_kin', # 'ors_logistic', # 'ors_chi_cus', # 'ors_fisher', 'pwald_kin', # 'pvals_logistic', # 'pvals_fisher', # 'ci_lb_fisher', # 'ci_ub_fisher' , 'beta_kin', 'se_kin', 'logl_H1_kin', 'l_remle_kin', # 'stat_chi', # 'pvals_chi', 'n_diff', 'tot_diff', 'n_miss']] # sanity check after reassigning columns if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns): print('PASS: Successfully formatted df with rearranged columns') else: sys.exit('FAIL: something went wrong when rearranging columns!') #%% write file print('\n=====================================================================' , '\nWriting output file:\n', outfile_or_kin , '\nNo.of rows:', len(dfm2_mis) , '\nNo. of cols:', len(dfm2_mis.columns)) orkin_linked.to_csv(outfile_or_kin, index = False) #%% diff b/w allele0 and 1: or_df #https://stackoverflow.com/questions/40348541/pandas-diff-with-string #df = or_df.iloc[[5, 15, 17, 19, 34]] #df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) #df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)