diff --git a/scripts/find_missense.py b/scripts/find_missense.py new file mode 100755 index 0000000..ae52065 --- /dev/null +++ b/scripts/find_missense.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +import pandas as pd + +DEBUG = False +#%% +#def find_missense(test_df, ref_allele1, alt_allele0): +def find_missense(test_df, ref_allele_column, alt_allele_column, n_diff_colname = 'n_diff', tot_diff_colname = 'tot_diff', ref_a_colname = 'ref_allele', alt_a_colname = 'alt_allele'): + + + """Find mismatches in pairwise comparison of strings b/w col_a and col_b + + Case insensitive, converts strings to uppercase before comparison + + @test_df: df containing columns to compare + @type: pandas df + + @ref_allele_column: column containing ref allele str + @type: str (converts to uppercase) + + @alt_allele_column: column containing alt_allele str + @type: str (converts to uppercase) + + @n_diff_colname: user defined colname for no. of char diff b/w ref_allele_str and alt_allele_str + @type: str + + @tot_diff_colname: user defined colname abs diff to indicate if strings are of equal length + @type: str + + @ref_a_colname: user defined colname containing extracted referece allele + @type: str + + @alt_a_colname: user defined colname containing extracted alt allele + @type: str + + returns df: with 4 columns. If column names clash, the function column + name will override original column + @rtype: pandas df + """ + for ind, val in test_df.iterrows(): + if DEBUG: + print('index:', ind, 'value:', val + , '\n============================================================') + ref_a = val[ref_allele_column].upper() + alt_a = val[alt_allele_column].upper() + if DEBUG: + print('ref_allele_string:', ref_a, 'alt_allele_string:', alt_a) + difference = sum(1 for e in zip(ref_a, alt_a) if e[0] != e[1]) + test_df.at[ind, n_diff_colname] = difference # adding column + tot_difference = difference + abs(len(ref_a) - len(alt_a)) + test_df.at[ind, tot_diff_colname] = tot_difference # adding column + if difference != tot_difference: + print('WARNING: lengths of ref_allele and alt_allele differ at index:', ind + , '\nNon-missense muts detected') + + # Now finding the mismatched char + ref_aln = '' + alt_aln = '' + if ref_a == alt_a: + ##test_df.at[ind, 'ref_allele'] = 'no_change' # adding column + ##test_df.at[ind, 'alt_allele'] = 'no_change' # adding column + test_df.at[ind, ref_a_colname] = 'no_change' # adding column + test_df.at[ind, alt_a_colname] = 'no_change' # adding column + elif len(ref_a) == len(alt_a) and len(ref_a) > 0: + print('ref:', ref_a, 'alt:', alt_a) + for n in range(len(ref_a)): + if ref_a[n] != alt_a[n]: + ref_aln += ref_a[n] + alt_aln += alt_a[n] + ##test_df.at[ind, 'ref_allele'] = ref_aln + ##test_df.at[ind, 'alt_allele'] = alt_aln + test_df.at[ind, ref_a_colname] = ref_aln + test_df.at[ind, alt_a_colname] = alt_aln + print('ref:', ref_aln) + print('alt:', alt_aln) + else: + ##test_df.at[ind, 'ref_allele'] = 'ERROR_Not_nsSNP' + ##test_df.at[ind, 'alt_allele'] = 'ERROR_Not_nsSNP' + test_df.at[ind, ref_a_colname] = 'ERROR_Not_nsSNP' + test_df.at[ind, alt_a_colname] = 'ERROR_Not_nsSNP' + + return test_df +#======================================== +# a representative example +#eg_df = pd.read_csv('pnca_assoc.txt', sep = '\t', nrows = 10, header = 0, index_col = False) +#eg_df = pd.DataFrame(eg_df) +#eg_df = {'chromosome_number': [2288719, 2288766, 2288775, 2288779, 2288827, 1111111, 2222222], +# 'ref_allele1': ['Tc', 'AG', 'AGCACCCTG', 'CCCTGGTGGCC', 'CACA', 'AA', 'CAT'], +# 'alt_allele0': ['CC', 'CA', 'GGCACCCTGZ','TCCTGGTGGCCAAD', 'TACA', 'AA', 'TCZ']} + +#def main(): + #find_missense(eg_df, ref_allele1 = 'ref_allele', alt_allele0 = 'alt_allele') +# find_missense(test_df = eg_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0') +# print(eg_df) + +#if __name__ == '__main__': +# main() diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py new file mode 100755 index 0000000..de1d93b --- /dev/null +++ b/scripts/or_kinship_link.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 10 11:13:49 2020 + +@author: tanu +""" +#======================================================================= +#%% useful links +#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/ +#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/ +#https://stackoverflow.com/questions/40348541/pandas-diff-with-string +#======================================================================= +#%% specify dirs +import os, sys +import pandas as pd +import numpy as np +import re +import argparse + +homedir = os.path.expanduser('~') +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +# local import +from find_missense import find_missense +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() + +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None) +arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None) + +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') + +# FIXME: remove defaults +arg_parser.add_argument('-s', '--start_coord', help = 'start of coding region (cds) of gene', default = 2288681) # pnca cds +arg_parser.add_argument('-e', '--end_coord', help = 'end of coding region (cds) of gene', default = 2289241) # pnca cds + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + +args = arg_parser.parse_args() +#======================================================================= +#%% variables +#gene = 'pncA' +#drug = 'pyrazinamide' +#start_cds = 2288681 +#end_cds = 2289241 + +# cmd variables +gene = args.gene +drug = args.drug +gene_match = gene + '_p.' + +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir + +start_cds = args.start_coord +end_cds = args.end_coord + +#======================================================================= +#%% input and output dirs and files +#============== +# directories +#============== +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' + +#======= +# input +#======= +info_filename = 'snp_info.txt' +snp_info = datadir + '/' + info_filename +print('Info file: ', snp_info + , '\n============================================================') + +#gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.txt' # without headers +gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.csv' +gene_info = indir + '/' + gene_info_filename +print('gene info file: ', gene_info + , '\n============================================================') + + +in_filename_or = 'ns'+ gene.lower()+ '_assoc.txt' +gene_or = indir + '/' + in_filename_or +print('gene OR file: ', gene_or + , '\n============================================================') + +#======= +# output +#======= +gene_or_filename = gene.lower() + '_af_or_kinship.csv' # other one is called AFandOR +outfile_or_kin = outdir + '/' + gene_or_filename +print('Output file: ', outfile_or_kin + , '\n============================================================') + +#%% read files: preformatted using bash +# or file: '...assoc.txt' +# FIXME: call bash script from here + +or_df = pd.read_csv(gene_or, sep = '\t', header = 0, index_col = False) # 182, 12 (without filtering for missense muts, it was 212 i.e we 30 muts weren't missense) +or_df.head() +or_df.columns +#%% snp_info file: master and gene specific ones + +# gene info +#info_df2 = pd.read_csv('nssnp_info_pnca.txt', sep = '\t', header = 0) #303, 10 +info_df2 = pd.read_csv(gene_info, sep = ',', header = 0) #303, 10 +mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100 +print('*****RESULT*****' + , '\nPercentage of missense mut in pncA:', mis_mut_cover + , '\n*****RESULT*****') #65.7% + +# large file +#info_df = pd.read_csv('snp_info.txt', sep = '\t', header = None) #12010 +info_df = pd.read_csv(snp_info, sep = '\t') #12010 +info_df.columns +#info_df.columns = ['chromosome_number', 'ref_allele', 'alt_allele', 'snp_info'] #12009, 4 + +info_df['chromosome_number'].nunique() #10257 +mut_cover = (info_df['chromosome_number'].nunique()/info_df['chromosome_number'].count()) * 100 +print('*****RESULT*****' + ,'\nPercentage of mutations in pncA:', mut_cover + , '\n*****RESULT*****') #85.4% + +# extract unique chr position numbers +genomic_pos = info_df['chromosome_number'].unique() +genomic_pos_df = pd.DataFrame(genomic_pos, columns = ['chr_pos']) +genomic_pos_df.dtypes + +genomic_pos_min = info_df['chromosome_number'].min() +genomic_pos_max = info_df['chromosome_number'].max() + +# genomic coord for pnca coding region +cds_len = (end_cds-start_cds) + 1 +pred_prot_len = (cds_len/3) - 1 + +# mindblowing: difference b/w bitwise (&) and 'and' +# DO NOT want &: is this bit set to '1' in both variables? Is this what you want? +#if (genomic_pos_min <= start_cds) & (genomic_pos_max >= end_cds): +print('*****RESULT*****' + , '\nlength of coding region:', cds_len, 'bp' + , '\npredicted protein length:', pred_prot_len, 'aa' + , '\n*****RESULT*****') + +if genomic_pos_min <= start_cds and genomic_pos_max >= end_cds: + print ('PASS: coding region for gene included in snp_info.txt') +else: + sys.exit('FAIL: coding region for gene not included in info file snp_info.txt') + +#%% Extracting ref allele and alt allele as single letters +# info_df has some of these params as more than a single letter, which means that +# when you try to merge ONLY using chromosome_number, then it messes up... and is WRONG. +# Hence the merge needs to be performed on a unique set of attributes which in our case +# would be chromosome_number, ref_allele and alt_allele + +orig_len = len(or_df.columns) + +#find_missense(or_df, 'ref_allele1', 'alt_allele0') +# adds columns to df passed +find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0') + +ncols_add = 4 +if len(or_df.columns) == orig_len + ncols_add: + print('PASS: Succesfuly extracted ref and alt alleles for missense muts') +else: + print('FAIL: No. of cols mismatch' + ,'\noriginal length:', orig_len + , '\nExpected no. of cols:', orig_len + ncols_add + , '\nGot no. of cols:', len(or_df.columns) + , '\nCheck hardcoded value of ncols_add?') + sys.exit() + +del(orig_len, ncols_add) + +#%% TRY MERGE +# check dtypes +or_df.dtypes +info_df.dtypes +#or_df.info() + +# pandas documentation where it mentions: "Pandas uses the object dtype for storing strings" +# check how many unique chr_num in info_df are in or_df +genomic_pos_df['chr_pos'].isin(or_df['chromosome_number']).sum() #144 + +# check how many chr_num in or_df are in info_df: should be ALL of them +or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() #182 + +# sanity check 2 +if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df): + print('PASS: all genomic locs in or_df have meta datain info.txt') +else: + sys.exit('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt') +#%% Perform merge + +#my_join = 'inner' +#my_join = 'outer' +my_join = 'left' +#my_join = 'right' + +#dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = my_join, indicator = True) # not unique! +dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True) +dfm1['_merge'].value_counts() + +# count no. of missense mutations ONLY +dfm1.snp_info.str.count(r'(missense.*)').sum() + +dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True) +dfm2['_merge'].value_counts() + +print(or_df.columns, info_df2.columns) + +# count no. of nan +print('No. of NA in dfm2:', dfm2['mut_type'].isna().sum()) + +# drop nan +#dfm2_mis = dfm2[dfm2['mut_type'].notnull()] + +#%% sanity check +# count no. of missense muts +if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum(): + print('PASSED: numbers cross checked' + , '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum() + , '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum()) + +# two ways to filter to get only missense muts +test = dfm1[dfm1['snp_info'].str.count('missense.*')>0] +dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True] +test.equals(dfm1_mis) + +# drop nan from dfm2 +dfm2_mis = dfm2[dfm2['mut_type'].notnull()] + +if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]): + print('PASS: Further cross checks successful') +else: + sys.exit('FAIL: Second cross check unsuccessfull!') + +#%% extract mut info into three cols +orig_len = len(dfm2_mis.columns) +print('Dim of df to add cols to:', dfm2_mis.shape) + +# column names already present, wrap this in a if and perform sanity check +ncols_add = 0 +if not 'wild_type' in dfm2_mis.columns: + print('Extracting and adding column: wild_type' + , '\n======================================') + dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>') + ncols_add+=1 + +if not 'position' in dfm2_mis.columns: + print('Extracting and adding column: position' + , '\n======================================') + dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)') + ncols_add+=1 + +if not 'mutant_type' in dfm2_mis.columns: + print('Extracting and adding column: mutant_type' + , '\n======================================') + dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})') + ncols_add+=1 + +if not 'mutationinformation' in dfm2_mis.columns: + print('combining to create column: mutationinformation' + , '\n======================================') + dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type'] + ncols_add+=1 + +print('No. of cols added:', ncols_add) + + +if len(dfm2_mis.columns) == orig_len + ncols_add: + print('PASS: mcsm style muts present in df' + , '\n======================================') +else: + print('FAIL: No. of cols mismatch' + ,'\nOriginal length:', orig_len + , '\nExpected no. of cols:', orig_len + ncols_add + , '\nGot:', len(dfm2_mis.columns)) + sys.exit() + +del(ncols_add) +#%% formatting data for output +print('no of cols pre-formatting data:', len(dfm2_mis.columns) + , '\n======================================') +#1) Add column: OR for kinship calculated from beta coeff +print('converting beta coeff to OR by exponent function\n:' + , dfm2_mis['beta'].head() + , '\n======================================') +dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta']) +print(dfm2_mis['or_kin'].head()) +print('No. of cols after adidng OR kinship:', len(dfm2_mis.columns)) + +#2) rename af column +dfm2_mis.rename(columns = {'af': 'af_kin' + , 'beta': 'beta_kin' + , 'p_wald': 'pwald_kin' + , 'se': 'se_kin', 'logl_H1': 'logl_H1_kin' + , 'l_remle': 'l_remle_kin' + , 'ref_allele1': 'reference_allele'}, inplace = True) + +# FIXME: should be at source +# checking tot_diff column +#print((dfm2_mis['tot_diff']==1).all()) and remove these cols + +#3) drop some not required cols (including duplicate if you want) + #3a) drop duplicate columns +dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used +dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns) +print('Total no of duplciate columns:', len(dup_cols) + , '\nDuplicate columns identified:', dup_cols) +#dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff + +del(dfm2_mis2) + +#print('removing duplicate columns: kept one of the dup_cols i.e tot_diff') +print('Removing duplicate columns' + , '\nOriginal dim:', dfm2_mis.shape) +dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True) +print('\nRevised dim:', dfm2_mis.shape, 'after removing', len(dup_cols), 'columns') + + #3b) other not useful columns +cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ] + +dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True) +#dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True) + +print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape) +#print(dfm2_mis.columns) + +#4) reorder columns +#orkin_linked = dfm2_mis.copy() + +orkin_linked = dfm2_mis[['mutation', + 'mutationinformation', + 'wild_type', + 'position', + 'mutant_type', + 'chr_num_allele', + 'ref_allele', + 'alt_allele', + 'mut_info', + 'mut_type', + 'gene_id', + 'gene_number', + 'mut_region', + 'reference_allele', + 'alternate_allele', + 'chromosome_number', + #'afs + 'af_kin', + 'or_kin', +# 'ors_logistic', +# 'ors_chi_cus', +# 'ors_fisher', + 'pwald_kin', +# 'pvals_logistic', +# 'pvals_fisher', +# 'ci_lb_fisher', +# 'ci_ub_fisher' , + 'beta_kin', + 'se_kin', + 'logl_H1_kin', + 'l_remle_kin', +# 'stat_chi', +# 'pvals_chi', +# 'n_diff', +# 'tot_diff', + 'n_miss', + 'wt_3let', + 'mt_3let']] + + +# sanity check after reassigning columns +if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns): + print('PASS: Successfully formatted df with rearranged columns') +else: + sys.exit('FAIL: something went wrong when rearranging columns!') + +#%% write file +print('\n=====================================================================' + , '\nWriting output file:\n', outfile_or_kin + , '\nNo.of rows:', len(dfm2_mis) + , '\nNo. of cols:', len(dfm2_mis.columns)) +orkin_linked.to_csv(outfile_or_kin, index = False) + +#%% diff b/w allele0 and 1: or_df +#https://stackoverflow.com/questions/40348541/pandas-diff-with-string +#df = or_df.iloc[[5, 15, 17, 19, 34]] +#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) +#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) +