diff --git a/scripts/AF_and_OR_calcs.R b/scripts/af_or_calcs.R similarity index 100% rename from scripts/AF_and_OR_calcs.R rename to scripts/af_or_calcs.R diff --git a/scripts/af_or_calcs_scratch.R b/scripts/af_or_calcs_scratch.R new file mode 100644 index 0000000..170532e --- /dev/null +++ b/scripts/af_or_calcs_scratch.R @@ -0,0 +1,385 @@ +######################################################### +# TASK: To compare OR from master data +# chisq, fisher test and logistic and adjusted logistic +######################################################### +getwd() +setwd('~/git/LSHTM_analysis/scripts') +getwd() + +#install.packages("logistf") +library(logistf) +######################################################### +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#=========== +# input and output dirs +#=========== +datadir = paste0('~/git/Data') +indir = paste0(datadir, '/', drug, '/', 'input') +outdir = paste0(datadir, '/', drug, '/', 'output') + +#=========== +# input and output files +#=========== +in_filename = 'original_tanushree_data_v2.csv' +#in_filename = 'mtb_gwas_v3.csv' +infile = paste0(datadir, '/', in_filename) +cat(paste0('Reading infile1: raw data', ' ', infile) ) + +# infile2: _gene associated meta data file to extract valid snps and add calcs to. +# This is outfile3 from data_extraction.py +in_filename_metadata = paste0(tolower(gene), '_metadata.csv') +infile_metadata = paste0(outdir, '/', in_filename_metadata) +cat(paste0('Reading infile2: gene associated metadata:', infile_metadata)) + +#=========== +# output +#=========== +out_filename = paste0(tolower(gene),'_', 'meta_data_with_AF_OR.csv') +outfile = paste0(outdir, '/', out_filename) +cat(paste0('Output file with full path:', outfile)) +#%% end of variable assignment for input and output files + +######################################################### +# 1: Read master/raw data stored in Data/ +##################################################### + +#=============== +# Step 1: read raw data (all remove entries with NA in pza column) +#=============== +raw_data_all = read.csv(infile, stringsAsFactors = F) + +# building cols to extract +dr_muts_col = paste0('dr_mutations_', drug) +other_muts_col = paste0('other_mutations_', drug) + +cat('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n===============================================================') + +raw_data = raw_data_all[,c("id" + , drug + , dr_muts_col + , other_muts_col)] +rm(raw_data_all) + +rm(indir, in_filename, infile) + +#=========== +# 1a: exclude na +#=========== +raw_data = raw_data[!is.na(raw_data[[drug]]),] + +total_samples = length(unique(raw_data$id)) +cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) + +# sanity check: should be true +is.numeric(total_samples) + +#=========== +# 1b: combine the two mutation columns +#=========== +#raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide) + +all_muts_colname = paste0('all_mutations_', drug) +raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) +head(raw_data[[all_muts_colname]]) + +#=========== +# 1c: create yet another column that contains all the mutations but in lower case +#=========== +head(raw_data[[all_muts_colname]]) +raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) +head(raw_data$all_muts_gene) + +# sanity checks +#table(grepl("gene_p",raw_data$all_muts_gene)) +cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) +gene_match = tolower(gene_match) + +table(grepl(gene_match,raw_data$all_muts_gene)) + +# sanity check +if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ + cat('PASS: Total no. of samples match') +} else{ + cat('FAIL: No. of samples mismatch') +} + +######################################################### +# 2: Read valid snps for which OR +# can be calculated +######################################################### +cat(paste0('Reading metadata infile:', infile_metadata)) + +gene_metadata = read.csv(infile_metadata + #, file.choose() + , stringsAsFactors = F + , header = T) + + +# clear variables +rm(in_filename_metadata, infile_metadata) + +# count na in pyrazinamide column +tot_pza_na = sum(is.na(gene_metadata$pyrazinamide)) +expected_rows = nrow(gene_metadata) - tot_pza_na + +# drop na from the pyrazinamide colum +gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] + +# sanity check +if(nrow(gene_snps_or) == expected_rows){ + cat('PASS: no. of rows match with expected_rows') +} else{ + cat('FAIL: nrows mismatch.') +} + +# extract unique snps to iterate over for AF and OR calcs +gene_snps_unique = unique(gene_snps_or$mutation) + +cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique))) + +#===================================== +#OR calcs using the following 4 +#1) chisq.test +#2) fisher +#3) modified chisq.test +#4) logistic +#5) adjusted logistic? +#6) kinship (separate script) + +#====================================== + +################# modified chisq OR +# Define OR function +#x = as.numeric(mut) +#y = dst +my_chisq_or = function(x,y){ + tab = as.matrix(table(x,y)) + a = tab[2,2] + if (a==0){ a<-0.5} + b = tab[2,1] + if (b==0){ b<-0.5} + c = tab[1,2] + if (c==0){ c<-0.5} + d = tab[1,1] + if (d==0){ d<-0.5} + (a/b)/(c/d) + +} + +#======================== +# TEST WITH ONE + +i = "pnca_p.trp68gly" +i = "pnca_p.gln10pro" +i = "pnca_p.leu159arg" + +# IV +table(grepl(i,raw_data$all_muts_gene)) +mut = grepl(i,raw_data$all_muts_gene) + +# DV +#dst = raw_data$pyrazinamide +dst = raw_data[[drug]] # or raw_data[,drug] + +# 2X2 table +table(mut, dst) + +# CV +#c = raw_data$id[mut] +c = raw_data$id[grepl(i,raw_data$all_muts_gene)] +#sid = grepl(raw_data$id[mut], raw_data$id) # warning +#argument 'pattern' has length > 1 and only the first element will be used +#grepl(raw_data$id=="ERR2512440", raw_data$id) + +sid = grepl(paste(c,collapse="|"), raw_data$id) +table(sid) + +# 3X2 table +table(mut, dst, sid) + +#============================ +# compare OR +chisq.test(table(mut,dst)) +fisher.test(table(mut, dst)) +fisher.test(table(mut, dst))$p.value +fisher.test(table(mut, dst))$estimate +my_chisq_or(mut,dst) + +# logistic or +summary(model<-glm(dst ~ mut, family = binomial)) +or_logistic = exp(summary(model)$coefficients[2,1]); print(or_logistic) +pval_logistic = summary(model)$coefficients[2,4]; print(pval_logistic) + +# adjusted logistic or +summary(model2<-glm(dst ~ mut + sid, family = binomial)) +or_logistic2 = exp(summary(model2)$coefficients[2,1]); print(or_logistic2) +pval_logistic2 = summary(model2)$coefficients[2,4]; print(pval_logistic2) + +#========================= + +ors = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + my_chisq_or(mut,dst) +}) + +ors + +pvals = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + fisher.test(mut,dst)$p.value +}) + +pvals + +afs = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + mean(mut) +}) + +afs + +# logistic +logistic_ors = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + model<-glm(dst ~ mut, family = binomial) + or_logistic = exp(summary(model)$coefficients[2,1]) + #pval_logistic = summary(model)$coefficients[2,4] +}) +logistic_ors + + +# logistic adj # Doesn't seem to make a difference +logistic_ors2 = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + c = raw_data$id[mut] + sid = grepl(paste(c,collapse="|"), raw_data$id) + model2<-glm(dst ~ mut + sid, family = binomial) + or_logistic2 = exp(summary(model2)$coefficients[2,1]) + #pval_logistic2 = summary(model2)$coefficients[2,4] +}) + +logistic_ors2 + +or_logistic2; pval_logistic2 + + +head(logistic_ors) +#==================================== + +# logistic +summary(model<-glm(dst ~ mut + , family = binomial + #, control = glm.control(maxit = 1) + #, options(warn = 1) + )) +or_logistic_maxit = exp(summary(model)$coefficients[2,1]); print(or_logistic_maxit) +pval_logistic_maxit = summary(model)$coefficients[2,4]; print(pval_logistic_maxit) + + +##################### +# iterate: subset +##################### + +snps_test = c("pnca_p.trp68gly", "pnca_p.leu4ser", "pnca_p.leu159arg","pnca_p.his57arg" ) + +data = snps_test[1:2] + +data + + + + +################# start loop +for (i in data){ + + print(i) + + # IV + #mut<-as.numeric(grepl(i,raw_data$all_muts_gene)) + mut = grepl(i,raw_data$all_muts_gene) + table(mut) + + # DV + #dst<-as.numeric(raw_data[[drug]]) + dst = raw_data[[drug]] + + # table + print(table(dst, mut)) + + + #===================== + # logistic regression, glm.control(maxit = n) + #https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression + #===================== + #n = 1 + summary(model<-glm(dst ~ mut + , family = binomial + #, control = glm.control(maxit = 1) + #, options(warn = 1) + )) + or_logistic_maxit = exp(summary(model)$coefficients[2,1]); print(or_logistic_maxit) + pval_logistic_maxit = summary(model)$coefficients[2,4]; print(pval_logistic_maxit) + + #===================== + # fishers test + #===================== + #attributes(fisher.test(table(dst, mut))) + or_fisher = fisher.test(table(dst, mut))$estimate + or_fisher = or_fisher[[1]]; or_fisher + + pval_fisher = fisher.test(table(dst, mut))$p.value ; pval_fisher + + #===================== + # chi square + #===================== + #chisq.test(y = dst, x = mut) + #attributes(chisq.test(table(dst, mut))) + est_chisq = chisq.test(table(dst, mut))$statistic + est_chisq = est_chisq[[1]]; est_chisq + + pval_chisq = chisq.test(table(dst, mut))$p.value; pval_chisq + + # all output + writeLines(c(paste0("mutation:", i) + , paste0("=========================") + , paste0("OR_logistic_maxit:", or_logistic_maxit,"--->", "P-val_logistic_maxit:", pval_logistic_maxit ) + , paste0("OR_fisher:", or_fisher, "--->","P-val_fisher:", pval_fisher ) + , paste0("Chi_sq_estimate:", est_chisq, "--->","P-val_chisq:", pval_chisq))) + + +} + + +i = "gene_p.leu159arg" + +mut<-as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV +dst<-as.numeric(raw_data$pyrazinamide) +# tablehttps://mail.google.com/mail/?tab=rm&ogbl +table(dst, mut) + + #===================== + # fishers test + #===================== + #attributes(fisher.test(table(dst, mut))) + or_fisher = fisher.test(table(dst, mut))$estimate + or_fisher = or_fisher[[1]]; or_fisher + + pval_fisher = fisher.test(table(dst, mut))$p.value ; pval_fisher + + + # https://stats.stackexchange.com/questions/259635/what-is-the-difference-using-a-fishers-exact-test-vs-a-logistic-regression-for + exact2x2(table(dst, mut),tsmethod="central") + + \ No newline at end of file diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py new file mode 100755 index 0000000..560023f --- /dev/null +++ b/scripts/or_kinship_link.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 10 11:13:49 2020 + +@author: tanu +""" +#%% useful links + +#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/ +#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/ +#https://stackoverflow.com/questions/40348541/pandas-diff-with-string +#%% +import os, sys +import pandas as pd +#import numpy as np +import re +from find_missense import find_missense +import argparse + +#%% +# homedir +homedir = os.path.expanduser('~') +#os.chdir(homedir + '/git/Misc/jody_pza') +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None) +arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None) # case sensitive + +#arg_parser.add_argument('-p', '--outpath', help = 'output path', default = outpath) +#arg_parser.add_argument('-o', '--outfile', help = 'output filename', default = outfile_or_kin) + +arg_parser.add_argument('-s', '--start_coord', help = 'start of coding region (cds) of gene', default = 2288681) # pnca cds +arg_parser.add_argument('-e', '--end_coord', help = 'end of coding region (cds) of gene', default = 2289241) # pnca cds + +args = arg_parser.parse_args() +#======================================================================= +#%% variables +#or_file +#info_file +#short_info_file +#gene = 'pncA' +#drug = 'pyrazinamide' +start_cds = args.start_coord +end_cds = args.end_coord + +gene = args.gene +drug = args.drug +#======================================================================= +#%% input and output dirs and files +#======= +# data dir +#======= +datadir = homedir + '/' + 'git/Data' +indir = datadir + '/' + drug + '/input' +outdir = datadir + '/' + drug + '/output' + +#======= +# input +#======= + +info_filename = 'snp_info.txt' +snp_info = datadir + '/' + info_filename +print('Info file: ', snp_info + , '\n============================================================') + + +gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.txt' +gene_info = indir + '/' + gene_info_filename +print('gene info file: ', gene_info + , '\n============================================================') + + +in_filename_or = 'ns'+ gene.lower()+ '_assoc.txt' +gene_or = indir + '/' + in_filename_or +print('gene OR file: ', gene_or + , '\n============================================================') + +#======= +# output +#======= +gene_or_filename = gene.lower() + '_AF_OR_kinship.csv' # other one is called AFandOR +outfile_or_kin = outdir + '/' + gene_or_filename +print('Output file: ', outfile_or_kin + , '\n============================================================') + +#%% read files: preformatted using bash +# or file: '...assoc.txt' +or_df = pd.read_csv(gene_or, sep = '\t', header = 0, index_col = False) # 182, 12 (without filtering for missense muts, it was 212 i.e we 30 muts weren't missense) +or_df.head() +or_df.columns +#%% snp_info file: master and gene specific ones + +# gene info +#info_df2 = pd.read_csv('nssnp_info_pnca.txt', sep = '\t', header = 0) #303, 10 +info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10 +mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100 +print('*****RESULT*****' + , '\nPercentage of missense mut in pncA:', mis_mut_cover + , '\n*****RESULT*****') #65.7% + +# large file +#info_df = pd.read_csv('snp_info.txt', sep = '\t', header = None) #12010 +info_df = pd.read_csv(snp_info, sep = '\t') #12010 +#info_df.columns = ['chromosome_number', 'ref_allele', 'alt_allele', 'snp_info'] #12009, 4 + +info_df['chromosome_number'].nunique() #10257 +mut_cover = (info_df['chromosome_number'].nunique()/info_df['chromosome_number'].count()) * 100 +print('*****RESULT*****' + ,'\nPercentage of mutations in pncA:', mut_cover + , '\n*****RESULT*****') #85.4% + +# extract unique chr position numbers +genomic_pos = info_df['chromosome_number'].unique() +genomic_pos_df = pd.DataFrame(genomic_pos, columns = ['chr_pos']) +genomic_pos_df.dtypes + +genomic_pos_min = info_df['chromosome_number'].min() +genomic_pos_max = info_df['chromosome_number'].max() + +# genomic coord for pnca coding region +#start_cds = 2288681 +#end_cds = 2289241 +cds_len = (end_cds-start_cds) + 1 +pred_prot_len = (cds_len/3) - 1 + +# mindblowing: difference b/w bitwise (&) and 'and' +# DO NOT want &: is this bit set to '1' in both variables? Is this what you want? +#if (genomic_pos_min <= start_cds) & (genomic_pos_max >= end_cds): +print('*****RESULT*****' + , '\nlength of coding region:', cds_len, 'bp' + , '\npredicted protein length:', pred_prot_len, 'aa' + , '\n*****RESULT*****') + +if genomic_pos_min <= start_cds and genomic_pos_max >= end_cds: + print ('PASS: coding region for gene included in snp_info.txt') +else: + print('FAIL: coding region for gene not included in info file snp_info.txt') + +#%% Extracting ref allele and alt allele as single letters +# info_df has some of these params as more than a single letter, which means that +# when you try to merge ONLY using chromosome_number, then it messes up... and is WRONG. +# Hence the merge needs to be performed on a unique set of attributes which in our case +# would be chromosome_number, ref_allele and alt_allele + +#FIXME: Turn to a function +orig_len = len(or_df.columns) + +#find_missense(or_df, 'ref_allele1', 'alt_allele0') +find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0') + +ncols_add = 4 +if len(or_df.columns) == orig_len + ncols_add: + print('PASS: Succesfuly extracted ref and alt alleles for missense muts') +else: + print('FAIL: No. of cols mismatch' + ,'\noriginal length:', orig_len + , '\nExpected no. of cols:', orig_len + ncols_add + , '\nGot no. of cols:', len(or_df.columns)) + sys.exit() +del(orig_len, ncols_add) + +#%% TRY MERGE +# check dtypes +or_df.dtypes +info_df.dtypes +or_df.info() + +# pandas documentation where it mentions: "Pandas uses the object dtype for storing strings" +# check how many unique chr_num in info_df are in or_df +genomic_pos_df['chr_pos'].isin(or_df['chromosome_number']).sum() #144 + +# check how many chr_num in or_df are in info_df: should be ALL of them +or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() #182 + +# sanity check 2 +if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df): + print('PASS: all genomic locs in or_df have meta datain info.txt') +else: + print('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt') + +#%% Perform merge + +#join_type = 'inner' +#join_type = 'outer' +join_type = 'left' +#join_type = 'right' + +#dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = join_type, indicator = True) # not unique! +dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = join_type, indicator = True) +dfm1['_merge'].value_counts() + +# count no. of missense mutations ONLY +dfm1.snp_info.str.count(r'(missense.*)').sum() + +dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = join_type, indicator = True) +dfm2['_merge'].value_counts() + +# count no. of nan +dfm2['mut_type'].isna().sum() + +# drop nan +dfm2_mis = dfm2[dfm2['mut_type'].notnull()] + +#%% sanity check +# count no. of missense muts +if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum(): + print('PASSED: numbers cross checked' + , '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum() + , '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum()) + +# two ways to filter to get only missense muts +test = dfm1[dfm1['snp_info'].str.count('missense.*')>0] +dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True] +test.equals(dfm1_mis) + +# drop nan +dfm2_mis = dfm2[dfm2['mut_type'].notnull()] + +if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]): + print('PASS: Further cross checks successful') +else: + print('FAIL: Second cross check unsuccessfull. Debug please!') + sys.exit() + +orig_len = len(dfm2_mis.columns) + +#%% extract mut info into three cols +dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>') +dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)') +dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})') + +dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type'] + +# sanity check +ncols_add = 4 +if len(dfm2_mis.columns) == orig_len + ncols_add: + print('PASS: Succesfully extracted and added mutationinformation(mcsm style)') +else: + print('FAIL: No. of cols mismatch' + ,'\noriginal length:', orig_len + , '\nExpected no. of cols:', orig_len + ncols_add + , '\nGot no. of cols:', len(dfm2_mis.columns)) + sys.exit() + + +#%% write file +print('Writing output file:\n', outfile_or_kin + , '\nNo.of rows:', len(dfm2_mis) + , '\nNo. of cols:', len(dfm2_mis.columns)) +dfm2_mis.to_csv(outfile_or_kin, index = False) + +#%% diff b/w allele0 and 1: or_df +#https://stackoverflow.com/questions/40348541/pandas-diff-with-string +#df = or_df.iloc[[5, 15, 17, 19, 34]] +#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) +#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) +