renamed files & added or kinship link file

2020-06-19 10:33:26 +01:00 · 2020-06-19 10:33:26 +01:00 · 07258120de
commit 07258120de
parent c36197d75e
3 changed files with 646 additions and 0 deletions
--- a/scripts/af_or_calcs_scratch.R
+++ b/scripts/af_or_calcs_scratch.R
@ -0,0 +1,385 @@
+#########################################################
+# TASK: To compare OR from master data
+# chisq, fisher test and logistic and adjusted logistic
+#########################################################
+getwd()
+setwd('~/git/LSHTM_analysis/scripts')
+getwd()
+
+#install.packages("logistf")
+library(logistf)
+#########################################################
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = paste0(gene,'_p.')
+cat(gene_match)
+
+#===========
+# input and output dirs
+#===========
+datadir = paste0('~/git/Data')
+indir = paste0(datadir, '/', drug, '/', 'input')
+outdir = paste0(datadir, '/', drug, '/', 'output')
+
+#===========
+# input and output files
+#===========
+in_filename  = 'original_tanushree_data_v2.csv'
+#in_filename  = 'mtb_gwas_v3.csv'
+infile = paste0(datadir, '/', in_filename)
+cat(paste0('Reading infile1: raw data', ' ', infile) )
+
+# infile2: _gene associated meta data file to extract valid snps and add calcs to.
+# This is outfile3 from data_extraction.py
+in_filename_metadata = paste0(tolower(gene), '_metadata.csv')
+infile_metadata = paste0(outdir, '/', in_filename_metadata)
+cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
+
+#===========
+# output
+#===========
+out_filename = paste0(tolower(gene),'_', 'meta_data_with_AF_OR.csv')
+outfile = paste0(outdir, '/', out_filename)
+cat(paste0('Output file with full path:', outfile))
+#%% end of variable assignment for input and output files
+
+#########################################################
+# 1: Read master/raw data stored in Data/
+#####################################################
+
+#===============
+# Step 1: read raw data (all remove entries with NA in pza column)
+#===============
+raw_data_all = read.csv(infile, stringsAsFactors = F)
+
+# building cols to extract
+dr_muts_col = paste0('dr_mutations_', drug)
+other_muts_col = paste0('other_mutations_', drug)
+
+cat('Extracting columns based on variables:\n'
+    , drug
+    , '\n'
+    , dr_muts_col
+    , '\n'
+    , other_muts_col
+    , '\n===============================================================')
+
+raw_data = raw_data_all[,c("id"
+                           , drug
+                           , dr_muts_col
+                           , other_muts_col)]
+rm(raw_data_all)
+
+rm(indir, in_filename, infile)
+
+#===========
+# 1a: exclude na
+#===========
+raw_data = raw_data[!is.na(raw_data[[drug]]),]
+
+total_samples = length(unique(raw_data$id))
+cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
+
+# sanity check: should  be true
+is.numeric(total_samples) 
+
+#===========
+# 1b: combine the two mutation columns
+#===========
+#raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)
+
+all_muts_colname = paste0('all_mutations_', drug)
+raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]])
+head(raw_data[[all_muts_colname]])
+
+#===========
+# 1c: create yet another column that contains all the mutations but in lower case
+#===========
+head(raw_data[[all_muts_colname]])
+raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) 
+head(raw_data$all_muts_gene)
+
+# sanity checks
+#table(grepl("gene_p",raw_data$all_muts_gene))
+cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
+gene_match = tolower(gene_match)
+
+table(grepl(gene_match,raw_data$all_muts_gene))
+
+# sanity check
+if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){
+  cat('PASS: Total no. of samples match')
+} else{
+  cat('FAIL: No. of samples mismatch')
+}
+
+#########################################################
+# 2: Read valid snps for which OR 
+# can be calculated
+#########################################################
+cat(paste0('Reading metadata infile:', infile_metadata))
+
+gene_metadata = read.csv(infile_metadata
+                         #, file.choose()
+                         , stringsAsFactors = F
+                         , header = T)
+
+
+# clear variables
+rm(in_filename_metadata, infile_metadata)
+
+# count na in pyrazinamide column
+tot_pza_na = sum(is.na(gene_metadata$pyrazinamide))
+expected_rows = nrow(gene_metadata) - tot_pza_na
+
+# drop na from the pyrazinamide colum
+gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),]
+
+# sanity check
+if(nrow(gene_snps_or) == expected_rows){
+  cat('PASS: no. of rows match with expected_rows')
+} else{
+  cat('FAIL: nrows mismatch.')
+}
+
+# extract unique snps to iterate over for AF and OR calcs
+gene_snps_unique = unique(gene_snps_or$mutation) 
+
+cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique)))
+
+#=====================================
+#OR calcs using the following 4
+#1) chisq.test
+#2) fisher
+#3) modified chisq.test
+#4) logistic
+#5) adjusted logistic?
+#6) kinship (separate script)
+
+#======================================
+
+################# modified chisq OR
+# Define OR function
+#x = as.numeric(mut)
+#y = dst
+my_chisq_or = function(x,y){
+  tab = as.matrix(table(x,y))
+  a = tab[2,2]
+  if (a==0){ a<-0.5}
+  b = tab[2,1]
+  if (b==0){ b<-0.5}
+  c = tab[1,2]
+  if (c==0){ c<-0.5}
+  d = tab[1,1]
+  if (d==0){ d<-0.5}
+  (a/b)/(c/d)
+  
+}
+
+#========================
+# TEST WITH ONE
+
+i = "pnca_p.trp68gly"
+i = "pnca_p.gln10pro"
+i = "pnca_p.leu159arg"
+
+# IV
+table(grepl(i,raw_data$all_muts_gene))
+mut = grepl(i,raw_data$all_muts_gene)
+
+# DV
+#dst = raw_data$pyrazinamide
+dst = raw_data[[drug]] # or raw_data[,drug]
+
+# 2X2 table
+table(mut, dst)
+
+# CV
+#c = raw_data$id[mut]
+c = raw_data$id[grepl(i,raw_data$all_muts_gene)] 
+#sid = grepl(raw_data$id[mut], raw_data$id) # warning
+#argument 'pattern' has length > 1 and only the first element will be used
+#grepl(raw_data$id=="ERR2512440", raw_data$id)
+
+sid = grepl(paste(c,collapse="|"), raw_data$id)
+table(sid)   
+
+# 3X2 table
+table(mut, dst, sid)
+
+#============================
+# compare OR
+chisq.test(table(mut,dst))
+fisher.test(table(mut, dst))
+fisher.test(table(mut, dst))$p.value
+fisher.test(table(mut, dst))$estimate
+my_chisq_or(mut,dst)
+
+# logistic or
+summary(model<-glm(dst ~ mut, family = binomial))
+or_logistic = exp(summary(model)$coefficients[2,1]); print(or_logistic)
+pval_logistic = summary(model)$coefficients[2,4]; print(pval_logistic)
+
+# adjusted logistic or
+summary(model2<-glm(dst ~ mut + sid, family = binomial))
+or_logistic2 = exp(summary(model2)$coefficients[2,1]); print(or_logistic2)
+pval_logistic2 = summary(model2)$coefficients[2,4]; print(pval_logistic2)
+
+#=========================
+
+ors = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  my_chisq_or(mut,dst)
+})
+
+ors
+
+pvals = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  fisher.test(mut,dst)$p.value
+})
+
+pvals
+
+afs = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  mean(mut)
+})
+
+afs
+
+# logistic
+logistic_ors = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  model<-glm(dst ~ mut, family = binomial)
+  or_logistic = exp(summary(model)$coefficients[2,1])
+  #pval_logistic = summary(model)$coefficients[2,4]
+})
+logistic_ors
+
+
+# logistic adj # Doesn't seem to make a difference
+logistic_ors2 = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  c = raw_data$id[mut]
+  sid = grepl(paste(c,collapse="|"), raw_data$id)
+  model2<-glm(dst ~ mut + sid, family = binomial)
+  or_logistic2 = exp(summary(model2)$coefficients[2,1])
+  #pval_logistic2 = summary(model2)$coefficients[2,4]
+})
+
+logistic_ors2
+
+or_logistic2; pval_logistic2
+
+
+head(logistic_ors)
+#====================================
+
+# logistic
+summary(model<-glm(dst ~ mut
+                   , family = binomial
+                   #, control = glm.control(maxit = 1)
+                   #, options(warn = 1)
+                   ))
+or_logistic_maxit = exp(summary(model)$coefficients[2,1]); print(or_logistic_maxit)
+pval_logistic_maxit = summary(model)$coefficients[2,4]; print(pval_logistic_maxit)
+
+
+#####################
+# iterate: subset
+#####################
+
+snps_test = c("pnca_p.trp68gly", "pnca_p.leu4ser", "pnca_p.leu159arg","pnca_p.his57arg" )
+
+data = snps_test[1:2]
+
+data
+
+
+
+
+################# start loop
+for (i in data){
+  
+  print(i)
+  
+  # IV
+  #mut<-as.numeric(grepl(i,raw_data$all_muts_gene))
+  mut = grepl(i,raw_data$all_muts_gene)
+  table(mut)
+  
+  # DV
+  #dst<-as.numeric(raw_data[[drug]])
+  dst = raw_data[[drug]]
+  
+  # table
+  print(table(dst, mut))
+  
+
+  #=====================
+  # logistic regression, glm.control(maxit = n)
+  #https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression
+  #=====================
+  #n = 1
+  summary(model<-glm(dst ~ mut
+                     , family = binomial
+                     #, control = glm.control(maxit = 1)
+                     #, options(warn = 1)
+                     ))
+  or_logistic_maxit = exp(summary(model)$coefficients[2,1]); print(or_logistic_maxit)
+  pval_logistic_maxit = summary(model)$coefficients[2,4]; print(pval_logistic_maxit)
+  
+  #=====================
+  # fishers test
+  #=====================
+  #attributes(fisher.test(table(dst, mut)))
+  or_fisher = fisher.test(table(dst, mut))$estimate
+  or_fisher = or_fisher[[1]]; or_fisher
+  
+  pval_fisher = fisher.test(table(dst, mut))$p.value ; pval_fisher
+  
+  #=====================
+  # chi square
+  #=====================
+  #chisq.test(y = dst, x = mut)
+  #attributes(chisq.test(table(dst, mut)))
+  est_chisq = chisq.test(table(dst, mut))$statistic 
+  est_chisq = est_chisq[[1]]; est_chisq
+  
+  pval_chisq = chisq.test(table(dst, mut))$p.value; pval_chisq
+  
+  # all output 
+  writeLines(c(paste0("mutation:", i)
+               , paste0("=========================")
+               , paste0("OR_logistic_maxit:", or_logistic_maxit,"--->", "P-val_logistic_maxit:", pval_logistic_maxit )
+               , paste0("OR_fisher:", or_fisher, "--->","P-val_fisher:", pval_fisher )
+               , paste0("Chi_sq_estimate:", est_chisq, "--->","P-val_chisq:", pval_chisq)))
+    
+
+}
+
+
+i  = "gene_p.leu159arg"
+
+mut<-as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV
+dst<-as.numeric(raw_data$pyrazinamide)
+# tablehttps://mail.google.com/mail/?tab=rm&ogbl
+table(dst, mut)
+
+ #=====================
+  # fishers test
+  #=====================
+  #attributes(fisher.test(table(dst, mut)))
+  or_fisher = fisher.test(table(dst, mut))$estimate
+  or_fisher = or_fisher[[1]]; or_fisher
+  
+  pval_fisher = fisher.test(table(dst, mut))$p.value ; pval_fisher
+  
+  
+ # https://stats.stackexchange.com/questions/259635/what-is-the-difference-using-a-fishers-exact-test-vs-a-logistic-regression-for
+  exact2x2(table(dst, mut),tsmethod="central")
+  
+