import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/meta_data_analysis/pnca_AF_and_OR_calcs.R
+++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R
@ -0,0 +1,241 @@
+getwd()
+setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
+getwd()
+
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+
+raw_data = infile[,c("id"
+                     , "pyrazinamide"
+                     , "dr_mutations_pyrazinamide"
+                     , "other_mutations_pyrazinamide")]
+
+#####
+# 1a: exclude na
+#####
+raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
+
+total_samples = length(unique(raw_data$id))
+print(total_samples)
+
+# sanity check: should  be true
+is.numeric(total_samples) 
+
+#####
+# 1b: combine the two mutation columns
+#####
+raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
+                                            , raw_data$other_mutations_pyrazinamide)
+head(raw_data$all_mutations_pyrazinamide)
+
+#####
+# 1c: create yet another column that contains all the mutations but in lower case
+#####
+raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
+
+# sanity checks
+table(grepl("pnca_p",raw_data$all_muts_pnca))
+
+# sanity check: should be TRUE
+sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
+
+# set up variables: can be used for logistic regression as well
+i  = "pnca_p.ala134gly" # has a NA, should NOT exist
+table(grepl(i,raw_data$all_muts_pnca))
+
+i = "pnca_p.trp68gly"
+table(grepl(i,raw_data$all_muts_pnca))
+
+mut = grepl(i,raw_data$all_muts_pnca)
+dst = raw_data$pyrazinamide
+table(mut, dst)
+
+#chisq.test(table(mut,dst))
+#fisher.test(table(mut, dst))
+#table(mut)
+
+###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
+pnca_snps_or = read.csv(file.choose()
+                        , stringsAsFactors = F
+                        , header = T)
+
+# extract unique snps to iterate over for AF and OR calcs
+# total no of unique snps
+# AF and OR calculations
+
+pnca_snps_unique = unique(pnca_snps_or$mutation) 
+
+# Define OR function
+x = as.numeric(mut)
+y = dst
+or = function(x,y){
+  tab = as.matrix(table(x,y))
+  a = tab[2,2]
+  if (a==0){ a<-0.5}
+  b = tab[2,1]
+  if (b==0){ b<-0.5}
+  c = tab[1,2]
+  if (c==0){ c<-0.5}
+  d = tab[1,1]
+  if (d==0){ d<-0.5}
+  (a/b)/(c/d)
+  
+  }
+
+dst = raw_data$pyrazinamide
+ors = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  or(mut,dst)
+})
+
+ors
+
+pvals = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  fisher.test(mut,dst)$p.value
+})
+
+pvals
+
+afs = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  mean(mut)
+})
+
+afs
+
+# check ..hmmm
+afs['pnca_p.trp68gly']
+afs['pnca_p.gln10pro'] 
+afs['pnca_p.leu4ser'] 
+
+#plot(density(log(ors)))
+#plot(-log10(pvals))
+#hist(log(ors)
+#     ,breaks = 100
+#     )
+
+# subset df cols to add to the calc param df
+pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] 
+pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
+
+rownames(pnca_snps_cols) = pnca_snps_cols$mutation
+head(rownames(pnca_snps_cols))
+#snps_with_AF_and_OR
+
+# combine
+comb_AF_and_OR = data.frame(ors, pvals, afs)
+head(rownames(comb_AF_and_OR))
+
+# sanity checks: should be the same
+dim(comb_AF_and_OR); dim(pnca_snps_cols)
+table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
+
+table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
+
+# merge the above two df whose dim you checked
+snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
+                            , by = "row.names"
+#                            , all.x = T
+                            )
+
+#rm(pnca_snps_cols, pnca_snps_or, raw_data)
+
+#===============
+# Step 3: Read data file where you will add the calculated OR 
+# Note: this is the big file with one-many relationship between snps and lineages
+# i.e fname4 from 'pnca_extraction.py'
+#===============
+my_data = read.csv(file.choose()
+                   , row.names = 1
+                   , stringsAsFactors = FALSE)
+
+head(my_data)
+length(unique(my_data$id))
+
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+
+# sanity check
+head(my_data$mutation)
+
+# FILES TO MERGE:
+# comb_AF_and_OR: file containing OR
+# my_data = big meta data file 
+# linking column: mutation
+
+head(my_data)
+merged_df = merge(my_data # big file
+                  , snps_with_AF_and_OR # small (afor file)
+                  , by = "mutation"
+                  , all.x = T) # because you want all the entries of the meta data 
+
+# sanity checks: should be True 
+# FIXME: I have checked this manually, but make it so it is a pass or a fail!
+comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors  
+merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
+
+merged_df[merged_df$Mutationinformation.x == "Q10P",]
+
+# sanity check: very important!
+colnames(merged_df)
+
+table(merged_df$mutation_info.x == merged_df$mutation_info.y)
+
+#FIXME: what happened to other 7 and FALSE
+table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
+
+# problem
+identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
+
+#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
+
+#throw away the y because that is a smaller df
+d1 = which(colnames(merged_df) == "mutation_info.y") #21
+d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
+
+merged_df2 = merged_df[-c(d1, d2)] #3093 20
+colnames(merged_df2)
+
+# rename cols 
+colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
+colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
+
+colnames(merged_df2)
+
+# should be 0
+sum(is.na(merged_df2$Mutationinformation))
+
+# count na in each column
+na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
+# only some or and Af should be NA
+#Row.names           ors               pvals               afs 
+#81                  81                81                  81 
+
+
+colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
+colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
+colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
+
+colnames(merged_df2)
+
+# add log OR and neglog pvalue
+merged_df2$logor = log(merged_df2$OR)
+is.numeric(merged_df2$logor)
+
+merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) 
+is.numeric(merged_df2$neglog10pvalue)
+
+# write file out
+#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
+
+# define output variables
+drug  = 'pyrazinamide'
+out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
+outFile = "meta_data_with_AFandOR.csv"
+output_filename = paste0(outdir, outFile)
+
+write.csv(merged_df2, output_filename
+          , row.names = F)