moved mut_landscape files to plotting

2022-07-31 12:01:32 +01:00 · 2022-07-31 12:01:32 +01:00 · 4de1e800ec
commit 4de1e800ec
parent d6a63eed21
2 changed files with 191 additions and 0 deletions
--- a/scripts/plotting/mut_landscape.R
+++ b/scripts/plotting/mut_landscape.R
@ -0,0 +1,155 @@
+source("~/git/LSHTM_analysis/config/alr.R")
+source("~/git/LSHTM_analysis/config/embb.R")
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/config/katg.R")
+source("~/git/LSHTM_analysis/config/pnca.R")
+source("~/git/LSHTM_analysis/config/rpob.R")
+
+#================================
+# output files
+# In total: 6 files are written
+#================================
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
+
+# mutational positions: all
+outfile_mutpos = paste0(outdir_images, "/", tolower(gene), "_mutpos_all.txt")
+outfile_meta1 =  paste0(outdir_images, "/", tolower(gene), "_mutpos_cu.txt")
+
+# mutational positions with sensitivity: S, R and common
+outfile_mutpos_S = paste0(outdir_images, "/", tolower(gene), "_mutpos_S.txt")
+outfile_mutpos_R = paste0(outdir_images, "/", tolower(gene), "_mutpos_R.txt")
+outfile_mutpos_common = paste0(outdir_images, "/", tolower(gene), "_mutpos_common.txt")
+outfile_meta2 =  paste0(outdir_images, "/", tolower(gene), "_mutpos_annot_cu.txt")
+
+#=============
+# Input
+#=============
+df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
+df3 = read.csv(df3_filename)
+
+# Determine for each gene
+if (tolower(gene) == "embb"){
+  chain_suffix = ".B"
+} else{
+  chain_suffix = ".A"
+}
+
+# mut_info checks
+table(df3$mutation_info)
+table(df3$mutation_info_orig)
+table(df3$mutation_info_labels_orig)
+
+# used in plots and analyses
+table(df3$mutation_info_labels) # different, and matches dst_mode
+table(df3$dst_mode)
+
+# create column based on dst mode with different colname
+table(is.na(df3$dst))
+table(is.na(df3$dst_mode))
+
+#===============
+# Create column: sensitivity mapped to dst_mode
+#===============
+df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
+table(df3$sensitivity)
+
+length(unique((df3$mutationinformation)))
+all_colnames = as.data.frame(colnames(df3))
+
+############################################################
+cols_to_extract = c("mutationinformation"
+                    , "wild_type"
+                    , "chain"
+                    , "mutant_type"
+                    , "position"
+                    , "dst_mode"
+                    , "mutation_info_labels_orig"
+                    , "mutation_info_labels"
+                    , "sensitivity")
+
+df3_plot = df3[, cols_to_extract]
+# create pos_chain column: allows easier colouring in chimera
+df3_plot$pos_chain = paste(df3_plot$position, df3_plot$chain, sep = ".")
+pos_cu = length(unique(df3_plot$position))
+
+#===========================
+# positions with mutations 
+#===========================
+#df3_all_mut_pos = df3_plot[, c("mutationinformation", "position", "chain")]
+#df3_all_mut_pos$pos_chain = paste(df3_all_mut_pos$position, df3_all_mut_pos$chain, sep = ".")
+
+df3_all_mut_pos = df3_plot[, c("position", "pos_chain")]
+gene_mut_pos_u = unique(df3_all_mut_pos$pos_chain)
+class(gene_mut_pos_u)
+paste(gene_mut_pos_u, collapse=',')
+
+if (length(gene_mut_pos_u) == pos_cu){
+  cat("\nPASS: all mutation positions extracted"
+      , "\nWriting file:", outfile_mutpos)
+} else{
+  stop("\nAbort: mutation position count mismatch")
+}
+
+write.table(paste(gene_mut_pos_u, collapse=',')
+            , outfile_mutpos
+            , row.names = F
+            , col.names = F)
+
+write.table(paste("Count of positions with mutations in gene"
+                  , tolower(gene), ":", pos_cu)
+            , outfile_meta1
+            , row.names = F
+            , col.names = F)
+#========================================
+# positions with sensitivity annotations
+#========================================
+df3_muts_annot = df3_plot[, c("mutationinformation", "position", "pos_chain", "sensitivity")]
+
+# aggregrate position count by sensitivity
+result <- aggregate(sensitivity ~ position, data = df3_muts_annot, paste, collapse = "")
+
+sensitive_pos = result$position[grep("(^S+$)", result$sensitivity)]
+sensitive_pos = paste0(sensitive_pos, chain_suffix)
+
+resistant_pos = result$position[grep("(^R+$)", result$sensitivity)]
+resistant_pos = paste0(resistant_pos, chain_suffix)
+
+common_pos    = result$position[grep("SR|RS" , result$sensitivity)]
+common_pos    = paste0(common_pos, chain_suffix)
+
+if (tolower(gene)!= "alr"){
+  length_check = length(sensitive_pos) + length(resistant_pos) + length(common_pos)
+  cpl          =  length(common_pos)
+}else{
+  length_check = length(sensitive_pos) + length(resistant_pos)
+  cpl          = 0
+}
+
+if (length_check == pos_cu){
+  cat("\nPASS: position with mutational sensitivity extracted"
+      , "\nWriting files: sensitive, resistant and common position counts" )
+} else{
+  stop("\nAbort: position with mutational sensitivity count mismatch")
+}
+
+write.table(paste(sensitive_pos, collapse = ',')
+            , outfile_mutpos_S
+            , row.names = F, col.names = F)
+
+write.table(paste(resistant_pos, collapse = ',')
+            , outfile_mutpos_R
+            , row.names = F, col.names = F)
+
+write.table(paste(common_pos, collapse = ',')
+            , outfile_mutpos_common
+            , row.names = F, col.names = F)
+
+write.table(paste("Count of positions with mutations in gene:"
+                  , tolower(gene)
+                  , "\nTotal mutational positions:", pos_cu
+                  , "\nsensitive:", length(sensitive_pos)
+                  , "\nresistant:", length(resistant_pos)
+                  , "\ncommon:"   , cpl)
+            , outfile_meta2
+            , row.names = F
+            , col.names = F)
--- a/scripts/plotting/mut_landscape_5uhc_rpob.R
+++ b/scripts/plotting/mut_landscape_5uhc_rpob.R
@ -0,0 +1,191 @@
+source("~/git/LSHTM_analysis/config/rpob.R")
+
+#================================
+# output files
+# In total: 6 files are written
+#================================
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
+
+# mutational positions: all
+outfile_mutpos = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_all.txt")
+outfile_meta1 =  paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_cu.txt")
+
+# mutational positions with sensitivity: S, R and common
+outfile_mutpos_S = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_S.txt")
+outfile_mutpos_R = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_R.txt")
+outfile_mutpos_common = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_common.txt")
+outfile_meta2 =  paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_annot_cu.txt")
+
+#=============
+# Input
+#=============
+df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
+df3 = read.csv(df3_filename)
+
+chain_suffix = ".C"
+
+# mut_info checks
+table(df3$mutation_info)
+table(df3$mutation_info_orig)
+table(df3$mutation_info_labels_orig)
+
+# used in plots and analyses
+table(df3$mutation_info_labels) # different, and matches dst_mode
+table(df3$dst_mode)
+
+# create column based on dst mode with different colname
+table(is.na(df3$dst))
+table(is.na(df3$dst_mode))
+
+#===============
+# Create column: sensitivity mapped to dst_mode
+#===============
+df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
+table(df3$sensitivity)
+
+length(unique((df3$mutationinformation)))
+all_colnames = as.data.frame(colnames(df3))
+
+############################################################
+cols_to_extract = c("mutationinformation"
+                    , "wild_type"
+                    , "chain"
+                    , "mutant_type"
+                    , "position"
+                    , "X5uhc_position"
+                    , "X5uhc_offset"
+                    , "dst_mode"
+                    , "mutation_info_labels_orig"
+                    , "mutation_info_labels"
+                    , "sensitivity")
+
+df3_plot = df3[, cols_to_extract]
+
+# use the x5uhc_position column
+
+# create pos_chain column: allows easier colouring in chimera
+df3_plot$pos_chain = paste(df3_plot$X5uhc_position, chain_suffix, sep = ".")
+pos_cu = length(unique(df3_plot$X5uhc_position))
+X5uhc_pos = unique(df3_plot$X5uhc_position)
+X5uhc_pos = paste0(X5uhc_pos, chain_suffix)
+
+#===========================
+# positions with mutations 
+#===========================
+df3_all_mut_pos = df3_plot[, c("X5uhc_position", "pos_chain")]
+gene_mut_pos_u = unique(df3_all_mut_pos$pos_chain)
+class(gene_mut_pos_u)
+paste(gene_mut_pos_u, collapse=',')
+
+if (length(gene_mut_pos_u) == pos_cu){
+  cat("\nPASS: all mutation positions extracted"
+      , "\nWriting file:", outfile_mutpos)
+} else{
+  stop("\nAbort: mutation position count mismatch")
+}
+
+write.table(paste(gene_mut_pos_u, collapse=',')
+            , outfile_mutpos
+            , row.names = F
+            , col.names = F)
+
+write.table(paste("Count of positions with mutations in gene"
+                  , tolower(gene), ":", pos_cu)
+            , outfile_meta1
+            , row.names = F
+            , col.names = F)
+#========================================
+# positions with sensitivity annotations
+#========================================
+df3_muts_annot = df3_plot[, c("mutationinformation", "X5uhc_position", "pos_chain", "sensitivity")]
+
+# aggregrate position count by sensitivity
+result <- aggregate(sensitivity ~ X5uhc_position, data = df3_muts_annot, paste, collapse = "")
+
+sensitive_pos = result$X5uhc_position[grep("(^S+$)", result$sensitivity)]
+sensitive_pos = paste0(sensitive_pos, chain_suffix)
+
+resistant_pos = result$X5uhc_position[grep("(^R+$)", result$sensitivity)]
+resistant_pos = paste0(resistant_pos, chain_suffix)
+
+common_pos    = result$X5uhc_position[grep("SR|RS" , result$sensitivity)]
+common_pos    = paste0(common_pos, chain_suffix)
+
+if (tolower(gene)!= "alr"){
+  length_check = length(sensitive_pos) + length(resistant_pos) + length(common_pos)
+  cpl          =  length(common_pos)
+}else{
+  length_check = length(sensitive_pos) + length(resistant_pos)
+  cpl          = 0
+}
+
+if (length_check == pos_cu){
+  cat("\nPASS: position with mutational sensitivity extracted"
+      , "\nWriting files: sensitive, resistant and common position counts" )
+} else{
+  stop("\nAbort: position with mutational sensitivity count mismatch")
+}
+# spl handling for rpob 5uhc
+revised_gene_mut_pos_u = c(sensitive_pos, resistant_pos, common_pos)
+revised_pos_cu = length(unique(revised_gene_mut_pos_u))
+if (length(revised_gene_mut_pos_u) == revised_pos_cu){
+  cat("\nPASS: all mutation positions extracted"
+      , "\nWriting file:", outfile_mutpos)
+} else{
+  stop("\nAbort: mutation position count mismatch")
+}
+
+write.table(paste(revised_gene_mut_pos_u, collapse=',')
+            , outfile_mutpos
+            , row.names = F
+            , col.names = F)
+
+write.table(paste("Count of positions with mutations in gene"
+                  , tolower(gene), ":", revised_pos_cu)
+            , outfile_meta1
+            , row.names = F
+            , col.names = F)
+
+# mut_annot
+write.table(paste(sensitive_pos, collapse = ',')
+            , outfile_mutpos_S
+            , row.names = F, col.names = F)
+
+write.table(paste(resistant_pos, collapse = ',')
+            , outfile_mutpos_R
+            , row.names = F, col.names = F)
+
+write.table(paste(common_pos, collapse = ',')
+            , outfile_mutpos_common
+            , row.names = F, col.names = F)
+
+write.table(paste("Count of positions with mutations in gene:"
+                  , tolower(gene)
+                  , "\nTotal mutational positions:", revised_pos_cu
+                  , "\nsensitive:", length(sensitive_pos)
+                  , "\nresistant:", length(resistant_pos)
+                  , "\ncommon:"   , cpl)
+            , outfile_meta2
+            , row.names = F
+            , col.names = F)
+
+#Quick check to find out the discrepancy
+revised_gene_mut_pos_u
+gene_mut_pos_u
+
+library("qpcR")  
+foo <- data.frame(qpcR:::cbind.na(gene_mut_pos_u, revised_gene_mut_pos_u))
+
+
+table(!gene_mut_pos_u%in%revised_gene_mut_pos_u)
+table(!revised_gene_mut_pos_u%in%gene_mut_pos_u)
+
+X5uhc_pos
+#table(!gene_mut_pos_u%in%X5uhc_pos)
+table(X5uhc_pos%in%gene_mut_pos_u)
+
+X5uhc_pos[!X5uhc_pos%in%gene_mut_pos_u]
+X5uhc_pos[!gene_mut_pos_u%in%X5uhc_pos]
+
+#TODO: NOTE
+#D1148G (i.e D1154) is NOT Present in 5UHC