generated ggpairs plots finally

2022-08-15 19:05:22 +01:00 · 2022-08-15 19:05:22 +01:00 · a3e5283a9b
commit a3e5283a9b
parent b68841b337
11 changed files with 657 additions and 939 deletions
--- a/scripts/plotting/LINEAGE2.R
+++ b/scripts/plotting/LINEAGE2.R
@ -4,22 +4,41 @@ library("ggforce")
 #install.packages("gginference")
 library(gginference)
 library(ggpubr)
-library(svglite)
 ##################################################
 #%% read data
 # DOME: read data using gene and drug combination
 # gene must be lowercase
 # tolower(gene)
-#################################################
+############################################################
 #gene="pncA"
 #drug="pyrazinamide"

 #lineage_filename=paste0(tolower(gene),"_merged_df2.csv")
 #lineage_data_path="~/git/Data/pyrazinamide/output"

-df2 = read.csv(paste0(lineage_data_path,"/",lineage_filename))
+#=============
+# Data: Input
+#==============
+#source("~/git/LSHTM_analysis/config/alr.R")
+#source("~/git/LSHTM_analysis/config/embb.R")
+# source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/config/katg.R")
+#source("~/git/LSHTM_analysis/config/pnca.R")
+#source("~/git/LSHTM_analysis/config/rpob.R")

-foo = as.data.frame(colnames(df2))
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")
+
+
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
+cat("plots will output to:", outdir_images)
+
+###########################################################
+class(merged_df2)
+foo = as.data.frame(colnames(merged_df2))

 cols_to_subset = c('mutationinformation'
                   , 'snp_frequency'
@ -36,7 +55,7 @@ cols_to_subset = c('mutationinformation'

 #cols_to_subset%in%foo

-my_df = df2[ ,cols_to_subset]
+my_df = merged_df2[ ,cols_to_subset]

 # r24p_embb = df_embb[df_embb$mutationinformation == "R24P",]
 # #tm = c("A102P", "M1T")
@ -73,10 +92,9 @@ table(my_df2$lineage)

 sel_lineages2 = c("L1", "L2", "L3", "L4")
 my_df2 = my_df2[my_df2$lineage%in%sel_lineages2,]
-table(my_df2$lineage)
-
 sum(table(my_df2$lineage)) == nrow(my_df2)
 table(my_df2$lineage)
+table(my_df2$lineage, my_df2$sensitivity)

 # %%
 # str(my_df2)
@ -85,6 +103,7 @@ table(my_df2$lineage)

 #%% get only muts which belong to > 1 lineage and have different sensitivity classifications
 muts = unique(my_df2$mutationinformation)
+cat ("Total unique muts in L1-L4", tolower(gene), ":", length(muts))
 #-----------------------------------------------
 # step 0 : get muts with more than one lineage
 #-----------------------------------------------
@ -100,7 +119,6 @@ for (i in muts) {
 }
 cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")

-
 #-----------------------------------------------
 # step 1 : get other muts that do not have this
 #-----------------------------------------------
@ -111,7 +129,6 @@ cat("\nGot:", length(consist_muts), "mutations that are consistent")
 # step 2: subset these muts for plotting
 #-----------------------------------------------
 plot_df = my_df2[my_df2$mutationinformation%in%lin_muts,]
-
 cat("\nnrow of plot_df:", nrow(plot_df))

 #-----------------------------------------------
@ -125,7 +142,9 @@ for (i in lin_muts) {
  s_tab = table(s_mut$lineage, s_mut$sens2)
  #print(s_tab)
  #ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
-  ft_pvalue_i = fisher.test(s_tab)$p.value
+  ft_pvalue_i = fisher.test(s_tab
+                            #, workspace=2e9
+                            , simulate.p.value=TRUE,B=1e7)$p.value
  #print(ft_pvalue_i)
  plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
  #print(s_tab)
@ -155,8 +174,6 @@ plot_df
 head(plot_df)
 table(plot_df$pvalR<0.05)

-
-
 # format p value
 # TODO: add case statement for correct pvalue formatting
 #plot_df$pvalF = ifelse(plot_df$pval <= 0.0001, paste0(round(plot_df$pval, 3), "**** "), plot_df$pval )
@ -233,6 +250,7 @@ cat("\nGot:", sig_muts, "mutations that are significant")
 plot_df_ns = plot_df2[plot_df2$pvalR>0.05,]
 ns_muts = length(unique(plot_df_ns$mutationinformation))
 cat("\nGot:", ns_muts, "mutations that are NOT significant")
+
 p_title = gene
 ts = 8
 gls = 3
@ -244,7 +262,7 @@ gls = 3
 #3) Add *: Extend yaxis for each plot to allow geom_label to have space (or see
 # if this self resolving with facet_wrap_paginate())
 #================================================
-#svg(paste0(outdir_images, "embb_linDS.svg"), width = 6, height = 10 ) # old-school square 4:3 CRT shape 1.3:1
+#svg(paste0(outdir_images, tolower(gene), "_linDS.svg"), width = 6, height = 10 ) # old-school square 4:3 CRT shape 1.3:1
 ds_s =  ggplot(plot_df_sig, aes(x = lineage
                         , fill = sens2)) + 
      geom_bar(stat = 'count') +
@ -280,7 +298,7 @@ ds_s =  ggplot(plot_df_sig, aes(x = lineage
 ###################################
 #ns muts

-#svg(paste0(outdir_images, "embb_linDS_ns.svg"), width =10 , height = 8) # old-school square 4:3 CRT shape 1.3:1
+#svg(paste0(outdir_images, tolower(gene), "_linDS_ns.svg"), width =10 , height = 8) # old-school square 4:3 CRT shape 1.3:1
 ds_ns =  ggplot(plot_df_ns, aes(x = lineage
                         , fill = sens2)) + 
    geom_bar(stat = 'count') +
@ -309,31 +327,57 @@ ds_ns =  ggplot(plot_df_ns, aes(x = lineage
    labs(title = paste0(p_title, ": sensitivity by lineage")
         , y = 'Sample Count')
 #dev.off()
+#####################################################################
+#===================
+# Combine output
+#====================

-
-# svg(paste0(outdir_images, "embb_linDS_CL.svg")
+# svg(paste0(outdir_images, tolower(gene), "_linDS_CL.svg")
 #     , width = 11
 #     , height = 8 ) 
-png(paste0(outdir_images, "embb_linDS_CL.png")
-    , width = 11.75
+png(paste0(outdir_images, tolower(gene), "_linDS_CL2.png")
+    , width = 11.75*1.15
    , height = 8, units = "in", res = 300 ) 

 cowplot::plot_grid(ds_s, ds_ns
                   , ncol = 2
-                   ,rel_widths = c(1,2)
+                   #, align = "hv"
+                   , rel_widths = c(1,2.5)
                   , labels = "AUTO")

 dev.off()

+########################################################################
+#==================
+# Summary output
+#==================
+cat ("Total unique muts in ALL samples for", tolower(gene), ":", length(unique(merged_df2$mutationinformation)))
+other_lin_muts = unique(merged_df2$mutationinformation)[!unique(merged_df2$mutationinformation)%in%unique(my_df2$mutationinformation)]

+cat ("Total unique muts NOT in L1-L4:", length(other_lin_muts))
+cat("These are:\n", other_lin_muts)
+other_lin_muts_df = merged_df2[merged_df2$mutationinformation%in%other_lin_muts,]

+if ( length(unique(other_lin_muts_df$mutationinformation)) == length(other_lin_muts)) {
+  cat("\nPASS: other lin muts extracted")
+}else{
+  stop("\nAbort: other lin muts numbers mismatch")
+}

+table(other_lin_muts_df$mutationinformation, other_lin_muts_df$lineage)

+cat("\n==============================================\n")
+cat ("Total samples L1-L4:", nrow(my_df2))
+table(my_df2$lineage)
+table(my_df2$lineage, my_df2$sensitivity)

+cat ("Total unique muts in L1-L4", tolower(gene), ":", length(muts))
+cat("\nGot:", length(lin_muts), "mutations belonging to >1 lineage with differing drug sensitivities")

+cat("\nGot:", sig_muts, "mutations that are significant"
+    , "\nThese are:", unique(plot_df_sig$mutationinformation))

-#geom_text(aes(label = paste0("p=",pvalF), x = 2.5, ypos_label+1))# +
+cat("\nGot:", ns_muts, "mutations that are NOT significant"
+    , "\nThese are:", unique(plot_df_ns$mutationinformation))

-  #geom_segment(aes(x = 1, y = ypos_label+0.5, xend = 4, yend = ypos_label+0.5))
-  #geom_hline(data = lin_muts_dfM, aes(yintercept=ypos_label+0.5))
-  #geom_bracket(data=lin_muts_dfM, aes(xmin = 1, xmax = 4, y.position = ypos_label+0.5, label=''))
+cat("\n==============================================\n")