From 69a0da0a5957937bae64ba6242fa68b31fc79e34 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Sun, 4 Sep 2022 21:36:49 +0100
Subject: [PATCH] added script for lineage diff sensitivities

---
 .../embb/lineage_diff_sensitivities.R         | 166 ++++++++++++++++++
 .../gid/lineage_diff_sensitivities.R          | 134 ++++++++++++++
 .../katg/lineage_diff_sensitivities.R         | 133 ++++++++++++++
 .../rpob/lineage_diff_sensitivities.R         | 133 ++++++++++++++
 4 files changed, 566 insertions(+)
 create mode 100644 scripts/plotting/plotting_thesis/embb/lineage_diff_sensitivities.R
 create mode 100644 scripts/plotting/plotting_thesis/gid/lineage_diff_sensitivities.R
 create mode 100644 scripts/plotting/plotting_thesis/katg/lineage_diff_sensitivities.R
 create mode 100644 scripts/plotting/plotting_thesis/rpob/lineage_diff_sensitivities.R

diff --git a/scripts/plotting/plotting_thesis/embb/lineage_diff_sensitivities.R b/scripts/plotting/plotting_thesis/embb/lineage_diff_sensitivities.R
new file mode 100644
index 0000000..d7e1833
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/embb/lineage_diff_sensitivities.R
@@ -0,0 +1,166 @@
+#=============
+# Data: Input
+#==============
+#source("~/git/LSHTM_analysis/config/embb.R")
+source("~/git/LSHTM_analysis/config/gid.R")
+
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+
+# Now we need to make a column that fill na  in dst with value of dst_mode
+df2 = merged_df2
+#table(df2$dst2)
+
+df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2$dst)
+df2$sens2 = ifelse(df2$dst2 == 1, "R", "S")
+table(df2$sens2)
+
+
+all_snps    = unique(df2$mutationinformation)
+all_snps_n  = length(all_snps); all_snps_n
+
+all_samples_id = unique(df2$id)  # different to nrows
+all_samples_id_n = length(all_samples_id); all_samples_id_n # different to nrows
+
+sel_lineage   = c("L1", "L2", "L3", "L4")
+df2_lin = df2[df2$lineage%in%sel_lineage,]
+sel_lin_snps    = unique(df2_lin$mutationinformation)
+sel_lin_snps_n   = length(sel_lin_snps); sel_lin_snps_n
+
+sel_lin_samples_id = unique(df2_lin$id)
+sel_lin_samples_id_n = length(sel_lin_samples_id);sel_lin_samples_id_n
+
+# are the snps that are not in L1-L4 unique to L5-L7
+left_snps = all_snps[!all_snps%in%sel_lin_snps]
+left_snps_n = length(left_snps); left_snps_n
+
+if (all_snps_n == sel_lin_snps_n+left_snps_n){
+  cat("PASS: left snps extracted for gene", tolower(gene))
+}else{
+  stop("Abort: left snps count mismatch")
+}
+
+left_snps
+lef_snps_df = df2[df2$mutationinformation%in%left_snps,]
+table(lef_snps_df$lineage)
+
+##################################
+# selected lineage plos
+cols_to_subset = c("mutationinformation"
+                  , "lineage"
+                  , "dst2"
+                  , "sens2")
+
+
+#-----------------------------------------------
+# step 0: Subset a smaller df
+#-----------------------------------------------
+plot_df_gene = df2_lin[, cols_to_subset]
+
+#-----------------------------------------------
+# step 1: Select muts for each target
+#-----------------------------------------------
+# embb
+#sel_mutsP = c("D354N", "Y319D", "Y319D", "A962P", "S651N", "A201S")
+# gid
+sel_mutsP = c("P75R", "A19G", "A133P", "R154W", "R118L") #G30D)
+# katg
+#sel_mutsP = c("")
+# rpob
+#sel_mutsP = c("")
+#-----------------------------------------------
+# step 2: Subset data with just those genes
+#-----------------------------------------------
+plot_df_gene = plot_df_gene[plot_df_gene$mutationinformation%in%sel_mutsP,]
+cat("\nnrow of plot_df:", nrow(plot_df_gene))
+table(plot_df_gene$sens2, plot_df_gene$lineage, plot_df_gene$mutationinformation)
+#-----------------------------------------------
+# step 3: Assign to plot_df
+#-----------------------------------------------
+
+plot_df = plot_df_gene
+
+#-----------------------------------------------
+# step 4: Add p-value
+#-----------------------------------------------
+plot_df$pval = NULL
+for (i in sel_mutsP) {
+  #print (i)
+  s_mut = plot_df[plot_df$mutationinformation == i,]
+  #print(s_mut)
+  s_tab = table(s_mut$lineage, s_mut$sens2)
+  #print(s_tab)
+  #ft_pvalue_i = round(fisher.test(s_tab)$p.value, 3)
+  ft_pvalue_i = fisher.test(s_tab
+                            #, workspace=2e9
+                            #, simulate.p.value=TRUE,B=1e7
+                            )$p.value
+  #print(ft_pvalue_i)
+  plot_df$pval[plot_df$mutationinformation == i] <- ft_pvalue_i
+  #print(s_tab)
+}
+
+plot_df$pvalR = round(plot_df$pval, 3)
+
+# round P-values
+plot_df$pvalRF = plot_df$pvalR
+plot_df = dplyr::mutate(plot_df
+                        , pvalRF = case_when(pvalRF == 0.05 ~ "P ."
+                                             , pvalRF <=0.0001 ~ 'P ****'
+                                             , pvalRF <=0.001 ~ 'P ***'
+                                             , pvalRF <=0.01 ~ 'P **'
+                                             , pvalRF <0.05 ~ 'P *'
+                                             , TRUE ~ 'ns'))
+
+plot_df
+
+head(plot_df)
+table(plot_df$pvalR<0.05)
+
+#-----------------------------------------------
+# step 5: Plot 
+#-----------------------------------------------
+p_title = gene
+ts = 8
+gls = 3
+DSplot = ggplot(plot_df, aes(x = lineage,
+                    fill = sens2)) + 
+  geom_bar(stat = 'count') +
+  scale_fill_manual(name = ""
+                    # name = leg_title
+                    , values = c("red", "blue")
+                    #, labels = levels(sens2))
+  )+
+  facet_wrap(~mutationinformation
+             , scales = 'free_y'
+             #, ncol = 3
+             , nrow = 1
+  ) +
+  theme(legend.position = "top"
+        , plot.title = element_text(hjust = 0.5, size=15,face = "italic")
+        #, plot.title = element_blank()
+        
+        , strip.text = element_text(size=ts+2)
+        , axis.text.x = element_text(size=ts)
+        , axis.text.y = element_text(size=ts)
+        , axis.title.y = element_text(size=ts)
+        , legend.title = element_blank()
+        , axis.title.x = element_blank()
+  )+
+  labs(title = paste0(p_title
+                      #, ": sensitivity by lineage"
+                      ),
+       y = 'Sample Count') #+
+  #geom_text(aes(label = pvalRF, x = 2.5, y =  ypos_label+0.75))
+ # geom_blank(aes(y = ypos_label+1.25)) +
+#  geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white", size =gls)
+
+#========
+# Outplot
+#========
+outdir_lin = "/home/pub/Work/LSHTM/Thesis_Plots/"
+png(paste0(outdir_lin, tolower(gene), "_linDS_selected.png")
+    , width = 8
+    , height = 3, units = "in", res = 300 ) 
+DSplot
+dev.off()
diff --git a/scripts/plotting/plotting_thesis/gid/lineage_diff_sensitivities.R b/scripts/plotting/plotting_thesis/gid/lineage_diff_sensitivities.R
new file mode 100644
index 0000000..5e192df
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/gid/lineage_diff_sensitivities.R
@@ -0,0 +1,134 @@
+#=============
+# Data: Input
+#==============
+#source("~/git/LSHTM_analysis/config/embb.R")
+source("~/git/LSHTM_analysis/config/gid.R")
+
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+
+# Now we need to make a column that fill na  in dst with value of dst_mode
+df2 = merged_df2
+#table(df2$dst2)
+
+df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2$dst)
+df2$sens2 = ifelse(df2$dst2 == 1, "R", "S")
+table(df2$sens2)
+
+
+all_snps    = unique(df2$mutationinformation)
+all_snps_n  = length(all_snps); all_snps_n
+
+all_samples_id = unique(df2$id)  # different to nrows
+all_samples_id_n = length(all_samples_id); all_samples_id_n # different to nrows
+
+sel_lineage   = c("L1", "L2", "L3", "L4")
+df2_lin = df2[df2$lineage%in%sel_lineage,]
+sel_lin_snps    = unique(df2_lin$mutationinformation)
+sel_lin_snps_n   = length(sel_lin_snps); sel_lin_snps_n
+
+sel_lin_samples_id = unique(df2_lin$id)
+sel_lin_samples_id_n = length(sel_lin_samples_id);sel_lin_samples_id_n
+
+# are the snps that are not in L1-L4 unique to L5-L7
+left_snps = all_snps[!all_snps%in%sel_lin_snps]
+left_snps_n = length(left_snps); left_snps_n
+
+if (all_snps_n == sel_lin_snps_n+left_snps_n){
+  cat("PASS: left snps extracted for gene", tolower(gene))
+}else{
+  stop("Abort: left snps count mismatch")
+}
+
+left_snps
+lef_snps_df = df2[df2$mutationinformation%in%left_snps,]
+table(lef_snps_df$lineage)
+
+##################################
+# selected lineage plos
+cols_to_subset = c("mutationinformation"
+                  , "lineage"
+                  , "dst2"
+                  , "sens2")
+
+
+#-----------------------------------------------
+# step 0: Subset a smaller df
+#-----------------------------------------------
+plot_df_gene = df2_lin[, cols_to_subset]
+
+#-----------------------------------------------
+# step 1: Select muts for each target
+#-----------------------------------------------
+# embb
+#sel_mutsP = c("D354N", "Y319D", "Y319D", "A962P", "S651N", "A201S")
+# gid
+sel_mutsP = c("P75R", "A19G", "A133P", "R154W", "R118L") #G30D)
+# katg
+#sel_mutsP = c("")
+# rpob
+#sel_mutsP = c("")
+#-----------------------------------------------
+# step 2: Subset data with just those genes
+#-----------------------------------------------
+plot_df_gene = plot_df_gene[plot_df_gene$mutationinformation%in%sel_mutsP,]
+cat("\nnrow of plot_df:", nrow(plot_df_gene))
+table(plot_df_gene$sens2, plot_df_gene$lineage, plot_df_gene$mutationinformation)
+#-----------------------------------------------
+# step 3: Assign to plot_df
+#-----------------------------------------------
+
+plot_df = plot_df_gene
+
+#-----------------------------------------------
+# step 4: Add p-value
+#-----------------------------------------------
+
+
+#-----------------------------------------------
+# step 5: Plot 
+#-----------------------------------------------
+p_title = gene
+ts = 8
+gls = 3
+DSplot = ggplot(plot_df, aes(x = lineage,
+                    fill = sens2)) + 
+  geom_bar(stat = 'count') +
+  scale_fill_manual(name = ""
+                    # name = leg_title
+                    , values = c("red", "blue")
+                    #, labels = levels(sens2))
+  )+
+  facet_wrap(~mutationinformation
+             , scales = 'free_y'
+             #, ncol = 3
+             , nrow = 1
+  ) +
+  theme(legend.position = "top"
+        , plot.title = element_text(hjust = 0.5, size=15,face = "italic")
+        #, plot.title = element_blank()
+        
+        , strip.text = element_text(size=ts+2)
+        , axis.text.x = element_text(size=ts)
+        , axis.text.y = element_text(size=ts)
+        , axis.title.y = element_text(size=ts)
+        , legend.title = element_blank()
+        , axis.title.x = element_blank()
+  )+
+  labs(title = paste0(p_title
+                      #, ": sensitivity by lineage"
+                      ),
+       y = 'Sample Count') #+
+  #geom_text(aes(label = pvalRF, x = 2.5, y =  ypos_label+0.75))
+ # geom_blank(aes(y = ypos_label+1.25)) +
+#  geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white", size =gls)
+
+#========
+# Outplot
+#========
+outdir_lin = "/home/pub/Work/LSHTM/Thesis_Plots/"
+png(paste0(outdir_lin, tolower(gene), "_linDS_selected.png")
+    , width = 8
+    , height = 3, units = "in", res = 300 ) 
+DSplot
+dev.off()
diff --git a/scripts/plotting/plotting_thesis/katg/lineage_diff_sensitivities.R b/scripts/plotting/plotting_thesis/katg/lineage_diff_sensitivities.R
new file mode 100644
index 0000000..46c9802
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/katg/lineage_diff_sensitivities.R
@@ -0,0 +1,133 @@
+#=============
+# Data: Input
+#==============
+source("~/git/LSHTM_analysis/config/katg.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+
+# Now we need to make a column that fill na  in dst with value of dst_mode
+df2 = merged_df2
+#table(df2$dst2)
+
+df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2$dst)
+df2$sens2 = ifelse(df2$dst2 == 1, "R", "S")
+table(df2$sens2)
+
+
+all_snps    = unique(df2$mutationinformation)
+all_snps_n  = length(all_snps); all_snps_n
+
+all_samples_id = unique(df2$id)  # different to nrows
+all_samples_id_n = length(all_samples_id); all_samples_id_n # different to nrows
+
+sel_lineage   = c("L1", "L2", "L3", "L4")
+df2_lin = df2[df2$lineage%in%sel_lineage,]
+sel_lin_snps    = unique(df2_lin$mutationinformation)
+sel_lin_snps_n   = length(sel_lin_snps); sel_lin_snps_n
+
+sel_lin_samples_id = unique(df2_lin$id)
+sel_lin_samples_id_n = length(sel_lin_samples_id);sel_lin_samples_id_n
+
+# are the snps that are not in L1-L4 unique to L5-L7
+left_snps = all_snps[!all_snps%in%sel_lin_snps]
+left_snps_n = length(left_snps); left_snps_n
+
+if (all_snps_n == sel_lin_snps_n+left_snps_n){
+  cat("PASS: left snps extracted for gene", tolower(gene))
+}else{
+  stop("Abort: left snps count mismatch")
+}
+
+left_snps
+lef_snps_df = df2[df2$mutationinformation%in%left_snps,]
+table(lef_snps_df$lineage)
+
+##################################
+#-----------------------------------------------
+# step 0: Select muts for each target
+#-----------------------------------------------
+# embb
+#sel_mutsP = c("D354N", "Y319D", "Y319D", "A962P", "S651N", "A201S")
+# gid
+# sel_mutsP = c("P75R", "A19G", "A133P", "R154W", "R118L") #G30D)
+# katg
+sel_mutsP = c("M257V", "G490S", "H116F", "H97N", "R249C", "W300R", "V320L", "S383A") #R463L
+# rpob
+#sel_mutsP = c("")
+
+# selected lineage plos
+cols_to_subset = c("mutationinformation"
+                  , "lineage"
+                  , "dst2"
+                  , "sens2")
+
+
+#-----------------------------------------------
+# step 1: Subset a smaller df
+#-----------------------------------------------
+plot_df_gene = df2_lin[, cols_to_subset]
+
+#-----------------------------------------------
+# step 2: Subset data with just those genes
+#-----------------------------------------------
+plot_df_gene = plot_df_gene[plot_df_gene$mutationinformation%in%sel_mutsP,]
+cat("\nnrow of plot_df:", nrow(plot_df_gene))
+table(plot_df_gene$sens2, plot_df_gene$lineage, plot_df_gene$mutationinformation)
+#-----------------------------------------------
+# step 3: Assign to plot_df
+#-----------------------------------------------
+
+plot_df = plot_df_gene
+
+#-----------------------------------------------
+# step 4: Add p-value
+#-----------------------------------------------
+
+
+#-----------------------------------------------
+# step 5: Plot 
+#-----------------------------------------------
+p_title = gene
+ts = 8
+gls = 3
+DSplot = ggplot(plot_df, aes(x = lineage,
+                    fill = sens2)) + 
+  geom_bar(stat = 'count') +
+  scale_fill_manual(name = ""
+                    # name = leg_title
+                    , values = c("red", "blue")
+                    #, labels = levels(sens2))
+  )+
+  facet_wrap(~mutationinformation
+             , scales = 'free_y'
+             #, ncol = 3
+             , nrow = 2
+  ) +
+  theme(legend.position = "top"
+        , plot.title = element_text(hjust = 0.5, size=15,face = "italic")
+        #, plot.title = element_blank()
+        
+        , strip.text = element_text(size=ts+2)
+        , axis.text.x = element_text(size=ts)
+        , axis.text.y = element_text(size=ts)
+        , axis.title.y = element_text(size=ts)
+        , legend.title = element_blank()
+        , axis.title.x = element_blank()
+  )+
+  labs(title = paste0(p_title
+                      #, ": sensitivity by lineage"
+                      ),
+       y = 'Sample Count') #+
+  #geom_text(aes(label = pvalRF, x = 2.5, y =  ypos_label+0.75))
+ # geom_blank(aes(y = ypos_label+1.25)) +
+#  geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white", size =gls)
+
+#========
+# Outplot
+#========
+outdir_lin = "/home/pub/Work/LSHTM/Thesis_Plots/"
+png(paste0(outdir_lin, tolower(gene), "_linDS_selected.png")
+    , width = 8
+    , height = 4, units = "in", res = 300 ) 
+DSplot
+dev.off()
diff --git a/scripts/plotting/plotting_thesis/rpob/lineage_diff_sensitivities.R b/scripts/plotting/plotting_thesis/rpob/lineage_diff_sensitivities.R
new file mode 100644
index 0000000..37b6bea
--- /dev/null
+++ b/scripts/plotting/plotting_thesis/rpob/lineage_diff_sensitivities.R
@@ -0,0 +1,133 @@
+#=============
+# Data: Input
+#==============
+source("~/git/LSHTM_analysis/config/rpob.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+
+# Now we need to make a column that fill na  in dst with value of dst_mode
+df2 = merged_df2
+#table(df2$dst2)
+
+df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2$dst)
+df2$sens2 = ifelse(df2$dst2 == 1, "R", "S")
+table(df2$sens2)
+
+all_snps    = unique(df2$mutationinformation)
+all_snps_n  = length(all_snps); all_snps_n
+
+all_samples_id = unique(df2$id)  # different to nrows
+all_samples_id_n = length(all_samples_id); all_samples_id_n # different to nrows
+
+sel_lineage   = c("L1", "L2", "L3", "L4")
+df2_lin = df2[df2$lineage%in%sel_lineage,]
+sel_lin_snps    = unique(df2_lin$mutationinformation)
+sel_lin_snps_n   = length(sel_lin_snps); sel_lin_snps_n
+
+sel_lin_samples_id = unique(df2_lin$id)
+sel_lin_samples_id_n = length(sel_lin_samples_id);sel_lin_samples_id_n
+
+# are the snps that are not in L1-L4 unique to L5-L7
+left_snps = all_snps[!all_snps%in%sel_lin_snps]
+left_snps_n = length(left_snps); left_snps_n
+
+if (all_snps_n == sel_lin_snps_n+left_snps_n){
+  cat("PASS: left snps extracted for gene", tolower(gene))
+}else{
+  stop("Abort: left snps count mismatch")
+}
+
+left_snps
+lef_snps_df = df2[df2$mutationinformation%in%left_snps,]
+table(lef_snps_df$lineage)
+
+##################################
+#-----------------------------------------------
+# step 0: Select muts for each target
+#-----------------------------------------------
+# embb
+#sel_mutsP = c("D354N", "Y319D", "Y319D", "A962P", "S651N", "A201S")
+# gid
+# sel_mutsP = c("P75R", "A19G", "A133P", "R154W", "R118L") #G30D)
+# katg
+#sel_mutsP = c("M257V", "G490S", "H116F", "H97N", "R249C", "W300R", "V320L", "S383A") #R463L
+# rpob
+sel_mutsP = c("E807Q", "H674Q", "I696L", "P439S", "Q287E", "S188A",
+              "S195R", "S239R", "S431T", "T756A", "V243I", "V385L", "V403A", "V517K")
+#K799Q, I491F
+# selected lineage plos
+cols_to_subset = c("mutationinformation"
+                  , "lineage"
+                  , "dst2"
+                  , "sens2")
+
+
+#-----------------------------------------------
+# step 1: Subset a smaller df
+#-----------------------------------------------
+plot_df_gene = df2_lin[, cols_to_subset]
+
+#-----------------------------------------------
+# step 2: Subset data with just those genes
+#-----------------------------------------------
+plot_df_gene = plot_df_gene[plot_df_gene$mutationinformation%in%sel_mutsP,]
+cat("\nnrow of plot_df:", nrow(plot_df_gene))
+table(plot_df_gene$sens2, plot_df_gene$lineage, plot_df_gene$mutationinformation)
+#-----------------------------------------------
+# step 3: Assign to plot_df
+#-----------------------------------------------
+
+plot_df = plot_df_gene
+
+#-----------------------------------------------
+# step 4: Add p-value
+#-----------------------------------------------
+
+
+#-----------------------------------------------
+# step 5: Plot 
+#-----------------------------------------------
+p_title = gene
+ts = 8
+gls = 3
+DSplot = ggplot(plot_df, aes(x = lineage,
+                    fill = sens2)) + 
+  geom_bar(stat = 'count') +
+  scale_fill_manual(name = ""
+                    # name = leg_title
+                    , values = c("red", "blue")
+                    #, labels = levels(sens2))
+  )+
+  facet_wrap(~mutationinformation
+             , scales = 'free_y'
+             #, ncol = 3
+             , nrow = 2
+  ) +
+  theme(legend.position = "top"
+        , plot.title = element_text(hjust = 0.5, size=15,face = "italic")
+        #, plot.title = element_blank()
+        
+        , strip.text = element_text(size=ts+2)
+        , axis.text.x = element_text(size=ts)
+        , axis.text.y = element_text(size=ts)
+        , axis.title.y = element_text(size=ts)
+        , legend.title = element_blank()
+        , axis.title.x = element_blank()
+  )+
+  labs(title = paste0(p_title
+                      #, ": sensitivity by lineage"
+                      ),
+       y = 'Sample Count') #+
+  #geom_text(aes(label = pvalRF, x = 2.5, y =  ypos_label+0.75))
+ # geom_blank(aes(y = ypos_label+1.25)) +
+#  geom_label(aes(label = pvalRF, x = 2.5, ypos_label+0.75), fill="white", size =gls)
+
+#========
+# Outplot
+#========
+outdir_lin = "/home/pub/Work/LSHTM/Thesis_Plots/"
+png(paste0(outdir_lin, tolower(gene), "_linDS_selected.png")
+    , width = 8
+    , height = 4, units = "in", res = 300 ) 
+DSplot
+dev.off()