generated lineage dist plots combined. needs tweaking

2020-09-09 12:53:53 +01:00 · 2020-09-09 12:53:53 +01:00 · 98cd8a74e8
commit 98cd8a74e8
parent af7c55b713
3 changed files with 191 additions and 119 deletions
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -48,6 +48,8 @@ cat("Variables imported:"
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
--- a/scripts/plotting/lineage_basic_barplot.R
+++ b/scripts/plotting/lineage_basic_barplot.R
@ -30,13 +30,15 @@ source("combining_dfs_plotting.R")
 # 9) my_df_u
 # 10) my_df_u_lig

-cat(paste0("Directories imported:"
+cat("Directories imported:"
+    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
-           , "\nplotdir:", plotdir))
+    , "\nplotdir:", plotdir)

-cat(paste0("Variables imported:"
+cat("Variables imported:"
+    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
@ -44,7 +46,11 @@ cat(paste0("Variables imported:"
    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\nNA count for ORs:", na_count
    , "\nNA count in df2:", na_count_df2
-           , "\nNA count in df3:", na_count_df3))       
+    , "\nNA count in df3:", na_count_df3
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+

 #===========
 # input
@ -67,7 +73,6 @@ plot_basic_bp_lineage  =  paste0(plotdir,"/", basic_bp_lineage)
 #================
 # REASSIGNMENT as necessary
 my_df  = merged_df2
-#my_df  = merged_df2_comp

 # clear excess variable
 rm(merged_df2_comp, merged_df3, merged_df3_comp)
--- a/scripts/plotting/lineage_dist_PS.R
+++ b/scripts/plotting/lineage_dist_PS.R
@ -3,36 +3,55 @@ getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()

-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
+#########################################################
+# TASK: Lineage dist plots: ggridges

-source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#require(data.table)
+# Output: 2 SVGs for PS stability

-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
+# 1) all muts
+# 2) dr_muts

-source("combining_two_df.R")
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################

-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
+source("combining_dfs_plotting.R")
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp

-#==========================
-# This will return:
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig

-# df with NA for <drug>
-# merged_df2
-# merged_df3
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\nNA count for ORs:", na_count
+    , "\nNA count in df2:", na_count_df2
+    , "\nNA count in df3:", na_count_df3
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)

-# df without NA for <drug>
-# merged_df2_comp
-# merged_df3_comp
-#===========================

 ###########################
 # Data for plots
@ -43,13 +62,8 @@ source("combining_two_df.R")
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 ###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df2
-#my_df  = merged_df2_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%

 # delete variables not required
 rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
@ -63,7 +77,7 @@ is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)

-table(my_df$mutation_info); str(my_df$mutation_info)
+table(my_df$mutation_info)

 # subset df with dr muts only
 my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
@ -74,59 +88,32 @@ table(my_df_dr$mutation_info)
 ########################################################################

 #==========================
-# Run two times: 
-# uncomment as necessary
-# 1) for all muts
-# 2) for dr_muts
-#===========================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-
-#================
-# for ALL muts
-#================
-plot_df = my_df  
-my_plot_name = 'lineage_dist_PS.svg'
-
-plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
-
-#my_plot_name = 'lineage_dist_PS_comp.svg'
-
-#================
-# for dr muts ONLY
-#================
-plot_df = my_df_dr 
-#my_plot_name = 'lineage_dist_dr_PS.svg'
-#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
-my_plot_name = 'lineage_dist_drug_muts_PS.svg'
-
-plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-#==========================
-# Plot: Lineage Distribution
+# Plot 1: ALL Muts
 # x = mcsm_values, y = dist
 # fill = stability
 #============================

+my_plot_name = 'lineage_dist_PS.svg'
+
+plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
+
 #===================
 # Data for plots
 #===================
-table(plot_df$lineage); str(plot_df$lineage)
+table(my_df$lineage); str(my_df$lineage)

 # subset only lineages1-4
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
-                 , "lineage4")
+                 , "lineage4"
                 #, "lineage5"
                 #, "lineage6"
-                 #, "lineage7")
+                 #, "lineage7"
+                 )

 # uncomment as necessary
-df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
 table(df_lin$lineage)

 # refactor
@ -139,16 +126,8 @@ table(df_lin$lineage)#{RESULT: No of samples within lineage}
 length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}

 length(df_lin$mutationinformation)
-# sanity checks
-# FIXME
-r1 = 2:7 # when merged_df2 used: because there is missing lineages 
-if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 

-u2 = unique(plot_df$mutationinformation)
+u2 = unique(my_df$mutationinformation)
 u = unique(df_lin$mutationinformation)
 check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}

@ -173,11 +152,101 @@ names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
                    # , 'lineage5', 'lineage6', 'lineage7'
                     )
 # check plot name
-my_plot_name
+plot_lineage_duet

 # output svg
-svg(plot_lineage_duet)
-printFile = ggplot(df, aes(x = duet_scaled
+#svg(plot_lineage_duet)
+p1 = ggplot(df, aes(x = duet_scaled
+                            , y = duet_outcome))+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+             #, switch = 'x'
+              , labeller = labeller(lineage = my_labels) ) +
+  coord_cartesian( xlim = c(-1, 1)) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = my_als-5)
+         , legend.title = element_text(size = my_als)
+) 
+
+print(p1)
+#dev.off()
+
+#######################################################################
+# lineage distribution plot for dr_muts
+#######################################################################
+
+#==========================
+# Plot 2: dr muts ONLY
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
+
+plot_lineage_dr_duet  =  paste0(plotdir,"/", my_plot_name_dr)
+
+#===================
+# Data for plots
+#===================
+table(my_df_dr$lineage); str(my_df_dr$lineage)
+
+# uncomment as necessary
+df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
+table(df_lin_dr$lineage)
+
+# refactor
+df_lin_dr$lineage = factor(df_lin_dr$lineage)
+
+sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
+
+table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
+
+length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
+
+length(df_lin_dr$mutationinformation)
+
+u2 = unique(my_df_dr$mutationinformation)
+u = unique(df_lin_dr$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df_dr <- df_lin_dr
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin_dr)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+
+# check plot name
+plot_lineage_dr_duet
+
+# output svg
+#svg(plot_lineage_dr_duet)
+p2 = ggplot(df_dr, aes(x = duet_scaled
                           , y = duet_outcome))+
  
  #printFile=geom_density_ridges_gradient(
@ -198,35 +267,31 @@ printFile = ggplot(df, aes(x = duet_scaled
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
-#                  , axis.text.y = element_text(size = my_ats
-#                                                , angle = 0
-#                                                , hjust = 1
-#                                                , vjust = 0)
        , axis.text.y = element_blank()
        , axis.title.x = element_blank()
        , axis.title.y = element_blank()
        , axis.ticks.y = element_blank()
        , plot.title = element_blank()
        , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
-         , legend.title = element_text(size = my_als)
-#                  , legend.position = c(0.3, 0.8)
-#                  , legend.key.height = unit(1, 'mm')
+        #, legend.text = element_text(size = 10)
+        #, legend.title = element_text(size = my_als)
+        , legend.position = "none"
  ) 

+print(p2)
+#dev.off()
+#====================
+#combine
+#=======
+# output
+#=======
+lineage_dist_combined = "lineage_dist_combined_PS.svg"
+plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
+
+svg(plot_lineage_dist_combined, width = 16, height = 12)
+
+printFile = cowplot::plot_grid(p1, p2
+                               , label_size = my_als+10)
+
 print(printFile)
 dev.off()
-
-#=!=!=!=!=!=!=!
-# COMMENT: Not much differences in the distributions
-# when using merged_df2 or merged_df2_comp.
-# Also, the lineage differences disappear when looking at all muts
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!=!
-#===================================================
-
-# COMPARING DISTRIBUTIONS: KS test 
-# run: "../KS_test_PS.R"
-
-
-