generated lineage dist plots combined. needs tweaking

2020-09-09 12:53:53 +01:00 · 2020-09-09 12:53:53 +01:00 · 19a984f228
commit 19a984f228
parent 31b98fb3d3
3 changed files with 191 additions and 119 deletions
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -48,6 +48,8 @@ cat("Variables imported:"
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
    , "\nAngstrom symbol:", angstroms_symbol
    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
--- a/scripts/plotting/lineage_basic_barplot.R
+++ b/scripts/plotting/lineage_basic_barplot.R
@ -30,13 +30,15 @@ source("combining_dfs_plotting.R")
 # 9) my_df_u
 # 10) my_df_u_lig
-cat(paste0("Directories imported:"
+cat("Directories imported:"
    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
-           , "\nplotdir:", plotdir))
+    , "\nplotdir:", plotdir)
-cat(paste0("Variables imported:"
+cat("Variables imported:"
    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
@ -44,7 +46,11 @@ cat(paste0("Variables imported:"
    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\nNA count for ORs:", na_count
    , "\nNA count in df2:", na_count_df2
-           , "\nNA count in df3:", na_count_df3))       
+    , "\nNA count in df3:", na_count_df3
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
 #===========
 # input
@ -67,7 +73,6 @@ plot_basic_bp_lineage  =  paste0(plotdir,"/", basic_bp_lineage)
 #================
 # REASSIGNMENT as necessary
 my_df  = merged_df2
 #my_df  = merged_df2_comp
 # clear excess variable
 rm(merged_df2_comp, merged_df3, merged_df3_comp)
--- a/scripts/plotting/lineage_dist_PS.R
+++ b/scripts/plotting/lineage_dist_PS.R
@ -3,36 +3,55 @@ getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
-########################################################################
+#########################################################
-# 				Installing and loading required packages 			   #
+# TASK: Lineage dist plots: ggridges
 ########################################################################
-source("../Header_TT.R")
+# Output: 2 SVGs for PS stability
 #source("../barplot_colour_function.R")
 #require(data.table)
-########################################################################
+# 1) all muts
-#		 Read file: call script for combining df for PS			   	   #
+# 2) dr_muts
 ########################################################################
-source("combining_two_df.R")
+##########################################################
 # 				Installing and loading required packages 			 
 ##########################################################
-#---------------------- PAY ATTENTION
+source("combining_dfs_plotting.R")
-# the above changes the working dir
+# PS combined: 
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+# 1) merged_df2
-#---------------------- PAY ATTENTION
+# 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
-#==========================
+# LIG combined: 
-# This will return:
+# 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
-# df with NA for <drug>
+# 9) my_df_u
-# merged_df2
+# 10) my_df_u_lig
-# merged_df3
+
 cat("Directories imported:"
    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)
 cat("Variables imported:"
    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
    , "\nAngstrom symbol:", angstroms_symbol
    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\nNA count for ORs:", na_count
    , "\nNA count in df2:", na_count_df2
    , "\nNA count in df3:", na_count_df3
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
 # df without NA for <drug>
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for plots
@ -43,13 +62,8 @@ source("combining_two_df.R")
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
@ -63,7 +77,7 @@ is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
-table(my_df$mutation_info); str(my_df$mutation_info)
+table(my_df$mutation_info)
 # subset df with dr muts only
 my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
@ -74,59 +88,32 @@ table(my_df_dr$mutation_info)
 ########################################################################
 #==========================
-# Run two times: 
+# Plot 1: ALL Muts
 # uncomment as necessary
 # 1) for all muts
 # 2) for dr_muts
 #===========================
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 #================
 # for ALL muts
 #================
 plot_df = my_df  
 my_plot_name = 'lineage_dist_PS.svg'
 plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
 #my_plot_name = 'lineage_dist_PS_comp.svg'
 #================
 # for dr muts ONLY
 #================
 plot_df = my_df_dr 
 #my_plot_name = 'lineage_dist_dr_PS.svg'
 #my_plot_name = 'lineage_dist_dr_PS_comp.svg'
 my_plot_name = 'lineage_dist_drug_muts_PS.svg'
 plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
 #%%%%%%%%%%%%%%%%%%%%%%%%
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
 # fill = stability
 #============================
 my_plot_name = 'lineage_dist_PS.svg'
 plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
 #===================
 # Data for plots
 #===================
-table(plot_df$lineage); str(plot_df$lineage)
+table(my_df$lineage); str(my_df$lineage)
 # subset only lineages1-4
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
-                 , "lineage4")
+                 , "lineage4"
                 #, "lineage5"
                 #, "lineage6"
-                 #, "lineage7")
+                 #, "lineage7"
                 )
 # uncomment as necessary
-df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
 table(df_lin$lineage)
 # refactor
@ -139,16 +126,8 @@ table(df_lin$lineage)#{RESULT: No of samples within lineage}
 length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
 length(df_lin$mutationinformation)
 # sanity checks
 # FIXME
 r1 = 2:7 # when merged_df2 used: because there is missing lineages 
 if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
 } else{
  print("Error!: check your numbers")
 } 
-u2 = unique(plot_df$mutationinformation)
+u2 = unique(my_df$mutationinformation)
 u = unique(df_lin$mutationinformation)
 check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
@ -173,11 +152,11 @@ names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
                    # , 'lineage5', 'lineage6', 'lineage7'
                     )
 # check plot name
-my_plot_name
+plot_lineage_duet
 # output svg
-svg(plot_lineage_duet)
+#svg(plot_lineage_duet)
-printFile = ggplot(df, aes(x = duet_scaled
+p1 = ggplot(df, aes(x = duet_scaled
                            , y = duet_outcome))+
  #printFile=geom_density_ridges_gradient(
@ -186,47 +165,133 @@ printFile = ggplot(df, aes(x = duet_scaled
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
-#             , switch = 'x'
+             #, switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)
+  coord_cartesian( xlim = c(-1, 1)) +
 #                   , ylim = c(0, 6)
 #                   , clip = "off" 
 ) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
-#                  , axis.text.y = element_text(size = my_ats
+
 #                                                , angle = 0
 #                                                , hjust = 1
 #                                                , vjust = 0)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
+         , legend.text = element_text(size = my_als-5)
         , legend.title = element_text(size = my_als)
-#                  , legend.position = c(0.3, 0.8)
+) 
-#                  , legend.key.height = unit(1, 'mm')
+
 print(p1)
 #dev.off()
 #######################################################################
 # lineage distribution plot for dr_muts
 #######################################################################
 #==========================
 # Plot 2: dr muts ONLY
 # x = mcsm_values, y = dist
 # fill = stability
 #============================
 my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
 plot_lineage_dr_duet  =  paste0(plotdir,"/", my_plot_name_dr)
 #===================
 # Data for plots
 #===================
 table(my_df_dr$lineage); str(my_df_dr$lineage)
 # uncomment as necessary
 df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
 table(df_lin_dr$lineage)
 # refactor
 df_lin_dr$lineage = factor(df_lin_dr$lineage)
 sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
 table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
 length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
 length(df_lin_dr$mutationinformation)
 u2 = unique(my_df_dr$mutationinformation)
 u = unique(df_lin_dr$mutationinformation)
 check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df_dr <- df_lin_dr
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 rm(df_lin_dr)
 #******************
 # generate distribution plot of lineages
 #******************
 # 2 : ggridges (good!)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 # check plot name
 plot_lineage_dr_duet
 # output svg
 #svg(plot_lineage_dr_duet)
 p2 = ggplot(df_dr, aes(x = duet_scaled
                           , y = duet_outcome))+
  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient(aes(fill = ..x..)
                               , scale = 3
                               , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
              #, switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
                   #, ylim = c(0, 6)
                   #, clip = "off" 
  ) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                       , name = "DUET" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_blank()
        , axis.title.x = element_blank()
        , axis.title.y = element_blank()
        , axis.ticks.y = element_blank()
        , plot.title = element_blank()
        , strip.text = element_text(size = my_als)
        #, legend.text = element_text(size = 10)
        #, legend.title = element_text(size = my_als)
        , legend.position = "none"
  ) 
 print(p2)
 #dev.off()
 #====================
 #combine
 #=======
 # output
 #=======
 lineage_dist_combined = "lineage_dist_combined_PS.svg"
 plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
 svg(plot_lineage_dist_combined, width = 16, height = 12)
 printFile = cowplot::plot_grid(p1, p2
                               , label_size = my_als+10)
 print(printFile)
 dev.off()
 #=!=!=!=!=!=!=!
 # COMMENT: Not much differences in the distributions
 # when using merged_df2 or merged_df2_comp.
 # Also, the lineage differences disappear when looking at all muts
 # The pattern we are interested in is possibly only for dr_mutations
 #=!=!=!=!=!=!=!
 #===================================================
 # COMPARING DISTRIBUTIONS: KS test 
 # run: "../KS_test_PS.R"