script to plot lineage dist plots

2020-09-04 22:40:49 +01:00 · 2020-09-04 22:40:49 +01:00 · dd1158a66c
commit dd1158a66c
parent 645868ea27
1 changed files with 232 additions and 0 deletions
--- a/scripts/plotting/lineage_dist_PS.R
+++ b/scripts/plotting/lineage_dist_PS.R
@ -0,0 +1,232 @@
 #!/usr/bin/env Rscript       
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("../barplot_colour_function.R")
 #require(data.table)
 ########################################################################
 #		 Read file: call script for combining df for PS			   	   #
 ########################################################################
 source("combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA for <drug>
 # merged_df2
 # merged_df3
 # df without NA for <drug>
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for plots
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
 # using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 table(my_df$mutation_info); str(my_df$mutation_info)
 # subset df with dr muts only
 my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
 table(my_df_dr$mutation_info)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Run two times: 
 # uncomment as necessary
 # 1) for all muts
 # 2) for dr_muts
 #===========================
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 #================
 # for ALL muts
 #================
 plot_df = my_df  
 my_plot_name = 'lineage_dist_PS.svg'
 plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
 #my_plot_name = 'lineage_dist_PS_comp.svg'
 #================
 # for dr muts ONLY
 #================
 plot_df = my_df_dr 
 #my_plot_name = 'lineage_dist_dr_PS.svg'
 #my_plot_name = 'lineage_dist_dr_PS_comp.svg'
 my_plot_name = 'lineage_dist_drug_muts_PS.svg'
 plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
 #%%%%%%%%%%%%%%%%%%%%%%%%
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
 # fill = stability
 #============================
 #===================
 # Data for plots
 #===================
 table(plot_df$lineage); str(plot_df$lineage)
 # subset only lineages1-4
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")
                 #, "lineage5"
                 #, "lineage6"
                 #, "lineage7")
 # uncomment as necessary
 df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
 table(df_lin$lineage)
 # refactor
 df_lin$lineage = factor(df_lin$lineage)
 sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
 table(df_lin$lineage)#{RESULT: No of samples within lineage}
 length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
 length(df_lin$mutationinformation)
 # sanity checks
 # FIXME
 r1 = 2:7 # when merged_df2 used: because there is missing lineages 
 if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
 } else{
  print("Error!: check your numbers")
 } 
 u2 = unique(plot_df$mutationinformation)
 u = unique(df_lin$mutationinformation)
 check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df <- df_lin
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 rm(df_lin)
 #******************
 # generate distribution plot of lineages
 #******************
 # 2 : ggridges (good!)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
              )
 names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
                    # , 'lineage5', 'lineage6', 'lineage7'
                     )
 # check plot name
 my_plot_name
 # output svg
 svg(plot_lineage_duet)
 printFile = ggplot(df, aes(x = duet_scaled
                            , y = duet_outcome))+
  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient(aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
 #             , switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
 #                   , ylim = c(0, 6)
 #                   , clip = "off" 
 ) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
 #                  , axis.text.y = element_text(size = my_ats
 #                                                , angle = 0
 #                                                , hjust = 1
 #                                                , vjust = 0)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size = my_als)
         , legend.text = element_text(size = 10)
         , legend.title = element_text(size = my_als)
 #                  , legend.position = c(0.3, 0.8)
 #                  , legend.key.height = unit(1, 'mm')
  ) 
 print(printFile)
 dev.off()
 #=!=!=!=!=!=!=!
 # COMMENT: Not much differences in the distributions
 # when using merged_df2 or merged_df2_comp.
 # Also, the lineage differences disappear when looking at all muts
 # The pattern we are interested in is possibly only for dr_mutations
 #=!=!=!=!=!=!=!
 #===================================================
 # COMPARING DISTRIBUTIONS: KS test 
 # run: "../KS_test_PS.R"