script to plot lineage dist plots

2020-09-04 22:40:49 +01:00 · 2020-09-04 22:40:49 +01:00 · dd1158a66c
commit dd1158a66c
parent 645868ea27
1 changed files with 232 additions and 0 deletions
--- a/scripts/plotting/lineage_dist_PS.R
+++ b/scripts/plotting/lineage_dist_PS.R
@ -0,0 +1,232 @@
+#!/usr/bin/env Rscript       
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA for <drug>
+# merged_df2
+# merged_df3
+
+# df without NA for <drug>
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
+table(my_df_dr$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Run two times: 
+# uncomment as necessary
+# 1) for all muts
+# 2) for dr_muts
+#===========================
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+
+#================
+# for ALL muts
+#================
+plot_df = my_df  
+my_plot_name = 'lineage_dist_PS.svg'
+
+plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
+
+#my_plot_name = 'lineage_dist_PS_comp.svg'
+
+#================
+# for dr muts ONLY
+#================
+plot_df = my_df_dr 
+#my_plot_name = 'lineage_dist_dr_PS.svg'
+#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
+my_plot_name = 'lineage_dist_drug_muts_PS.svg'
+
+plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+table(plot_df$lineage); str(plot_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+                 #, "lineage5"
+                 #, "lineage6"
+                 #, "lineage7")
+
+# uncomment as necessary
+df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
+table(df_lin$lineage)
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
+
+table(df_lin$lineage)#{RESULT: No of samples within lineage}
+
+length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
+
+length(df_lin$mutationinformation)
+# sanity checks
+# FIXME
+r1 = 2:7 # when merged_df2 used: because there is missing lineages 
+if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+u2 = unique(plot_df$mutationinformation)
+u = unique(df_lin$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df <- df_lin
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
+              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
+              )
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
+                    # , 'lineage5', 'lineage6', 'lineage7'
+                     )
+# check plot name
+my_plot_name
+
+# output svg
+svg(plot_lineage_duet)
+printFile = ggplot(df, aes(x = duet_scaled
+                            , y = duet_outcome))+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#             , switch = 'x'
+              , labeller = labeller(lineage = my_labels) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                   , ylim = c(0, 6)
+#                   , clip = "off" 
+) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#                  , axis.text.y = element_text(size = my_ats
+#                                                , angle = 0
+#                                                , hjust = 1
+#                                                , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#                  , legend.position = c(0.3, 0.8)
+#                  , legend.key.height = unit(1, 'mm')
+  ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!=!
+# COMMENT: Not much differences in the distributions
+# when using merged_df2 or merged_df2_comp.
+# Also, the lineage differences disappear when looking at all muts
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!=!
+#===================================================
+
+# COMPARING DISTRIBUTIONS: KS test 
+# run: "../KS_test_PS.R"
+
+
+