import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -0,0 +1,212 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#104     1293      264     1311 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#99     1275      263     1255
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+g <- ggplot(df, aes(x = ratioDUET)) + 
+  geom_density(aes(fill = DUET_outcome)
+               , alpha = 0.5) + facet_wrap(~ lineage,
+                                           scales = "free") +
+  ggtitle("Kernel Density estimates of Protein stability by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_PS.svg')
+
+printFile = ggplot( df, aes(x = ratioDUET
+                            , y = DUET_outcome) )+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#             , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off" 
+                ) +
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size=my_als)
+         , legend.text = element_text(size=10)
+         , legend.title = element_text(size=my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+        ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioDUET
+lin2 = df[df$lineage == "lineage2",]$ratioDUET
+lin3 = df[df$lineage == "lineage3",]$ratioDUET
+lin4 = df[df$lineage == "lineage4",]$ratioDUET
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3)
+ks.test(lin2,lin4)  
+
+ks.test(lin3,lin4)  
+
+
+