graphs for PS lineage dist for all and dr muts

2020-01-22 10:12:09 +00:00 · 2020-01-22 10:12:09 +00:00 · 4de4549430
commit 4de4549430
parent 3c20be5615
4 changed files with 93 additions and 567 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -1,5 +1,5 @@
 getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()

 ########################################################################
@ -24,11 +24,11 @@ source("../combining_two_df.R")
 #==========================
 # This will return:

-# df with NA:
+# df with NA for pyrazinamide:
 # merged_df2
 # merged_df3

-# df without NA:
+# df without NA for pyrazinamide:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
@ -38,14 +38,17 @@ source("../combining_two_df.R")
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
 ###########################

 # uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!

 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
@ -59,12 +62,39 @@ is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)

-table(my_df$mutation_info)
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 

 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################

+#==========================
+# Data for plot: assign as 
+# necessary
+#===========================
+
+# uncomment as necessary
+#!!!!!!!!!!!!!!!!!!!!!!!
+# REASSIGNMENT
+
+#==================
+# data for ALL muts
+#==================
+plot_df = my_df  
+my_plot_name = 'lineage_dist_PS.svg'
+#my_plot_name = 'lineage_dist_PS_comp.svg'
+
+#=======================
+# data for dr_muts ONLY
+#=======================
+#plot_df = my_df_dr 
+#my_plot_name = 'lineage_dist_dr_PS.svg'
+#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
+#!!!!!!!!!!!!!!!!!!!!!!!
+
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
@ -74,6 +104,7 @@ table(my_df$mutation_info)
 #===================
 # Data for plots
 #===================
+table(plot_df$lineage); str(plot_df$lineage)

 # subset only lineages1-4
 sel_lineages = c("lineage1"
@ -82,34 +113,29 @@ sel_lineages = c("lineage1"
                 , "lineage4")

 # uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+df_lin = subset(plot_df, subset = lineage %in% sel_lineages )

 # refactor
 df_lin$lineage = factor(df_lin$lineage)

 table(df_lin$lineage) #{RESULT: No of samples within lineage}
 #lineage1 lineage2 lineage3 lineage4 
-#104     1293      264     1311 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#99     1275      263     1255

 length(unique(df_lin$Mutationinformation))
 #{Result: No. of unique mutations the 4 lineages contribute to}

 # sanity checks
 r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
 } else{
  print("Error!: check your numbers")
 } 

-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!! 
 # REASSIGNMENT
 df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!!

 rm(df_lin)

@ -117,8 +143,8 @@ rm(df_lin)
 # generate distribution plot of lineages
 #******************
 # basic: could improve this!
-library(plotly)
-library(ggridges)
+#library(plotly)
+#library(ggridges)

 g <- ggplot(df, aes(x = ratioDUET)) + 
  geom_density(aes(fill = DUET_outcome)
@ -129,64 +155,68 @@ g <- ggplot(df, aes(x = ratioDUET)) +
 ggplotly(g)

 # 2 : ggridges (good!)
-
 my_ats = 15 # axis text size
 my_als = 20 # axis label size

-fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')

 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()

-svg('lineage_dist_PS.svg')
+# check plot name
+my_plot_name

-printFile = ggplot( df, aes(x = ratioDUET
-                            , y = DUET_outcome) )+
+# output svg
+svg(my_plot_name)
+printFile = ggplot(df, aes(x = ratioDUET
+                            , y = DUET_outcome))+
  
  #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient( aes(fill = ..x..)
+  geom_density_ridges_gradient(aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
 #             , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
+              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off" 
-                ) +
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+#                   , ylim = c(0, 6)
+#                   , clip = "off" 
+) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) + 
-  theme( axis.text.x = element_text( size = my_ats
+  theme(axis.text.x = element_text(size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
+#                  , axis.text.y = element_text(size = my_ats
+#                                                , angle = 0
+#                                                , hjust = 1
+#                                                , vjust = 0)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
-         , strip.text = element_text(size=my_als)
-         , legend.text = element_text(size=10)
-         , legend.title = element_text(size=my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-        ) 
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#                  , legend.position = c(0.3, 0.8)
+#                  , legend.key.height = unit(1, 'mm')
+  ) 

 print(printFile)
 dev.off()

-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
+#=!=!=!=!=!=!=!
+# COMMENT: Not much differences in the distributions
+# when using merged_df2 or merged_df2_comp.
+# Also, the lineage differences disappear when looking at all muts
 # The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
+#=!=!=!=!=!=!=!
 #===================================================

 # COMPARING DISTRIBUTIONS