LSHTM_analysis/scripts/plotting/lineage_dist_combined_PS.R

#!/usr/bin/env Rscript
#########################################################
# TASK: Lineage dist plots: ggridges

# Output: 2 SVGs for PS stability

# 1) all muts
# 2) dr_muts

##########################################################
# 				Installing and loading required packages
##########################################################
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()

source("Header_TT.R")
library(ggridges)
source("combining_dfs_plotting.R")
# PS combined:
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp

# LIG combined:
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig

# 9) my_df_u
# 10) my_df_u_lig

cat("Directories imported:"
    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)

cat("Variables imported:"
    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
    , "\nAngstrom symbol:", angstroms_symbol
    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\nNA count for ORs:", na_count
    , "\nNA count in df2:", na_count_df2
    , "\nNA count in df3:", na_count_df3
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)

#=======
# output
#=======
lineage_dist_combined = "lineage_dist_combined_PS.svg"
plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
#========================================================================

###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################
# REASSIGNMENT
my_df  = merged_df2

# delete variables not required
rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# quick checks
colnames(my_df)
str(my_df)

# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)

table(my_df$mutation_info)

# subset df with dr muts only
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide")
table(my_df_dr$mutation_info)

########################################################################
#               end of data extraction and cleaning for plots          #
########################################################################

#==========================
# Plot 1: ALL Muts
# x = mcsm_values, y = dist
# fill = stability
#============================

my_plot_name = 'lineage_dist_PS.svg'

plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)

#===================
# Data for plots
#===================
table(my_df$lineage); str(my_df$lineage)

# subset only lineages1-4
sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4"
                 #, "lineage5"
                 #, "lineage6"
                 #, "lineage7"
                 )

# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
table(df_lin$lineage)

# refactor
df_lin$lineage = factor(df_lin$lineage)

sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}

table(df_lin$lineage)#{RESULT: No of samples within lineage}

length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}

length(df_lin$mutationinformation)

u2 = unique(my_df$mutationinformation)
u = unique(df_lin$mutationinformation)
check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}

#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df <- df_lin
#%%%%%%%%%%%%%%%%%%%%%%%%%

rm(df_lin)

#******************
# generate distribution plot of lineages
#******************
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size

my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
              )
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
                    # , 'lineage5', 'lineage6', 'lineage7'
                     )
# check plot name
plot_lineage_duet

# output svg
#svg(plot_lineage_duet)
p1 = ggplot(df, aes(x = duet_scaled
                            , y = duet_outcome))+

  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient(aes(fill = ..x..)
                               #, jittered_points = TRUE
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
             #, switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) +
  theme(axis.text.x = element_text(size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)

         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size = my_als)
         , legend.text = element_text(size = my_als-5)
         , legend.title = element_text(size = my_als)
)

print(p1)
#dev.off()

#######################################################################
# lineage distribution plot for dr_muts
#######################################################################

#==========================
# Plot 2: dr muts ONLY
# x = mcsm_values, y = dist
# fill = stability
#============================

my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'

plot_lineage_dr_duet  =  paste0(plotdir,"/", my_plot_name_dr)

#===================
# Data for plots
#===================
table(my_df_dr$lineage); str(my_df_dr$lineage)

# uncomment as necessary
df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
table(df_lin_dr$lineage)

# refactor
df_lin_dr$lineage = factor(df_lin_dr$lineage)

sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}

table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}

length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}

length(df_lin_dr$mutationinformation)

u2 = unique(my_df_dr$mutationinformation)
u = unique(df_lin_dr$mutationinformation)
check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}

#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df_dr <- df_lin_dr
#%%%%%%%%%%%%%%%%%%%%%%%%%

rm(df_lin_dr)

#******************
# generate distribution plot of lineages
#******************
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size


# check plot name
plot_lineage_dr_duet

# output svg
#svg(plot_lineage_dr_duet)
p2 = ggplot(df_dr, aes(x = duet_scaled
                       , y = duet_outcome))+

  geom_density_ridges_gradient(aes(fill = ..x..)
                               #, jittered_points = TRUE
                               , scale = 3
                               , size = 0.3) +
  #geom_point(aes(size = or_mychisq))+
  facet_wrap( ~lineage
              , scales = "free"
              #, switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
                   #, ylim = c(0, 6)
                   #, clip = "off"
  ) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                       , name = "DUET" ) +
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_blank()
        , axis.title.x = element_blank()
        , axis.title.y = element_blank()
        , axis.ticks.y = element_blank()
        , plot.title = element_blank()
        , strip.text = element_text(size = my_als)
        , legend.text = element_text(size = 10)
        , legend.title = element_text(size = my_als)
        #, legend.position = "none"
  )

print(p2)
#dev.off()
########################################################################
#==============
# combine plot
#===============

svg(plot_lineage_dist_combined, width = 12, height = 6)

printFile = cowplot::plot_grid(p1, p2
                               , label_size = my_als+10)

print(printFile)
dev.off()