added function for generating lineage barplots and also test script along wiadding script for processing data and added it to get_plotting_dfs.R

This commit is contained in:
Tanushree Tunstall 2021-09-06 19:50:50 +01:00
parent 605eb54526
commit 869fca7f94
6 changed files with 470 additions and 5 deletions

View file

@ -0,0 +1,172 @@
########################################
# Lineage and within SNP count barplot
########################################
lin_count_bp <- function( lf_data
, x_categ = ""
, y_count = ""
, bar_fill_categ = ""
, display_label_col = ""
, bar_stat_stype = "identity"
, x_lab_angle = 90
, d_lab_size = 5
, d_lab_hjust = 0.5
, d_lab_vjust = 0.5
, d_lab_col = "black"
, my_xats = 20 # x axis text size
, my_yats = 20 # y axis text size
, my_xals = 22 # x axis label size
, my_yals = 22 # y axis label size
, my_lls = 22 # legend label size
, bar_col_labels = c("Mutations", "Total Samples")
, bar_col_values = c("grey50", "gray75")
, bar_leg_name = ""
, leg_location = "top"
, y_log10 = FALSE
, y_scale_percent = FALSE
, y_label = c("Count", "SNP diversity")
) {
g = ggplot(lf_data
, aes( x = factor( eval(parse(text = x_categ)), ordered = T )
, y = eval(parse(text = y_count))
, fill = eval(parse(text = bar_fill_categ)) ) )
OutPlot = g + geom_bar( stat = bar_stat_stype
, position = position_stack(reverse = TRUE)
#, alpha = 1
#, colour = "grey75"
) +
theme(axis.text.x = element_text(size = my_xats
, angle = x_lab_angle)
, axis.text.y = element_text(size = my_yats
, angle = 90
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xals
, colour = "black")
, axis.title.y = element_text(size = my_yals
, colour = "black")
, legend.position = leg_location
, legend.text = element_text(size = my_lls)) +
geom_label(aes(label = eval(parse(text = display_label_col)))
, size = d_lab_size
, hjust = d_lab_hjust
, vjust = d_lab_vjust
, colour = d_lab_col
, show.legend = FALSE
#, check_overlap = TRUE
, position = position_stack(reverse = T)) +
scale_fill_manual(values = bar_col_values
, name = bar_leg_name
, labels = bar_col_labels) +
labs(title = ""
, x = ""
, y = y_label
, colour = "black")
if (y_log10){
OutPlot = OutPlot +
scale_y_continuous(trans = "log10"
, labels = trans_format("log10", math_format(10^.x) ) )
}
if (y_scale_percent){
OutPlot = OutPlot +
#scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
scale_y_continuous(labels = scales::percent) +
labs(title = ""
, x = ""
, y = y_label
, colour = "black")
}
return(OutPlot)
}
############################
# Lineage diversity barplot
############################
# lin_diversity_bp <- function( wf_data
# , x_categ = "sel_lineages"
# , y_count = "snp_diversity"
# , bar_stat_stype = "identity"
# , display_label_col = "snp_diversity_f"
# , x_lab_angle = 90
# , d_lab_size = 5
# , d_lab_hjust = 0.5
# , d_lab_vjust = 0.5
# , d_lab_col = "black"
# , my_xats = 20 # x axis text size
# , my_yats = 20 # y axis text size
# , my_xals = 22 # x axis label size
# , my_yals = 22 # y axis label size
# , my_lls = 22 # legend label size
# , bar_leg_name = ""
# , leg_location = "top"
# , y_scale_percent = TRUE
# , y_label = "SNP diversity" )
#
# {
# g = ggplot(wf_data
# , aes( x = factor( eval(parse(text = x_categ)), ordered = T )
# , y = eval(parse(text = y_count)) ) )
#
# OutPlot = g + geom_bar( stat = bar_stat_stype
# , position = position_stack(reverse = TRUE)
# ) +
#
# theme(axis.text.x = element_text(size = my_xats
# , angle = x_lab_angle)
# , axis.text.y = element_text(size = my_yats
# , angle = 90
# , hjust = 1
# , vjust = 0)
# , axis.title.x = element_text(size = my_xals
# , colour = "black")
# , axis.title.y = element_text(size = my_yals
# , colour = "black")
# , legend.position = leg_location
# , legend.text = element_text(size = my_lls)) +
#
# geom_label(aes(label = eval(parse(text = display_label_col)))
# , size = d_lab_size
# , hjust = d_lab_hjust
# , vjust = d_lab_vjust
# , colour = d_lab_col
# , show.legend = FALSE
# #, check_overlap = TRUE
# , position = position_stack(reverse = T))
# # return(OutPlot)
#
# if (y_scale_percent){
#
# OutPlot = OutPlot +
# scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
# labs(title = ""
# , x = ""
# , y = y_label
# , colour = "black")
#
# return(OutPlot)
# }
# return(OutPlot)
# }
# ggp <- ggplot(bar_sel, aes(sel_lineages, snp_diversity)) +
# geom_bar(stat = "identity")
# ggp + scale_y_continuous(labels = scales::percent_format(accuracy = 1)
# #, limits = c(0,1)
# , breaks = seq(0, 30, 5)
# )

View file

@ -0,0 +1,111 @@
setwd("~/git/LSHTM_analysis/scripts/plotting")
#source ('get_plotting_dfs.R')
source("../functions/bp_lineage.R")
#########################################
# Lineage and SNP count: lineage lf data
#########################################
# Relevel factors so that x-axis categ appear as you want
lin_lf_plot = lin_lf
lin_lf_plot
is.factor(lin_lf_plot$sel_lineages_f)
lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f, c(""
, "L1"
, "L2"
, "L3"
, "L4"
, "L5"
, "L6"
, "L7"
, "LBOV"
, "L1;L2"
, "L1;L3"
, "L1;L4"
, "L2;L3"
, "L2;L3;L4"
, "L2;L4"
, "L2;L6"
, "L2;LBOV"
, "L3;L4"
, "L4;L6"
, "L4;L7"))
levels(lin_lf_plot$sel_lineages_f)
lin_count_bp(lin_lf_plot
, x_categ = "sel_lineages_f"
, y_count = "p_count"
, bar_fill_categ = "count_categ"
, display_label_col = "p_count"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 20
, bar_col_labels = c("Mutations", "Total Samples")
, bar_col_values = c("grey50", "gray75")
, y_log10 = T
, y_label = "Count"
, y_scale_percent = F)
###############################################
# Lineage SNP diversity count: lineage wf data
###############################################
# Relevel factors so that x-axis categ appear as you want
lin_wf_plot = lin_wf
is.factor(lin_wf_plot$sel_lineages_f)
lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f, c(""
, "L1"
, "L2"
, "L3"
, "L4"
, "L5"
, "L6"
, "L7"
, "LBOV"
, "L1;L2"
, "L1;L3"
, "L1;L4"
, "L2;L3"
, "L2;L3;L4"
, "L2;L4"
, "L2;L6"
, "L2;LBOV"
, "L3;L4"
, "L4;L6"
, "L4;L7"))
levels(lin_wf_plot$sel_lineages_f)
#==========
# Plot
#==========
lin_count_bp(lin_wf_plot
, x_categ = "sel_lineages_f"
, y_count = "snp_diversity"
, display_label_col = "snp_diversity_f"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 20
, y_scale_percent = T
, y_label = "SNP diversity"
)
, x_categ = "sel_lineages_f"
, y_count = "p_count"
, bar_fill_categ = "count_categ"
, display_label_col = "p_count"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 15
, bar_col_labels = c("Mutations", "Total Samples")
, bar_col_values = c("grey50", "gray75")
, y_log10 = T
, y_scale_percent = F

View file

@ -437,13 +437,19 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
, "\nGot: ", check1)
}
rm(foo)
####################################################################
# Data for DM OM Plots: Long format dfs
####################################################################
source("other_plots_data.R")
####################################################################
# Data for Lineage barplots: WF and LF dfs
####################################################################
source("lineage_bp_data.R")
########################################################################
# End of script
########################################################################

View file

@ -0,0 +1,173 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for lineage barplots:
# WF and LF data with lineage sample, and snp counts
# sourced by get_plotting_dfs.R
#########################################################
# working dir and loading libraries
# getwd()
# setwd("~/git/LSHTM_analysis/scripts/plotting")
# getwd()
# make cmd
# globals
# drug = "streptomycin"
# gene = "gid"
# source("get_plotting_dfs.R")
#=======================================================================
#################################################
# Get data with lineage count, and snp diversity
#################################################
table(merged_df2$lineage)
if (table(merged_df2$lineage == "")[[2]]) {
cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
}
##################################
# WF data: lineages with
# snp count
# total_samples
# snp diversity (perc)
##################################
sel_lineages = levels(as.factor(merged_df2$lineage))
lin_wf = data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i])
#print(curr_total)
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = merged_df2[merged_df2$lineage==i,]
print(paste0(i, "=======\n"))
print(length(unique(foo$mutationinformation)))
curr_count = length(unique(foo$mutationinformation))
total_snps_u = c(total_snps_u, curr_count)
}
lin_wf
# Add these counts as columns to the df
lin_wf$num_snps_u = total_snps_u
lin_wf$total_samples = total_samples
# Add SNP diversity
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
lin_wf
#=====================
# Add some formatting
#=====================
# SNP diversity
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
lin_wf
# Lineage names
lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages)
lin_wf
# # Lineage names
# lin_wf = lin_wf %>%
# mutate(ordering_category = case_when(
# sel_lineages_f == "" ~ 0
# , sel_lineages_f == "L1" ~ 1
# , sel_lineages_f == "L2" ~ 2
# , sel_lineages_f == "L3" ~ 3
# , sel_lineages_f == "L4" ~ 4
# , sel_lineages_f == "L5" ~ 5
# , sel_lineages_f == "L6" ~ 6
# , sel_lineages_f == "L7" ~ 7
# , sel_lineages_f == "LBOV" ~ 8
#
# , sel_lineages_f == "L1;L2" ~ 9
# , sel_lineages_f == "L1;L3" ~ 10
# , sel_lineages_f == "L1;L4" ~ 11
#
# , sel_lineages_f == "L2;L3" ~ 12
# , sel_lineages_f == "L2;L3;L4" ~ 13
# , sel_lineages_f == "L2;L4" ~ 14
# , sel_lineages_f == "L2;L6" ~ 15
# , sel_lineages_f == "L2;LBOV" ~ 16
#
# , sel_lineages_f == "L3;L4" ~ 17
#
# , sel_lineages_f == "L4;L6" ~ 18
# , sel_lineages_f == "L4;L7" ~ 19
#
# , FALSE ~ -1)
# )
##################################
# LF data: lineages with
# snp count
# total_samples
# snp diversity (perc)
##################################
names(lin_wf)
tot_cols = ncol(lin_wf)
pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f")
pivot_cols_n = length(pivot_cols)
expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
lin_lf <- gather(lin_wf
, count_categ
, p_count
, num_snps_u:total_samples
, factor_key = TRUE)
lin_lf
# quick checks
if ( nrow(lin_lf) == expected_rows ){
cat("\nPASS: Lineage LF data created"
, "\nnrow: ", nrow(lin_lf)
, "\nncol: ", ncol(lin_lf))
} else {
cat("\nFAIL: numbers mismatch"
, "\nExpected nrow: ", expected_rows)
}
#######################################
# #=====================
# # Add some formatting
# #=====================
# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages)
# lin_lf
# lin_lf = lin_lf %>%
# mutate(ordering_category = case_when(
# sel_lineages_f == "" ~ 0
# , sel_lineages_f == "L1" ~ 1
# , sel_lineages_f == "L2" ~ 2
# , sel_lineages_f == "L3" ~ 3
# , sel_lineages_f == "L4" ~ 4
# , sel_lineages_f == "L5" ~ 5
# , sel_lineages_f == "L6" ~ 6
# , sel_lineages_f == "L7" ~ 7
# , sel_lineages_f == "LBOV" ~ 8
#
# , sel_lineages_f == "L1;L2" ~ 9
# , sel_lineages_f == "L1;L3" ~ 10
# , sel_lineages_f == "L1;L4" ~ 11
#
# , sel_lineages_f == "L2;L3" ~ 12
# , sel_lineages_f == "L2;L3;L4" ~ 13
# , sel_lineages_f == "L2;L4" ~ 14
# , sel_lineages_f == "L2;L6" ~ 15
# , sel_lineages_f == "L2;LBOV" ~ 16
#
# , sel_lineages_f == "L3;L4" ~ 17
#
# , sel_lineages_f == "L4;L6" ~ 18
# , sel_lineages_f == "L4;L7" ~ 19
#
# , FALSE ~ -1)
# )

View file

@ -1,7 +1,8 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: producing boxplots for dr and other muts
# TASK: Script to format data for dm om plots:
# generating LF data
# sourced by get_plotting_dfs.R
#########################################################
# working dir and loading libraries
# getwd()