added function for generating lineage barplots and also test script along wiadding script for processing data and added it to get_plotting_dfs.R
This commit is contained in:
parent
343f867eed
commit
7ba6868b60
6 changed files with 470 additions and 5 deletions
172
scripts/functions/bp_lineage.R
Normal file
172
scripts/functions/bp_lineage.R
Normal file
|
@ -0,0 +1,172 @@
|
|||
########################################
|
||||
# Lineage and within SNP count barplot
|
||||
########################################
|
||||
|
||||
lin_count_bp <- function( lf_data
|
||||
, x_categ = ""
|
||||
, y_count = ""
|
||||
, bar_fill_categ = ""
|
||||
, display_label_col = ""
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, d_lab_size = 5
|
||||
, d_lab_hjust = 0.5
|
||||
, d_lab_vjust = 0.5
|
||||
, d_lab_col = "black"
|
||||
, my_xats = 20 # x axis text size
|
||||
, my_yats = 20 # y axis text size
|
||||
, my_xals = 22 # x axis label size
|
||||
, my_yals = 22 # y axis label size
|
||||
, my_lls = 22 # legend label size
|
||||
, bar_col_labels = c("Mutations", "Total Samples")
|
||||
, bar_col_values = c("grey50", "gray75")
|
||||
, bar_leg_name = ""
|
||||
, leg_location = "top"
|
||||
, y_log10 = FALSE
|
||||
, y_scale_percent = FALSE
|
||||
, y_label = c("Count", "SNP diversity")
|
||||
) {
|
||||
g = ggplot(lf_data
|
||||
, aes( x = factor( eval(parse(text = x_categ)), ordered = T )
|
||||
, y = eval(parse(text = y_count))
|
||||
, fill = eval(parse(text = bar_fill_categ)) ) )
|
||||
|
||||
OutPlot = g + geom_bar( stat = bar_stat_stype
|
||||
, position = position_stack(reverse = TRUE)
|
||||
#, alpha = 1
|
||||
#, colour = "grey75"
|
||||
) +
|
||||
theme(axis.text.x = element_text(size = my_xats
|
||||
, angle = x_lab_angle)
|
||||
, axis.text.y = element_text(size = my_yats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xals
|
||||
, colour = "black")
|
||||
, axis.title.y = element_text(size = my_yals
|
||||
, colour = "black")
|
||||
, legend.position = leg_location
|
||||
, legend.text = element_text(size = my_lls)) +
|
||||
|
||||
geom_label(aes(label = eval(parse(text = display_label_col)))
|
||||
, size = d_lab_size
|
||||
, hjust = d_lab_hjust
|
||||
, vjust = d_lab_vjust
|
||||
, colour = d_lab_col
|
||||
, show.legend = FALSE
|
||||
#, check_overlap = TRUE
|
||||
, position = position_stack(reverse = T)) +
|
||||
|
||||
scale_fill_manual(values = bar_col_values
|
||||
, name = bar_leg_name
|
||||
, labels = bar_col_labels) +
|
||||
labs(title = ""
|
||||
, x = ""
|
||||
, y = y_label
|
||||
, colour = "black")
|
||||
|
||||
|
||||
if (y_log10){
|
||||
|
||||
OutPlot = OutPlot +
|
||||
scale_y_continuous(trans = "log10"
|
||||
, labels = trans_format("log10", math_format(10^.x) ) )
|
||||
|
||||
}
|
||||
|
||||
if (y_scale_percent){
|
||||
|
||||
OutPlot = OutPlot +
|
||||
#scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
|
||||
scale_y_continuous(labels = scales::percent) +
|
||||
|
||||
labs(title = ""
|
||||
, x = ""
|
||||
, y = y_label
|
||||
, colour = "black")
|
||||
}
|
||||
|
||||
return(OutPlot)
|
||||
}
|
||||
|
||||
############################
|
||||
# Lineage diversity barplot
|
||||
############################
|
||||
# lin_diversity_bp <- function( wf_data
|
||||
# , x_categ = "sel_lineages"
|
||||
# , y_count = "snp_diversity"
|
||||
# , bar_stat_stype = "identity"
|
||||
# , display_label_col = "snp_diversity_f"
|
||||
# , x_lab_angle = 90
|
||||
# , d_lab_size = 5
|
||||
# , d_lab_hjust = 0.5
|
||||
# , d_lab_vjust = 0.5
|
||||
# , d_lab_col = "black"
|
||||
# , my_xats = 20 # x axis text size
|
||||
# , my_yats = 20 # y axis text size
|
||||
# , my_xals = 22 # x axis label size
|
||||
# , my_yals = 22 # y axis label size
|
||||
# , my_lls = 22 # legend label size
|
||||
# , bar_leg_name = ""
|
||||
# , leg_location = "top"
|
||||
# , y_scale_percent = TRUE
|
||||
# , y_label = "SNP diversity" )
|
||||
#
|
||||
# {
|
||||
# g = ggplot(wf_data
|
||||
# , aes( x = factor( eval(parse(text = x_categ)), ordered = T )
|
||||
# , y = eval(parse(text = y_count)) ) )
|
||||
#
|
||||
# OutPlot = g + geom_bar( stat = bar_stat_stype
|
||||
# , position = position_stack(reverse = TRUE)
|
||||
# ) +
|
||||
#
|
||||
# theme(axis.text.x = element_text(size = my_xats
|
||||
# , angle = x_lab_angle)
|
||||
# , axis.text.y = element_text(size = my_yats
|
||||
# , angle = 90
|
||||
# , hjust = 1
|
||||
# , vjust = 0)
|
||||
# , axis.title.x = element_text(size = my_xals
|
||||
# , colour = "black")
|
||||
# , axis.title.y = element_text(size = my_yals
|
||||
# , colour = "black")
|
||||
# , legend.position = leg_location
|
||||
# , legend.text = element_text(size = my_lls)) +
|
||||
#
|
||||
# geom_label(aes(label = eval(parse(text = display_label_col)))
|
||||
# , size = d_lab_size
|
||||
# , hjust = d_lab_hjust
|
||||
# , vjust = d_lab_vjust
|
||||
# , colour = d_lab_col
|
||||
# , show.legend = FALSE
|
||||
# #, check_overlap = TRUE
|
||||
# , position = position_stack(reverse = T))
|
||||
# # return(OutPlot)
|
||||
#
|
||||
# if (y_scale_percent){
|
||||
#
|
||||
# OutPlot = OutPlot +
|
||||
# scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
|
||||
# labs(title = ""
|
||||
# , x = ""
|
||||
# , y = y_label
|
||||
# , colour = "black")
|
||||
#
|
||||
# return(OutPlot)
|
||||
# }
|
||||
|
||||
# return(OutPlot)
|
||||
|
||||
# }
|
||||
|
||||
|
||||
|
||||
|
||||
# ggp <- ggplot(bar_sel, aes(sel_lineages, snp_diversity)) +
|
||||
# geom_bar(stat = "identity")
|
||||
# ggp + scale_y_continuous(labels = scales::percent_format(accuracy = 1)
|
||||
# #, limits = c(0,1)
|
||||
# , breaks = seq(0, 30, 5)
|
||||
# )
|
111
scripts/functions/test_bp_lineage.R
Normal file
111
scripts/functions/test_bp_lineage.R
Normal file
|
@ -0,0 +1,111 @@
|
|||
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||
|
||||
#source ('get_plotting_dfs.R')
|
||||
source("../functions/bp_lineage.R")
|
||||
|
||||
#########################################
|
||||
# Lineage and SNP count: lineage lf data
|
||||
#########################################
|
||||
# Relevel factors so that x-axis categ appear as you want
|
||||
lin_lf_plot = lin_lf
|
||||
lin_lf_plot
|
||||
is.factor(lin_lf_plot$sel_lineages_f)
|
||||
|
||||
lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f, c(""
|
||||
, "L1"
|
||||
, "L2"
|
||||
, "L3"
|
||||
, "L4"
|
||||
, "L5"
|
||||
, "L6"
|
||||
, "L7"
|
||||
, "LBOV"
|
||||
, "L1;L2"
|
||||
, "L1;L3"
|
||||
, "L1;L4"
|
||||
, "L2;L3"
|
||||
, "L2;L3;L4"
|
||||
, "L2;L4"
|
||||
, "L2;L6"
|
||||
, "L2;LBOV"
|
||||
, "L3;L4"
|
||||
, "L4;L6"
|
||||
, "L4;L7"))
|
||||
|
||||
levels(lin_lf_plot$sel_lineages_f)
|
||||
|
||||
lin_count_bp(lin_lf_plot
|
||||
, x_categ = "sel_lineages_f"
|
||||
, y_count = "p_count"
|
||||
, bar_fill_categ = "count_categ"
|
||||
, display_label_col = "p_count"
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, my_xats = 20
|
||||
, bar_col_labels = c("Mutations", "Total Samples")
|
||||
, bar_col_values = c("grey50", "gray75")
|
||||
, y_log10 = T
|
||||
, y_label = "Count"
|
||||
, y_scale_percent = F)
|
||||
|
||||
###############################################
|
||||
# Lineage SNP diversity count: lineage wf data
|
||||
###############################################
|
||||
# Relevel factors so that x-axis categ appear as you want
|
||||
lin_wf_plot = lin_wf
|
||||
is.factor(lin_wf_plot$sel_lineages_f)
|
||||
|
||||
lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f, c(""
|
||||
, "L1"
|
||||
, "L2"
|
||||
, "L3"
|
||||
, "L4"
|
||||
, "L5"
|
||||
, "L6"
|
||||
, "L7"
|
||||
, "LBOV"
|
||||
, "L1;L2"
|
||||
, "L1;L3"
|
||||
, "L1;L4"
|
||||
, "L2;L3"
|
||||
, "L2;L3;L4"
|
||||
, "L2;L4"
|
||||
, "L2;L6"
|
||||
, "L2;LBOV"
|
||||
, "L3;L4"
|
||||
, "L4;L6"
|
||||
, "L4;L7"))
|
||||
|
||||
levels(lin_wf_plot$sel_lineages_f)
|
||||
|
||||
#==========
|
||||
# Plot
|
||||
#==========
|
||||
lin_count_bp(lin_wf_plot
|
||||
, x_categ = "sel_lineages_f"
|
||||
, y_count = "snp_diversity"
|
||||
, display_label_col = "snp_diversity_f"
|
||||
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, my_xats = 20
|
||||
, y_scale_percent = T
|
||||
, y_label = "SNP diversity"
|
||||
|
||||
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
, x_categ = "sel_lineages_f"
|
||||
, y_count = "p_count"
|
||||
, bar_fill_categ = "count_categ"
|
||||
, display_label_col = "p_count"
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, my_xats = 15
|
||||
, bar_col_labels = c("Mutations", "Total Samples")
|
||||
, bar_col_values = c("grey50", "gray75")
|
||||
, y_log10 = T
|
||||
, y_scale_percent = F
|
|
@ -437,13 +437,19 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
|
|||
, "\nGot: ", check1)
|
||||
}
|
||||
|
||||
|
||||
rm(foo)
|
||||
####################################################################
|
||||
# Data for DM OM Plots: Long format dfs
|
||||
####################################################################
|
||||
|
||||
source("other_plots_data.R")
|
||||
|
||||
####################################################################
|
||||
# Data for Lineage barplots: WF and LF dfs
|
||||
####################################################################
|
||||
|
||||
source("lineage_bp_data.R")
|
||||
|
||||
########################################################################
|
||||
# End of script
|
||||
########################################################################
|
||||
|
|
173
scripts/plotting/lineage_bp_data.R
Normal file
173
scripts/plotting/lineage_bp_data.R
Normal file
|
@ -0,0 +1,173 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Script to format data for lineage barplots:
|
||||
# WF and LF data with lineage sample, and snp counts
|
||||
# sourced by get_plotting_dfs.R
|
||||
#########################################################
|
||||
# working dir and loading libraries
|
||||
# getwd()
|
||||
# setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||
# getwd()
|
||||
|
||||
# make cmd
|
||||
# globals
|
||||
# drug = "streptomycin"
|
||||
# gene = "gid"
|
||||
|
||||
# source("get_plotting_dfs.R")
|
||||
#=======================================================================
|
||||
#################################################
|
||||
# Get data with lineage count, and snp diversity
|
||||
#################################################
|
||||
table(merged_df2$lineage)
|
||||
|
||||
if (table(merged_df2$lineage == "")[[2]]) {
|
||||
|
||||
cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
|
||||
|
||||
}
|
||||
|
||||
##################################
|
||||
# WF data: lineages with
|
||||
# snp count
|
||||
# total_samples
|
||||
# snp diversity (perc)
|
||||
##################################
|
||||
sel_lineages = levels(as.factor(merged_df2$lineage))
|
||||
|
||||
lin_wf = data.frame(sel_lineages) #4, 1
|
||||
total_snps_u = NULL
|
||||
total_samples = NULL
|
||||
|
||||
for (i in sel_lineages){
|
||||
#print(i)
|
||||
curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i])
|
||||
#print(curr_total)
|
||||
total_samples = c(total_samples, curr_total)
|
||||
print(total_samples)
|
||||
|
||||
foo = merged_df2[merged_df2$lineage==i,]
|
||||
print(paste0(i, "=======\n"))
|
||||
print(length(unique(foo$mutationinformation)))
|
||||
curr_count = length(unique(foo$mutationinformation))
|
||||
|
||||
total_snps_u = c(total_snps_u, curr_count)
|
||||
}
|
||||
lin_wf
|
||||
|
||||
# Add these counts as columns to the df
|
||||
lin_wf$num_snps_u = total_snps_u
|
||||
lin_wf$total_samples = total_samples
|
||||
|
||||
# Add SNP diversity
|
||||
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
|
||||
lin_wf
|
||||
|
||||
#=====================
|
||||
# Add some formatting
|
||||
#=====================
|
||||
# SNP diversity
|
||||
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
|
||||
lin_wf
|
||||
|
||||
# Lineage names
|
||||
lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages)
|
||||
lin_wf
|
||||
|
||||
# # Lineage names
|
||||
# lin_wf = lin_wf %>%
|
||||
# mutate(ordering_category = case_when(
|
||||
# sel_lineages_f == "" ~ 0
|
||||
# , sel_lineages_f == "L1" ~ 1
|
||||
# , sel_lineages_f == "L2" ~ 2
|
||||
# , sel_lineages_f == "L3" ~ 3
|
||||
# , sel_lineages_f == "L4" ~ 4
|
||||
# , sel_lineages_f == "L5" ~ 5
|
||||
# , sel_lineages_f == "L6" ~ 6
|
||||
# , sel_lineages_f == "L7" ~ 7
|
||||
# , sel_lineages_f == "LBOV" ~ 8
|
||||
#
|
||||
# , sel_lineages_f == "L1;L2" ~ 9
|
||||
# , sel_lineages_f == "L1;L3" ~ 10
|
||||
# , sel_lineages_f == "L1;L4" ~ 11
|
||||
#
|
||||
# , sel_lineages_f == "L2;L3" ~ 12
|
||||
# , sel_lineages_f == "L2;L3;L4" ~ 13
|
||||
# , sel_lineages_f == "L2;L4" ~ 14
|
||||
# , sel_lineages_f == "L2;L6" ~ 15
|
||||
# , sel_lineages_f == "L2;LBOV" ~ 16
|
||||
#
|
||||
# , sel_lineages_f == "L3;L4" ~ 17
|
||||
#
|
||||
# , sel_lineages_f == "L4;L6" ~ 18
|
||||
# , sel_lineages_f == "L4;L7" ~ 19
|
||||
#
|
||||
# , FALSE ~ -1)
|
||||
# )
|
||||
|
||||
##################################
|
||||
# LF data: lineages with
|
||||
# snp count
|
||||
# total_samples
|
||||
# snp diversity (perc)
|
||||
##################################
|
||||
names(lin_wf)
|
||||
tot_cols = ncol(lin_wf)
|
||||
pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f")
|
||||
pivot_cols_n = length(pivot_cols)
|
||||
|
||||
expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
|
||||
|
||||
lin_lf <- gather(lin_wf
|
||||
, count_categ
|
||||
, p_count
|
||||
, num_snps_u:total_samples
|
||||
, factor_key = TRUE)
|
||||
lin_lf
|
||||
|
||||
# quick checks
|
||||
if ( nrow(lin_lf) == expected_rows ){
|
||||
cat("\nPASS: Lineage LF data created"
|
||||
, "\nnrow: ", nrow(lin_lf)
|
||||
, "\nncol: ", ncol(lin_lf))
|
||||
} else {
|
||||
cat("\nFAIL: numbers mismatch"
|
||||
, "\nExpected nrow: ", expected_rows)
|
||||
}
|
||||
#######################################
|
||||
# #=====================
|
||||
# # Add some formatting
|
||||
# #=====================
|
||||
# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages)
|
||||
# lin_lf
|
||||
|
||||
|
||||
# lin_lf = lin_lf %>%
|
||||
# mutate(ordering_category = case_when(
|
||||
# sel_lineages_f == "" ~ 0
|
||||
# , sel_lineages_f == "L1" ~ 1
|
||||
# , sel_lineages_f == "L2" ~ 2
|
||||
# , sel_lineages_f == "L3" ~ 3
|
||||
# , sel_lineages_f == "L4" ~ 4
|
||||
# , sel_lineages_f == "L5" ~ 5
|
||||
# , sel_lineages_f == "L6" ~ 6
|
||||
# , sel_lineages_f == "L7" ~ 7
|
||||
# , sel_lineages_f == "LBOV" ~ 8
|
||||
#
|
||||
# , sel_lineages_f == "L1;L2" ~ 9
|
||||
# , sel_lineages_f == "L1;L3" ~ 10
|
||||
# , sel_lineages_f == "L1;L4" ~ 11
|
||||
#
|
||||
# , sel_lineages_f == "L2;L3" ~ 12
|
||||
# , sel_lineages_f == "L2;L3;L4" ~ 13
|
||||
# , sel_lineages_f == "L2;L4" ~ 14
|
||||
# , sel_lineages_f == "L2;L6" ~ 15
|
||||
# , sel_lineages_f == "L2;LBOV" ~ 16
|
||||
#
|
||||
# , sel_lineages_f == "L3;L4" ~ 17
|
||||
#
|
||||
# , sel_lineages_f == "L4;L6" ~ 18
|
||||
# , sel_lineages_f == "L4;L7" ~ 19
|
||||
#
|
||||
# , FALSE ~ -1)
|
||||
# )
|
|
@ -1,7 +1,8 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: producing boxplots for dr and other muts
|
||||
|
||||
# TASK: Script to format data for dm om plots:
|
||||
# generating LF data
|
||||
# sourced by get_plotting_dfs.R
|
||||
#########################################################
|
||||
# working dir and loading libraries
|
||||
# getwd()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue