diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh index 18103c6..17c1a03 100755 --- a/dynamut/split_csv.sh +++ b/dynamut/split_csv.sh @@ -12,10 +12,12 @@ CHUNK=$3 mkdir -p ${OUTDIR}/${CHUNK} cd ${OUTDIR}/${CHUNK} +# makes the 2 dirs, hence ../.. split ../../${INFILE} -l ${CHUNK} -d snp_batch_ -#split ${INFILE} -l ${CHUNK} -d snp_batch_ # use case #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50 -~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50 +#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50 + +# add .txt to the files diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R new file mode 100644 index 0000000..86eb9f1 --- /dev/null +++ b/scripts/functions/bp_lineage.R @@ -0,0 +1,172 @@ +######################################## +# Lineage and within SNP count barplot +######################################## + +lin_count_bp <- function( lf_data + , x_categ = "" + , y_count = "" + , bar_fill_categ = "" + , display_label_col = "" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , d_lab_size = 5 + , d_lab_hjust = 0.5 + , d_lab_vjust = 0.5 + , d_lab_col = "black" + , my_xats = 20 # x axis text size + , my_yats = 20 # y axis text size + , my_xals = 22 # x axis label size + , my_yals = 22 # y axis label size + , my_lls = 22 # legend label size + , bar_col_labels = c("Mutations", "Total Samples") + , bar_col_values = c("grey50", "gray75") + , bar_leg_name = "" + , leg_location = "top" + , y_log10 = FALSE + , y_scale_percent = FALSE + , y_label = c("Count", "SNP diversity") + ) { + g = ggplot(lf_data + , aes( x = factor( eval(parse(text = x_categ)), ordered = T ) + , y = eval(parse(text = y_count)) + , fill = eval(parse(text = bar_fill_categ)) ) ) + + OutPlot = g + geom_bar( stat = bar_stat_stype + , position = position_stack(reverse = TRUE) + #, alpha = 1 + #, colour = "grey75" + ) + + theme(axis.text.x = element_text(size = my_xats + , angle = x_lab_angle) + , axis.text.y = element_text(size = my_yats + , angle = 90 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals + , colour = "black") + , axis.title.y = element_text(size = my_yals + , colour = "black") + , legend.position = leg_location + , legend.text = element_text(size = my_lls)) + + + geom_label(aes(label = eval(parse(text = display_label_col))) + , size = d_lab_size + , hjust = d_lab_hjust + , vjust = d_lab_vjust + , colour = d_lab_col + , show.legend = FALSE + #, check_overlap = TRUE + , position = position_stack(reverse = T)) + + + scale_fill_manual(values = bar_col_values + , name = bar_leg_name + , labels = bar_col_labels) + + labs(title = "" + , x = "" + , y = y_label + , colour = "black") + + + if (y_log10){ + + OutPlot = OutPlot + + scale_y_continuous(trans = "log10" + , labels = trans_format("log10", math_format(10^.x) ) ) + + } + + if (y_scale_percent){ + + OutPlot = OutPlot + + #scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + + scale_y_continuous(labels = scales::percent) + + + labs(title = "" + , x = "" + , y = y_label + , colour = "black") + } + + return(OutPlot) +} + +############################ +# Lineage diversity barplot +############################ +# lin_diversity_bp <- function( wf_data +# , x_categ = "sel_lineages" +# , y_count = "snp_diversity" +# , bar_stat_stype = "identity" +# , display_label_col = "snp_diversity_f" +# , x_lab_angle = 90 +# , d_lab_size = 5 +# , d_lab_hjust = 0.5 +# , d_lab_vjust = 0.5 +# , d_lab_col = "black" +# , my_xats = 20 # x axis text size +# , my_yats = 20 # y axis text size +# , my_xals = 22 # x axis label size +# , my_yals = 22 # y axis label size +# , my_lls = 22 # legend label size +# , bar_leg_name = "" +# , leg_location = "top" +# , y_scale_percent = TRUE +# , y_label = "SNP diversity" ) +# +# { +# g = ggplot(wf_data +# , aes( x = factor( eval(parse(text = x_categ)), ordered = T ) +# , y = eval(parse(text = y_count)) ) ) +# +# OutPlot = g + geom_bar( stat = bar_stat_stype +# , position = position_stack(reverse = TRUE) +# ) + +# +# theme(axis.text.x = element_text(size = my_xats +# , angle = x_lab_angle) +# , axis.text.y = element_text(size = my_yats +# , angle = 90 +# , hjust = 1 +# , vjust = 0) +# , axis.title.x = element_text(size = my_xals +# , colour = "black") +# , axis.title.y = element_text(size = my_yals +# , colour = "black") +# , legend.position = leg_location +# , legend.text = element_text(size = my_lls)) + +# +# geom_label(aes(label = eval(parse(text = display_label_col))) +# , size = d_lab_size +# , hjust = d_lab_hjust +# , vjust = d_lab_vjust +# , colour = d_lab_col +# , show.legend = FALSE +# #, check_overlap = TRUE +# , position = position_stack(reverse = T)) +# # return(OutPlot) +# +# if (y_scale_percent){ +# +# OutPlot = OutPlot + +# scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + +# labs(title = "" +# , x = "" +# , y = y_label +# , colour = "black") +# +# return(OutPlot) +# } + +# return(OutPlot) + +# } + + + + +# ggp <- ggplot(bar_sel, aes(sel_lineages, snp_diversity)) + +# geom_bar(stat = "identity") +# ggp + scale_y_continuous(labels = scales::percent_format(accuracy = 1) +# #, limits = c(0,1) +# , breaks = seq(0, 30, 5) +# ) diff --git a/scripts/functions/test_bp_lineage.R b/scripts/functions/test_bp_lineage.R new file mode 100644 index 0000000..6742429 --- /dev/null +++ b/scripts/functions/test_bp_lineage.R @@ -0,0 +1,111 @@ +setwd("~/git/LSHTM_analysis/scripts/plotting") + +#source ('get_plotting_dfs.R') +source("../functions/bp_lineage.R") + +######################################### +# Lineage and SNP count: lineage lf data +######################################### +# Relevel factors so that x-axis categ appear as you want +lin_lf_plot = lin_lf +lin_lf_plot +is.factor(lin_lf_plot$sel_lineages_f) + +lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f, c("" + , "L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7")) + +levels(lin_lf_plot$sel_lineages_f) + +lin_count_bp(lin_lf_plot + , x_categ = "sel_lineages_f" + , y_count = "p_count" + , bar_fill_categ = "count_categ" + , display_label_col = "p_count" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , bar_col_labels = c("Mutations", "Total Samples") + , bar_col_values = c("grey50", "gray75") + , y_log10 = T + , y_label = "Count" + , y_scale_percent = F) + +############################################### +# Lineage SNP diversity count: lineage wf data +############################################### +# Relevel factors so that x-axis categ appear as you want +lin_wf_plot = lin_wf +is.factor(lin_wf_plot$sel_lineages_f) + +lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f, c("" + , "L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7")) + +levels(lin_wf_plot$sel_lineages_f) + +#========== +# Plot +#========== +lin_count_bp(lin_wf_plot + , x_categ = "sel_lineages_f" + , y_count = "snp_diversity" + , display_label_col = "snp_diversity_f" + + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , y_scale_percent = T + , y_label = "SNP diversity" + + +) + + + + +, x_categ = "sel_lineages_f" +, y_count = "p_count" +, bar_fill_categ = "count_categ" +, display_label_col = "p_count" +, bar_stat_stype = "identity" +, x_lab_angle = 90 +, my_xats = 15 +, bar_col_labels = c("Mutations", "Total Samples") +, bar_col_values = c("grey50", "gray75") +, y_log10 = T +, y_scale_percent = F \ No newline at end of file diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index 2fc1c19..89b477c 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -437,13 +437,19 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { , "\nGot: ", check1) } - rm(foo) #################################################################### # Data for DM OM Plots: Long format dfs #################################################################### + source("other_plots_data.R") +#################################################################### +# Data for Lineage barplots: WF and LF dfs +#################################################################### + +source("lineage_bp_data.R") + ######################################################################## # End of script ######################################################################## diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R new file mode 100644 index 0000000..2cdfbe8 --- /dev/null +++ b/scripts/plotting/lineage_bp_data.R @@ -0,0 +1,173 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for lineage barplots: +# WF and LF data with lineage sample, and snp counts +# sourced by get_plotting_dfs.R +######################################################### +# working dir and loading libraries +# getwd() +# setwd("~/git/LSHTM_analysis/scripts/plotting") +# getwd() + +# make cmd +# globals +# drug = "streptomycin" +# gene = "gid" + +# source("get_plotting_dfs.R") +#======================================================================= +################################################# +# Get data with lineage count, and snp diversity +################################################# +table(merged_df2$lineage) + +if (table(merged_df2$lineage == "")[[2]]) { + +cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]]) + +} + +################################## +# WF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +################################## +sel_lineages = levels(as.factor(merged_df2$lineage)) + +lin_wf = data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i]) + #print(curr_total) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = merged_df2[merged_df2$lineage==i,] + print(paste0(i, "=======\n")) + print(length(unique(foo$mutationinformation))) + curr_count = length(unique(foo$mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} +lin_wf + +# Add these counts as columns to the df +lin_wf$num_snps_u = total_snps_u +lin_wf$total_samples = total_samples + +# Add SNP diversity +lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples +lin_wf + +#===================== +# Add some formatting +#===================== +# SNP diversity +lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) +lin_wf + +# Lineage names +lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages) +lin_wf + +# # Lineage names +# lin_wf = lin_wf %>% +# mutate(ordering_category = case_when( +# sel_lineages_f == "" ~ 0 +# , sel_lineages_f == "L1" ~ 1 +# , sel_lineages_f == "L2" ~ 2 +# , sel_lineages_f == "L3" ~ 3 +# , sel_lineages_f == "L4" ~ 4 +# , sel_lineages_f == "L5" ~ 5 +# , sel_lineages_f == "L6" ~ 6 +# , sel_lineages_f == "L7" ~ 7 +# , sel_lineages_f == "LBOV" ~ 8 +# +# , sel_lineages_f == "L1;L2" ~ 9 +# , sel_lineages_f == "L1;L3" ~ 10 +# , sel_lineages_f == "L1;L4" ~ 11 +# +# , sel_lineages_f == "L2;L3" ~ 12 +# , sel_lineages_f == "L2;L3;L4" ~ 13 +# , sel_lineages_f == "L2;L4" ~ 14 +# , sel_lineages_f == "L2;L6" ~ 15 +# , sel_lineages_f == "L2;LBOV" ~ 16 +# +# , sel_lineages_f == "L3;L4" ~ 17 +# +# , sel_lineages_f == "L4;L6" ~ 18 +# , sel_lineages_f == "L4;L7" ~ 19 +# +# , FALSE ~ -1) +# ) + +################################## +# LF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +################################## +names(lin_wf) +tot_cols = ncol(lin_wf) +pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f") +pivot_cols_n = length(pivot_cols) + +expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) + +lin_lf <- gather(lin_wf + , count_categ + , p_count + , num_snps_u:total_samples + , factor_key = TRUE) +lin_lf + +# quick checks +if ( nrow(lin_lf) == expected_rows ){ + cat("\nPASS: Lineage LF data created" + , "\nnrow: ", nrow(lin_lf) + , "\nncol: ", ncol(lin_lf)) +} else { + cat("\nFAIL: numbers mismatch" + , "\nExpected nrow: ", expected_rows) +} +####################################### +# #===================== +# # Add some formatting +# #===================== +# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages) +# lin_lf + + +# lin_lf = lin_lf %>% +# mutate(ordering_category = case_when( +# sel_lineages_f == "" ~ 0 +# , sel_lineages_f == "L1" ~ 1 +# , sel_lineages_f == "L2" ~ 2 +# , sel_lineages_f == "L3" ~ 3 +# , sel_lineages_f == "L4" ~ 4 +# , sel_lineages_f == "L5" ~ 5 +# , sel_lineages_f == "L6" ~ 6 +# , sel_lineages_f == "L7" ~ 7 +# , sel_lineages_f == "LBOV" ~ 8 +# +# , sel_lineages_f == "L1;L2" ~ 9 +# , sel_lineages_f == "L1;L3" ~ 10 +# , sel_lineages_f == "L1;L4" ~ 11 +# +# , sel_lineages_f == "L2;L3" ~ 12 +# , sel_lineages_f == "L2;L3;L4" ~ 13 +# , sel_lineages_f == "L2;L4" ~ 14 +# , sel_lineages_f == "L2;L6" ~ 15 +# , sel_lineages_f == "L2;LBOV" ~ 16 +# +# , sel_lineages_f == "L3;L4" ~ 17 +# +# , sel_lineages_f == "L4;L6" ~ 18 +# , sel_lineages_f == "L4;L7" ~ 19 +# +# , FALSE ~ -1) +# ) diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R index 8eb2020..a55303b 100644 --- a/scripts/plotting/other_plots_data.R +++ b/scripts/plotting/other_plots_data.R @@ -1,7 +1,8 @@ #!/usr/bin/env Rscript ######################################################### -# TASK: producing boxplots for dr and other muts - +# TASK: Script to format data for dm om plots: +# generating LF data +# sourced by get_plotting_dfs.R ######################################################### # working dir and loading libraries # getwd()