added function for generating lineage barplots and also test script along wiadding script for processing data and added it to get_plotting_dfs.R

2021-09-06 19:50:50 +01:00 · 2021-09-06 19:50:50 +01:00 · 869fca7f94
commit 869fca7f94
parent 605eb54526
6 changed files with 470 additions and 5 deletions
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@ -437,13 +437,19 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
      , "\nGot: ", check1)
 }

-
 rm(foo)
 ####################################################################
 #                        Data for DM OM Plots: Long format dfs
 ####################################################################
+
 source("other_plots_data.R")

+####################################################################
+#                  Data for Lineage barplots: WF and LF dfs
+####################################################################
+
+source("lineage_bp_data.R")
+
 ########################################################################
 #                           End of script
 ########################################################################
--- a/scripts/plotting/lineage_bp_data.R
+++ b/scripts/plotting/lineage_bp_data.R
@ -0,0 +1,173 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for lineage barplots:
+# WF and LF data with lineage sample, and snp counts
+# sourced by get_plotting_dfs.R
+#########################################################
+# working dir and loading libraries
+# getwd()
+# setwd("~/git/LSHTM_analysis/scripts/plotting")
+# getwd()
+
+# make cmd
+# globals
+# drug = "streptomycin"
+# gene = "gid"
+
+# source("get_plotting_dfs.R")
+#=======================================================================
+#################################################
+# Get data with lineage count, and snp diversity
+#################################################
+table(merged_df2$lineage)
+
+if (table(merged_df2$lineage == "")[[2]]) {
+
+cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
+  
+}
+
+##################################
+# WF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+##################################
+sel_lineages = levels(as.factor(merged_df2$lineage))
+
+lin_wf = data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i])
+  #print(curr_total)
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+
+  foo = merged_df2[merged_df2$lineage==i,]
+  print(paste0(i, "=======\n"))
+  print(length(unique(foo$mutationinformation)))
+  curr_count = length(unique(foo$mutationinformation))
+  
+  total_snps_u = c(total_snps_u, curr_count)
+}
+lin_wf
+
+# Add these counts as columns to the df
+lin_wf$num_snps_u = total_snps_u
+lin_wf$total_samples = total_samples
+
+# Add SNP diversity
+lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
+lin_wf
+
+#=====================
+# Add some formatting
+#=====================
+# SNP diversity 
+lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
+lin_wf
+
+# Lineage names
+lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages)
+lin_wf
+
+# # Lineage names
+# lin_wf = lin_wf %>%
+#   mutate(ordering_category = case_when(
+#     sel_lineages_f   == ""    ~ 0
+#     , sel_lineages_f == "L1"   ~ 1
+#     , sel_lineages_f == "L2"   ~ 2
+#     , sel_lineages_f == "L3"   ~ 3
+#     , sel_lineages_f == "L4"   ~ 4
+#     , sel_lineages_f == "L5"   ~ 5
+#     , sel_lineages_f == "L6"   ~ 6
+#     , sel_lineages_f == "L7"   ~ 7
+#     , sel_lineages_f == "LBOV" ~ 8
+#     
+#     , sel_lineages_f == "L1;L2" ~ 9
+#     , sel_lineages_f == "L1;L3" ~ 10
+#     , sel_lineages_f == "L1;L4" ~ 11
+#     
+#     , sel_lineages_f == "L2;L3"    ~ 12
+#     , sel_lineages_f == "L2;L3;L4" ~ 13
+#     , sel_lineages_f == "L2;L4"    ~ 14
+#     , sel_lineages_f == "L2;L6"    ~ 15
+#     , sel_lineages_f == "L2;LBOV"  ~ 16
+#     
+#     , sel_lineages_f == "L3;L4" ~ 17
+#     
+#     , sel_lineages_f == "L4;L6" ~ 18
+#     , sel_lineages_f == "L4;L7" ~ 19
+#     
+#     , FALSE ~ -1)
+#   )
+
+##################################
+# LF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+##################################
+names(lin_wf)
+tot_cols = ncol(lin_wf)
+pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f")
+pivot_cols_n = length(pivot_cols)
+
+expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
+  
+lin_lf <- gather(lin_wf
+                 , count_categ
+                 , p_count
+                 , num_snps_u:total_samples
+                 , factor_key = TRUE)
+lin_lf
+
+# quick checks
+if ( nrow(lin_lf)  ==  expected_rows ){
+  cat("\nPASS: Lineage LF data created"
+      , "\nnrow: ", nrow(lin_lf)
+      , "\nncol: ", ncol(lin_lf))
+} else {
+  cat("\nFAIL: numbers mismatch"
+      , "\nExpected nrow: ", expected_rows)
+}
+#######################################
+# #=====================
+# # Add some formatting
+# #=====================
+# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages)
+# lin_lf
+
+
+# lin_lf = lin_lf %>%
+#   mutate(ordering_category = case_when(
+#      sel_lineages_f   == ""    ~ 0
+#     , sel_lineages_f == "L1"   ~ 1
+#     , sel_lineages_f == "L2"   ~ 2
+#     , sel_lineages_f == "L3"   ~ 3
+#     , sel_lineages_f == "L4"   ~ 4
+#     , sel_lineages_f == "L5"   ~ 5
+#     , sel_lineages_f == "L6"   ~ 6
+#     , sel_lineages_f == "L7"   ~ 7
+#     , sel_lineages_f == "LBOV" ~ 8
+# 
+#     , sel_lineages_f == "L1;L2" ~ 9
+#     , sel_lineages_f == "L1;L3" ~ 10
+#     , sel_lineages_f == "L1;L4" ~ 11
+# 
+#     , sel_lineages_f == "L2;L3"    ~ 12
+#     , sel_lineages_f == "L2;L3;L4" ~ 13
+#     , sel_lineages_f == "L2;L4"    ~ 14
+#     , sel_lineages_f == "L2;L6"    ~ 15
+#     , sel_lineages_f == "L2;LBOV"  ~ 16
+# 
+#     , sel_lineages_f == "L3;L4" ~ 17
+# 
+#     , sel_lineages_f == "L4;L6" ~ 18
+#     , sel_lineages_f == "L4;L7" ~ 19
+# 
+#     , FALSE ~ -1)
+#   )
--- a/scripts/plotting/other_plots_data.R
+++ b/scripts/plotting/other_plots_data.R
@ -1,7 +1,8 @@
 #!/usr/bin/env Rscript  
 #########################################################
-# TASK: producing boxplots for dr and other muts
-
+# TASK: Script to format data for dm om plots: 
+# generating LF data
+# sourced by get_plotting_dfs.R
 #########################################################
 # working dir and loading libraries
 # getwd()