LSHTM_analysis/scripts/plotting/lineage_bp_data.R

173 lines
4.7 KiB
R

#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for lineage barplots:
# WF and LF data with lineage sample, and snp counts
# sourced by get_plotting_dfs.R
#########################################################
# working dir and loading libraries
# getwd()
# setwd("~/git/LSHTM_analysis/scripts/plotting")
# getwd()
# make cmd
# globals
# drug = "streptomycin"
# gene = "gid"
# source("get_plotting_dfs.R")
#=======================================================================
#################################################
# Get data with lineage count, and snp diversity
#################################################
table(merged_df2$lineage)
if (table(merged_df2$lineage == "")[[2]]) {
cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
}
##################################
# WF data: lineages with
# snp count
# total_samples
# snp diversity (perc)
##################################
sel_lineages = levels(as.factor(merged_df2$lineage))
lin_wf = data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i])
#print(curr_total)
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = merged_df2[merged_df2$lineage==i,]
print(paste0(i, "=======\n"))
print(length(unique(foo$mutationinformation)))
curr_count = length(unique(foo$mutationinformation))
total_snps_u = c(total_snps_u, curr_count)
}
lin_wf
# Add these counts as columns to the df
lin_wf$num_snps_u = total_snps_u
lin_wf$total_samples = total_samples
# Add SNP diversity
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
lin_wf
#=====================
# Add some formatting
#=====================
# SNP diversity
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
lin_wf
# Lineage names
lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages)
lin_wf
# # Lineage names
# lin_wf = lin_wf %>%
# mutate(ordering_category = case_when(
# sel_lineages_f == "" ~ 0
# , sel_lineages_f == "L1" ~ 1
# , sel_lineages_f == "L2" ~ 2
# , sel_lineages_f == "L3" ~ 3
# , sel_lineages_f == "L4" ~ 4
# , sel_lineages_f == "L5" ~ 5
# , sel_lineages_f == "L6" ~ 6
# , sel_lineages_f == "L7" ~ 7
# , sel_lineages_f == "LBOV" ~ 8
#
# , sel_lineages_f == "L1;L2" ~ 9
# , sel_lineages_f == "L1;L3" ~ 10
# , sel_lineages_f == "L1;L4" ~ 11
#
# , sel_lineages_f == "L2;L3" ~ 12
# , sel_lineages_f == "L2;L3;L4" ~ 13
# , sel_lineages_f == "L2;L4" ~ 14
# , sel_lineages_f == "L2;L6" ~ 15
# , sel_lineages_f == "L2;LBOV" ~ 16
#
# , sel_lineages_f == "L3;L4" ~ 17
#
# , sel_lineages_f == "L4;L6" ~ 18
# , sel_lineages_f == "L4;L7" ~ 19
#
# , FALSE ~ -1)
# )
##################################
# LF data: lineages with
# snp count
# total_samples
# snp diversity (perc)
##################################
names(lin_wf)
tot_cols = ncol(lin_wf)
pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f")
pivot_cols_n = length(pivot_cols)
expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
lin_lf <- gather(lin_wf
, count_categ
, p_count
, num_snps_u:total_samples
, factor_key = TRUE)
lin_lf
# quick checks
if ( nrow(lin_lf) == expected_rows ){
cat("\nPASS: Lineage LF data created"
, "\nnrow: ", nrow(lin_lf)
, "\nncol: ", ncol(lin_lf))
} else {
cat("\nFAIL: numbers mismatch"
, "\nExpected nrow: ", expected_rows)
}
#######################################
# #=====================
# # Add some formatting
# #=====================
# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages)
# lin_lf
# lin_lf = lin_lf %>%
# mutate(ordering_category = case_when(
# sel_lineages_f == "" ~ 0
# , sel_lineages_f == "L1" ~ 1
# , sel_lineages_f == "L2" ~ 2
# , sel_lineages_f == "L3" ~ 3
# , sel_lineages_f == "L4" ~ 4
# , sel_lineages_f == "L5" ~ 5
# , sel_lineages_f == "L6" ~ 6
# , sel_lineages_f == "L7" ~ 7
# , sel_lineages_f == "LBOV" ~ 8
#
# , sel_lineages_f == "L1;L2" ~ 9
# , sel_lineages_f == "L1;L3" ~ 10
# , sel_lineages_f == "L1;L4" ~ 11
#
# , sel_lineages_f == "L2;L3" ~ 12
# , sel_lineages_f == "L2;L3;L4" ~ 13
# , sel_lineages_f == "L2;L4" ~ 14
# , sel_lineages_f == "L2;L6" ~ 15
# , sel_lineages_f == "L2;LBOV" ~ 16
#
# , sel_lineages_f == "L3;L4" ~ 17
#
# , sel_lineages_f == "L4;L6" ~ 18
# , sel_lineages_f == "L4;L7" ~ 19
#
# , FALSE ~ -1)
# )