added lineage_dist.R nad renamed lineage_bp_data file to lineage_data

2021-09-09 16:14:14 +01:00 · 2021-09-09 16:14:14 +01:00 · 66d45a8b73
commit 66d45a8b73
parent cdab8f0414
2 changed files with 224 additions and 0 deletions
--- a/scripts/functions/lineage_dist.R
+++ b/scripts/functions/lineage_dist.R
@ -0,0 +1,69 @@
 ###############################
 # TASK: function to plot lineage
 # dist plots with or without facet
 # think about color palette
 # for stability
 ##############################
 #n_colours = length(unique(lin_dist_plot$duet_scaled))
 #my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
 lineage_distP <- function(plotdf
                          , x_axis = "duet_scaled"
                          , y_axis = "lineage_labels"
                          , x_lab = "DUET"
                          , with_facet = F
                          , facet_wrap_var = ""
                          , fill_categ = "mutation_info_labels"
                          , fill_categ_cols = c("#E69F00", "#999999")
                          , my_ats = 15 # axis text size
                          , my_als = 20 # axis label size
                          , my_leg_ts = 16
                          , my_leg_title = 16
                          , my_strip_ts = 20
                          , leg_pos = c(0.8, 0.9)
                          , leg_pos_wf = c("top", "left", "bottom", "right")
                          , leg_dir_wf = c("horizontal", "vertical")
                          , leg_label = "")
 {
 LinDistP = ggplot(plotdf, aes_string(x = x_axis
                              , y = y_axis))+
  geom_density_ridges(aes_string(fill = fill_categ)
                      , scale = 3
                      , size = 0.3
                      , alpha = 0.8) +
  scale_x_continuous(expand = c(0.01, 0.01)) +
  #coord_cartesian( xlim = c(-1, 1)) +
  scale_fill_manual(values = fill_categ_cols) +
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_text(size = my_ats)
        , axis.title.x = element_text(size = my_ats)
        , axis.title.y = element_blank()
        , strip.text = element_text(size = my_strip_ts)
        , legend.text = element_text(size = my_leg_ts)
        , legend.title = element_text(size = my_leg_title)
        , legend.position = c(0.8, 0.9)) +
  labs(x = x_lab
       , fill = leg_label)
 if (with_facet){
  # used reformulate or make as formula
  #fwv = reformulate(facet_wrap_var)
  fwv = as.formula(paste0("~", facet_wrap_var))
  LinDistP = LinDistP + 
    facet_wrap(fwv) + 
    theme(legend.position = leg_pos_wf
          , legend.direction = leg_dir_wf)
 }
 return(LinDistP)
 }
--- a/scripts/plotting/lineage_data.R
+++ b/scripts/plotting/lineage_data.R
@ -0,0 +1,155 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Script to format data for lineage barplots:
 # WF and LF data with lineage sample, and snp counts
 # sourced by get_plotting_dfs.R
 #########################################################
 # working dir and loading libraries
 # getwd()
 # setwd("~/git/LSHTM_analysis/scripts/plotting")
 # getwd()
 # make cmd
 # globals
 # drug = "streptomycin"
 # gene = "gid"
 # source("get_plotting_dfs.R")
 #=======================================================================
 #################################################
 # Get data with lineage count, and snp diversity
 #################################################
 table(merged_df2$lineage)
 if (table(merged_df2$lineage == "")[[2]]) {
 cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
 }
 table(merged_df2$lineage_labels)
 class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)
 ##################################
 # WF data: lineages with 
 # snp count
 # total_samples
 # snp diversity (perc)
 ##################################
 sel_lineages = levels(merged_df2$lineage_labels)
 lin_wf = data.frame(sel_lineages) #4, 1
 total_snps_u = NULL
 total_samples = NULL
 for (i in sel_lineages){
  #print(i)
  curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i])
  #print(curr_total)
  total_samples = c(total_samples, curr_total)
  print(total_samples)
  foo = merged_df2[merged_df2$lineage_labels==i,]
  print(paste0(i, "=======\n"))
  print(length(unique(foo$mutationinformation)))
  curr_count = length(unique(foo$mutationinformation))
  total_snps_u = c(total_snps_u, curr_count)
 }
 lin_wf
 # Add these counts as columns to the df
 lin_wf$num_snps_u = total_snps_u
 lin_wf$total_samples = total_samples
 lin_wf
 # Add SNP diversity
 lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
 lin_wf
 #=====================
 # Add some formatting
 #=====================
 # SNP diversity 
 lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
 lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
 lin_wf$sel_lineages
 # Important: Check factors so that x-axis categ appear as you want
 lin_wf$sel_lineages =  factor(lin_wf$sel_lineages, c("L1"
                                                     , "L2"
                                                     , "L3"
                                                     , "L4"
                                                     , "L5"
                                                     , "L6"
                                                     , "L7"
                                                     , "LBOV"
                                                     , "L1;L2"
                                                     , "L1;L3"
                                                     , "L1;L4"
                                                     , "L2;L3"
                                                     , "L2;L3;L4"
                                                     , "L2;L4"  
                                                     , "L2;L6"
                                                     , "L2;LBOV"  
                                                     , "L3;L4" 
                                                     , "L4;L6"
                                                     , "L4;L7"
                                                     , ""))
 levels(lin_wf$sel_lineages)
 ##################################
 # LF data: lineages with 
 # snp count
 # total_samples
 # snp diversity (perc)
 ##################################
 names(lin_wf)
 tot_cols = ncol(lin_wf)
 pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
 pivot_cols_n = length(pivot_cols)
 expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
 lin_lf <- gather(lin_wf
                 , count_categ
                 , p_count
                 , num_snps_u:total_samples
                 , factor_key = TRUE)
 lin_lf
 # quick checks
 if ( nrow(lin_lf )  ==  expected_rows ){
  cat("\nPASS: Lineage LF data created"
      , "\nnrow: ", nrow(lin_lf)
      , "\nncol: ", ncol(lin_lf))
 } else {
  cat("\nFAIL: numbers mismatch"
      , "\nExpected nrow: ", expected_rows)
 }
 # Important: Relevel factors so that x-axis categ appear as you want
 lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c("L1"
                                                     , "L2"
                                                     , "L3"
                                                     , "L4"
                                                     , "L5"
                                                     , "L6"
                                                     , "L7"
                                                     , "LBOV"
                                                     , "L1;L2"
                                                     , "L1;L3"
                                                     , "L1;L4"
                                                     , "L2;L3"
                                                     , "L2;L3;L4"
                                                     , "L2;L4"  
                                                     , "L2;L6"
                                                     , "L2;LBOV"  
                                                     , "L3;L4" 
                                                     , "L4;L6"
                                                     , "L4;L7"
                                                     , ""))
 levels(lin_lf$sel_lineages)