#!/usr/bin/env Rscript  
#########################################################
# TASK: Script to format data for lineage barplots:
# WF and LF data with lineage sample, and snp counts
# sourced by get_plotting_dfs.R
#########################################################
# working dir and loading libraries
# getwd()
# setwd("~/git/LSHTM_analysis/scripts/plotting")
# getwd()

# make cmd
# globals
# drug = "streptomycin"
# gene = "gid"

# source("get_plotting_dfs.R")
#=======================================================================
#################################################
# Get data with lineage count, and snp diversity
#################################################
table(merged_df2$lineage)

if (table(merged_df2$lineage == "")[[2]]) {

cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
  
}

table(merged_df2$lineage_labels)
class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)

##################################
# WF data: lineages with 
# snp count
# total_samples
# snp diversity (perc)
##################################
sel_lineages = levels(merged_df2$lineage_labels)

lin_wf = data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL

for (i in sel_lineages){
  #print(i)
  curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i])
  #print(curr_total)
  total_samples = c(total_samples, curr_total)
  print(total_samples)

  foo = merged_df2[merged_df2$lineage_labels==i,]
  print(paste0(i, "=======\n"))
  print(length(unique(foo$mutationinformation)))
  curr_count = length(unique(foo$mutationinformation))
  
  total_snps_u = c(total_snps_u, curr_count)
}
lin_wf

# Add these counts as columns to the df
lin_wf$num_snps_u = total_snps_u
lin_wf$total_samples = total_samples
lin_wf

# Add SNP diversity
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
lin_wf

#=====================
# Add some formatting
#=====================
# SNP diversity 
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")

lin_wf$sel_lineages

# Important: Check factors so that x-axis categ appear as you want
lin_wf$sel_lineages =  factor(lin_wf$sel_lineages, c("L1"
                                                     , "L2"
                                                     , "L3"
                                                     , "L4"
                                                     , "L5"
                                                     , "L6"
                                                     , "L7"
                                                     , "LBOV"
                                                     , "L1;L2"
                                                     , "L1;L3"
                                                     , "L1;L4"
                                                     , "L2;L3"
                                                     , "L2;L3;L4"
                                                     , "L2;L4"  
                                                     , "L2;L6"
                                                     , "L2;LBOV"  
                                                     , "L3;L4" 
                                                     , "L4;L6"
                                                     , "L4;L7"
                                                     , ""))

levels(lin_wf$sel_lineages)

##################################
# LF data: lineages with 
# snp count
# total_samples
# snp diversity (perc)
##################################
names(lin_wf)
tot_cols = ncol(lin_wf)
pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
pivot_cols_n = length(pivot_cols)

expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
  
lin_lf <- gather(lin_wf
                 , count_categ
                 , p_count
                 , num_snps_u:total_samples
                 , factor_key = TRUE)
lin_lf

# quick checks
if ( nrow(lin_lf )  ==  expected_rows ){
  cat("\nPASS: Lineage LF data created"
      , "\nnrow: ", nrow(lin_lf)
      , "\nncol: ", ncol(lin_lf))
} else {
  cat("\nFAIL: numbers mismatch"
      , "\nExpected nrow: ", expected_rows)
}

# Important: Relevel factors so that x-axis categ appear as you want
lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c("L1"
                                                     , "L2"
                                                     , "L3"
                                                     , "L4"
                                                     , "L5"
                                                     , "L6"
                                                     , "L7"
                                                     , "LBOV"
                                                     , "L1;L2"
                                                     , "L1;L3"
                                                     , "L1;L4"
                                                     , "L2;L3"
                                                     , "L2;L3;L4"
                                                     , "L2;L4"  
                                                     , "L2;L6"
                                                     , "L2;LBOV"  
                                                     , "L3;L4" 
                                                     , "L4;L6"
                                                     , "L4;L7"
                                                     , ""))

levels(lin_lf$sel_lineages)