diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh
index 18103c6..17c1a03 100755
--- a/dynamut/split_csv.sh
+++ b/dynamut/split_csv.sh
@@ -12,10 +12,12 @@ CHUNK=$3
 mkdir -p ${OUTDIR}/${CHUNK}
 cd ${OUTDIR}/${CHUNK}
 
+# makes the 2 dirs, hence ../..
 split ../../${INFILE} -l ${CHUNK} -d snp_batch_
-#split ${INFILE} -l ${CHUNK} -d snp_batch_
 
 # use case
 #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
-~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
+
+# add .txt to the files
diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R
new file mode 100644
index 0000000..86eb9f1
--- /dev/null
+++ b/scripts/functions/bp_lineage.R
@@ -0,0 +1,172 @@
+########################################
+# Lineage and within SNP count barplot
+########################################
+
+lin_count_bp <- function( lf_data
+                          , x_categ = ""
+                          , y_count = ""
+                          , bar_fill_categ = ""
+                          , display_label_col = ""
+                          , bar_stat_stype = "identity"
+                          , x_lab_angle = 90
+                          , d_lab_size = 5
+                          , d_lab_hjust = 0.5
+                          , d_lab_vjust = 0.5
+                          , d_lab_col = "black"
+                          , my_xats = 20 # x axis text size
+                          , my_yats = 20 # y axis text size
+                          , my_xals = 22 # x axis label size
+                          , my_yals = 22 # y axis label size
+                          , my_lls   = 22 # legend label size
+                          , bar_col_labels = c("Mutations", "Total Samples")
+                          , bar_col_values = c("grey50", "gray75")
+                          , bar_leg_name = ""
+                          , leg_location = "top"
+                          , y_log10 = FALSE
+                          , y_scale_percent = FALSE
+                          , y_label = c("Count", "SNP diversity")
+                          ) {
+  g = ggplot(lf_data
+             , aes(  x    = factor( eval(parse(text = x_categ)), ordered = T )
+                     , y    = eval(parse(text = y_count))
+                     , fill = eval(parse(text = bar_fill_categ)) ) )
+  
+  OutPlot = g + geom_bar( stat          = bar_stat_stype
+                          , position    = position_stack(reverse = TRUE)
+                          #, alpha    = 1
+                          #, colour   = "grey75"
+  ) + 
+    theme(axis.text.x     = element_text(size = my_xats
+                                         , angle = x_lab_angle)
+          , axis.text.y   = element_text(size = my_yats
+                                         , angle = 90
+                                         , hjust = 1
+                                         , vjust = 0)
+          , axis.title.x = element_text(size     = my_xals
+                                        , colour = "black")
+          , axis.title.y = element_text(size     = my_yals
+                                        , colour = "black")
+          , legend.position = leg_location
+          , legend.text = element_text(size = my_lls)) + 
+    
+    geom_label(aes(label = eval(parse(text = display_label_col)))
+               , size    = d_lab_size
+               , hjust   = d_lab_hjust
+               , vjust   = d_lab_vjust
+               , colour  = d_lab_col
+               , show.legend = FALSE
+               #, check_overlap = TRUE
+               , position = position_stack(reverse = T)) + 
+
+    scale_fill_manual(values   = bar_col_values
+                      , name   = bar_leg_name
+                      , labels = bar_col_labels) +
+    labs(title    = ""
+         , x      = ""
+         , y      = y_label
+         , colour = "black")
+    
+  
+  if (y_log10){
+    
+   OutPlot = OutPlot + 
+     scale_y_continuous(trans = "log10"
+                        , labels = trans_format("log10", math_format(10^.x) ) )
+
+  }
+  
+  if (y_scale_percent){
+    
+    OutPlot = OutPlot +
+      #scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+      scale_y_continuous(labels = scales::percent) +
+      
+      labs(title    = ""
+           , x      = ""
+           , y      = y_label
+           , colour = "black")
+  }
+  
+  return(OutPlot)
+}
+
+############################
+# Lineage diversity barplot
+############################
+# lin_diversity_bp <- function( wf_data
+#                              , x_categ = "sel_lineages"
+#                              , y_count = "snp_diversity"
+#                              , bar_stat_stype = "identity"
+#                              , display_label_col = "snp_diversity_f"
+#                              , x_lab_angle = 90
+#                              , d_lab_size = 5
+#                              , d_lab_hjust = 0.5
+#                              , d_lab_vjust = 0.5
+#                              , d_lab_col = "black"
+#                              , my_xats = 20 # x axis text size
+#                              , my_yats = 20 # y axis text size
+#                              , my_xals = 22 # x axis label size
+#                              , my_yals = 22 # y axis label size
+#                              , my_lls  = 22 # legend label size
+#                              , bar_leg_name = ""
+#                              , leg_location = "top"
+#                              , y_scale_percent = TRUE
+#                              , y_label = "SNP diversity" )
+#   
+#                              {
+#   g = ggplot(wf_data
+#              , aes(  x    = factor( eval(parse(text = x_categ)), ordered = T )
+#                    , y    = eval(parse(text = y_count)) ) )
+#   
+#   OutPlot = g + geom_bar( stat     = bar_stat_stype
+#                      , position    = position_stack(reverse = TRUE)
+#                      ) + 
+#     
+#     theme(axis.text.x     = element_text(size = my_xats
+#                                          , angle = x_lab_angle)
+#           , axis.text.y   = element_text(size = my_yats
+#                                          , angle = 90
+#                                          , hjust = 1
+#                                          , vjust = 0)
+#           , axis.title.x = element_text(size     = my_xals
+#                                         , colour = "black")
+#           , axis.title.y = element_text(size     = my_yals
+#                                         , colour = "black")
+#           , legend.position = leg_location
+#           , legend.text = element_text(size = my_lls)) + 
+#     
+#     geom_label(aes(label = eval(parse(text = display_label_col)))
+#                , size    = d_lab_size
+#                , hjust   = d_lab_hjust
+#                , vjust   = d_lab_vjust
+#                , colour  = d_lab_col
+#                , show.legend = FALSE
+#                #, check_overlap = TRUE
+#                , position = position_stack(reverse = T))
+# #  return(OutPlot)
+#   
+#   if (y_scale_percent){
+# 
+#     OutPlot = OutPlot + 
+#       scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+#       labs(title = ""
+#            , x   = ""
+#            , y   = y_label
+#            , colour = "black")
+#     
+#     return(OutPlot) 
+#     }
+     
+#  return(OutPlot)
+  
+#  }
+  
+  
+
+
+# ggp <- ggplot(bar_sel, aes(sel_lineages, snp_diversity)) +
+#   geom_bar(stat = "identity")
+# ggp +  scale_y_continuous(labels = scales::percent_format(accuracy = 1)
+#                           #, limits = c(0,1)
+#                           , breaks = seq(0, 30, 5)
+# )    
diff --git a/scripts/functions/test_bp_lineage.R b/scripts/functions/test_bp_lineage.R
new file mode 100644
index 0000000..6742429
--- /dev/null
+++ b/scripts/functions/test_bp_lineage.R
@@ -0,0 +1,111 @@
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+
+#source ('get_plotting_dfs.R')
+source("../functions/bp_lineage.R")
+
+#########################################
+# Lineage and SNP count: lineage lf data
+#########################################
+# Relevel factors so that x-axis categ appear as you want
+lin_lf_plot = lin_lf
+lin_lf_plot
+is.factor(lin_lf_plot$sel_lineages_f)
+
+lin_lf_plot$sel_lineages_f =  factor(lin_lf_plot$sel_lineages_f, c(""
+                                                                   , "L1"
+                                                                   , "L2"
+                                                                   , "L3"
+                                                                   , "L4"
+                                                                   , "L5"
+                                                                   , "L6"
+                                                                   , "L7"
+                                                                   , "LBOV"
+                                                                   , "L1;L2"
+                                                                   , "L1;L3"
+                                                                   , "L1;L4"
+                                                                   , "L2;L3"
+                                                                   , "L2;L3;L4"
+                                                                   , "L2;L4"  
+                                                                   , "L2;L6"
+                                                                   , "L2;LBOV"  
+                                                                   , "L3;L4" 
+                                                                   , "L4;L6"
+                                                                   , "L4;L7"))
+
+levels(lin_lf_plot$sel_lineages_f)
+
+lin_count_bp(lin_lf_plot
+             , x_categ = "sel_lineages_f"
+             , y_count = "p_count"
+             , bar_fill_categ = "count_categ"
+             , display_label_col = "p_count"
+             , bar_stat_stype = "identity"
+             , x_lab_angle = 90
+             , my_xats = 20
+             , bar_col_labels = c("Mutations", "Total Samples")
+             , bar_col_values = c("grey50", "gray75")
+             , y_log10 = T
+             , y_label = "Count"
+             , y_scale_percent = F)
+
+###############################################
+# Lineage SNP diversity count: lineage wf data
+###############################################
+# Relevel factors so that x-axis categ appear as you want
+lin_wf_plot = lin_wf
+is.factor(lin_wf_plot$sel_lineages_f)
+
+lin_wf_plot$sel_lineages_f =  factor(lin_wf_plot$sel_lineages_f, c(""
+                                                         , "L1"
+                                                         , "L2"
+                                                         , "L3"
+                                                         , "L4"
+                                                         , "L5"
+                                                         , "L6"
+                                                         , "L7"
+                                                         , "LBOV"
+                                                         , "L1;L2"
+                                                         , "L1;L3"
+                                                         , "L1;L4"
+                                                         , "L2;L3"
+                                                         , "L2;L3;L4"
+                                                         , "L2;L4"  
+                                                         , "L2;L6"
+                                                         , "L2;LBOV"  
+                                                         , "L3;L4" 
+                                                         , "L4;L6"
+                                                         , "L4;L7"))
+
+levels(lin_wf_plot$sel_lineages_f)
+
+#==========
+# Plot
+#==========
+lin_count_bp(lin_wf_plot
+                 , x_categ = "sel_lineages_f"
+                 , y_count = "snp_diversity"
+                 , display_label_col = "snp_diversity_f"
+
+                 , bar_stat_stype = "identity"
+                 , x_lab_angle = 90
+                 , my_xats = 20
+                 , y_scale_percent = T
+                 , y_label = "SNP diversity"
+             
+
+)
+
+
+
+
+, x_categ = "sel_lineages_f"
+, y_count = "p_count"
+, bar_fill_categ = "count_categ"
+, display_label_col = "p_count"
+, bar_stat_stype = "identity"
+, x_lab_angle = 90
+, my_xats = 15
+, bar_col_labels = c("Mutations", "Total Samples")
+, bar_col_values = c("grey50", "gray75")
+, y_log10 = T
+, y_scale_percent = F
\ No newline at end of file
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index 2fc1c19..89b477c 100644
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -437,13 +437,19 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
       , "\nGot: ", check1)
 }
 
-
 rm(foo)
 ####################################################################
 #                        Data for DM OM Plots: Long format dfs
 ####################################################################
+
 source("other_plots_data.R")
 
+####################################################################
+#                  Data for Lineage barplots: WF and LF dfs
+####################################################################
+
+source("lineage_bp_data.R")
+
 ########################################################################
 #                           End of script
 ########################################################################
diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R
new file mode 100644
index 0000000..2cdfbe8
--- /dev/null
+++ b/scripts/plotting/lineage_bp_data.R
@@ -0,0 +1,173 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for lineage barplots:
+# WF and LF data with lineage sample, and snp counts
+# sourced by get_plotting_dfs.R
+#########################################################
+# working dir and loading libraries
+# getwd()
+# setwd("~/git/LSHTM_analysis/scripts/plotting")
+# getwd()
+
+# make cmd
+# globals
+# drug = "streptomycin"
+# gene = "gid"
+
+# source("get_plotting_dfs.R")
+#=======================================================================
+#################################################
+# Get data with lineage count, and snp diversity
+#################################################
+table(merged_df2$lineage)
+
+if (table(merged_df2$lineage == "")[[2]]) {
+
+cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
+  
+}
+
+##################################
+# WF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+##################################
+sel_lineages = levels(as.factor(merged_df2$lineage))
+
+lin_wf = data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i])
+  #print(curr_total)
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+
+  foo = merged_df2[merged_df2$lineage==i,]
+  print(paste0(i, "=======\n"))
+  print(length(unique(foo$mutationinformation)))
+  curr_count = length(unique(foo$mutationinformation))
+  
+  total_snps_u = c(total_snps_u, curr_count)
+}
+lin_wf
+
+# Add these counts as columns to the df
+lin_wf$num_snps_u = total_snps_u
+lin_wf$total_samples = total_samples
+
+# Add SNP diversity
+lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
+lin_wf
+
+#=====================
+# Add some formatting
+#=====================
+# SNP diversity 
+lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
+lin_wf
+
+# Lineage names
+lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages)
+lin_wf
+
+# # Lineage names
+# lin_wf = lin_wf %>%
+#   mutate(ordering_category = case_when(
+#     sel_lineages_f   == ""    ~ 0
+#     , sel_lineages_f == "L1"   ~ 1
+#     , sel_lineages_f == "L2"   ~ 2
+#     , sel_lineages_f == "L3"   ~ 3
+#     , sel_lineages_f == "L4"   ~ 4
+#     , sel_lineages_f == "L5"   ~ 5
+#     , sel_lineages_f == "L6"   ~ 6
+#     , sel_lineages_f == "L7"   ~ 7
+#     , sel_lineages_f == "LBOV" ~ 8
+#     
+#     , sel_lineages_f == "L1;L2" ~ 9
+#     , sel_lineages_f == "L1;L3" ~ 10
+#     , sel_lineages_f == "L1;L4" ~ 11
+#     
+#     , sel_lineages_f == "L2;L3"    ~ 12
+#     , sel_lineages_f == "L2;L3;L4" ~ 13
+#     , sel_lineages_f == "L2;L4"    ~ 14
+#     , sel_lineages_f == "L2;L6"    ~ 15
+#     , sel_lineages_f == "L2;LBOV"  ~ 16
+#     
+#     , sel_lineages_f == "L3;L4" ~ 17
+#     
+#     , sel_lineages_f == "L4;L6" ~ 18
+#     , sel_lineages_f == "L4;L7" ~ 19
+#     
+#     , FALSE ~ -1)
+#   )
+
+##################################
+# LF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+##################################
+names(lin_wf)
+tot_cols = ncol(lin_wf)
+pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f")
+pivot_cols_n = length(pivot_cols)
+
+expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
+  
+lin_lf <- gather(lin_wf
+                 , count_categ
+                 , p_count
+                 , num_snps_u:total_samples
+                 , factor_key = TRUE)
+lin_lf
+
+# quick checks
+if ( nrow(lin_lf)  ==  expected_rows ){
+  cat("\nPASS: Lineage LF data created"
+      , "\nnrow: ", nrow(lin_lf)
+      , "\nncol: ", ncol(lin_lf))
+} else {
+  cat("\nFAIL: numbers mismatch"
+      , "\nExpected nrow: ", expected_rows)
+}
+#######################################
+# #=====================
+# # Add some formatting
+# #=====================
+# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages)
+# lin_lf
+
+
+# lin_lf = lin_lf %>%
+#   mutate(ordering_category = case_when(
+#      sel_lineages_f   == ""    ~ 0
+#     , sel_lineages_f == "L1"   ~ 1
+#     , sel_lineages_f == "L2"   ~ 2
+#     , sel_lineages_f == "L3"   ~ 3
+#     , sel_lineages_f == "L4"   ~ 4
+#     , sel_lineages_f == "L5"   ~ 5
+#     , sel_lineages_f == "L6"   ~ 6
+#     , sel_lineages_f == "L7"   ~ 7
+#     , sel_lineages_f == "LBOV" ~ 8
+# 
+#     , sel_lineages_f == "L1;L2" ~ 9
+#     , sel_lineages_f == "L1;L3" ~ 10
+#     , sel_lineages_f == "L1;L4" ~ 11
+# 
+#     , sel_lineages_f == "L2;L3"    ~ 12
+#     , sel_lineages_f == "L2;L3;L4" ~ 13
+#     , sel_lineages_f == "L2;L4"    ~ 14
+#     , sel_lineages_f == "L2;L6"    ~ 15
+#     , sel_lineages_f == "L2;LBOV"  ~ 16
+# 
+#     , sel_lineages_f == "L3;L4" ~ 17
+# 
+#     , sel_lineages_f == "L4;L6" ~ 18
+#     , sel_lineages_f == "L4;L7" ~ 19
+# 
+#     , FALSE ~ -1)
+#   )
diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R
index 8eb2020..a55303b 100644
--- a/scripts/plotting/other_plots_data.R
+++ b/scripts/plotting/other_plots_data.R
@@ -1,7 +1,8 @@
 #!/usr/bin/env Rscript  
 #########################################################
-# TASK: producing boxplots for dr and other muts
-
+# TASK: Script to format data for dm om plots: 
+# generating LF data
+# sourced by get_plotting_dfs.R
 #########################################################
 # working dir and loading libraries
 # getwd()