added ks_test_all_PS.R, ks_test_dr_PS.R, ks_test_dr_others_PS.R

2020-09-21 17:46:22 +01:00 · 2020-09-21 17:46:22 +01:00 · 2297617af2
commit 2297617af2
parent be8fa7e639
4 changed files with 722 additions and 211 deletions
--- a/scripts/ks_test_PS.R
+++ b/scripts/ks_test_PS.R
@ -1,211 +0,0 @@
-#!/usr/bin/env Rscript
-#########################################################
-# TASK: KS test for PS/DUET lineage distributions
-#=======================================================================
-#=======================================================================
-# working dir and loading libraries
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/")
-getwd()
-
-#source("/plotting/Header_TT.R")
-#source("../barplot_colour_function.R")
-#require(data.table)
-source("plotting/combining_dfs_plotting.R")
-# should return the following dfs, directories and variables
-
-# PS combined: 
-# 1) merged_df2
-# 2) merged_df2_comp
-# 3) merged_df3
-# 4) merged_df3_comp
-
-# LIG combined: 
-# 5) merged_df2_lig
-# 6) merged_df2_comp_lig
-# 7) merged_df3_lig
-# 8) merged_df3_comp_lig
-
-# 9) my_df_u
-# 10) my_df_u_lig
-
-cat(paste0("Directories imported:"
-           , "\ndatadir:", datadir
-           , "\nindir:", indir
-           , "\noutdir:", outdir
-           , "\nplotdir:", plotdir))
-
-cat(paste0("Variables imported:"
-           , "\ndrug:", drug
-           , "\ngene:", gene
-           , "\ngene_match:", gene_match
-           , "\nAngstrom symbol:", angstroms_symbol
-           , "\nNo. of duplicated muts:", dup_muts_nu
-           , "\nNA count for ORs:", na_count
-           , "\nNA count in df2:", na_count_df2
-           , "\nNA count in df3:", na_count_df3))       
-
-###########################
-# Data for  stats
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-# using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available, hence use df with NA
-###########################
-
-# REASSIGNMENT
-my_df  = merged_df2
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info); str(my_df$mutation_info)
-
-# subset df with dr muts only
-my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
-table(my_df_dr$mutation_info)
-
-# stats for all muts and dr_muts
-# 1) for all muts
-# 2) for dr_muts
-#===========================
-table(my_df$lineage); str(my_df$lineage)
-table(my_df_dr$lineage); str(my_df_dr$lineage)
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# subset and refactor: all muts
-df_lin = subset(my_df, subset = lineage %in% sel_lineages)
-df_lin$lineage = factor(df_lin$lineage)
-
-# subset and refactor: dr muts
-df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
-df_lin_dr$lineage = factor(df_lin_dr$lineage)
-
-
-#{RESULT: No of samples within lineage}
-table(df_lin$lineage) 
-table(df_lin_dr$lineage)
-
-#{Result: No. of unique mutations the 4 lineages contribute to}
-length(unique(df_lin$mutationinformation))
-length(unique(df_lin_dr$mutationinformation))
-
-
-# COMPARING DISTRIBUTIONS
-#================
-# ALL mutations
-#=================
-head(df_lin$lineage)
-df_lin$lineage = as.character(df_lin$lineage)
-
-lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled
-lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled
-lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled
-lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled
-
-# ks test
-lin12 = ks.test(lin1,lin2) 
-lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value))
-  
-lin13 = ks.test(lin1,lin3) 
-lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value))
-
-lin14 = ks.test(lin1,lin4) 
-lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value))
-
-lin23 = ks.test(lin2,lin3)
-lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value))
-
-lin24 = ks.test(lin2,lin4)  
-lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value))
-
-lin34 = ks.test(lin3,lin4)
-lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value))
-
-ks_results_all = rbind(lin12_df 
-                   , lin13_df
-                   , lin14_df
-                   , lin23_df
-                   , lin24_df
-                   , lin34_df)
-
-#p-value < 2.2e-16
-rm(lin12, lin12_df
-   , lin13, lin13_df
-   , lin14, lin14_df
-   , lin23, lin23_df
-   , lin24, lin24_df
-   , lin34, lin34_df)
-
-#================
-# DRUG mutations
-#=================
-head(df_lin_dr$lineage)
-df_lin_dr$lineage = as.character(df_lin_dr$lineage)
-
-lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled
-lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled
-lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled
-lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled
-
-# ks test: dr muts
-lin12_dr = ks.test(lin1_dr,lin2_dr) 
-lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value))
-
-lin13_dr = ks.test(lin1_dr,lin3_dr) 
-lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value))
-
-lin14_dr = ks.test(lin1_dr,lin4_dr) 
-lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value))
-
-lin23_dr = ks.test(lin2_dr,lin3_dr)
-lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value))
-
-lin24_dr = ks.test(lin2_dr,lin4_dr)  
-lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value))
-
-lin34_dr = ks.test(lin3_dr,lin4_dr)
-lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value))
-
-ks_results_dr = rbind(lin12_df_dr 
-                      , lin13_df_dr
-                      , lin14_df_dr
-                      , lin23_df_dr
-                      , lin24_df_dr
-                      , lin34_df_dr)
-
-ks_results_combined = cbind(ks_results_all, ks_results_dr)
-
-my_colnames = c("Lineage_comparisons"
-                , paste0("All_mutations n=", nrow(df_lin))
-                , paste0("Drug_associated_mutations n=", nrow(df_lin_dr)))
-my_colnames
-
-# select the output columns
-ks_results_combined_f = ks_results_combined[,c(1,2,4)]
-
-colnames(ks_results_combined_f) = my_colnames
-ks_results_combined_f 
-
-#=============
-# write output file
-#=============
-ks_results = paste0(outdir,"/results/ks_results.csv")
-write.csv(ks_results_combined_f, ks_results, row.names = F)
-
--- a/scripts/ks_test_all_PS.R
+++ b/scripts/ks_test_all_PS.R
@ -0,0 +1,280 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: KS test for PS/DUET lineage distributions
+#=======================================================================
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/")
+getwd()
+
+#source("/plotting/Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+source("plotting/combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+
+#=============
+# Output
+#=============
+# ks test by lineage
+ks_lineage = paste0(outdir, "/KS_lineage_all_muts.csv")
+
+###########################
+# Data for  stats
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# REASSIGNMENT
+my_df  = merged_df2
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+#is.factor(my_df$lineage)
+#my_df$lineage = as.factor(my_df$lineage)
+#is.factor(my_df$lineage)
+########################################################################
+table(my_df$lineage); str(my_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# subset selected lineages
+df_lin = subset(my_df, subset = lineage %in% sel_lineages)
+
+#==============
+# dr_muts_col
+#==============
+#table(df_lin$mutation_info); str(df_lin$mutation_info)
+
+# subset df with dr muts only
+#df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) 
+#table(df_lin_dr$mutation_info)
+
+#==============
+# other_muts_col
+#==============
+#df_lin_other = subset(df_lin, mutation_info == other_muts_col) 
+#table(df_lin_other$mutation_info)
+
+#=======================================================================
+# individual: CHECKS
+lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled
+lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled
+lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled
+lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled
+
+ks.test(lin1, lin4)
+
+
+ks.test(df_lin$duet_scaled[df_lin$lineage == "lineage1"]
+        , df_lin$duet_scaled[df_lin$lineage == "lineage2"])
+
+#=======================================================================
+my_lineages = levels(factor(df_lin$lineage)); my_lineages
+#=======================================================================
+# Loop
+#=====================
+# Lineage 1 comparisons
+#=====================
+my_lin1 = "lineage1"
+#my_lineages_comp_l1 = c("lineage2", "lineage3", "lineage4")
+my_lineages_comp_l1 = my_lineages[-match(my_lin1, my_lineages)]
+
+ks_df_l1 = data.frame()
+
+for (i in my_lineages_comp_l1){
+   cat(i)
+   l1_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
+   
+   lineage_comp = paste0(my_lin1, " vs ", i)
+   ks_method = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin1]
+          , df_lin$duet_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin1]
+          , df_lin$duet_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin1]
+          , df_lin$duet_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l1_df$group = lineage_comp
+   l1_df$method = ks_method
+   l1_df$ks_statistic = ks_statistic[[1]]
+   l1_df$ks_pvalue = ks_pvalue
+   
+   print(l1_df)
+   ks_df_l1 = rbind(ks_df_l1,l1_df)
+   
+}
+#####################################################################
+
+#=====================
+# Lineage 2 comparisons
+#=====================
+my_lin2 = "lineage2"
+#my_lineages_comp_l2 = c("lineage1", lineage3", "lineage4")
+my_lineages_comp_l2 = my_lineages[-match(my_lin2, my_lineages)]
+
+ks_df_l2 = data.frame()
+
+for (i in my_lineages_comp_l2){
+   cat(i)
+   l2_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
+   
+   lineage_comp = paste0(my_lin2, " vs ", i)
+   ks_method = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin2]
+                       , df_lin$duet_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin2]
+                          , df_lin$duet_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin2]
+                       , df_lin$duet_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l2_df$group = lineage_comp
+   l2_df$method = ks_method
+   l2_df$ks_statistic = ks_statistic[[1]]
+   l2_df$ks_pvalue = ks_pvalue
+   
+   print(l2_df)
+   ks_df_l2 = rbind(ks_df_l2, l2_df)
+   
+}
+
+#=====================
+# Lineage 3 comparisons
+#=====================
+my_lin3 = "lineage3"
+#my_lineages_comp_l3 = c("lineage1", lineage2", "lineage4")
+my_lineages_comp_l3 = my_lineages[-match(my_lin3, my_lineages)]
+
+ks_df_l3 = data.frame()
+
+for (i in my_lineages_comp_l3){
+   cat(i)
+   l3_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
+   
+   lineage_comp = paste0(my_lin3, " vs ", i)
+   ks_method = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin3]
+                       , df_lin$duet_scaled[df_lin$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin3]
+                          , df_lin$duet_scaled[df_lin$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin3]
+                       , df_lin$duet_scaled[df_lin$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l3_df$group = lineage_comp
+   l3_df$method = ks_method
+   l3_df$ks_statistic = ks_statistic[[1]]
+   l3_df$ks_pvalue = ks_pvalue
+   
+   print(l3_df)
+   ks_df_l3 = rbind(ks_df_l3, l3_df)
+   
+}
+####################################################################
+# combine all three ks_dfs
+n_dfs = 3
+if ( all.equal(nrow(ks_df_l1), nrow(ks_df_l2), nrow(ks_df_l3)) && 
+     all.equal(ncol(ks_df_l1), ncol(ks_df_l2), ncol(ks_df_l3)) ){
+   cat("\nPASS: Calculating expected rows and cols for sanity checks on combined_dfs")
+   expected_rows = nrow(ks_df_l1) * n_dfs
+   expected_cols = ncol(ks_df_l1)
+   ks_df_combined = rbind(ks_df_l1, ks_df_l2, ks_df_l3)
+   if ( nrow(ks_df_combined) == expected_rows && ncol(ks_df_combined) == expected_cols ){
+      cat("\nPASS: combined df successfully created"
+          , "\nnrow combined_df:", nrow(ks_df_combined)
+          , "\nncol combined_df:", ncol(ks_df_combined))
+   }
+   else{
+      cat("\nFAIL: Dim mismatch"
+          , "\nExpected rows:", expected_rows
+          , "\nGot:", nrow(ks_df_combined)
+          , "\nExpected cols:", expected_cols
+          , "\nGot:", ncol(ks_df_combined))
+   }
+}else{
+   cat("\nFAIL: Could not generate expected_rows and/or expected_cols"
+       , "\nCheck hardcoded value of n_dfs")
+}
+
+#=======================================================================
+# formatting
+#=======================================================================
+# add total_n number
+ks_df_combined$total_samples_analysed = nrow(df_lin)
+
+# adding pvalue_signif
+ks_df_combined$pvalue_signif = ks_df_combined$ks_pvalue
+str(ks_df_combined$pvalue_signif)
+
+ks_df_combined = dplyr::mutate(ks_df_combined
+                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                     , pvalue_signif <=0.0001 ~ '****'
+                                                     , pvalue_signif <=0.001 ~ '***'
+                                                     , pvalue_signif <=0.01 ~ '**'
+                                                     , pvalue_signif <0.05 ~ '*'
+                                                     , TRUE ~ 'ns'))
+
+# Remove duplicates
+rows_to_remove = c("lineage2 vs lineage1", "lineage3 vs lineage1", "lineage3 vs lineage2")
+
+ks_df_combined_f = ks_df_combined[-match(rows_to_remove, ks_df_combined$group),]
+
+#=======================================================================
+#******************
+# write output file: KS test
+#******************
+cat("Output of KS test bt lineage:", ks_lineage)
+write.csv(ks_df_combined_f, ks_lineage, row.names = FALSE)
--- a/scripts/ks_test_dr_PS.R
+++ b/scripts/ks_test_dr_PS.R
@ -0,0 +1,272 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: KS test for PS/DUET lineage distributions
+#=======================================================================
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/")
+getwd()
+
+#source("/plotting/Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+source("plotting/combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+
+#=============
+# Output
+#=============
+# ks test by lineage
+ks_lineage_drug_muts = paste0(outdir, "/KS_lineage_drug_muts.csv")
+
+###########################
+# Data for  stats
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# REASSIGNMENT
+my_df  = merged_df2
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+#is.factor(my_df$lineage)
+#my_df$lineage = as.factor(my_df$lineage)
+#is.factor(my_df$lineage)
+########################################################################
+table(my_df$lineage); str(my_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# subset selected lineages
+df_lin = subset(my_df, subset = lineage %in% sel_lineages)
+
+#==============
+# dr_muts_col
+#==============
+table(df_lin$mutation_info); str(df_lin$mutation_info)
+
+# subset df with dr muts only
+df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) 
+table(df_lin_dr$mutation_info)
+
+#==============
+# other_muts_col
+#==============
+#df_lin_other = subset(df_lin, mutation_info == other_muts_col) 
+#table(df_lin_other$mutation_info)
+
+#=======================================================================
+# individual: CHECKS
+ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"]
+        , df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage2"])
+
+#=======================================================================
+my_lineages = levels(factor(df_lin_dr$lineage)); my_lineages
+#=======================================================================
+# Loop
+#=====================
+# Lineage 1 comparisons
+#=====================
+my_lin1 = "lineage1"
+#my_lineages_comp_l1 = c("lineage2", "lineage3", "lineage4")
+my_lineages_comp_l1 = my_lineages[-match(my_lin1, my_lineages)]
+
+ks_df_l1 = data.frame()
+
+for (i in my_lineages_comp_l1){
+   cat(i)
+   l1_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
+   
+   lineage_comp = paste0(my_lin1, " vs ", i)
+   ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1]
+                       , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1]
+                          , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1]
+                       , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l1_df$group = lineage_comp
+   l1_df$method = ks_method
+   l1_df$ks_statistic = ks_statistic[[1]]
+   l1_df$ks_pvalue = ks_pvalue
+   
+   print(l1_df)
+   ks_df_l1 = rbind(ks_df_l1,l1_df)
+   
+}
+#####################################################################
+
+#=====================
+# Lineage 2 comparisons
+#=====================
+my_lin2 = "lineage2"
+#my_lineages_comp_l2 = c("lineage1", lineage3", "lineage4")
+my_lineages_comp_l2 = my_lineages[-match(my_lin2, my_lineages)]
+
+ks_df_l2 = data.frame()
+
+for (i in my_lineages_comp_l2){
+   cat(i)
+   l2_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
+   
+   lineage_comp = paste0(my_lin2, " vs ", i)
+   ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2]
+                       , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2]
+                          , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2]
+                       , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l2_df$group = lineage_comp
+   l2_df$method = ks_method
+   l2_df$ks_statistic = ks_statistic[[1]]
+   l2_df$ks_pvalue = ks_pvalue
+   
+   print(l2_df)
+   ks_df_l2 = rbind(ks_df_l2, l2_df)
+   
+}
+
+#=====================
+# Lineage 3 comparisons
+#=====================
+my_lin3 = "lineage3"
+#my_lineages_comp_l3 = c("lineage1", lineage2", "lineage4")
+my_lineages_comp_l3 = my_lineages[-match(my_lin3, my_lineages)]
+
+ks_df_l3 = data.frame()
+
+for (i in my_lineages_comp_l3){
+   cat(i)
+   l3_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
+   
+   lineage_comp = paste0(my_lin3, " vs ", i)
+   ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3]
+                       , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3]
+                          , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3]
+                       , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   l3_df$group = lineage_comp
+   l3_df$method = ks_method
+   l3_df$ks_statistic = ks_statistic[[1]]
+   l3_df$ks_pvalue = ks_pvalue
+   
+   print(l3_df)
+   ks_df_l3 = rbind(ks_df_l3, l3_df)
+   
+}
+####################################################################
+# combine all three ks_dfs
+n_dfs = 3
+if ( all.equal(nrow(ks_df_l1), nrow(ks_df_l2), nrow(ks_df_l3)) && 
+     all.equal(ncol(ks_df_l1), ncol(ks_df_l2), ncol(ks_df_l3)) ){
+   cat("\nPASS: Calculating expected rows and cols for sanity checks on combined_dfs")
+   expected_rows = nrow(ks_df_l1) * n_dfs
+   expected_cols = ncol(ks_df_l1)
+   ks_df_combined = rbind(ks_df_l1, ks_df_l2, ks_df_l3)
+   if ( nrow(ks_df_combined) == expected_rows && ncol(ks_df_combined) == expected_cols ){
+      cat("\nPASS: combined df successfully created"
+          , "\nnrow combined_df:", nrow(ks_df_combined)
+          , "\nncol combined_df:", ncol(ks_df_combined))
+   }
+   else{
+      cat("\nFAIL: Dim mismatch"
+          , "\nExpected rows:", expected_rows
+          , "\nGot:", nrow(ks_df_combined)
+          , "\nExpected cols:", expected_cols
+          , "\nGot:", ncol(ks_df_combined))
+   }
+}else{
+   cat("\nFAIL: Could not generate expected_rows and/or expected_cols"
+       , "\nCheck hardcoded value of n_dfs")
+}
+
+#=======================================================================
+# formatting
+#=======================================================================
+# add total_n number
+ks_df_combined$n_drug_muts = nrow(df_lin_dr)
+
+# adding pvalue_signif
+ks_df_combined$pvalue_signif = ks_df_combined$ks_pvalue
+str(ks_df_combined$pvalue_signif)
+
+ks_df_combined = dplyr::mutate(ks_df_combined
+                               , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                           , pvalue_signif <=0.0001 ~ '****'
+                                                           , pvalue_signif <=0.001 ~ '***'
+                                                           , pvalue_signif <=0.01 ~ '**'
+                                                           , pvalue_signif <0.05 ~ '*'
+                                                           , TRUE ~ 'ns'))
+
+# Remove duplicates
+rows_to_remove = c("lineage2 vs lineage1", "lineage3 vs lineage1", "lineage3 vs lineage2")
+
+ks_df_combined_f = ks_df_combined[-match(rows_to_remove, ks_df_combined$group),]
+
+#=======================================================================
+#******************
+# write output file: KS test
+#******************
+cat("Output of KS test by lineage for dr_muts:", ks_lineage_drug_muts)
+write.csv(ks_df_combined_f, ks_lineage_drug_muts, row.names = FALSE)
--- a/scripts/ks_test_dr_others_PS.R
+++ b/scripts/ks_test_dr_others_PS.R
@ -0,0 +1,170 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: KS test for PS/DUET lineage distributions
+#=======================================================================
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/")
+getwd()
+
+#source("/plotting/Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+source("plotting/combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+
+#=============
+# Output
+#=============
+# ks test by lineage
+ks_dr_others_lineage = paste0(outdir, "/KS_lineage_dr_vs_others.csv")
+
+###########################
+# Data for  stats
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# REASSIGNMENT
+my_df  = merged_df2
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+#is.factor(my_df$lineage)
+#my_df$lineage = as.factor(my_df$lineage)
+#is.factor(my_df$lineage)
+########################################################################
+table(my_df$lineage); str(my_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# subset selected lineages
+df_lin = subset(my_df, subset = lineage %in% sel_lineages)
+
+#==============
+# dr_muts_col
+#==============
+table(df_lin$mutation_info); str(df_lin$mutation_info)
+
+# subset df with dr muts only
+df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) 
+table(df_lin_dr$mutation_info)
+
+#==============
+# other_muts_col
+#==============
+df_lin_other = subset(df_lin, mutation_info == other_muts_col) 
+table(df_lin_other$mutation_info)
+
+#=======================================================================
+# individual: CHECKS
+ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"]
+        , df_lin_other$duet_scaled[df_lin_other$lineage == "lineage1"])
+
+#=======================================================================
+my_lineages = levels(factor(df_lin$lineage)); my_lineages
+
+ks_df = data.frame()
+
+for (i in my_lineages){
+   cat(i)
+   df = data.frame(lineage = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
+   
+   lineage_comp = i
+   ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
+                       , df_lin_other$duet_scaled[df_lin_other$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
+                          , df_lin_other$duet_scaled[df_lin_other$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
+                       , df_lin_other$duet_scaled[df_lin_other$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   df$lineage = lineage_comp
+   df$method = ks_method
+   df$ks_statistic = ks_statistic[[1]]
+   df$ks_pvalue = ks_pvalue
+   
+   print(df)
+   ks_df = rbind(ks_df,df)
+   
+}
+
+#=======================================================================
+# formatting
+#=======================================================================
+# add total_n number
+ks_df$n_drug = nrow(df_lin_dr)
+ks_df$n_others = nrow(df_lin_other)
+
+# add group
+ks_df$group = "Drug vs Others"
+
+str(ks_df)
+ks_df$pvalue_signif = ks_df$ks_pvalue
+str(ks_df$pvalue_signif)
+
+# adding pvalue_signif
+ks_df = dplyr::mutate(ks_df, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                      , pvalue_signif <=0.0001 ~ '****'
+                                                      , pvalue_signif <=0.001 ~ '***'
+                                                      , pvalue_signif <=0.01 ~ '**'
+                                                      , pvalue_signif <0.05 ~ '*'
+                                                      , TRUE ~ 'ns'))
+
+
+#=======================================================================
+#******************
+# write output file: KS test
+#******************
+cat("Output of KS test bt lineage:", ks_dr_others_lineage )
+write.csv(ks_df, ks_dr_others_lineage, row.names = FALSE)