diff --git a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R b/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R
new file mode 100644
index 0000000..5a827c8
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R
@@ -0,0 +1,157 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA for pyrazinamide:
+# merged_df2
+# merged_df3
+
+# df without NA for pyrazinamide:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# uncomment as necessary
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
+table(my_df_dr$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Run two times: 
+# uncomment as necessary
+# 1) for all muts
+# 2) for dr_muts
+#===========================
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+
+#================
+# for ALL muts
+#================
+#plot_df = my_df  
+
+#================
+# for dr muts ONLY
+#================
+plot_df = my_df_dr 
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+#============================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+table(plot_df$lineage); str(plot_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df <- df_lin
+#%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin)
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioDUET
+lin2 = df[df$lineage == "lineage2",]$ratioDUET
+lin3 = df[df$lineage == "lineage3",]$ratioDUET
+lin4 = df[df$lineage == "lineage4",]$ratioDUET
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3)
+ks.test(lin2,lin4)  
+
+ks.test(lin3,lin4)  
+
+
+