From 648be02665dbc9102707393a0cdf3944ec784ac7 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 7 Sep 2020 15:27:53 +0100
Subject: [PATCH] ks test script added

---
 scripts/ks_test_PS.R               | 211 +++++++++++++++++++++++++++++
 scripts/plotting/autoviz.py        |  45 ------
 scripts/plotting/combined_or.R     |   1 -
 scripts/plotting/lineage_count.txt |  71 ----------
 4 files changed, 211 insertions(+), 117 deletions(-)
 create mode 100644 scripts/ks_test_PS.R
 delete mode 100644 scripts/plotting/autoviz.py
 delete mode 100644 scripts/plotting/lineage_count.txt

diff --git a/scripts/ks_test_PS.R b/scripts/ks_test_PS.R
new file mode 100644
index 0000000..bc11957
--- /dev/null
+++ b/scripts/ks_test_PS.R
@@ -0,0 +1,211 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: KS test for PS/DUET lineage distributions
+#=======================================================================
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/")
+getwd()
+
+#source("/plotting/Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+source("plotting/combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+###########################
+# Data for  stats
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# REASSIGNMENT
+my_df  = merged_df2
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
+table(my_df_dr$mutation_info)
+
+# stats for all muts and dr_muts
+# 1) for all muts
+# 2) for dr_muts
+#===========================
+table(my_df$lineage); str(my_df$lineage)
+table(my_df_dr$lineage); str(my_df_dr$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# subset and refactor: all muts
+df_lin = subset(my_df, subset = lineage %in% sel_lineages)
+df_lin$lineage = factor(df_lin$lineage)
+
+# subset and refactor: dr muts
+df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
+df_lin_dr$lineage = factor(df_lin_dr$lineage)
+
+
+#{RESULT: No of samples within lineage}
+table(df_lin$lineage) 
+table(df_lin_dr$lineage)
+
+#{Result: No. of unique mutations the 4 lineages contribute to}
+length(unique(df_lin$mutationinformation))
+length(unique(df_lin_dr$mutationinformation))
+
+
+# COMPARING DISTRIBUTIONS
+#================
+# ALL mutations
+#=================
+head(df_lin$lineage)
+df_lin$lineage = as.character(df_lin$lineage)
+
+lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled
+lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled
+lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled
+lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled
+
+# ks test
+lin12 = ks.test(lin1,lin2) 
+lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value))
+  
+lin13 = ks.test(lin1,lin3) 
+lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value))
+
+lin14 = ks.test(lin1,lin4) 
+lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value))
+
+lin23 = ks.test(lin2,lin3)
+lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value))
+
+lin24 = ks.test(lin2,lin4)  
+lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value))
+
+lin34 = ks.test(lin3,lin4)
+lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value))
+
+ks_results_all = rbind(lin12_df 
+                   , lin13_df
+                   , lin14_df
+                   , lin23_df
+                   , lin24_df
+                   , lin34_df)
+
+#p-value < 2.2e-16
+rm(lin12, lin12_df
+   , lin13, lin13_df
+   , lin14, lin14_df
+   , lin23, lin23_df
+   , lin24, lin24_df
+   , lin34, lin34_df)
+
+#================
+# DRUG mutations
+#=================
+head(df_lin_dr$lineage)
+df_lin_dr$lineage = as.character(df_lin_dr$lineage)
+
+lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled
+lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled
+lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled
+lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled
+
+# ks test: dr muts
+lin12_dr = ks.test(lin1_dr,lin2_dr) 
+lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value))
+
+lin13_dr = ks.test(lin1_dr,lin3_dr) 
+lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value))
+
+lin14_dr = ks.test(lin1_dr,lin4_dr) 
+lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value))
+
+lin23_dr = ks.test(lin2_dr,lin3_dr)
+lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value))
+
+lin24_dr = ks.test(lin2_dr,lin4_dr)  
+lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value))
+
+lin34_dr = ks.test(lin3_dr,lin4_dr)
+lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value))
+
+ks_results_dr = rbind(lin12_df_dr 
+                      , lin13_df_dr
+                      , lin14_df_dr
+                      , lin23_df_dr
+                      , lin24_df_dr
+                      , lin34_df_dr)
+
+ks_results_combined = cbind(ks_results_all, ks_results_dr)
+
+my_colnames = c("Lineage_comparisons"
+                , paste0("All_mutations n=", nrow(df_lin))
+                , paste0("Drug_associated_mutations n=", nrow(df_lin_dr)))
+my_colnames
+
+# select the output columns
+ks_results_combined_f = ks_results_combined[,c(1,2,4)]
+
+colnames(ks_results_combined_f) = my_colnames
+ks_results_combined_f 
+
+#=============
+# write output file
+#=============
+ks_results = paste0(outdir,"/results/ks_results.csv")
+write.csv(ks_results_combined_f, ks_results, row.names = F)
+
diff --git a/scripts/plotting/autoviz.py b/scripts/plotting/autoviz.py
deleted file mode 100644
index 8dc6405..0000000
--- a/scripts/plotting/autoviz.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-#=======================================================================
-#%% useful links
-#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
-#https://pypi.org/project/autoviz/
-#=======================================================================
-import os, sys
-import pandas as pd
-import numpy as np
-import re
-import argparse
-from autoviz.AutoViz_Class import AutoViz_Class
-
-homedir = os.path.expanduser('~')
-os.chdir(homedir + '/git/LSHTM_analysis/scripts')
-#%%============================================================================
-# variables
-gene = 'pncA'
-drug = 'pyrazinamide'
-
-#%%============================================================================
-#==============
-# directories
-#==============
-datadir = homedir + '/' + 'git/Data'
-    
-indir = datadir + '/' + drug + '/input'
-    
-outdir = datadir + '/' + drug + '/output'
-
-#=======
-# input
-#=======
-in_filename_plotting = 'car_design.csv'
-in_filename_plotting = gene.lower() + '_all_params.csv'
-infile_plotting = outdir + '/' + in_filename_plotting
-print('plotting file: ', infile_plotting
-      , '\n============================================================')
-#=======================================================================
-plotting_df = pd.read_csv(infile_plotting, sep = ',')
-#Instantiate the AutoViz class
-AV = AutoViz_Class()
-df = AV.AutoViz(infile_plotting)
-#df2 = AV.AutoViz(plotting_df)
-plotting_df.columns[~plotting_df.columns.isin(df.columns)]
diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R
index 4bf2b80..1c167a8 100644
--- a/scripts/plotting/combined_or.R
+++ b/scripts/plotting/combined_or.R
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Basic lineage barplot showing numbers
diff --git a/scripts/plotting/lineage_count.txt b/scripts/plotting/lineage_count.txt
deleted file mode 100644
index fe6a87d..0000000
--- a/scripts/plotting/lineage_count.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-#=============
-# merged_df2
-#=============
-----------------
-# no. of samples
-----------------
-                  Var1 Freq
-1                         8
-2             lineage1  144
-3    lineage1;lineage2    3
-4    lineage1;lineage4    4
-5             lineage2 1886
-6    lineage2;lineage4   19
-7             lineage3  190
-8    lineage3;lineage4   11
-9             lineage4 2213
-10   lineage4;lineage6    1
-11   lineage4;lineage7    1
-12 lineage4;lineageBOV    1
-13            lineage5   31
-14            lineage6    9
-15            lineage7    3
-16          lineageBOV  392
-
-----------------
-# no. of nsSNPs
-----------------
-
-  sel_lineages num_snps_u total_samples
-1     lineage1         74           144
-2     lineage2        277          1886
-3     lineage3        104           190
-4     lineage4        311          2213
-5     lineage5         18            31
-6     lineage6          8             9
-7     lineage7          1             3
-
-
-#=============
-# merged_df2_comp
-#=============
-----------------
-# no. of samples
-----------------
-
-                  Var1 Freq
-1                         3
-2             lineage1  108
-3    lineage1;lineage2    2
-4    lineage1;lineage4    2
-5             lineage2 1497
-6    lineage2;lineage4   13
-7             lineage3  154
-8    lineage3;lineage4    3
-9             lineage4 1846
-10 lineage4;lineageBOV    1
-11            lineage5   12
-12            lineage6    2
-13          lineageBOV  36
-
-----------------
-# no. of nsSNPs
-----------------
-  sel_lineages num_snps_u total_samples
-1     lineage1         42           108
-2     lineage2        141          1497
-3     lineage3         75           154
-4     lineage4        148          1846
-5     lineage5          9            12
-6     lineage6          2             2
-7     lineage7          0             0