added ks_test_all_PS.R, ks_test_dr_PS.R, ks_test_dr_others_PS.R

2020-09-21 17:46:22 +01:00 · 2020-09-21 17:46:22 +01:00 · 2297617af2
commit 2297617af2
parent be8fa7e639
4 changed files with 722 additions and 211 deletions
--- a/scripts/ks_test_dr_others_PS.R
+++ b/scripts/ks_test_dr_others_PS.R
@ -0,0 +1,170 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: KS test for PS/DUET lineage distributions
+#=======================================================================
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/")
+getwd()
+
+#source("/plotting/Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+source("plotting/combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+
+#=============
+# Output
+#=============
+# ks test by lineage
+ks_dr_others_lineage = paste0(outdir, "/KS_lineage_dr_vs_others.csv")
+
+###########################
+# Data for  stats
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# REASSIGNMENT
+my_df  = merged_df2
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+#is.factor(my_df$lineage)
+#my_df$lineage = as.factor(my_df$lineage)
+#is.factor(my_df$lineage)
+########################################################################
+table(my_df$lineage); str(my_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# subset selected lineages
+df_lin = subset(my_df, subset = lineage %in% sel_lineages)
+
+#==============
+# dr_muts_col
+#==============
+table(df_lin$mutation_info); str(df_lin$mutation_info)
+
+# subset df with dr muts only
+df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) 
+table(df_lin_dr$mutation_info)
+
+#==============
+# other_muts_col
+#==============
+df_lin_other = subset(df_lin, mutation_info == other_muts_col) 
+table(df_lin_other$mutation_info)
+
+#=======================================================================
+# individual: CHECKS
+ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"]
+        , df_lin_other$duet_scaled[df_lin_other$lineage == "lineage1"])
+
+#=======================================================================
+my_lineages = levels(factor(df_lin$lineage)); my_lineages
+
+ks_df = data.frame()
+
+for (i in my_lineages){
+   cat(i)
+   df = data.frame(lineage = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
+   
+   lineage_comp = i
+   ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
+                       , df_lin_other$duet_scaled[df_lin_other$lineage == i])$method
+   
+   ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
+                          , df_lin_other$duet_scaled[df_lin_other$lineage == i])$statistic
+   
+   ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
+                       , df_lin_other$duet_scaled[df_lin_other$lineage == i])$p.value
+   
+   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
+   df$lineage = lineage_comp
+   df$method = ks_method
+   df$ks_statistic = ks_statistic[[1]]
+   df$ks_pvalue = ks_pvalue
+   
+   print(df)
+   ks_df = rbind(ks_df,df)
+   
+}
+
+#=======================================================================
+# formatting
+#=======================================================================
+# add total_n number
+ks_df$n_drug = nrow(df_lin_dr)
+ks_df$n_others = nrow(df_lin_other)
+
+# add group
+ks_df$group = "Drug vs Others"
+
+str(ks_df)
+ks_df$pvalue_signif = ks_df$ks_pvalue
+str(ks_df$pvalue_signif)
+
+# adding pvalue_signif
+ks_df = dplyr::mutate(ks_df, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
+                                                      , pvalue_signif <=0.0001 ~ '****'
+                                                      , pvalue_signif <=0.001 ~ '***'
+                                                      , pvalue_signif <=0.01 ~ '**'
+                                                      , pvalue_signif <0.05 ~ '*'
+                                                      , TRUE ~ 'ns'))
+
+
+#=======================================================================
+#******************
+# write output file: KS test
+#******************
+cat("Output of KS test bt lineage:", ks_dr_others_lineage )
+write.csv(ks_df, ks_dr_others_lineage, row.names = FALSE)