LSHTM_analysis/scripts/ks_test_dr_others_PS.R

170 lines
5.2 KiB
R
Executable file

#!/usr/bin/env Rscript
#########################################################
# TASK: KS test for PS/DUET lineage distributions
#=======================================================================
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/")
getwd()
#source("/plotting/Header_TT.R")
#source("../barplot_colour_function.R")
#require(data.table)
source("plotting/combining_dfs_plotting.R")
# should return the following dfs, directories and variables
# PS combined:
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp
# LIG combined:
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig
# 9) my_df_u
# 10) my_df_u_lig
cat(paste0("Directories imported:"
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir))
cat(paste0("Variables imported:"
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nAngstrom symbol:", angstroms_symbol
, "\nNo. of duplicated muts:", dup_muts_nu
, "\nNA count for ORs:", na_count
, "\nNA count in df2:", na_count_df2
, "\nNA count in df3:", na_count_df3))
#=============
# Output
#=============
# ks test by lineage
ks_dr_others_lineage = paste0(outdir, "/KS_lineage_dr_vs_others.csv")
###########################
# Data for stats
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################
# REASSIGNMENT
my_df = merged_df2
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
#is.factor(my_df$lineage)
#my_df$lineage = as.factor(my_df$lineage)
#is.factor(my_df$lineage)
########################################################################
table(my_df$lineage); str(my_df$lineage)
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# subset selected lineages
df_lin = subset(my_df, subset = lineage %in% sel_lineages)
#==============
# dr_muts_col
#==============
table(df_lin$mutation_info); str(df_lin$mutation_info)
# subset df with dr muts only
df_lin_dr = subset(df_lin, mutation_info == dr_muts_col)
table(df_lin_dr$mutation_info)
#==============
# other_muts_col
#==============
df_lin_other = subset(df_lin, mutation_info == other_muts_col)
table(df_lin_other$mutation_info)
#=======================================================================
# individual: CHECKS
ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"]
, df_lin_other$duet_scaled[df_lin_other$lineage == "lineage1"])
#=======================================================================
my_lineages = levels(factor(df_lin$lineage)); my_lineages
ks_df = data.frame()
for (i in my_lineages){
cat(i)
df = data.frame(lineage = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
lineage_comp = i
ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$method
ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$statistic
ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$p.value
# print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
df$lineage = lineage_comp
df$method = ks_method
df$ks_statistic = ks_statistic[[1]]
df$ks_pvalue = ks_pvalue
print(df)
ks_df = rbind(ks_df,df)
}
#=======================================================================
# formatting
#=======================================================================
# add total_n number
ks_df$n_drug = nrow(df_lin_dr)
ks_df$n_others = nrow(df_lin_other)
# add group
ks_df$group = "Drug vs Others"
str(ks_df)
ks_df$pvalue_signif = ks_df$ks_pvalue
str(ks_df$pvalue_signif)
# adding pvalue_signif
ks_df = dplyr::mutate(ks_df, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
, pvalue_signif <=0.0001 ~ '****'
, pvalue_signif <=0.001 ~ '***'
, pvalue_signif <=0.01 ~ '**'
, pvalue_signif <0.05 ~ '*'
, TRUE ~ 'ns'))
#=======================================================================
#******************
# write output file: KS test
#******************
cat("Output of KS test bt lineage:", ks_dr_others_lineage )
write.csv(ks_df, ks_dr_others_lineage, row.names = FALSE)