170 lines
5.3 KiB
R
Executable file
170 lines
5.3 KiB
R
Executable file
#!/usr/bin/env Rscript
|
|
#########################################################
|
|
# TASK: KS test for PS/DUET lineage distributions
|
|
#=======================================================================
|
|
#=======================================================================
|
|
# working dir and loading libraries
|
|
getwd()
|
|
setwd("~/git/LSHTM_analysis/scripts/")
|
|
getwd()
|
|
|
|
#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
|
|
#source("../barplot_colour_function.R")
|
|
#require(data.table)
|
|
source("plotting/combining_dfs_plotting.R")
|
|
# should return the following dfs, directories and variables
|
|
|
|
# PS combined:
|
|
# 1) merged_df2
|
|
# 2) merged_df2_comp
|
|
# 3) merged_df3
|
|
# 4) merged_df3_comp
|
|
|
|
# LIG combined:
|
|
# 5) merged_df2_lig
|
|
# 6) merged_df2_comp_lig
|
|
# 7) merged_df3_lig
|
|
# 8) merged_df3_comp_lig
|
|
|
|
# 9) my_df_u
|
|
# 10) my_df_u_lig
|
|
|
|
cat(paste0("Directories imported:"
|
|
, "\ndatadir:", datadir
|
|
, "\nindir:", indir
|
|
, "\noutdir:", outdir
|
|
, "\nplotdir:", plotdir))
|
|
|
|
cat(paste0("Variables imported:"
|
|
, "\ndrug:", drug
|
|
, "\ngene:", gene
|
|
, "\ngene_match:", gene_match
|
|
, "\nAngstrom symbol:", angstroms_symbol
|
|
, "\nNo. of duplicated muts:", dup_muts_nu
|
|
, "\nNA count for ORs:", na_count
|
|
, "\nNA count in df2:", na_count_df2
|
|
, "\nNA count in df3:", na_count_df3))
|
|
|
|
|
|
#=============
|
|
# Output
|
|
#=============
|
|
# ks test by lineage
|
|
ks_dr_others_lineage = paste0(outdir, "/KS_lineage_dr_vs_others.csv")
|
|
|
|
###########################
|
|
# Data for stats
|
|
# you need merged_df2 or merged_df2_comp
|
|
# since this is one-many relationship
|
|
# i.e the same SNP can belong to multiple lineages
|
|
# using the _comp dataset means
|
|
# we lose some muts and at this level, we should use
|
|
# as much info as available, hence use df with NA
|
|
###########################
|
|
|
|
# REASSIGNMENT
|
|
my_df = merged_df2
|
|
|
|
# delete variables not required
|
|
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
|
rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig)
|
|
|
|
# quick checks
|
|
colnames(my_df)
|
|
str(my_df)
|
|
|
|
# Ensure correct data type in columns to plot: need to be factor
|
|
#is.factor(my_df$lineage)
|
|
#my_df$lineage = as.factor(my_df$lineage)
|
|
#is.factor(my_df$lineage)
|
|
########################################################################
|
|
table(my_df$lineage); str(my_df$lineage)
|
|
|
|
# subset only lineages1-4
|
|
sel_lineages = c("lineage1"
|
|
, "lineage2"
|
|
, "lineage3"
|
|
, "lineage4")
|
|
|
|
# subset selected lineages
|
|
df_lin = subset(my_df, subset = lineage %in% sel_lineages)
|
|
|
|
#==============
|
|
# dr_muts_col
|
|
#==============
|
|
table(df_lin$mutation_info); str(df_lin$mutation_info)
|
|
|
|
# subset df with dr muts only
|
|
df_lin_dr = subset(df_lin, mutation_info == dr_muts_col)
|
|
table(df_lin_dr$mutation_info)
|
|
|
|
#==============
|
|
# other_muts_col
|
|
#==============
|
|
df_lin_other = subset(df_lin, mutation_info == other_muts_col)
|
|
table(df_lin_other$mutation_info)
|
|
|
|
#=======================================================================
|
|
# individual: CHECKS
|
|
ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"]
|
|
, df_lin_other$duet_scaled[df_lin_other$lineage == "lineage1"])
|
|
|
|
#=======================================================================
|
|
my_lineages = levels(factor(df_lin$lineage)); my_lineages
|
|
|
|
ks_df = data.frame()
|
|
|
|
for (i in my_lineages){
|
|
cat(i)
|
|
df = data.frame(lineage = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
|
|
|
|
lineage_comp = i
|
|
ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
|
|
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$method
|
|
|
|
ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
|
|
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$statistic
|
|
|
|
ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
|
|
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$p.value
|
|
|
|
# print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
|
|
df$lineage = lineage_comp
|
|
df$method = ks_method
|
|
df$ks_statistic = ks_statistic[[1]]
|
|
df$ks_pvalue = ks_pvalue
|
|
|
|
print(df)
|
|
ks_df = rbind(ks_df,df)
|
|
|
|
}
|
|
|
|
#=======================================================================
|
|
# formatting
|
|
#=======================================================================
|
|
# add total_n number
|
|
ks_df$n_drug = nrow(df_lin_dr)
|
|
ks_df$n_others = nrow(df_lin_other)
|
|
|
|
# add group
|
|
ks_df$group = "Drug vs Others"
|
|
|
|
str(ks_df)
|
|
ks_df$pvalue_signif = ks_df$ks_pvalue
|
|
str(ks_df$pvalue_signif)
|
|
|
|
# adding pvalue_signif
|
|
ks_df = dplyr::mutate(ks_df, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
|
|
, pvalue_signif <=0.0001 ~ '****'
|
|
, pvalue_signif <=0.001 ~ '***'
|
|
, pvalue_signif <=0.01 ~ '**'
|
|
, pvalue_signif <0.05 ~ '*'
|
|
, TRUE ~ 'ns'))
|
|
|
|
|
|
#=======================================================================
|
|
#******************
|
|
# write output file: KS test
|
|
#******************
|
|
cat("Output of KS test bt lineage:", ks_dr_others_lineage )
|
|
write.csv(ks_df, ks_dr_others_lineage, row.names = FALSE)
|