added ks_test_all_PS.R, ks_test_dr_PS.R, ks_test_dr_others_PS.R
This commit is contained in:
parent
be8fa7e639
commit
2297617af2
4 changed files with 722 additions and 211 deletions
170
scripts/ks_test_dr_others_PS.R
Executable file
170
scripts/ks_test_dr_others_PS.R
Executable file
|
@ -0,0 +1,170 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: KS test for PS/DUET lineage distributions
|
||||
#=======================================================================
|
||||
#=======================================================================
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/")
|
||||
getwd()
|
||||
|
||||
#source("/plotting/Header_TT.R")
|
||||
#source("../barplot_colour_function.R")
|
||||
#require(data.table)
|
||||
source("plotting/combining_dfs_plotting.R")
|
||||
# should return the following dfs, directories and variables
|
||||
|
||||
# PS combined:
|
||||
# 1) merged_df2
|
||||
# 2) merged_df2_comp
|
||||
# 3) merged_df3
|
||||
# 4) merged_df3_comp
|
||||
|
||||
# LIG combined:
|
||||
# 5) merged_df2_lig
|
||||
# 6) merged_df2_comp_lig
|
||||
# 7) merged_df3_lig
|
||||
# 8) merged_df3_comp_lig
|
||||
|
||||
# 9) my_df_u
|
||||
# 10) my_df_u_lig
|
||||
|
||||
cat(paste0("Directories imported:"
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir))
|
||||
|
||||
cat(paste0("Variables imported:"
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nAngstrom symbol:", angstroms_symbol
|
||||
, "\nNo. of duplicated muts:", dup_muts_nu
|
||||
, "\nNA count for ORs:", na_count
|
||||
, "\nNA count in df2:", na_count_df2
|
||||
, "\nNA count in df3:", na_count_df3))
|
||||
|
||||
|
||||
#=============
|
||||
# Output
|
||||
#=============
|
||||
# ks test by lineage
|
||||
ks_dr_others_lineage = paste0(outdir, "/KS_lineage_dr_vs_others.csv")
|
||||
|
||||
###########################
|
||||
# Data for stats
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
# using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available, hence use df with NA
|
||||
###########################
|
||||
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
#is.factor(my_df$lineage)
|
||||
#my_df$lineage = as.factor(my_df$lineage)
|
||||
#is.factor(my_df$lineage)
|
||||
########################################################################
|
||||
table(my_df$lineage); str(my_df$lineage)
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
# subset selected lineages
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages)
|
||||
|
||||
#==============
|
||||
# dr_muts_col
|
||||
#==============
|
||||
table(df_lin$mutation_info); str(df_lin$mutation_info)
|
||||
|
||||
# subset df with dr muts only
|
||||
df_lin_dr = subset(df_lin, mutation_info == dr_muts_col)
|
||||
table(df_lin_dr$mutation_info)
|
||||
|
||||
#==============
|
||||
# other_muts_col
|
||||
#==============
|
||||
df_lin_other = subset(df_lin, mutation_info == other_muts_col)
|
||||
table(df_lin_other$mutation_info)
|
||||
|
||||
#=======================================================================
|
||||
# individual: CHECKS
|
||||
ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"]
|
||||
, df_lin_other$duet_scaled[df_lin_other$lineage == "lineage1"])
|
||||
|
||||
#=======================================================================
|
||||
my_lineages = levels(factor(df_lin$lineage)); my_lineages
|
||||
|
||||
ks_df = data.frame()
|
||||
|
||||
for (i in my_lineages){
|
||||
cat(i)
|
||||
df = data.frame(lineage = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
|
||||
|
||||
lineage_comp = i
|
||||
ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
|
||||
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$method
|
||||
|
||||
ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
|
||||
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$statistic
|
||||
|
||||
ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
|
||||
, df_lin_other$duet_scaled[df_lin_other$lineage == i])$p.value
|
||||
|
||||
# print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
|
||||
df$lineage = lineage_comp
|
||||
df$method = ks_method
|
||||
df$ks_statistic = ks_statistic[[1]]
|
||||
df$ks_pvalue = ks_pvalue
|
||||
|
||||
print(df)
|
||||
ks_df = rbind(ks_df,df)
|
||||
|
||||
}
|
||||
|
||||
#=======================================================================
|
||||
# formatting
|
||||
#=======================================================================
|
||||
# add total_n number
|
||||
ks_df$n_drug = nrow(df_lin_dr)
|
||||
ks_df$n_others = nrow(df_lin_other)
|
||||
|
||||
# add group
|
||||
ks_df$group = "Drug vs Others"
|
||||
|
||||
str(ks_df)
|
||||
ks_df$pvalue_signif = ks_df$ks_pvalue
|
||||
str(ks_df$pvalue_signif)
|
||||
|
||||
# adding pvalue_signif
|
||||
ks_df = dplyr::mutate(ks_df, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
|
||||
, pvalue_signif <=0.0001 ~ '****'
|
||||
, pvalue_signif <=0.001 ~ '***'
|
||||
, pvalue_signif <=0.01 ~ '**'
|
||||
, pvalue_signif <0.05 ~ '*'
|
||||
, TRUE ~ 'ns'))
|
||||
|
||||
|
||||
#=======================================================================
|
||||
#******************
|
||||
# write output file: KS test
|
||||
#******************
|
||||
cat("Output of KS test bt lineage:", ks_dr_others_lineage )
|
||||
write.csv(ks_df, ks_dr_others_lineage, row.names = FALSE)
|
Loading…
Add table
Add a link
Reference in a new issue