diff --git a/scripts/ks_test_PS.R b/scripts/ks_test_PS.R deleted file mode 100644 index bc11957..0000000 --- a/scripts/ks_test_PS.R +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: KS test for PS/DUET lineage distributions -#======================================================================= -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/") -getwd() - -#source("/plotting/Header_TT.R") -#source("../barplot_colour_function.R") -#require(data.table) -source("plotting/combining_dfs_plotting.R") -# should return the following dfs, directories and variables - -# PS combined: -# 1) merged_df2 -# 2) merged_df2_comp -# 3) merged_df3 -# 4) merged_df3_comp - -# LIG combined: -# 5) merged_df2_lig -# 6) merged_df2_comp_lig -# 7) merged_df3_lig -# 8) merged_df3_comp_lig - -# 9) my_df_u -# 10) my_df_u_lig - -cat(paste0("Directories imported:" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir)) - -cat(paste0("Variables imported:" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\nNA count for ORs:", na_count - , "\nNA count in df2:", na_count_df2 - , "\nNA count in df3:", na_count_df3)) - -########################### -# Data for stats -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -# using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available, hence use df with NA -########################### - -# REASSIGNMENT -my_df = merged_df2 - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -table(my_df$mutation_info); str(my_df$mutation_info) - -# subset df with dr muts only -my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") -table(my_df_dr$mutation_info) - -# stats for all muts and dr_muts -# 1) for all muts -# 2) for dr_muts -#=========================== -table(my_df$lineage); str(my_df$lineage) -table(my_df_dr$lineage); str(my_df_dr$lineage) - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -# subset and refactor: all muts -df_lin = subset(my_df, subset = lineage %in% sel_lineages) -df_lin$lineage = factor(df_lin$lineage) - -# subset and refactor: dr muts -df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) -df_lin_dr$lineage = factor(df_lin_dr$lineage) - - -#{RESULT: No of samples within lineage} -table(df_lin$lineage) -table(df_lin_dr$lineage) - -#{Result: No. of unique mutations the 4 lineages contribute to} -length(unique(df_lin$mutationinformation)) -length(unique(df_lin_dr$mutationinformation)) - - -# COMPARING DISTRIBUTIONS -#================ -# ALL mutations -#================= -head(df_lin$lineage) -df_lin$lineage = as.character(df_lin$lineage) - -lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled -lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled -lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled -lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled - -# ks test -lin12 = ks.test(lin1,lin2) -lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value)) - -lin13 = ks.test(lin1,lin3) -lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value)) - -lin14 = ks.test(lin1,lin4) -lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value)) - -lin23 = ks.test(lin2,lin3) -lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value)) - -lin24 = ks.test(lin2,lin4) -lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value)) - -lin34 = ks.test(lin3,lin4) -lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value)) - -ks_results_all = rbind(lin12_df - , lin13_df - , lin14_df - , lin23_df - , lin24_df - , lin34_df) - -#p-value < 2.2e-16 -rm(lin12, lin12_df - , lin13, lin13_df - , lin14, lin14_df - , lin23, lin23_df - , lin24, lin24_df - , lin34, lin34_df) - -#================ -# DRUG mutations -#================= -head(df_lin_dr$lineage) -df_lin_dr$lineage = as.character(df_lin_dr$lineage) - -lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled -lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled -lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled -lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled - -# ks test: dr muts -lin12_dr = ks.test(lin1_dr,lin2_dr) -lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value)) - -lin13_dr = ks.test(lin1_dr,lin3_dr) -lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value)) - -lin14_dr = ks.test(lin1_dr,lin4_dr) -lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value)) - -lin23_dr = ks.test(lin2_dr,lin3_dr) -lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value)) - -lin24_dr = ks.test(lin2_dr,lin4_dr) -lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value)) - -lin34_dr = ks.test(lin3_dr,lin4_dr) -lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value)) - -ks_results_dr = rbind(lin12_df_dr - , lin13_df_dr - , lin14_df_dr - , lin23_df_dr - , lin24_df_dr - , lin34_df_dr) - -ks_results_combined = cbind(ks_results_all, ks_results_dr) - -my_colnames = c("Lineage_comparisons" - , paste0("All_mutations n=", nrow(df_lin)) - , paste0("Drug_associated_mutations n=", nrow(df_lin_dr))) -my_colnames - -# select the output columns -ks_results_combined_f = ks_results_combined[,c(1,2,4)] - -colnames(ks_results_combined_f) = my_colnames -ks_results_combined_f - -#============= -# write output file -#============= -ks_results = paste0(outdir,"/results/ks_results.csv") -write.csv(ks_results_combined_f, ks_results, row.names = F) - diff --git a/scripts/ks_test_all_PS.R b/scripts/ks_test_all_PS.R new file mode 100755 index 0000000..54530ed --- /dev/null +++ b/scripts/ks_test_all_PS.R @@ -0,0 +1,280 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: KS test for PS/DUET lineage distributions +#======================================================================= +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +#source("/plotting/Header_TT.R") +#source("../barplot_colour_function.R") +#require(data.table) +source("plotting/combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + + +#============= +# Output +#============= +# ks test by lineage +ks_lineage = paste0(outdir, "/KS_lineage_all_muts.csv") + +########################### +# Data for stats +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +# REASSIGNMENT +my_df = merged_df2 + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) +rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +#is.factor(my_df$lineage) +#my_df$lineage = as.factor(my_df$lineage) +#is.factor(my_df$lineage) +######################################################################## +table(my_df$lineage); str(my_df$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# subset selected lineages +df_lin = subset(my_df, subset = lineage %in% sel_lineages) + +#============== +# dr_muts_col +#============== +#table(df_lin$mutation_info); str(df_lin$mutation_info) + +# subset df with dr muts only +#df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) +#table(df_lin_dr$mutation_info) + +#============== +# other_muts_col +#============== +#df_lin_other = subset(df_lin, mutation_info == other_muts_col) +#table(df_lin_other$mutation_info) + +#======================================================================= +# individual: CHECKS +lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled +lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled +lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled +lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled + +ks.test(lin1, lin4) + + +ks.test(df_lin$duet_scaled[df_lin$lineage == "lineage1"] + , df_lin$duet_scaled[df_lin$lineage == "lineage2"]) + +#======================================================================= +my_lineages = levels(factor(df_lin$lineage)); my_lineages +#======================================================================= +# Loop +#===================== +# Lineage 1 comparisons +#===================== +my_lin1 = "lineage1" +#my_lineages_comp_l1 = c("lineage2", "lineage3", "lineage4") +my_lineages_comp_l1 = my_lineages[-match(my_lin1, my_lineages)] + +ks_df_l1 = data.frame() + +for (i in my_lineages_comp_l1){ + cat(i) + l1_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) + + lineage_comp = paste0(my_lin1, " vs ", i) + ks_method = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin1] + , df_lin$duet_scaled[df_lin$lineage == i])$method + + ks_statistic = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin1] + , df_lin$duet_scaled[df_lin$lineage == i])$statistic + + ks_pvalue = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin1] + , df_lin$duet_scaled[df_lin$lineage == i])$p.value + + # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) + l1_df$group = lineage_comp + l1_df$method = ks_method + l1_df$ks_statistic = ks_statistic[[1]] + l1_df$ks_pvalue = ks_pvalue + + print(l1_df) + ks_df_l1 = rbind(ks_df_l1,l1_df) + +} +##################################################################### + +#===================== +# Lineage 2 comparisons +#===================== +my_lin2 = "lineage2" +#my_lineages_comp_l2 = c("lineage1", lineage3", "lineage4") +my_lineages_comp_l2 = my_lineages[-match(my_lin2, my_lineages)] + +ks_df_l2 = data.frame() + +for (i in my_lineages_comp_l2){ + cat(i) + l2_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) + + lineage_comp = paste0(my_lin2, " vs ", i) + ks_method = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin2] + , df_lin$duet_scaled[df_lin$lineage == i])$method + + ks_statistic = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin2] + , df_lin$duet_scaled[df_lin$lineage == i])$statistic + + ks_pvalue = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin2] + , df_lin$duet_scaled[df_lin$lineage == i])$p.value + + # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) + l2_df$group = lineage_comp + l2_df$method = ks_method + l2_df$ks_statistic = ks_statistic[[1]] + l2_df$ks_pvalue = ks_pvalue + + print(l2_df) + ks_df_l2 = rbind(ks_df_l2, l2_df) + +} + +#===================== +# Lineage 3 comparisons +#===================== +my_lin3 = "lineage3" +#my_lineages_comp_l3 = c("lineage1", lineage2", "lineage4") +my_lineages_comp_l3 = my_lineages[-match(my_lin3, my_lineages)] + +ks_df_l3 = data.frame() + +for (i in my_lineages_comp_l3){ + cat(i) + l3_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) + + lineage_comp = paste0(my_lin3, " vs ", i) + ks_method = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin3] + , df_lin$duet_scaled[df_lin$lineage == i])$method + + ks_statistic = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin3] + , df_lin$duet_scaled[df_lin$lineage == i])$statistic + + ks_pvalue = ks.test(df_lin$duet_scaled[df_lin$lineage == my_lin3] + , df_lin$duet_scaled[df_lin$lineage == i])$p.value + + # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) + l3_df$group = lineage_comp + l3_df$method = ks_method + l3_df$ks_statistic = ks_statistic[[1]] + l3_df$ks_pvalue = ks_pvalue + + print(l3_df) + ks_df_l3 = rbind(ks_df_l3, l3_df) + +} +#################################################################### +# combine all three ks_dfs +n_dfs = 3 +if ( all.equal(nrow(ks_df_l1), nrow(ks_df_l2), nrow(ks_df_l3)) && + all.equal(ncol(ks_df_l1), ncol(ks_df_l2), ncol(ks_df_l3)) ){ + cat("\nPASS: Calculating expected rows and cols for sanity checks on combined_dfs") + expected_rows = nrow(ks_df_l1) * n_dfs + expected_cols = ncol(ks_df_l1) + ks_df_combined = rbind(ks_df_l1, ks_df_l2, ks_df_l3) + if ( nrow(ks_df_combined) == expected_rows && ncol(ks_df_combined) == expected_cols ){ + cat("\nPASS: combined df successfully created" + , "\nnrow combined_df:", nrow(ks_df_combined) + , "\nncol combined_df:", ncol(ks_df_combined)) + } + else{ + cat("\nFAIL: Dim mismatch" + , "\nExpected rows:", expected_rows + , "\nGot:", nrow(ks_df_combined) + , "\nExpected cols:", expected_cols + , "\nGot:", ncol(ks_df_combined)) + } +}else{ + cat("\nFAIL: Could not generate expected_rows and/or expected_cols" + , "\nCheck hardcoded value of n_dfs") +} + +#======================================================================= +# formatting +#======================================================================= +# add total_n number +ks_df_combined$total_samples_analysed = nrow(df_lin) + +# adding pvalue_signif +ks_df_combined$pvalue_signif = ks_df_combined$ks_pvalue +str(ks_df_combined$pvalue_signif) + +ks_df_combined = dplyr::mutate(ks_df_combined + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + +# Remove duplicates +rows_to_remove = c("lineage2 vs lineage1", "lineage3 vs lineage1", "lineage3 vs lineage2") + +ks_df_combined_f = ks_df_combined[-match(rows_to_remove, ks_df_combined$group),] + +#======================================================================= +#****************** +# write output file: KS test +#****************** +cat("Output of KS test bt lineage:", ks_lineage) +write.csv(ks_df_combined_f, ks_lineage, row.names = FALSE) diff --git a/scripts/ks_test_dr_PS.R b/scripts/ks_test_dr_PS.R new file mode 100755 index 0000000..759a328 --- /dev/null +++ b/scripts/ks_test_dr_PS.R @@ -0,0 +1,272 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: KS test for PS/DUET lineage distributions +#======================================================================= +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +#source("/plotting/Header_TT.R") +#source("../barplot_colour_function.R") +#require(data.table) +source("plotting/combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + + +#============= +# Output +#============= +# ks test by lineage +ks_lineage_drug_muts = paste0(outdir, "/KS_lineage_drug_muts.csv") + +########################### +# Data for stats +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +# REASSIGNMENT +my_df = merged_df2 + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) +rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +#is.factor(my_df$lineage) +#my_df$lineage = as.factor(my_df$lineage) +#is.factor(my_df$lineage) +######################################################################## +table(my_df$lineage); str(my_df$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# subset selected lineages +df_lin = subset(my_df, subset = lineage %in% sel_lineages) + +#============== +# dr_muts_col +#============== +table(df_lin$mutation_info); str(df_lin$mutation_info) + +# subset df with dr muts only +df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) +table(df_lin_dr$mutation_info) + +#============== +# other_muts_col +#============== +#df_lin_other = subset(df_lin, mutation_info == other_muts_col) +#table(df_lin_other$mutation_info) + +#======================================================================= +# individual: CHECKS +ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage2"]) + +#======================================================================= +my_lineages = levels(factor(df_lin_dr$lineage)); my_lineages +#======================================================================= +# Loop +#===================== +# Lineage 1 comparisons +#===================== +my_lin1 = "lineage1" +#my_lineages_comp_l1 = c("lineage2", "lineage3", "lineage4") +my_lineages_comp_l1 = my_lineages[-match(my_lin1, my_lineages)] + +ks_df_l1 = data.frame() + +for (i in my_lineages_comp_l1){ + cat(i) + l1_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) + + lineage_comp = paste0(my_lin1, " vs ", i) + ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method + + ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic + + ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value + + # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) + l1_df$group = lineage_comp + l1_df$method = ks_method + l1_df$ks_statistic = ks_statistic[[1]] + l1_df$ks_pvalue = ks_pvalue + + print(l1_df) + ks_df_l1 = rbind(ks_df_l1,l1_df) + +} +##################################################################### + +#===================== +# Lineage 2 comparisons +#===================== +my_lin2 = "lineage2" +#my_lineages_comp_l2 = c("lineage1", lineage3", "lineage4") +my_lineages_comp_l2 = my_lineages[-match(my_lin2, my_lineages)] + +ks_df_l2 = data.frame() + +for (i in my_lineages_comp_l2){ + cat(i) + l2_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) + + lineage_comp = paste0(my_lin2, " vs ", i) + ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method + + ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic + + ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value + + # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) + l2_df$group = lineage_comp + l2_df$method = ks_method + l2_df$ks_statistic = ks_statistic[[1]] + l2_df$ks_pvalue = ks_pvalue + + print(l2_df) + ks_df_l2 = rbind(ks_df_l2, l2_df) + +} + +#===================== +# Lineage 3 comparisons +#===================== +my_lin3 = "lineage3" +#my_lineages_comp_l3 = c("lineage1", lineage2", "lineage4") +my_lineages_comp_l3 = my_lineages[-match(my_lin3, my_lineages)] + +ks_df_l3 = data.frame() + +for (i in my_lineages_comp_l3){ + cat(i) + l3_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) + + lineage_comp = paste0(my_lin3, " vs ", i) + ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method + + ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic + + ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3] + , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value + + # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) + l3_df$group = lineage_comp + l3_df$method = ks_method + l3_df$ks_statistic = ks_statistic[[1]] + l3_df$ks_pvalue = ks_pvalue + + print(l3_df) + ks_df_l3 = rbind(ks_df_l3, l3_df) + +} +#################################################################### +# combine all three ks_dfs +n_dfs = 3 +if ( all.equal(nrow(ks_df_l1), nrow(ks_df_l2), nrow(ks_df_l3)) && + all.equal(ncol(ks_df_l1), ncol(ks_df_l2), ncol(ks_df_l3)) ){ + cat("\nPASS: Calculating expected rows and cols for sanity checks on combined_dfs") + expected_rows = nrow(ks_df_l1) * n_dfs + expected_cols = ncol(ks_df_l1) + ks_df_combined = rbind(ks_df_l1, ks_df_l2, ks_df_l3) + if ( nrow(ks_df_combined) == expected_rows && ncol(ks_df_combined) == expected_cols ){ + cat("\nPASS: combined df successfully created" + , "\nnrow combined_df:", nrow(ks_df_combined) + , "\nncol combined_df:", ncol(ks_df_combined)) + } + else{ + cat("\nFAIL: Dim mismatch" + , "\nExpected rows:", expected_rows + , "\nGot:", nrow(ks_df_combined) + , "\nExpected cols:", expected_cols + , "\nGot:", ncol(ks_df_combined)) + } +}else{ + cat("\nFAIL: Could not generate expected_rows and/or expected_cols" + , "\nCheck hardcoded value of n_dfs") +} + +#======================================================================= +# formatting +#======================================================================= +# add total_n number +ks_df_combined$n_drug_muts = nrow(df_lin_dr) + +# adding pvalue_signif +ks_df_combined$pvalue_signif = ks_df_combined$ks_pvalue +str(ks_df_combined$pvalue_signif) + +ks_df_combined = dplyr::mutate(ks_df_combined + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + +# Remove duplicates +rows_to_remove = c("lineage2 vs lineage1", "lineage3 vs lineage1", "lineage3 vs lineage2") + +ks_df_combined_f = ks_df_combined[-match(rows_to_remove, ks_df_combined$group),] + +#======================================================================= +#****************** +# write output file: KS test +#****************** +cat("Output of KS test by lineage for dr_muts:", ks_lineage_drug_muts) +write.csv(ks_df_combined_f, ks_lineage_drug_muts, row.names = FALSE) diff --git a/scripts/ks_test_dr_others_PS.R b/scripts/ks_test_dr_others_PS.R new file mode 100755 index 0000000..4575c5e --- /dev/null +++ b/scripts/ks_test_dr_others_PS.R @@ -0,0 +1,170 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: KS test for PS/DUET lineage distributions +#======================================================================= +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +#source("/plotting/Header_TT.R") +#source("../barplot_colour_function.R") +#require(data.table) +source("plotting/combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + + +#============= +# Output +#============= +# ks test by lineage +ks_dr_others_lineage = paste0(outdir, "/KS_lineage_dr_vs_others.csv") + +########################### +# Data for stats +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +# REASSIGNMENT +my_df = merged_df2 + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) +rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +#is.factor(my_df$lineage) +#my_df$lineage = as.factor(my_df$lineage) +#is.factor(my_df$lineage) +######################################################################## +table(my_df$lineage); str(my_df$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# subset selected lineages +df_lin = subset(my_df, subset = lineage %in% sel_lineages) + +#============== +# dr_muts_col +#============== +table(df_lin$mutation_info); str(df_lin$mutation_info) + +# subset df with dr muts only +df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) +table(df_lin_dr$mutation_info) + +#============== +# other_muts_col +#============== +df_lin_other = subset(df_lin, mutation_info == other_muts_col) +table(df_lin_other$mutation_info) + +#======================================================================= +# individual: CHECKS +ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"] + , df_lin_other$duet_scaled[df_lin_other$lineage == "lineage1"]) + +#======================================================================= +my_lineages = levels(factor(df_lin$lineage)); my_lineages + +ks_df = data.frame() + +for (i in my_lineages){ + cat(i) + df = data.frame(lineage = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) + + lineage_comp = i + ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i] + , df_lin_other$duet_scaled[df_lin_other$lineage == i])$method + + ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i] + , df_lin_other$duet_scaled[df_lin_other$lineage == i])$statistic + + ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i] + , df_lin_other$duet_scaled[df_lin_other$lineage == i])$p.value + + # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) + df$lineage = lineage_comp + df$method = ks_method + df$ks_statistic = ks_statistic[[1]] + df$ks_pvalue = ks_pvalue + + print(df) + ks_df = rbind(ks_df,df) + +} + +#======================================================================= +# formatting +#======================================================================= +# add total_n number +ks_df$n_drug = nrow(df_lin_dr) +ks_df$n_others = nrow(df_lin_other) + +# add group +ks_df$group = "Drug vs Others" + +str(ks_df) +ks_df$pvalue_signif = ks_df$ks_pvalue +str(ks_df$pvalue_signif) + +# adding pvalue_signif +ks_df = dplyr::mutate(ks_df, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + + +#======================================================================= +#****************** +# write output file: KS test +#****************** +cat("Output of KS test bt lineage:", ks_dr_others_lineage ) +write.csv(ks_df, ks_dr_others_lineage, row.names = FALSE)