#!/usr/bin/env Rscript ######################################################### # TASK: KS test for PS/DUET lineage distributions #======================================================================= #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/") getwd() #source("/plotting/Header_TT.R") #source("../barplot_colour_function.R") #require(data.table) source("plotting/combining_dfs_plotting.R") # should return the following dfs, directories and variables # PS combined: # 1) merged_df2 # 2) merged_df2_comp # 3) merged_df3 # 4) merged_df3_comp # LIG combined: # 5) merged_df2_lig # 6) merged_df2_comp_lig # 7) merged_df3_lig # 8) merged_df3_comp_lig # 9) my_df_u # 10) my_df_u_lig cat(paste0("Directories imported:" , "\ndatadir:", datadir , "\nindir:", indir , "\noutdir:", outdir , "\nplotdir:", plotdir)) cat(paste0("Variables imported:" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match , "\nAngstrom symbol:", angstroms_symbol , "\nNo. of duplicated muts:", dup_muts_nu , "\nNA count for ORs:", na_count , "\nNA count in df2:", na_count_df2 , "\nNA count in df3:", na_count_df3)) #============= # Output #============= # ks test by lineage ks_lineage_drug_muts = paste0(outdir, "/KS_lineage_drug_muts.csv") ########################### # Data for stats # you need merged_df2 or merged_df2_comp # since this is one-many relationship # i.e the same SNP can belong to multiple lineages # using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available, hence use df with NA ########################### # REASSIGNMENT my_df = merged_df2 # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig) # quick checks colnames(my_df) str(my_df) # Ensure correct data type in columns to plot: need to be factor #is.factor(my_df$lineage) #my_df$lineage = as.factor(my_df$lineage) #is.factor(my_df$lineage) ######################################################################## table(my_df$lineage); str(my_df$lineage) # subset only lineages1-4 sel_lineages = c("lineage1" , "lineage2" , "lineage3" , "lineage4") # subset selected lineages df_lin = subset(my_df, subset = lineage %in% sel_lineages) #============== # dr_muts_col #============== table(df_lin$mutation_info); str(df_lin$mutation_info) # subset df with dr muts only df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) table(df_lin_dr$mutation_info) #============== # other_muts_col #============== #df_lin_other = subset(df_lin, mutation_info == other_muts_col) #table(df_lin_other$mutation_info) #======================================================================= # individual: CHECKS ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"] , df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage2"]) #======================================================================= my_lineages = levels(factor(df_lin_dr$lineage)); my_lineages #======================================================================= # Loop #===================== # Lineage 1 comparisons #===================== my_lin1 = "lineage1" #my_lineages_comp_l1 = c("lineage2", "lineage3", "lineage4") my_lineages_comp_l1 = my_lineages[-match(my_lin1, my_lineages)] ks_df_l1 = data.frame() for (i in my_lineages_comp_l1){ cat(i) l1_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) lineage_comp = paste0(my_lin1, " vs ", i) ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin1] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) l1_df$group = lineage_comp l1_df$method = ks_method l1_df$ks_statistic = ks_statistic[[1]] l1_df$ks_pvalue = ks_pvalue print(l1_df) ks_df_l1 = rbind(ks_df_l1,l1_df) } ##################################################################### #===================== # Lineage 2 comparisons #===================== my_lin2 = "lineage2" #my_lineages_comp_l2 = c("lineage1", lineage3", "lineage4") my_lineages_comp_l2 = my_lineages[-match(my_lin2, my_lineages)] ks_df_l2 = data.frame() for (i in my_lineages_comp_l2){ cat(i) l2_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) lineage_comp = paste0(my_lin2, " vs ", i) ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin2] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) l2_df$group = lineage_comp l2_df$method = ks_method l2_df$ks_statistic = ks_statistic[[1]] l2_df$ks_pvalue = ks_pvalue print(l2_df) ks_df_l2 = rbind(ks_df_l2, l2_df) } #===================== # Lineage 3 comparisons #===================== my_lin3 = "lineage3" #my_lineages_comp_l3 = c("lineage1", lineage2", "lineage4") my_lineages_comp_l3 = my_lineages[-match(my_lin3, my_lineages)] ks_df_l3 = data.frame() for (i in my_lineages_comp_l3){ cat(i) l3_df = data.frame(group = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) lineage_comp = paste0(my_lin3, " vs ", i) ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$method ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$statistic ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == my_lin3] , df_lin_dr$duet_scaled[df_lin_dr$lineage == i])$p.value # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) l3_df$group = lineage_comp l3_df$method = ks_method l3_df$ks_statistic = ks_statistic[[1]] l3_df$ks_pvalue = ks_pvalue print(l3_df) ks_df_l3 = rbind(ks_df_l3, l3_df) } #################################################################### # combine all three ks_dfs n_dfs = 3 if ( all.equal(nrow(ks_df_l1), nrow(ks_df_l2), nrow(ks_df_l3)) && all.equal(ncol(ks_df_l1), ncol(ks_df_l2), ncol(ks_df_l3)) ){ cat("\nPASS: Calculating expected rows and cols for sanity checks on combined_dfs") expected_rows = nrow(ks_df_l1) * n_dfs expected_cols = ncol(ks_df_l1) ks_df_combined = rbind(ks_df_l1, ks_df_l2, ks_df_l3) if ( nrow(ks_df_combined) == expected_rows && ncol(ks_df_combined) == expected_cols ){ cat("\nPASS: combined df successfully created" , "\nnrow combined_df:", nrow(ks_df_combined) , "\nncol combined_df:", ncol(ks_df_combined)) } else{ cat("\nFAIL: Dim mismatch" , "\nExpected rows:", expected_rows , "\nGot:", nrow(ks_df_combined) , "\nExpected cols:", expected_cols , "\nGot:", ncol(ks_df_combined)) } }else{ cat("\nFAIL: Could not generate expected_rows and/or expected_cols" , "\nCheck hardcoded value of n_dfs") } #======================================================================= # formatting #======================================================================= # add total_n number ks_df_combined$n_drug_muts = nrow(df_lin_dr) # adding pvalue_signif ks_df_combined$pvalue_signif = ks_df_combined$ks_pvalue str(ks_df_combined$pvalue_signif) ks_df_combined = dplyr::mutate(ks_df_combined , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." , pvalue_signif <=0.0001 ~ '****' , pvalue_signif <=0.001 ~ '***' , pvalue_signif <=0.01 ~ '**' , pvalue_signif <0.05 ~ '*' , TRUE ~ 'ns')) # Remove duplicates rows_to_remove = c("lineage2 vs lineage1", "lineage3 vs lineage1", "lineage3 vs lineage2") ks_df_combined_f = ks_df_combined[-match(rows_to_remove, ks_df_combined$group),] #======================================================================= #****************** # write output file: KS test #****************** cat("Output of KS test by lineage for dr_muts:", ks_lineage_drug_muts) write.csv(ks_df_combined_f, ks_lineage_drug_muts, row.names = FALSE)