#!/usr/bin/env Rscript ######################################################### # TASK: KS test for PS/DUET lineage distributions #======================================================================= #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/") getwd() #source("~/git/LSHTM_analysis/scripts/Header_TT.R") #source("../barplot_colour_function.R") #require(data.table) source("plotting/combining_dfs_plotting.R") # should return the following dfs, directories and variables # PS combined: # 1) merged_df2 # 2) merged_df2_comp # 3) merged_df3 # 4) merged_df3_comp # LIG combined: # 5) merged_df2_lig # 6) merged_df2_comp_lig # 7) merged_df3_lig # 8) merged_df3_comp_lig # 9) my_df_u # 10) my_df_u_lig cat(paste0("Directories imported:" , "\ndatadir:", datadir , "\nindir:", indir , "\noutdir:", outdir , "\nplotdir:", plotdir)) cat(paste0("Variables imported:" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match , "\nAngstrom symbol:", angstroms_symbol , "\nNo. of duplicated muts:", dup_muts_nu , "\nNA count for ORs:", na_count , "\nNA count in df2:", na_count_df2 , "\nNA count in df3:", na_count_df3)) #============= # Output #============= # ks test by lineage ks_dr_others_lineage = paste0(outdir, "/KS_lineage_dr_vs_others.csv") ########################### # Data for stats # you need merged_df2 or merged_df2_comp # since this is one-many relationship # i.e the same SNP can belong to multiple lineages # using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available, hence use df with NA ########################### # REASSIGNMENT my_df = merged_df2 # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig) # quick checks colnames(my_df) str(my_df) # Ensure correct data type in columns to plot: need to be factor #is.factor(my_df$lineage) #my_df$lineage = as.factor(my_df$lineage) #is.factor(my_df$lineage) ######################################################################## table(my_df$lineage); str(my_df$lineage) # subset only lineages1-4 sel_lineages = c("lineage1" , "lineage2" , "lineage3" , "lineage4") # subset selected lineages df_lin = subset(my_df, subset = lineage %in% sel_lineages) #============== # dr_muts_col #============== table(df_lin$mutation_info); str(df_lin$mutation_info) # subset df with dr muts only df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) table(df_lin_dr$mutation_info) #============== # other_muts_col #============== df_lin_other = subset(df_lin, mutation_info == other_muts_col) table(df_lin_other$mutation_info) #======================================================================= # individual: CHECKS ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"] , df_lin_other$duet_scaled[df_lin_other$lineage == "lineage1"]) #======================================================================= my_lineages = levels(factor(df_lin$lineage)); my_lineages ks_df = data.frame() for (i in my_lineages){ cat(i) df = data.frame(lineage = NA, method = NA, ks_statistic = NA, ks_pvalue = NA) lineage_comp = i ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i] , df_lin_other$duet_scaled[df_lin_other$lineage == i])$method ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i] , df_lin_other$duet_scaled[df_lin_other$lineage == i])$statistic ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i] , df_lin_other$duet_scaled[df_lin_other$lineage == i])$p.value # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval)) df$lineage = lineage_comp df$method = ks_method df$ks_statistic = ks_statistic[[1]] df$ks_pvalue = ks_pvalue print(df) ks_df = rbind(ks_df,df) } #======================================================================= # formatting #======================================================================= # add total_n number ks_df$n_drug = nrow(df_lin_dr) ks_df$n_others = nrow(df_lin_other) # add group ks_df$group = "Drug vs Others" str(ks_df) ks_df$pvalue_signif = ks_df$ks_pvalue str(ks_df$pvalue_signif) # adding pvalue_signif ks_df = dplyr::mutate(ks_df, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." , pvalue_signif <=0.0001 ~ '****' , pvalue_signif <=0.001 ~ '***' , pvalue_signif <=0.01 ~ '**' , pvalue_signif <0.05 ~ '*' , TRUE ~ 'ns')) #======================================================================= #****************** # write output file: KS test #****************** cat("Output of KS test bt lineage:", ks_dr_others_lineage ) write.csv(ks_df, ks_dr_others_lineage, row.names = FALSE)