#!/usr/bin/env Rscript ######################################################### # TASK: KS test for PS/DUET lineage distributions #======================================================================= #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/") getwd() #source("/plotting/Header_TT.R") #source("../barplot_colour_function.R") #require(data.table) source("plotting/combining_dfs_plotting.R") # should return the following dfs, directories and variables # PS combined: # 1) merged_df2 # 2) merged_df2_comp # 3) merged_df3 # 4) merged_df3_comp # LIG combined: # 5) merged_df2_lig # 6) merged_df2_comp_lig # 7) merged_df3_lig # 8) merged_df3_comp_lig # 9) my_df_u # 10) my_df_u_lig cat(paste0("Directories imported:" , "\ndatadir:", datadir , "\nindir:", indir , "\noutdir:", outdir , "\nplotdir:", plotdir)) cat(paste0("Variables imported:" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match , "\nAngstrom symbol:", angstroms_symbol , "\nNo. of duplicated muts:", dup_muts_nu , "\nNA count for ORs:", na_count , "\nNA count in df2:", na_count_df2 , "\nNA count in df3:", na_count_df3)) ########################### # Data for stats # you need merged_df2 or merged_df2_comp # since this is one-many relationship # i.e the same SNP can belong to multiple lineages # using the _comp dataset means # we lose some muts and at this level, we should use # as much info as available, hence use df with NA ########################### # REASSIGNMENT my_df = merged_df2 # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # quick checks colnames(my_df) str(my_df) # Ensure correct data type in columns to plot: need to be factor is.factor(my_df$lineage) my_df$lineage = as.factor(my_df$lineage) is.factor(my_df$lineage) table(my_df$mutation_info); str(my_df$mutation_info) # subset df with dr muts only my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") table(my_df_dr$mutation_info) # stats for all muts and dr_muts # 1) for all muts # 2) for dr_muts #=========================== table(my_df$lineage); str(my_df$lineage) table(my_df_dr$lineage); str(my_df_dr$lineage) # subset only lineages1-4 sel_lineages = c("lineage1" , "lineage2" , "lineage3" , "lineage4") # subset and refactor: all muts df_lin = subset(my_df, subset = lineage %in% sel_lineages) df_lin$lineage = factor(df_lin$lineage) # subset and refactor: dr muts df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) df_lin_dr$lineage = factor(df_lin_dr$lineage) #{RESULT: No of samples within lineage} table(df_lin$lineage) table(df_lin_dr$lineage) #{Result: No. of unique mutations the 4 lineages contribute to} length(unique(df_lin$mutationinformation)) length(unique(df_lin_dr$mutationinformation)) # COMPARING DISTRIBUTIONS #================ # ALL mutations #================= head(df_lin$lineage) df_lin$lineage = as.character(df_lin$lineage) lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled # ks test lin12 = ks.test(lin1,lin2) lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value)) lin13 = ks.test(lin1,lin3) lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value)) lin14 = ks.test(lin1,lin4) lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value)) lin23 = ks.test(lin2,lin3) lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value)) lin24 = ks.test(lin2,lin4) lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value)) lin34 = ks.test(lin3,lin4) lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value)) ks_results_all = rbind(lin12_df , lin13_df , lin14_df , lin23_df , lin24_df , lin34_df) #p-value < 2.2e-16 rm(lin12, lin12_df , lin13, lin13_df , lin14, lin14_df , lin23, lin23_df , lin24, lin24_df , lin34, lin34_df) #================ # DRUG mutations #================= head(df_lin_dr$lineage) df_lin_dr$lineage = as.character(df_lin_dr$lineage) lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled # ks test: dr muts lin12_dr = ks.test(lin1_dr,lin2_dr) lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value)) lin13_dr = ks.test(lin1_dr,lin3_dr) lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value)) lin14_dr = ks.test(lin1_dr,lin4_dr) lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value)) lin23_dr = ks.test(lin2_dr,lin3_dr) lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value)) lin24_dr = ks.test(lin2_dr,lin4_dr) lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value)) lin34_dr = ks.test(lin3_dr,lin4_dr) lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value)) ks_results_dr = rbind(lin12_df_dr , lin13_df_dr , lin14_df_dr , lin23_df_dr , lin24_df_dr , lin34_df_dr) ks_results_combined = cbind(ks_results_all, ks_results_dr) my_colnames = c("Lineage_comparisons" , paste0("All_mutations n=", nrow(df_lin)) , paste0("Drug_associated_mutations n=", nrow(df_lin_dr))) my_colnames # select the output columns ks_results_combined_f = ks_results_combined[,c(1,2,4)] colnames(ks_results_combined_f) = my_colnames ks_results_combined_f #============= # write output file #============= ks_results = paste0(outdir,"/results/ks_results.csv") write.csv(ks_results_combined_f, ks_results, row.names = F)