#!/usr/bin/env Rscript source("~/git/LSHTM_analysis/config/embb.R") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") #======= # output #======= outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/") outdir_stats = paste0(outdir_images,"stats/") ################################################################### # FIXME: ADD distance to NA when SP replies geneL_normal = c("pnca") geneL_na = c("gid", "rpob") geneL_ppi2 = c("alr", "embb", "katg", "rpob") # LigDist_colname # from globals used # ppi2Dist_colname #from globals used # naDist_colname #from globals used common_cols = c("mutationinformation" , "X5uhc_position" , "X5uhc_offset" , "position" , "dst_mode" , "mutation_info_labels" , "sensitivity", dist_columns ) ######################################## categ_cols_to_factor = grep( "_outcome|_info", colnames(merged_df3) ) fact_cols = colnames(merged_df3)[categ_cols_to_factor] if (any(lapply(merged_df3[, fact_cols], class) == "character")){ cat("\nChanging", length(categ_cols_to_factor), "cols to factor") merged_df3[, fact_cols] <- lapply(merged_df3[, fact_cols], as.factor) if (all(lapply(merged_df3[, fact_cols], class) == "factor")){ cat("\nSuccessful: cols changed to factor") } }else{ cat("\nRequested cols aready factors") } cat("\ncols changed to factor are:\n", colnames(merged_df3)[categ_cols_to_factor] ) #################################### # merged_df3: NECESSARY pre-processing ################################### #df3 = merged_df3 plot_cols = c("mutationinformation", "mutation_info_labels", "position", "dst_mode" , all_cols) all_cols = c(common_cols , all_stability_cols , all_affinity_cols , all_conserv_cols) # counting foo = merged_df3[, c("mutationinformation" , "wild_pos" , "position" , "sensitivity" , "avg_lig_affinity" , "avg_lig_affinity_scaled" , "avg_lig_affinity_outcome" , "ligand_distance" , "ligand_affinity_change" , "affinity_scaled" , "ligand_outcome" , "consurf_colour_rev" , "consurf_outcome")] table(foo$consurf_outcome) foo2 = foo[foo$ligand_distance<10,] table(foo2$ligand_outcome) ############################# # wide plots SAV # DRUG length(aa_pos_drug); aa_pos_drug drug = foo[foo$position%in%aa_pos_drug,] drug$wild_pos #CA ############################################### # OR ############################################### # OR plot df3_or = merged_df3 df3_or$maf_percent = df3_or$maf*100 bar = df3_or[, c("mutationinformation" , "wild_pos" , "position" , "sensitivity" , drug , affinity_dist_colnames , "maf_percent" , "or_mychisq" , "pval_fisher" #, "pval_chisq" , "neglog_pval_fisher" , "log10_or_mychisq")] # bar$p_adj_bonferroni = p.adjust(bar$pval_fisher, method = "bonferroni") # bar$signif_bon = bar$p_adj_bonferroni # bar = dplyr::mutate(bar # , signif_bon = case_when(signif_bon == 0.05 ~ "." # , signif_bon <=0.0001 ~ '****' # , signif_bon <=0.001 ~ '***' # , signif_bon <=0.01 ~ '**' # , signif_bon <0.05 ~ '*' # , TRUE ~ 'ns')) bar$p_adj_fdr = p.adjust(bar$pval_fisher, method = "BH") bar$signif_fdr = bar$p_adj_fdr bar = dplyr::mutate(bar , signif_fdr = case_when(signif_fdr == 0.05 ~ "." , signif_fdr <=0.0001 ~ '****' , signif_fdr <=0.001 ~ '***' , signif_fdr <=0.01 ~ '**' , signif_fdr <0.05 ~ '*' , TRUE ~ 'ns')) # sort df bar = bar[order(bar$or_mychisq, decreasing = T), ] bar = bar[, c("mutationinformation" , "wild_pos" , "position" , "sensitivity" , drug , affinity_dist_colnames , "maf_percent" , "or_mychisq" #, "pval_fisher" #, "pval_chisq" #, "neglog_pval_fisher" #, "log10_or_mychisq" #, "signif_bon" , "p_adj_fdr" , "signif_fdr")] table(bar$sensitivity) str(bar) sen = bar[bar$or_mychisq<1,] sen = na.omit(sen) res = bar[bar$or_mychisq>1,] res = na.omit(res) # comp bar_or = bar[!is.na(bar$or_mychisq),] table(bar_or$sensitivity) sen1 = bar_or[bar_or$or_mychisq<1,] # sen and res ~OR res1 = bar_or[bar_or$or_mychisq>1,] # sen and res ~OR # sanity check if (nrow(bar_or) == nrow(sen1) + nrow(res1) ){ cat("\nPASS: df with or successfully sourced" , "\nCalculating % of muts with OR>1") }else{ stop("Abort: df with or numbers mimatch") } # percent for OR muts pc_orR = nrow(res1)/(nrow(sen1) + nrow(res1)); pc_orR cat("\nNo.of DST muts:", nrow(bar_or) , "\nNo of DST (R):", table(bar_or$sensitivity)[[1]] , "\nNo of DST (S):", table(bar_or$sensitivity)[[2]] , "\nNumber of R muts with OR >1 (n = ", nrow(res1),")" , "\nPercentage of muts with OR>1 i.e resistant:" , pc_orR *100 ) table(bar_or$sensitivity) # muts with highest OR head(bar_or$mutationinformation, 10) # sort df bar_or = bar_or[order(bar_or$or_mychisq , bar_or$ligand_distance # bar_or$nca_dist , bar_or$interface_dist , decreasing = T), ] nrow(bar_or) bar_or$drug_site = ifelse(bar_or$position%in%aa_pos_drug, "drug", "no") table(bar_or$drug_site) bar_or$heme_site = ifelse(bar_or$position%in%aa_pos_hem, "heme", "no") table(bar_or$heme_site) top10_or = bar_or[1:10,] top10_or write.csv(bar_or, paste0(outdir_stats, "katg_OR_10.csv")) # are these active sites top10_or$position[top10_or$position%in%active_aa_pos] # maf bar_maf = bar_or[order(bar_or$maf_percent , bar_or$ligand_distance # bar_or$nca_dist , bar_or$interface_dist , decreasing = T), ] head(bar_maf) ######################################################### # closest most sig bar_or_lig = bar_or[bar_or$ligand_distance<10,] bar_or_lig = bar_or_lig[order(bar_or_lig$ligand_distance, -bar_or_lig$or_mychisq), ] table(bar_or_lig$signif_fdr) bar_or_ppi = bar_or[bar_or$interface_dist<10,] bar_or_ppi = bar_or_ppi[order(bar_or_ppi$interface_dist, -bar_or_ppi$or_mychisq), ] table(bar_or_ppi$signif_fdr)