diff --git a/scripts/plotting/plotting_thesis/ORandSNP_results.R b/scripts/plotting/plotting_thesis/ORandSNP_results.R new file mode 100644 index 0000000..127c441 --- /dev/null +++ b/scripts/plotting/plotting_thesis/ORandSNP_results.R @@ -0,0 +1,251 @@ +#!/usr/bin/env Rscript +#source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/embb.R") +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/gid.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/rpob.R") + +# get plottting dfs +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") + +source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R") +#======= +# output +#======= +outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/") +################################################################### +# FIXME: ADD distance to NA when SP replies + +geneL_normal = c("pnca") +geneL_na = c("gid", "rpob") +geneL_ppi2 = c("alr", "embb", "katg", "rpob") + +# LigDist_colname # from globals used +# ppi2Dist_colname #from globals used +# naDist_colname #from globals used + +common_cols = c("mutationinformation" + , "X5uhc_position" + , "X5uhc_offset" + , "position" + , "dst_mode" + , "mutation_info_labels" + , "sensitivity", dist_columns ) + + +######################################## +categ_cols_to_factor = grep( "_outcome|_info", colnames(merged_df3) ) +fact_cols = colnames(merged_df3)[categ_cols_to_factor] + +if (any(lapply(merged_df3[, fact_cols], class) == "character")){ + cat("\nChanging", length(categ_cols_to_factor), "cols to factor") + merged_df3[, fact_cols] <- lapply(merged_df3[, fact_cols], as.factor) + if (all(lapply(merged_df3[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +}else{ + cat("\nRequested cols aready factors") +} + +cat("\ncols changed to factor are:\n", colnames(merged_df3)[categ_cols_to_factor] ) + +#################################### +# merged_df3: NECESSARY pre-processing +################################### +#df3 = merged_df3 +plot_cols = c("mutationinformation", "mutation_info_labels", "position", "dst_mode" + , all_cols) + +all_cols = c(common_cols + , all_stability_cols + , all_affinity_cols + , all_conserv_cols) + + +# counting +foo = merged_df3[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , "avg_lig_affinity" + , "avg_lig_affinity_scaled" + , "avg_lig_affinity_outcome" + , "ligand_distance" + , "ligand_affinity_change" + , "affinity_scaled" + , "ligand_outcome" + , "consurf_colour_rev" + , "consurf_outcome")] + +table(foo$consurf_outcome) + +foo2 = foo[foo$ligand_distance<10,] + +table(foo2$ligand_outcome) + +############################# +# wide plots SNP +# DRUG +length(aa_pos_drug); aa_pos_drug +drug = foo[foo$position%in%aa_pos_drug,] +drug$wild_pos + +length(unique(drug$position)); unique(drug$position) +table(drug$position) + +drug$mutationinformation[drug$position==306] +drug$mutationinformation[drug$position==303] + +#CA +length(aa_pos_ca); aa_pos_ca +ca = foo[foo$position%in%aa_pos_ca,] +ca$position; length(unique(ca$position)) +table(ca$position) + +# DSL +length(aa_pos_dsl); aa_pos_dsl +dsl= foo[foo$position%in%aa_pos_dsl,] +dsl$position; length(unique(dsl$position)) +table(dsl$position) + +dsl$mutationinformation[dsl$position==330] +dsl$mutationinformation[dsl$position==438] +dsl$mutationinformation[dsl$position==439] +dsl$mutationinformation[dsl$position==510] + + + +# CDL +length(aa_pos_cdl); aa_pos_cdl +cdl= foo[foo$position%in%aa_pos_cdl,] +length(unique(cdl$position)); cdl$position; +table(cdl$position) + +cdl$mutationinformation[cdl$position==456] +cdl$mutationinformation[cdl$position==521] +cdl$mutationinformation[cdl$position==554] +cdl$mutationinformation[cdl$position==568] +cdl$mutationinformation[cdl$position==575] +cdl$mutationinformation[cdl$position==580] +cdl$mutationinformation[cdl$position==658] +cdl$mutationinformation[cdl$position==665] + +############################################### +# OR plot + +bar = merged_df3[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , affinity_dist_colnames + , "or_mychisq" + , "pval_fisher" + #, "pval_chisq" + , "neglog_pval_fisher" + , "log10_or_mychisq")] + +# bar$p_adj_bonferroni = p.adjust(bar$pval_fisher, method = "bonferroni") +# bar$signif_bon = bar$p_adj_bonferroni +# bar = dplyr::mutate(bar +# , signif_bon = case_when(signif_bon == 0.05 ~ "." +# , signif_bon <=0.0001 ~ '****' +# , signif_bon <=0.001 ~ '***' +# , signif_bon <=0.01 ~ '**' +# , signif_bon <0.05 ~ '*' +# , TRUE ~ 'ns')) + +bar$p_adj_fdr = p.adjust(bar$pval_fisher, method = "BH") +bar$signif_fdr = bar$p_adj_fdr +bar = dplyr::mutate(bar + , signif_fdr = case_when(signif_fdr == 0.05 ~ "." + , signif_fdr <=0.0001 ~ '****' + , signif_fdr <=0.001 ~ '***' + , signif_fdr <=0.01 ~ '**' + , signif_bon <0.05 ~ '*' + , TRUE ~ 'ns')) + +# sort df +bar = bar[order(bar$or_mychisq, decreasing = T), ] +bar = bar[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , affinity_dist_colnames + , "or_mychisq" + #, "pval_fisher" + #, "pval_chisq" + #, "neglog_pval_fisher" + #, "log10_or_mychisq" + #, "signif_bon" + , "p_adj_fdr" + , "signif_fdr")] + +table(bar$sensitivity) + +table(bar$or_mychisq>1&bar$signif_fdr) # sen and res ~ OR + +str(bar) +sen = bar[bar$or_mychisq<1,] +sen = na.omit(sen) + +res = bar[bar$or_mychisq>1,] +res = na.omit(res) + +# comp +bar_or = bar[!is.na(bar$or_mychisq),] +table(bar_or$sensitivity) + +sen1 = bar_or[bar_or$or_mychisq<1,] # sen and res ~OR +res1 = bar_or[bar_or$or_mychisq>1,] # sen and res ~OR + +# sanity check +if (nrow(bar_or) == nrow(sen1) + nrow(res1) ){ + cat("\nPASS: df with or successfully sourced" + , "\nCalculating % of muts with OR>1") +}else{ + stop("Abort: df with or numbers mimatch") +} + +# percent for OR muts +pc_orR = nrow(res1)/(nrow(sen1) + nrow(res1)); pc_orR +cat("\nPercentage of muts with OR>1 i.e resistant:" + , pc_orR *100 ) + +# muts with highest OR +head(bar_or$mutationinformation, 10) + +# sort df +bar_or = bar_or[order(bar_or$or_mychisq + , bar_or$ligand_distance + , bar_or$interface_dist + , decreasing = T), ] + +bar_or$drug_site = ifelse(bar_or$position%in%aa_pos_drug, "drug", "no") +table(bar_or$drug_site) + +bar_or$dsl_site = ifelse(bar_or$position%in%aa_pos_dsl, "dsl", "no") +table(bar_or$dsl_site) + +bar_or$ca_site = ifelse(bar_or$position%in%aa_pos_ca, "ca", "no") +table(bar_or$ca_site) + +bar_or$cdl_site = ifelse(bar_or$position%in%aa_pos_cdl, "cdl", "no") +table(bar_or$cdl_site) + + +top10_or = bar_or[1:10,] + +# are these active sites +top10_or$position[top10_or$position%in%active_aa_pos] + + +# clostest most sig +bar_or_lig = bar_or[bar_or$ligand_distance<10,] +bar_or_lig = bar_or_lig[order(bar_or_lig$ligand_distance, -bar_or_lig$or_mychisq), ] +table(bar_or_lig$signif_fdr) + + +bar_or_ppi = bar_or[bar_or$interface_dist<10,] +bar_or_ppi = bar_or_ppi[order(bar_or_ppi$interface_dist, -bar_or_ppi$or_mychisq), ] +table(bar_or_ppi$signif_fdr)