diff --git a/scripts/plotting/plotting_thesis/embb/embb_ORandSNP_results.R b/scripts/plotting/plotting_thesis/embb/embb_ORandSNP_results.R new file mode 100644 index 0000000..4716963 --- /dev/null +++ b/scripts/plotting/plotting_thesis/embb/embb_ORandSNP_results.R @@ -0,0 +1,223 @@ +#!/usr/bin/env Rscript +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") + +#======= +# output +#======= +outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/") +outdir_stats = paste0(outdir_images,"stats/") + +################################################################### +# FIXME: ADD distance to NA when SP replies + +geneL_normal = c("pnca") +geneL_na = c("gid", "rpob") +geneL_ppi2 = c("alr", "embb", "katg", "rpob") + +# LigDist_colname # from globals used +# ppi2Dist_colname #from globals used +# naDist_colname #from globals used + +common_cols = c("mutationinformation" + , "X5uhc_position" + , "X5uhc_offset" + , "position" + , "dst_mode" + , "mutation_info_labels" + , "sensitivity", dist_columns ) + + +######################################## +categ_cols_to_factor = grep( "_outcome|_info", colnames(merged_df3) ) +fact_cols = colnames(merged_df3)[categ_cols_to_factor] + +if (any(lapply(merged_df3[, fact_cols], class) == "character")){ + cat("\nChanging", length(categ_cols_to_factor), "cols to factor") + merged_df3[, fact_cols] <- lapply(merged_df3[, fact_cols], as.factor) + if (all(lapply(merged_df3[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +}else{ + cat("\nRequested cols aready factors") +} + +cat("\ncols changed to factor are:\n", colnames(merged_df3)[categ_cols_to_factor] ) + +#################################### +# merged_df3: NECESSARY pre-processing +################################### +#df3 = merged_df3 +plot_cols = c("mutationinformation", "mutation_info_labels", "position", "dst_mode" + , all_cols) + +all_cols = c(common_cols + , all_stability_cols + , all_affinity_cols + , all_conserv_cols) + + +# counting +foo = merged_df3[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , "avg_lig_affinity" + , "avg_lig_affinity_scaled" + , "avg_lig_affinity_outcome" + , "ligand_distance" + , "ligand_affinity_change" + , "affinity_scaled" + , "ligand_outcome" + , "consurf_colour_rev" + , "consurf_outcome")] + +table(foo$consurf_outcome) +foo2 = foo[foo$ligand_distance<10,] + +table(foo2$ligand_outcome) + +############################# +# wide plots SNP +# DRUG +length(aa_pos_drug); aa_pos_drug +drug = foo[foo$position%in%aa_pos_drug,] +drug$wild_pos +#CA +############################################### +# OR +############################################### +# OR plot +df3_or = merged_df3 +df3_or$maf_percent = df3_or$maf*100 + +bar = df3_or[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , drug + , affinity_dist_colnames + , "maf_percent" + , "or_mychisq" + , "pval_fisher" + #, "pval_chisq" + , "neglog_pval_fisher" + , "log10_or_mychisq")] + +# bar$p_adj_bonferroni = p.adjust(bar$pval_fisher, method = "bonferroni") +# bar$signif_bon = bar$p_adj_bonferroni +# bar = dplyr::mutate(bar +# , signif_bon = case_when(signif_bon == 0.05 ~ "." +# , signif_bon <=0.0001 ~ '****' +# , signif_bon <=0.001 ~ '***' +# , signif_bon <=0.01 ~ '**' +# , signif_bon <0.05 ~ '*' +# , TRUE ~ 'ns')) + +bar$p_adj_fdr = p.adjust(bar$pval_fisher, method = "BH") +bar$signif_fdr = bar$p_adj_fdr +bar = dplyr::mutate(bar + , signif_fdr = case_when(signif_fdr == 0.05 ~ "." + , signif_fdr <=0.0001 ~ '****' + , signif_fdr <=0.001 ~ '***' + , signif_fdr <=0.01 ~ '**' + , signif_fdr <0.05 ~ '*' + , TRUE ~ 'ns')) + +# sort df +bar = bar[order(bar$or_mychisq, decreasing = T), ] +bar = bar[, c("mutationinformation" + , "wild_pos" + , "position" + , "sensitivity" + , drug + , affinity_dist_colnames + , "maf_percent" + , "or_mychisq" + #, "pval_fisher" + #, "pval_chisq" + #, "neglog_pval_fisher" + #, "log10_or_mychisq" + #, "signif_bon" + , "p_adj_fdr" + , "signif_fdr")] + +table(bar$sensitivity) + +str(bar) +sen = bar[bar$or_mychisq<1,] +sen = na.omit(sen) + +res = bar[bar$or_mychisq>1,] +res = na.omit(res) + +# comp +bar_or = bar[!is.na(bar$or_mychisq),] +table(bar_or$sensitivity) + +sen1 = bar_or[bar_or$or_mychisq<1,] # sen and res ~OR +res1 = bar_or[bar_or$or_mychisq>1,] # sen and res ~OR + +# sanity check +if (nrow(bar_or) == nrow(sen1) + nrow(res1) ){ + cat("\nPASS: df with or successfully sourced" + , "\nCalculating % of muts with OR>1") +}else{ + stop("Abort: df with or numbers mimatch") +} + +# percent for OR muts +pc_orR = nrow(res1)/(nrow(sen1) + nrow(res1)); pc_orR + +cat("\nNo.of DST muts:", nrow(bar_or) + , "\nNo of DST (R):", table(bar_or$sensitivity)[[1]] + , "\nNo of DST (S):", table(bar_or$sensitivity)[[2]] + , "\nNumber of R muts with OR >1 (n = ", nrow(res1),")" + , "\nPercentage of muts with OR>1 i.e resistant:" , pc_orR *100 ) + +table(bar_or$sensitivity) + +# muts with highest OR +head(bar_or$mutationinformation, 10) + +# sort df +bar_or = bar_or[order(bar_or$or_mychisq + , bar_or$ligand_distance + # bar_or$nca_dist + , bar_or$interface_dist + , decreasing = T), ] + +nrow(bar_or) +bar_or$drug_site = ifelse(bar_or$position%in%aa_pos_drug, "drug", "no") +table(bar_or$drug_site) + +bar_or$heme_site = ifelse(bar_or$position%in%aa_pos_hem, "heme", "no") +table(bar_or$heme_site) + +top10_or = bar_or[1:10,] +top10_or + +write.csv(bar_or, paste0(outdir_stats, "katg_OR_10.csv")) + +# are these active sites +top10_or$position[top10_or$position%in%active_aa_pos] + + +# maf +bar_maf = bar_or[order(bar_or$maf_percent + , bar_or$ligand_distance + # bar_or$nca_dist + , bar_or$interface_dist + , decreasing = T), ] + +head(bar_maf) +######################################################### +# closest most sig +bar_or_lig = bar_or[bar_or$ligand_distance<10,] +bar_or_lig = bar_or_lig[order(bar_or_lig$ligand_distance, -bar_or_lig$or_mychisq), ] +table(bar_or_lig$signif_fdr) + + +bar_or_ppi = bar_or[bar_or$interface_dist<10,] +bar_or_ppi = bar_or_ppi[order(bar_or_ppi$interface_dist, -bar_or_ppi$or_mychisq), ] +table(bar_or_ppi$signif_fdr) diff --git a/scripts/plotting/plotting_thesis/embb/embb_appendix_tables.R b/scripts/plotting/plotting_thesis/embb/embb_appendix_tables.R new file mode 100644 index 0000000..bf1e0f6 --- /dev/null +++ b/scripts/plotting/plotting_thesis/embb/embb_appendix_tables.R @@ -0,0 +1,460 @@ +#!/usr/bin/env Rscript +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") + +#======= +# output +#======= +outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/") +outdir_stats = paste0(outdir_images,"stats/") +cat("\nOutput dir for stats:", outdir_stats) +################################################################### +geneL_normal = c("pnca") +#geneL_na = c("gid", "rpob") +geneL_na_v2 = c("gid") +geneL_ppi2 = c("alr", "embb", "katg", "rpob") +geneL_both = c("rpob") + +if (tolower(gene)%in%geneL_na_v2) { + gene_colnames = c("mcsm_na_affinity", "mcsm_na_outcome") +} + +if (tolower(gene)%in%geneL_ppi2) { + gene_colnames = c("mcsm_ppi2_affinity", "mcsm_ppi2_outcome") +} + +#from plotting_globals() +LigDist_colname +ppi2Dist_colname +naDist_colname + +delta_symbol #delta_symbol = "\u0394"; delta_symbol +angstroms_symbol + +cat("\nAffinity Distance colnames:", length(affinity_dist_colnames) + , "\nThese are:", affinity_dist_colnames) +#=========== +# Data used +#=========== +df3 = merged_df3 + +cols_to_output = c("position" + , "sensitivity" + , "mutationinformation" + , affinity_dist_colnames[1] + , "ligand_affinity_change" + , "ligand_outcome" + , "mmcsm_lig" + , "mmcsm_lig_outcome" + , affinity_dist_colnames[2] + # #, affinity_dist_colnames[3] + # , "mcsm_na_affinity" + # , "mcsm_na_outcome" + # #, "mcsm_nca_affinity" + # #, "mcsm_nca_outcome" + , gene_colnames + , "maf" + , "or_mychisq" + , "pval_fisher") + +cols_to_output +df3_output = df3[, cols_to_output] +colnames(df3_output) +cat("\nSelecting columns:", length(colnames(df3_output))) +#=============================================== +# Add COLS and rounding: adjusted P-values + MAF +#============================================== +#----------------------------- +# adjusted P-values +#----------------------------- +# add cols: p_adj_fdr and signif_fdr +df3_output$p_adj_fdr = p.adjust(df3_output$pval_fisher, method = "fdr") +df3_output$signif_fdr = df3_output$p_adj_fdr +df3_output = dplyr::mutate(df3_output + , signif_fdr = case_when(signif_fdr == 0.05 ~ "." + , signif_fdr <=0.0001 ~ '****' + , signif_fdr <=0.001 ~ '***' + , signif_fdr <=0.01 ~ '**' + , signif_fdr <0.05 ~ '*' + , TRUE ~ 'ns')) +# rounding +df3_output$or_mychisq = round(df3_output$or_mychisq,2) +df3_output$p_adj_fdr = round(df3_output$p_adj_fdr,2) +head(df3_output) + +#---------- +# MAF (%) +#---------- +# add col maf_percent +df3_output$maf_percent = df3_output$maf*100 + +# rounding +df3_output$maf_percent = round(df3_output$maf_percent,2) +head(df3_output$af); head(df3_output$maf);head(df3_output$maf_percent) + +#---------- +# P-value +#---------- +df3_output$pval_fisher = round(df3_output$pval_fisher,2) + +class(df3_output) +head(df3_output) + +#################################### +# Appendix: ligand affinity +#################################### +df_lig = df3_output[df3_output[[LigDist_colname]]