added katg and rpob files

2022-08-29 23:27:37 +01:00 · 2022-08-29 23:27:37 +01:00 · f39bbdcce7
commit f39bbdcce7
parent 7c2e4b898e
9 changed files with 3145 additions and 0 deletions
--- a/scripts/plotting/plotting_thesis/rpob/rpob_ORandSNP_results.R
+++ b/scripts/plotting/plotting_thesis/rpob/rpob_ORandSNP_results.R
@ -0,0 +1,226 @@
+#!/usr/bin/env Rscript
+#source("~/git/LSHTM_analysis/config/katg.R")
+#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
+outdir_stats = paste0(outdir_images,"stats/") 
+
+###################################################################
+# FIXME: ADD distance to NA when SP replies
+
+geneL_normal  = c("pnca")
+geneL_na      = c("gid", "rpob")
+geneL_ppi2    = c("alr", "embb", "katg", "rpob")
+
+# LigDist_colname # from globals used
+# ppi2Dist_colname #from globals used 
+# naDist_colname #from globals used
+
+dist_columns = c(LigDist_colname, ppi2Dist_colname, naDist_colname )
+
+common_cols  = c("mutationinformation"
+                 , "X5uhc_position"
+                 , "X5uhc_offset"
+                 , "position"
+                 , "dst_mode"
+                 , "mutation_info_labels"
+                 , "sensitivity", dist_columns )
+
+########################################
+categ_cols_to_factor = grep( "_outcome|_info", colnames(merged_df3) )
+fact_cols = colnames(merged_df3)[categ_cols_to_factor]
+
+if (any(lapply(merged_df3[, fact_cols], class) == "character")){
+  cat("\nChanging", length(categ_cols_to_factor), "cols to factor")
+  merged_df3[, fact_cols] <- lapply(merged_df3[, fact_cols], as.factor)
+  if (all(lapply(merged_df3[, fact_cols], class) == "factor")){
+    cat("\nSuccessful: cols changed to factor")
+  }
+}else{
+  cat("\nRequested cols aready factors")
+}
+
+cat("\ncols changed to factor are:\n", colnames(merged_df3)[categ_cols_to_factor] )
+
+####################################
+# merged_df3: NECESSARY pre-processing
+###################################
+#df3 = merged_df3
+all_cols = c(common_cols
+             , all_stability_cols
+             , all_affinity_cols
+             , all_conserv_cols)
+
+plot_cols = c("mutationinformation", "mutation_info_labels", "position", "dst_mode"
+              #, all_cols
+              )
+
+# counting
+foo = merged_df3[, c("mutationinformation"
+                     , "wild_pos"
+                     , "position"
+                     , "sensitivity"
+                     , "avg_lig_affinity"
+                     , "avg_lig_affinity_scaled"
+                     , "avg_lig_affinity_outcome"
+                     , "ligand_distance"
+                     , "ligand_affinity_change"
+                     , "affinity_scaled"
+                     , "ligand_outcome"
+                     , "consurf_colour_rev"
+                     , "consurf_outcome")]
+
+table(foo$consurf_outcome)
+foo2 = foo[foo$ligand_distance<10,]
+
+table(foo2$ligand_outcome)
+
+#############################
+# wide plots SNP
+# DRUG
+length(aa_pos_drug); aa_pos_drug
+drug = foo[foo$position%in%aa_pos_drug,]
+drug$wild_pos
+#CA
+###############################################
+#                     OR
+###############################################
+# OR plot
+df3_or = merged_df3
+df3_or$maf_percent = df3_or$maf*100
+
+bar = df3_or[, c("mutationinformation"
+                     , "wild_pos"
+                     , "position"
+                     , "sensitivity"
+                     , drug
+                     , affinity_dist_colnames
+                     , "maf_percent"
+                     , "or_mychisq"
+                     , "pval_fisher"
+                     #, "pval_chisq"
+                     , "neglog_pval_fisher"
+                     , "log10_or_mychisq")]
+
+# bar$p_adj_bonferroni = p.adjust(bar$pval_fisher, method = "bonferroni")
+# bar$signif_bon = bar$p_adj_bonferroni
+# bar = dplyr::mutate(bar
+#                     , signif_bon = case_when(signif_bon == 0.05 ~ "."
+#                                              , signif_bon <=0.0001 ~ '****'
+#                                              , signif_bon <=0.001 ~ '***'
+#                                              , signif_bon <=0.01 ~ '**'
+#                                              , signif_bon <0.05 ~ '*'
+#                                              , TRUE ~ 'ns'))
+
+bar$p_adj_fdr = p.adjust(bar$pval_fisher, method = "BH")
+bar$signif_fdr = bar$p_adj_fdr
+bar = dplyr::mutate(bar
+                    , signif_fdr = case_when(signif_fdr == 0.05 ~ "."
+                                             , signif_fdr <=0.0001 ~ '****'
+                                             , signif_fdr <=0.001 ~ '***'
+                                             , signif_fdr <=0.01 ~ '**'
+                                             , signif_fdr <0.05 ~ '*'
+                                             , TRUE ~ 'ns'))
+
+# sort df
+bar = bar[order(bar$or_mychisq, decreasing = T), ]
+bar = bar[, c("mutationinformation"
+               , "wild_pos"
+               , "position"
+               , "sensitivity"
+               , drug
+               , affinity_dist_colnames
+               , "maf_percent"
+               , "or_mychisq"
+               #, "pval_fisher"
+               #, "pval_chisq"
+               #, "neglog_pval_fisher"
+               #, "log10_or_mychisq"
+               #, "signif_bon"
+               , "p_adj_fdr"
+               , "signif_fdr")]
+
+table(bar$sensitivity)
+
+str(bar)
+sen = bar[bar$or_mychisq<1,]
+sen = na.omit(sen)
+
+res = bar[bar$or_mychisq>1,]
+res = na.omit(res)
+
+# comp 
+bar_or = bar[!is.na(bar$or_mychisq),]
+table(bar_or$sensitivity)
+      
+sen1 = bar_or[bar_or$or_mychisq<1,] # sen and res ~OR
+res1 = bar_or[bar_or$or_mychisq>1,] # sen and res ~OR
+
+# sanity check
+if (nrow(bar_or) == nrow(sen1) + nrow(res1) ){
+  cat("\nPASS: df with or successfully sourced"
+      , "\nCalculating % of muts with OR>1")
+}else{
+  stop("Abort: df with or numbers mimatch")
+}
+
+# percent for OR muts
+pc_orR = nrow(res1)/(nrow(sen1) + nrow(res1)); pc_orR
+
+cat("\nNo.of DST muts:", nrow(bar_or)
+    , "\nNo of DST (R):", table(bar_or$sensitivity)[[1]]
+    , "\nNo of DST (S):", table(bar_or$sensitivity)[[2]]
+    , "\nNumber of R muts with OR >1 (n = ", nrow(res1),")"
+    , "\nPercentage of muts with OR>1 i.e resistant:" , pc_orR *100 )
+
+table(bar_or$sensitivity)
+
+# muts with highest OR
+head(bar_or$mutationinformation, 10)
+
+# sort df
+bar_or = bar_or[order(bar_or$or_mychisq
+                      , bar_or$ligand_distance
+                      , bar_or$nca_distance
+                      , bar_or$interface_dist
+                      , decreasing = T), ]
+
+nrow(bar_or)
+bar_or$drug_site = ifelse(bar_or$position%in%aa_pos_drug, "drug", "no")
+table(bar_or$drug_site)
+
+top10_or = bar_or[1:10,]
+top10_or
+
+write.csv(bar_or, paste0(outdir_stats, "rpob_OR_10.csv"))
+
+# are these active sites
+top10_or$position[top10_or$position%in%active_aa_pos]
+
+
+# most frequent
+bar_maf = bar_or[order(bar_or$maf_percent
+                      , bar_or$ligand_distance
+                      , bar_or$nca_distance
+                      , bar_or$interface_dist
+                      , decreasing = T), ]
+bar_maf$drug_site = ifelse(bar_maf$position%in%aa_pos_drug, "drug", "no")
+table(bar_maf$drug_site)
+bar_maf[1:10,]
+#########################################################
+# closest most sig
+bar_or_lig = bar_or[bar_or$ligand_distance<10,]
+bar_or_lig = bar_or_lig[order(bar_or_lig$ligand_distance, -bar_or_lig$or_mychisq), ]
+table(bar_or_lig$signif_fdr)
+
+
+bar_or_nca = bar_or[bar_or$nca_distance<10,]
+bar_or_nca = bar_or_nca[order(bar_or_nca$nca_distance, -bar_or_ppi$or_mychisq), ]
+table(bar_or_nca$signif_fdr)
+
+bar_or_ppi = bar_or[bar_or$interface_dist<10,]
+bar_or_ppi = bar_or_ppi[order(bar_or_ppi$interface_dist, -bar_or_ppi$or_mychisq), ]
+table(bar_or_ppi$signif_fdr)