From b302daaa60ca029b36233e3bf9d49264bf970903 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 11 Aug 2022 20:56:34 +0100 Subject: [PATCH] rearranged corr plot cols and also added example for ggpairs --- scripts/functions/corr_plot_data.R | 14 +- .../plotting_thesis/corr_plots_thesis.R | 261 +++++++++++------- .../plotting/plotting_thesis/preformatting.R | 1 + .../structure_figures/AFFINITY_TEST.R | 138 --------- .../mcsm_affinity_data_only.R | 241 ---------------- .../mcsm_mean_stability_ensemble.R | 190 ++----------- .../replaceBfactor_pdb_stability.R | 63 +++-- 7 files changed, 227 insertions(+), 681 deletions(-) delete mode 100644 scripts/plotting/structure_figures/AFFINITY_TEST.R delete mode 100644 scripts/plotting/structure_figures/mcsm_affinity_data_only.R diff --git a/scripts/functions/corr_plot_data.R b/scripts/functions/corr_plot_data.R index e6aff38..05595c8 100644 --- a/scripts/functions/corr_plot_data.R +++ b/scripts/functions/corr_plot_data.R @@ -15,6 +15,10 @@ corr_data_extract <- function(df , extract_scaled_cols = F){ if ( missing(colnames_to_extract) || missing(colnames_display_key) ){ + + df$maf2 = log10(df$maf) # can't see otherwise + sum(is.na(df$maf2)) + cat("\n==========================================" , "\nCORR PLOTS data: ALL params" , "\n=========================================") @@ -30,20 +34,22 @@ corr_data_extract <- function(df common_colnames = c(drug, "dst_mode" , "duet_stability_change" , "ddg_foldx" , "deepddg" , "ddg_dynamut2" , "asa" , "rsa" , "kd_values" , "rd_values" - , "maf" , "log10_or_mychisq" , "neglog_pval_fisher" + # previously maf + , "maf2" , "log10_or_mychisq" , "neglog_pval_fisher" , LigDist_colname , "consurf_score" , "snap2_score" , "provean_score" - , "ligand_affinity_change" + , "ligand_affinity_change", "mmcsm_lig" #, "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet" ) display_common_colnames = c( drug, "dst_mode" , "DUET" , "FoldX" , "DeepDDG", "Dynamut2" , "ASA" , "RSA" , "KD" , "RD" - , "MAF" , "Log(OR)" , "-Log(P)" + # previously MAF + , "Log(MAF)" , "Log(OR)" , "-Log(P)" , "Lig-Dist" , "ConSurf" , "SNAP2" , "PROVEAN" - , "mCSM-lig" + , "mCSM-lig", "mmCSM-lig" # , "Dynamut" , "ENCoM-DDG" , "mCSM" , "SDM" , "DUET-d" , "ENCoM-DDS" ) diff --git a/scripts/plotting/plotting_thesis/corr_plots_thesis.R b/scripts/plotting/plotting_thesis/corr_plots_thesis.R index 348b8cb..cd3f9e8 100644 --- a/scripts/plotting/plotting_thesis/corr_plots_thesis.R +++ b/scripts/plotting/plotting_thesis/corr_plots_thesis.R @@ -8,6 +8,7 @@ source("~/git/LSHTM_analysis/config/embb.R") # get plottting dfs source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R") #################################################### #======= # output @@ -23,55 +24,126 @@ corr_plotdf = corr_data_extract(merged_df3 , drug = drug , extract_scaled_cols = F) colnames(corr_plotdf) -colnames(corr_df_m3_f) -corr_plotdf = corr_df_m3_f #for downstream code + +if (all(colnames(corr_df_m3_f) == colnames(corr_plotdf))){ + cat("PASS: corr plot colnames match for dashboard") +}else{ + stop("Abort: corr plot colnames DO NOT match for dashboard") +} + +#corr_plotdf = corr_df_m3_f #for downstream code + +aff_dist_cols = colnames(corr_plotdf)[grep("Dist", colnames(corr_plotdf))] +aff_dist_cols + + +static_cols = c("Log(MAF)" + , "Log(OR)" + #, "-Log(P)" +) + #================ # stability #================ -corr_ps_colnames = c("DUET" +#affinity_dist_colnames# lIg DIst and ppi Di +corr_ps_colnames = c(static_cols + , "DUET" , "FoldX" , "DeepDDG" , "Dynamut2" - , "MAF" - , "Log(OR)" - , "-Log(P)" - #, "ligand_distance" - , "dst_mode" - , drug) + , aff_dist_cols + , "dst_mode") -corr_ps_colnames%in%colnames(corr_plotdf) +if (all(corr_ps_colnames%in%colnames(corr_plotdf))){ + cat("PASS: all colnames exist for correlation") +}else{ + stop("Abort: all colnames DO NOT exist for correlation") +} corr_df_ps = corr_plotdf[, corr_ps_colnames] complete_obs_ps = nrow(corr_df_ps) - sum(is.na(corr_df_ps$`Log(OR)`)) cat("\nComplete muts for Conservation for", gene, ":", complete_obs_ps) color_coln = which(colnames(corr_df_ps) == "dst_mode") -end = which(colnames(corr_df_ps) == drug) -ncol_omit = 2 -corr_end = end-ncol_omit +#end = which(colnames(corr_df_ps) == drug) +#ncol_omit = 2 +#corr_end = end-ncol_omit +corr_end = color_coln-1 #------------------------ # Output: stability corrP #------------------------ corr_psP = paste0(outdir_images - ,tolower(gene) - ,"_corr_stability.svg" ) + ,tolower(gene) + ,"_corr_stability.svg" ) cat("Corr plot stability with coloured dots:", corr_psP) svg(corr_psP, width = 15, height = 15) my_corr_pairs(corr_data_all = corr_df_ps - , corr_cols = colnames(corr_df_ps[1:corr_end]) - , corr_method = "spearman" - , colour_categ_col = colnames(corr_df_ps[color_coln]) #"dst_mode" - , categ_colour = c("red", "blue") - , density_show = F - , hist_col = "coral4" - , dot_size = 1.6 - , ats = 1.5 - , corr_lab_size = 3 - , corr_value_size = 1) + , corr_cols = colnames(corr_df_ps[1:corr_end]) + , corr_method = "spearman" + , colour_categ_col = colnames(corr_df_ps[color_coln]) #"dst_mode" + , categ_colour = c("red", "blue") + , density_show = F + , hist_col = "coral4" + , dot_size = 1.6 + , ats = 1.5 + , corr_lab_size = 3 + , corr_value_size = 1) dev.off() +#=============== +# CONSERVATION +#============== +corr_conservation_cols = c( static_cols + , "ConSurf" + , "SNAP2" + , "PROVEAN" + , aff_dist_cols + , "dst_mode" + , drug) + +if (all(corr_conservation_cols%in%colnames(corr_plotdf))){ + cat("PASS: all colnames exist for ConSurf-correlation") +}else{ + stop("Abort: all colnames DO NOT exist for ConSurf-correlation") +} + +corr_df_cons = corr_plotdf[, corr_conservation_cols] +complete_obs_cons = nrow(corr_df_cons) - sum(is.na(corr_df_cons$`Log(OR)`)) +cat("\nComplete muts for Conservation for", gene, ":", complete_obs_cons) + +color_coln = which(colnames(corr_df_cons) == "dst_mode") +# end = which(colnames(corr_df_cons) == drug) +# ncol_omit = 2 +# corr_end = end-ncol_omit +corr_end = color_coln-1 + + +#--------------------------- +# Output: Conservation corrP +#---------------------------- +corr_consP = paste0(outdir_images + ,tolower(gene) + ,"_corr_conservation.svg" ) + +cat("Corr plot conservation coloured dots:", corr_consP) +svg(corr_consP, width = 10, height = 10) + +my_corr_pairs(corr_data_all = corr_df_cons + , corr_cols = colnames(corr_df_cons[1:corr_end]) + , corr_method = "spearman" + , colour_categ_col = colnames(corr_df_cons[color_coln]) #"dst_mode" + , categ_colour = c("red", "blue") + , density_show = F + , hist_col = "coral4" + , dot_size =1.1 + , ats = 1.5 + , corr_lab_size = 2.15 + , corr_value_size = 1) + +dev.off() + ##################################################### #DistCutOff = 10 #LigDist_colname # = "ligand_distance" # from globals @@ -82,31 +154,36 @@ dev.off() #================ # ligand affinity #================ -corr_lig_colnames = c("mCSM-lig" - , "MAF" - , "Log(OR)" - , "-Log(P)" - , "Lig-Dist" - , "dst_mode" - , drug) +corr_df_lig = corr_plotdf[corr_plotdf["Lig-Dist"]% - dplyr::group_by(position) %>% - biggest = max(abs(a2[gene_aff_cols])) - a2$max_es = max(abs(a2[gene_aff_cols])) - a2$effect = names(a2[gene_aff_cols])[which(abs(a2[gene_aff_cols]) == biggest, arr.ind=T)[, "col"]] - effect_name = unique(a2$effect) - - #get index of value of max effect - ind = (which(abs(a2[effect_name]) == biggest, arr.ind=T)) - a2[effect_name][ind] - # extract sign - a2$effect_dir = sign(a2[effect_name][ind]) -################################# -df2_short = df2[df2$position%in%c(167, 423, 427),] - -for (i in unique(df2_short$position) ){ - #print(i) - #print(paste0("\nNo. of unique positions:", length(unique(df2$position))) ) - #cat(length(unique(df2$position))) - a2 = df2_short[df2_short$position==i,] - biggest = max(abs(a2[gene_aff_cols])) - a2$max_es = max(abs(a2[gene_aff_cols])) - a2$effect = names(a2[gene_aff_cols])[which(abs(a2[gene_aff_cols]) == biggest, arr.ind=T)[, "col"]] - effect_name = unique(a2$effect) - - #get index of value of max effect - ind = (which(abs(a2[effect_name]) == biggest, arr.ind=T)) - a2[effect_name][ind] - # extract sign - a2$effect_sign = sign(a2[effect_name][ind]) -} - -#======================== -df2_short = df3[df3$position%in%c(167, 423, 427),] -df2_short = df3[df3$position%in%c(170, 167, 493, 453, 435, 433, 480, 456, 445),] -df2_short = df3[df3$position%in%c(435, 480),] -df2_short = df3[df3$position%in%c(435, 480),] - -give_col=function(x,y,df=df2_short){ - df[df$position==x,y] -} - -for (i in unique(df2_short$position) ){ - #print(i) - #print(paste0("\nNo. of unique positions:", length(unique(df2$position))) ) - #cat(length(unique(df2$position))) - #df2_short[df2_short$position==i,gene_aff_cols] - - biggest = max(abs(give_col(i,gene_aff_cols))) - - df2_short[df2_short$position==i,'abs_max_effect'] = biggest - df2_short[df2_short$position==i,'effect_type']= names( - give_col(i,gene_aff_cols)[which( - abs( - give_col(i,gene_aff_cols) - ) == biggest, arr.ind=T - )[, "col"]]) - - effect_name = df2_short[df2_short$position==i,'effect_type'][1] # pick first one in case we have multiple exact values - - # get index/rowname for value of max effect, and then use it to get the original sign - # here - #df2_short[df2_short$position==i,c(effect_name)] - #which(abs(df2_short[df2_short$position==i,c('position',effect_name)][effect_name])==biggest, arr.ind=T) - ind = rownames(which(abs(df2_short[df2_short$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T)) - df2_short[df2_short$position==i,'effect_sign'] = sign(df2_short[effect_name][ind,]) -} - -df2_short$effect_type = sub("\\.[0-9]+", "", df2_short$effect_type) # cull duplicate effect types that happen when there are exact duplicate values \ No newline at end of file diff --git a/scripts/plotting/structure_figures/mcsm_affinity_data_only.R b/scripts/plotting/structure_figures/mcsm_affinity_data_only.R deleted file mode 100644 index f962935..0000000 --- a/scripts/plotting/structure_figures/mcsm_affinity_data_only.R +++ /dev/null @@ -1,241 +0,0 @@ -#source("~/git/LSHTM_analysis/config/pnca.R") -#source("~/git/LSHTM_analysis/config/alr.R") -#source("~/git/LSHTM_analysis/config/gid.R") -#source("~/git/LSHTM_analysis/config/embb.R") -#source("~/git/LSHTM_analysis/config/katg.R") -#source("~/git/LSHTM_analysis/config/rpob.R") - -source("/home/tanu/git/LSHTM_analysis/my_header.R") -######################################################### -# TASK: Generate averaged affinity values -# across all affinity tools for a given structure -# as applicable... -######################################################### - -#======= -# output -#======= -outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene)) - -#OutFile1 -outfile_mean_aff = paste0(outdir_images, "/", tolower(gene) - , "_mean_affinity_all.csv") -print(paste0("Output file:", outfile_mean_aff)) - -#OutFile2 -outfile_mean_aff_priorty = paste0(outdir_images, "/", tolower(gene) - , "_mean_affinity_priority.csv") -print(paste0("Output file:", outfile_mean_aff_priorty)) - -#%%=============================================================== - -#============= -# Input -#============= -df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") -df3 = read.csv(df3_filename) -length(df3$mutationinformation) - -# mut_info checks -table(df3$mutation_info) -table(df3$mutation_info_orig) -table(df3$mutation_info_labels_orig) - -# used in plots and analyses -table(df3$mutation_info_labels) # different, and matches dst_mode -table(df3$dst_mode) - -# create column based on dst mode with different colname -table(is.na(df3$dst)) -table(is.na(df3$dst_mode)) - -#=============== -# Create column: sensitivity mapped to dst_mode -#=============== -df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S") -table(df3$sensitivity) - -length(unique((df3$mutationinformation))) -all_colnames = as.data.frame(colnames(df3)) - -# FIXME: ADD distance to NA when SP replies -dist_columns = c("ligand_distance", "interface_dist") -DistCutOff = 10 -common_cols = c("mutationinformation" - , "X5uhc_position" - , "X5uhc_offset" - , "position" - , "dst_mode" - , "mutation_info_labels" - , "sensitivity", dist_columns ) - -all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)] -all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)] - -#=================== -# stability cols -#=================== -raw_cols_stability = c("duet_stability_change" - , "deepddg" - , "ddg_dynamut2" - , "ddg_foldx") - -scaled_cols_stability = c("duet_scaled" - , "deepddg_scaled" - , "ddg_dynamut2_scaled" - , "foldx_scaled") - -outcome_cols_stability = c("duet_outcome" - , "deepddg_outcome" - , "ddg_dynamut2_outcome" - , "foldx_outcome") - -#=================== -# affinity cols -#=================== -raw_cols_affinity = c("ligand_affinity_change" - , "mmcsm_lig" - , "mcsm_ppi2_affinity" - , "mcsm_na_affinity") - -scaled_cols_affinity = c("affinity_scaled" - , "mmcsm_lig_scaled" - , "mcsm_ppi2_scaled" - , "mcsm_na_scaled" ) - -outcome_cols_affinity = c( "ligand_outcome" - , "mmcsm_lig_outcome" - , "mcsm_ppi2_outcome" - , "mcsm_na_outcome") - -#=================== -# conservation cols -#=================== -# raw_cols_conservation = c("consurf_score" -# , "snap2_score" -# , "provean_score") -# -# scaled_cols_conservation = c("consurf_scaled" -# , "snap2_scaled" -# , "provean_scaled") -# -# # CANNOT strictly be used, as categories are not identical with conssurf missing altogether -# outcome_cols_conservation = c("provean_outcome" -# , "snap2_outcome" -# #consurf outcome doesn't exist -# ) - -gene_aff_cols = colnames(df3)[colnames(df3)%in%scaled_cols_affinity] -gene_stab_cols = colnames(df3)[colnames(df3)%in%scaled_cols_stability] -gene_common_cols = colnames(df3)[colnames(df3)%in%common_cols] - -sel_cols = c(gene_common_cols - , gene_stab_cols - , gene_aff_cols) - -######################################### -#df3_plot = df3[, cols_to_extract] -df3_plot = df3[, sel_cols] - -###################### -#FILTERING HMMMM! -#all dist <10 -#for embb this results in 2 muts -###################### -df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 | df3_plot$interface_dist <10),] -df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 & df3_plot$interface_dist <10),] - -c0u = unique(df3_affinity_filtered$position) -length(c0u) - -#df = df3_affinity_filtered -########################################## -#NO FILTERING: annotate the whole df -df = df3_plot -sum(is.na(df)) -df2 = na.omit(df) -c0u = unique(df2$position) -length(c0u) - -# reassign orig -my_df_raw = df3 - -# now subset -df3 = df2 -####################################################### -#================= -# affinity effect -#================= -give_col=function(x,y,df=df3){ - df[df$position==x,y] -} - -for (i in unique(df3$position) ){ - #print(i) - biggest = max(abs(give_col(i,gene_aff_cols))) - - df3[df3$position==i,'abs_max_effect'] = biggest - df3[df3$position==i,'effect_type']= names( - give_col(i,gene_aff_cols)[which( - abs( - give_col(i,gene_aff_cols) - ) == biggest, arr.ind=T - )[, "col"]]) - - # effect_name = unique(df3[df3$position==i,'effect_type']) - effect_name = df3[df3$position==i,'effect_type'][1] # pick first one in case we have multiple exact values - - ind = rownames(which(abs(df3[df3$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T)) - df3[df3$position==i,'effect_sign'] = sign(df3[effect_name][ind,]) -} - -df3$effect_type = sub("\\.[0-9]+", "", df3$effect_type) # cull duplicate effect types that happen when there are exact duplicate values -df3U = df3[!duplicated(df3[c('position')]), ] -table(df3U$effect_type) -######################################################### -#%% consider stability as well -df4 = df2 - -#================= -# stability + affinity effect -#================= -effect_cols = c(gene_aff_cols, gene_stab_cols) - -give_col=function(x,y,df=df4){ - df[df$position==x,y] -} - -for (i in unique(df4$position) ){ - #print(i) - biggest = max(abs(give_col(i,effect_cols))) - - df4[df4$position==i,'abs_max_effect'] = biggest - df4[df4$position==i,'effect_type']= names( - give_col(i,effect_cols)[which( - abs( - give_col(i,effect_cols) - ) == biggest, arr.ind=T - )[, "col"]]) - - # effect_name = unique(df4[df4$position==i,'effect_type']) - effect_name = df4[df4$position==i,'effect_type'][1] # pick first one in case we have multiple exact values - - ind = rownames(which(abs(df4[df4$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T)) - df4[df4$position==i,'effect_sign'] = sign(df4[effect_name][ind,]) -} - -df4$effect_type = sub("\\.[0-9]+", "", df4$effect_type) # cull duplicate effect types that happen when there are exact duplicate values -df4U = df4[!duplicated(df4[c('position')]), ] -table(df4U$effect_type) - -#%%============================================================ -# output -write.csv(combined_df, outfile_mean_ens_st_aff - , row.names = F) -cat("Finished writing file:\n" - , outfile_mean_ens_st_aff - , "\nNo. of rows:", nrow(combined_df) - , "\nNo. of cols:", ncol(combined_df)) - -# end of script -#=============================================================== diff --git a/scripts/plotting/structure_figures/mcsm_mean_stability_ensemble.R b/scripts/plotting/structure_figures/mcsm_mean_stability_ensemble.R index aeb5d0a..26beac4 100644 --- a/scripts/plotting/structure_figures/mcsm_mean_stability_ensemble.R +++ b/scripts/plotting/structure_figures/mcsm_mean_stability_ensemble.R @@ -1,14 +1,13 @@ #source("~/git/LSHTM_analysis/config/pnca.R") #source("~/git/LSHTM_analysis/config/alr.R") #source("~/git/LSHTM_analysis/config/gid.R") -#source("~/git/LSHTM_analysis/config/embb.R") +source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/katg.R") #source("~/git/LSHTM_analysis/config/rpob.R") -source("/home/tanu/git/LSHTM_analysis/my_header.R") ######################################################### -# TASK: Generate averaged stability values -# across all stability tools +# TASK: Generate averaged stability values by position +# calculated across all stability tools # for a given structure ######################################################### @@ -23,190 +22,53 @@ print(paste0("Output file:", outfile_mean_ens_st_aff)) #%%=============================================================== #============= -# Input +# Input: merged_df3 #============= -df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") -df3 = read.csv(df3_filename) +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +#merged_df3= paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") + +cols_to_extract_ms = c("mutationinformation", "position", "avg_stability_scaled") + +df3 = merged_df3[, cols_to_extract_ms] length(df3$mutationinformation) -# mut_info checks -table(df3$mutation_info) -table(df3$mutation_info_orig) -table(df3$mutation_info_labels_orig) - -# used in plots and analyses -table(df3$mutation_info_labels) # different, and matches dst_mode -table(df3$dst_mode) - -# create column based on dst mode with different colname -table(is.na(df3$dst)) -table(is.na(df3$dst_mode)) - -#=============== -# Create column: sensitivity mapped to dst_mode -#=============== -df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S") -table(df3$sensitivity) - -length(unique((df3$mutationinformation))) -all_colnames = as.data.frame(colnames(df3)) -common_cols = c("mutationinformation" - , "position" - , "dst_mode" - , "mutation_info_labels" - , "sensitivity" - , "ligand_distance" - , "interface_dist") - -all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)] -all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)] - -#=================== -# stability cols -#=================== -raw_cols_stability = c("duet_stability_change" - , "deepddg" - , "ddg_dynamut2" - , "ddg_foldx") - -scaled_cols_stability = c("duet_scaled" - , "deepddg_scaled" - , "ddg_dynamut2_scaled" - , "foldx_scaled") - -outcome_cols_stability = c("duet_outcome" - , "deepddg_outcome" - , "ddg_dynamut2_outcome" - , "foldx_outcome") - -#=================== -# affinity cols -#=================== -raw_cols_affinity = c("ligand_affinity_change" - , "mmcsm_lig" - , "mcsm_ppi2_affinity" - , "mcsm_na_affinity") - -scaled_cols_affinity = c("affinity_scaled" - , "mmcsm_lig_scaled" - , "mcsm_ppi2_scaled" - , "mcsm_na_scaled" ) - -outcome_cols_affinity = c( "ligand_outcome" - , "mmcsm_lig_outcome" - , "mcsm_ppi2_outcome" - , "mcsm_na_outcome") - -#=================== -# conservation cols -#=================== -# raw_cols_conservation = c("consurf_score" -# , "snap2_score" -# , "provean_score") -# -# scaled_cols_conservation = c("consurf_scaled" -# , "snap2_scaled" -# , "provean_scaled") -# -# # CANNOT strictly be used, as categories are not identical with conssurf missing altogether -# outcome_cols_conservation = c("provean_outcome" -# , "snap2_outcome" -# #consurf outcome doesn't exist -# ) - -########################################################### -cols_to_consider = colnames(df3)[colnames(df3)%in%c(common_cols - , raw_cols_stability - , scaled_cols_stability - , outcome_cols_stability - , raw_cols_affinity - , scaled_cols_affinity - , outcome_cols_affinity)] - -cols_to_extract = cols_to_consider[cols_to_consider%in%c(common_cols - , outcome_cols_stability)] -############################################################## -##################### -# Ensemble stability: outcome_cols_stability -##################### -# extract outcome cols and map numeric values to the categories -# Destabilising == 0, and stabilising == 1, so rescaling can let -1 be destabilising -df3_plot = df3[, cols_to_extract] - -# assign numeric values to outcome -df3_plot[, outcome_cols_stability] <- sapply(df3_plot[, outcome_cols_stability] - , function(x){ifelse(x == "Destabilising", 0, 1)}) -table(df3$duet_outcome) -table(df3_plot$duet_outcome) -#===================================== -# Stability (4 cols): average the scores -# across predictors ==> average by -# position ==> scale b/w -1 and 1 - -# column to average: ens_stability -#===================================== -cols_to_average = which(colnames(df3_plot)%in%outcome_cols_stability) - -# ensemble average across predictors -df3_plot$ens_stability = rowMeans(df3_plot[,cols_to_average]) - -head(df3_plot$position); head(df3_plot$mutationinformation) -head(df3_plot$ens_stability) -table(df3_plot$ens_stability) - # ensemble average of predictors by position -mean_ens_stability_by_position <- df3_plot %>% +avg_stability_by_position <- df3 %>% dplyr::group_by(position) %>% - dplyr::summarize(avg_ens_stability = mean(ens_stability)) + dplyr::summarize(avg_stability_scaled_pos = mean(avg_stability_scaled)) -# REscale b/w -1 and 1 -#en_stab_min = min(mean_ens_stability_by_position['avg_ens_stability']) -#en_stab_max = max(mean_ens_stability_by_position['avg_ens_stability']) +min(avg_stability_by_position$avg_stability_scaled_pos) +max(avg_stability_by_position$avg_stability_scaled_pos) -# scale the average stability value between -1 and 1 -# mean_ens_by_position['averaged_stability3_scaled'] = lapply(mean_ens_by_position['averaged_stability3'] -# , function(x) ifelse(x < 0, x/abs(en3_min), x/en3_max)) - -mean_ens_stability_by_position['avg_ens_stability_scaled'] = lapply(mean_ens_stability_by_position['avg_ens_stability'] +avg_stability_by_position['avg_stability_scaled_pos_scaled'] = lapply(avg_stability_by_position['avg_stability_scaled_pos'] , function(x) { - scales::rescale(x, to = c(-1,1) + scales::rescale_mid(x, to = c(-1,1) #, from = c(en_stab_min,en_stab_max)) + , mid = 0 , from = c(0,1)) }) cat(paste0('Average stability scores:\n' - , head(mean_ens_stability_by_position['avg_ens_stability']) + , head(avg_stability_by_position['avg_stability_scaled_pos']) , '\n---------------------------------------------------------------' , '\nAverage stability scaled scores:\n' - , head(mean_ens_stability_by_position['avg_ens_stability_scaled']))) + , head(avg_stability_by_position['avg_stability_scaled_pos_scaled']) + )) + +all(avg_stability_by_position['avg_stability_scaled_pos'] == avg_stability_by_position['avg_stability_scaled_pos_scaled']) # convert to a data frame -mean_ens_stability_by_position = as.data.frame(mean_ens_stability_by_position) +avg_stability_by_position = as.data.frame(avg_stability_by_position) -#FIXME: sanity checks -# TODO: predetermine the bounds -# l_bound_ens = min(mean_ens_stability_by_position['avg_ens_stability_scaled']) -# u_bound_ens = max(mean_ens_stability_by_position['avg_ens_stability_scaled']) -# -# if ( (l_bound_ens == -1) && (u_bound_ens == 1) ){ -# cat(paste0("PASS: ensemble stability scores averaged by position and then scaled" -# , "\nmin ensemble averaged stability: ", l_bound_ens -# , "\nmax ensemble averaged stability: ", u_bound_ens)) -# }else{ -# cat(paste0("FAIL: avergaed duet scores could not be scaled b/w -1 and 1" -# , "\nmin ensemble averaged stability: ", l_bound_ens -# , "\nmax ensemble averaged stability: ", u_bound_ens)) -# quit() -# } ################################################################## # output #write.csv(combined_df, outfile_mean_ens_st_aff -write.csv(mean_ens_stability_by_position +write.csv(avg_stability_by_position , outfile_mean_ens_st_aff , row.names = F) cat("Finished writing file:\n" , outfile_mean_ens_st_aff - , "\nNo. of rows:", nrow(mean_ens_stability_by_position) - , "\nNo. of cols:", ncol(mean_ens_stability_by_position)) + , "\nNo. of rows:", nrow(avg_stability_by_position) + , "\nNo. of cols:", ncol(avg_stability_by_position)) # end of script #=============================================================== diff --git a/scripts/plotting/structure_figures/replaceBfactor_pdb_stability.R b/scripts/plotting/structure_figures/replaceBfactor_pdb_stability.R index e3be30f..fa85162 100644 --- a/scripts/plotting/structure_figures/replaceBfactor_pdb_stability.R +++ b/scripts/plotting/structure_figures/replaceBfactor_pdb_stability.R @@ -1,5 +1,11 @@ #!/usr/bin/env Rscript - + +#source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/embb.R") +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/gid.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/rpob.R") ######################################################### # TASK: Replace B-factors in the pdb file with the mean # normalised stability values. @@ -22,25 +28,26 @@ cat(c(getwd(),"\n")) library(bio3d) require("getopt", quietly = TRUE) # cmd parse arguments #======================================================== -#drug = "pyrazinamide" -#gene = "pncA" +#drug = "" +#gene = "" -# command line args -spec = matrix(c( - "drug" , "d", 1, "character", - "gene" , "g", 1, "character" -), byrow = TRUE, ncol = 4) - -opt = getopt(spec) - -drug = opt$drug -gene = opt$gene - -if(is.null(drug)|is.null(gene)) { - stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") -} +# # command line args +# spec = matrix(c( +# "drug" , "d", 1, "character", +# "gene" , "g", 1, "character" +# ), byrow = TRUE, ncol = 4) +# +# opt = getopt(spec) +# +# drug = opt$drug +# gene = opt$gene +# +# if(is.null(drug)|is.null(gene)) { +# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +# } #======================================================== -gene_match = paste0(gene,"_p.") +cat(gene) +gene_match = paste0(gene,"_p."); cat(gene_match) cat(gene_match) #============= @@ -64,7 +71,6 @@ cat(paste0("Input file:", infile_pdb) ) in_filename_mean_stability = paste0(tolower(gene), "_mean_ens_stability.csv") infile_mean_stability = paste0(outdir_plots, "/", in_filename_mean_stability) - cat(paste0("Input file:", infile_mean_stability) ) #======= @@ -150,12 +156,12 @@ plot(density(df_duet$b) #============= #hist(my_df$averaged_duet -hist(my_df$avg_ens_stability_scaled +hist(my_df$avg_stability_scaled_pos_scaled , xlab = "" , main = "mean stability values") #plot(density(my_df$averaged_duet) -plot(density(my_df$avg_ens_stability_scaled) +plot(density(my_df$avg_stability_scaled_pos_scaled) , xlab = "" , main = "mean stability values") @@ -178,7 +184,7 @@ plot(density(my_df$avg_ens_stability_scaled) # this makes all the B-factor values in the non-matched positions as NA #df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)] -df_duet$b = my_df$avg_ens_stability_scaled[match(df_duet$resno, my_df$position)] +df_duet$b = my_df$avg_stability_scaled_pos_scaled[match(df_duet$resno, my_df$position)] #========= # step 2_P1 @@ -194,8 +200,8 @@ na_rep = 2 df_duet$b[is.na(df_duet$b)] = na_rep # # sanity check: should be 0 and True -# # duet and lig -# if ( (sum(df_duet$b == na_rep) == b_na_duet) && (sum(df_lig$b == na_rep) == b_na_lig) ) { +# # duet +# if ( (sum(df_duet$b == na_rep) == b_na_duet) { # print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig") # } else { # print("FAIL: NA replacement in df_duet NOT successful") @@ -205,20 +211,13 @@ df_duet$b[is.na(df_duet$b)] = na_rep # max(df_duet$b); min(df_duet$b) # # # sanity checks: should be True -# if( (max(df_duet$b) == max(my_df$avg_ens_stability_scaled)) & (min(df_duet$b) == min(my_df$avg_ens_stability_scaled)) ){ +# if( (max(df_duet$b) == max(my_df$avg_stability_scaled_pos_scaled)) & (min(df_duet$b) == min(my_df$avg_stability_scaled_pos_scaled)) ){ # print("PASS: B-factors replaced correctly in df_duet") # } else { # print ("FAIL: To replace B-factors in df_duet") # quit() # } -# if( (max(df_lig$b) == max(my_df$avg_ens_affinity_scaled)) & (min(df_lig$b) == min(my_df$avg_ens_affinity_scaled)) ){ -# print("PASS: B-factors replaced correctly in df_lig") -# } else { -# print ("FAIL: To replace B-factors in df_lig") -# quit() -# } - #========= # step 3_P1 #=========