From 6d9412d23266ed1833629c84133e8b7b8c0eafd5 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 26 Aug 2021 16:35:46 +0100 Subject: [PATCH 01/51] playing with dm_om (other)plots data and graph on gid branch --- scripts/functions/plotting_globals.R | 3 +- scripts/plotting/get_plotting_dfs.R | 203 +------ scripts/plotting/other_plots_combined.R | 13 +- scripts/plotting/other_plots_data.R | 693 ++++++++++++++++-------- 4 files changed, 502 insertions(+), 410 deletions(-) diff --git a/scripts/functions/plotting_globals.R b/scripts/functions/plotting_globals.R index cfd2848..c28047e 100644 --- a/scripts/functions/plotting_globals.R +++ b/scripts/functions/plotting_globals.R @@ -32,7 +32,8 @@ import_dirs <- function(drug_name, gene_name) { #=============================== # mcsm ligand distance cut off #=============================== -#mcsm_lig_cutoff <<- 10 +LigDist_colname <<- "ligand_distance" +LigDist_cutoff <<- 10 #================== # Angstroms symbol diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index a9e78e9..2dae471 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -25,8 +25,8 @@ source("../functions/bp_subcolours.R") # variables for lig #==================== -LigDist_colname = "ligand_distance" -LigDist_cutoff = 10 +#LigDist_colname = "ligand_distance" +#LigDist_cutoff = 10 #=========== # input @@ -54,10 +54,15 @@ pd_df = plotting_data(mcsm_df , lig_dist_colname = LigDist_colname , lig_dist_cutoff = LigDist_cutoff) -my_df = pd_df[[1]] -my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() -my_df_u_lig = pd_df[[3]] -dup_muts = pd_df[[4]] +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() + +max_ang <- round(max(my_df_u[LigDist_colname])) +min_ang <- round(min(my_df_u[LigDist_colname])) + +cat("\nLigand distance cut off, colname:", LigDist_colname + , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b" + , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b") #-------------------------------- # call: combining_dfs_plotting() @@ -81,14 +86,22 @@ all_plot_dfs = combining_dfs_plotting(my_df_u , lig_dist_colname = LigDist_colname , lig_dist_cutoff = LigDist_cutoff) -merged_df2 = all_plot_dfs[[1]] -merged_df3 = all_plot_dfs[[2]] -merged_df2_comp = all_plot_dfs[[3]] -merged_df3_comp = all_plot_dfs[[4]] -merged_df2_lig = all_plot_dfs[[5]] -merged_df3_lig = all_plot_dfs[[6]] -merged_df2_comp_lig = all_plot_dfs[[7]] -merged_df3_comp_lig = all_plot_dfs[[8]] +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +#====================================================================== +# read other files +infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene + , "_complex_dynamut_norm.csv") + +infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene + , "_complex_dynamut2_norm.csv") + +infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene + , "_complex_mcsm_na_norm.csv") + +dynamut_df = read.csv(infilename_dynamut) +dynamut2_df = read.csv(infilename_dynamut2) +mcsm_na_df = read.csv(infilename_mcsm_na) #################################################################### # Data for subcols barplot (~heatmpa) @@ -168,61 +181,6 @@ subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r") print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours")) print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")) -#======================= -# Data for sub colours -# barplot: LIG -#======================= -cat("\nNo. of cols to select:", length(cols_to_select)) - -subcols_df_lig = merged_df3_lig[, cols_to_select] - -cat("\nNo of unique positions for LIG:" - , length(unique(subcols_df_lig$position))) - -# should be a factor -if (is.factor(subcols_df_lig$ligand_outcome)){ - cat("\nLigand_outcome is factor") - table(subcols_df_lig$ligand_outcome) -}else{ - cat("\nConverting ligand_outcome to factor") - subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome) - table(subcols_df_lig$ligand_outcome) -} - -# should be -1 and 1 -min(subcols_df_lig$affinity_scaled) -max(subcols_df_lig$affinity_scaled) - -tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min) -tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max) - -# check unique values in normalised data -cat("\nNo. of unique values in affinity scaled, no rounding:" - , length(unique(subcols_df_lig$affinity_scaled))) - -# No rounding -my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig) - -# Add rounding is to be used -n = 3 -subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n) - -cat("\nNo. of unique values in duet scaled", n, "places rounding:" - , length(unique(subcols_df_lig$affinity_scaledR))) - -my_grp_lig_r = subcols_df_lig$affinity_scaledR # rounding - -# Add grp cols -subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "") -subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "") - -# Call the function to create the palette based on the group defined above -subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig") -subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r") - -print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours")) -print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours")) - #################################################################### # Data for logoplots #################################################################### @@ -472,113 +430,6 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { , "\nGot: ", check1) } -#================================= -# Data for Correlation plots: LIG -#================================= -cat("\n==========================================" - , "\nCORR PLOTS data: LIG" - , "\n===========================================") - -df_lig = merged_df2_lig - -table(df_lig$ligand_outcome) - -#-------------------- -# adding log cols : NEW UNCOMMENT -#-------------------- -#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq) -#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher) - -##df_lig$log10_or_kin = log10(df_lig$or_kin) -##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin) - -#---------------------------- -# columns for corr plots:PS -#---------------------------- -# subset data to generate pairwise correlations -cols_to_select = c("mutationinformation" - , "affinity_scaled" - #, "mutation_info_labels" - , "asa" - , "rsa" - , "rd_values" - , "kd_values" - , "log10_or_mychisq" - , "neglog_pval_fisher" - ##, "or_kin" - ##, "neglog_pwald_kin" - , "af" - ##, "af_kin" - , "ligand_outcome" - , drug) - -corr_data_lig = df_lig[, cols_to_select] - -dim(corr_data_lig) - -#-------------------------------------- -# assign nice colnames (for display) -#-------------------------------------- -my_corr_colnames = c("Mutation" - , "Ligand Affinity" - #, "Mutation class" - , "ASA" - , "RSA" - , "RD" - , "KD" - , "Log (OR)" - , "-Log (P)" - ##, "Adjusted (OR)" - ##, "-Log (P wald)" - , "MAF" - ##, "MAF_kin" - , "ligand_outcome" - , drug) - -length(my_corr_colnames) - -colnames(corr_data_lig) -colnames(corr_data_lig) <- my_corr_colnames -colnames(corr_data_lig) - -start = 1 -end = which(colnames(corr_data_lig) == drug); end # should be the last column -offset = 1 - -#============================= -# Corr data for plots: LIG -# big_df lig: ~ merged_df2_lig -#============================== -#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug -corr_lig_df2 = corr_data_lig[start:end] -head(corr_lig_df2) - -#============================= -# Corr data for plots: LIG -# short_df lig: ~ merged_df3_lig -#============================== -corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),] - -na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`)) -check1_lig = nrow(corr_lig_df3) - na_or_lig - -if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) { - cat( "\nPASS: No. of rows for corr_lig_df3 match" - , "\nPASS: No. of OR values checked: " , check1_lig) -} else { - cat("\nFAIL: Numbers mismatch:" - , "\nExpected nrows: ", nrow(merged_df3_lig) - , "\nGot: ", nrow(corr_ps_df3_lig) - , "\nExpected OR values: ", nrow(merged_df3_comp_lig) - , "\nGot: ", check1_lig) -} - -# remove unnecessary columns -identical(corr_data_lig, corr_lig_df2) -identical(corr_data_ps, corr_ps_df2) - -#rm(df_ps, df_lig, corr_data_ps, corr_data_lig) - ######################################################################## # End of script ######################################################################## diff --git a/scripts/plotting/other_plots_combined.R b/scripts/plotting/other_plots_combined.R index d927808..3047f38 100644 --- a/scripts/plotting/other_plots_combined.R +++ b/scripts/plotting/other_plots_combined.R @@ -35,7 +35,7 @@ plot_dr_other_combined_labelled = paste0(plotdir,"/", dr_other_combined_labell #my_comparisons <- list( c(dr_muts_col, other_muts_col) ) my_comparisons <- list( c("DM", "OM") ) -my_ats = 22# axis text size +my_ats = 22 # axis text size my_als = 20 # axis label size my_fls = 20 # facet label size my_pts = 22 # plot title size @@ -45,12 +45,15 @@ my_pts = 22 # plot title size #=========== # Plot1: PS #=========== -my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type" - , data = df_lf_ps, paired = FALSE, p.adjust.method = "BH") +# my_stat_ps = compare_means(param_value~mutation_info +# , group.by = "param_type" +# , data = df_lf_ps +# , paired = FALSE +# , p.adjust.method = "BH") y_value = "param_value" -p1 = ggplot(df_lf_ps, aes(x = mutation_info +p1 = ggplot(lf_duet, aes(x = mutation_info , y = eval(parse(text=y_value)) )) + facet_wrap(~ param_type , nrow = 1 @@ -61,7 +64,7 @@ p1 = ggplot(df_lf_ps, aes(x = mutation_info geom_point(position = position_jitterdodge(dodge.width=0.01) , alpha = 0.5 , show.legend = FALSE - , aes(colour = factor(duet_outcome))) + + , aes(colour = duet_outcome)) + theme(axis.text.x = element_text(size = my_ats) , axis.text.y = element_text(size = my_ats , angle = 0 diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R index df5c1e3..8fc9e00 100644 --- a/scripts/plotting/other_plots_data.R +++ b/scripts/plotting/other_plots_data.R @@ -5,21 +5,18 @@ ######################################################### #======================================================================= # working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting") -getwd() +# getwd() +# setwd("~/git/LSHTM_analysis/scripts/plotting") +# getwd() -#source("Header_TT.R") -library(ggplot2) -library(data.table) -library(dplyr) -library(tidyverse) -source("combining_dfs_plotting.R") - -rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig - , merged_df3_comp, merged_df3_comp_lig - , my_df_u, my_df_u_lig) +# make cmd +# globals +# drug = "streptomycin" +# gene = "gid" +#source("get_plotting_dfs.R") +#======================================================================= +# MOVE TO COMBINE or singular file for deepddg cols_to_select = c("mutation", "mutationinformation" , "wild_type", "position", "mutant_type" @@ -27,275 +24,515 @@ cols_to_select = c("mutation", "mutationinformation" merged_df3_short = merged_df3[, cols_to_select] -# write merged_df3 to generate structural figure -write.csv(merged_df3_short, "merged_df3_short.csv") +infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene + , "_mcsm_formatted_snps.csv") +mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F) +names(mcsm_f_snps) <- "mutationinformation" + +# write merged_df3 to generate structural figure on chimera +#write.csv(merged_df3_short, "merged_df3_short.csv") #======================================================================== -#%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT: PS -#%%%%%%%%%%%%%%%%%%%% -df_ps = merged_df3 +# MOVE TO COMBINE or singular file for deepddg #============================ -# adding foldx scaled values +# adding deepddg scaled values # scale data b/w -1 and 1 #============================ -n = which(colnames(df_ps) == "ddg"); n +n = which(colnames(merged_df3) == "deepddg"); n -my_min = min(df_ps[,n]); my_min -my_max = max(df_ps[,n]); my_max +my_min = min(merged_df3[,n]); my_min +my_max = max(merged_df3[,n]); my_max -df_ps$foldx_scaled = ifelse(df_ps[,n] < 0 - , df_ps[,n]/abs(my_min) - , df_ps[,n]/my_max) +merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0 + , merged_df3[,n]/abs(my_min) + , merged_df3[,n]/my_max) # sanity check -my_min = min(df_ps$foldx_scaled); my_min -my_max = max(df_ps$foldx_scaled); my_max +my_min = min(merged_df3$deepddg_scaled); my_min +my_max = max(merged_df3$deepddg_scaled); my_max if (my_min == -1 && my_max == 1){ - cat("PASS: foldx ddg successfully scaled b/w -1 and 1" - , "\nProceeding with assigning foldx outcome category") + cat("PASS: DeepDDG successfully scaled b/w -1 and 1" + #, "\nProceeding with assigning deep outcome category") + , "\n") }else{ - cat("FAIL: could not scale foldx ddg values" + cat("FAIL: could not scale DeepDDG ddg values" , "Aborting!") } -#================================ -# adding foldx outcome category -# ddg<0 = "Stabilising" (-ve) -#================================= +#======================================================================== +# cols to select -c1 = table(df_ps$ddg < 0) -df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising") -c2 = table(df_ps$ddg < 0) +cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation" + , "mutation_info", "position" + , LigDist_colname + , "duet_stability_change", "duet_scaled", "duet_outcome" + , "ligand_affinity_change", "affinity_scaled", "ligand_outcome" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_scaled", "deepddg_outcome" + , "asa", "rsa" + , "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af")] -if ( all(c1 == c2) ){ - cat("PASS: foldx outcome successfully created") -}else{ - cat("FAIL: foldx outcome could not be created. Aborting!") - exit() +cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation" + , "mcsm_na_affinity", "mcsm_na_scaled" + , "mcsm_na_outcome")] +# entire dynamut_df + +cols_dynamut2_df <- dynamut2_df[, c("mutationinformation" + , "ddg_dynamut2", "ddg_dynamut2_scaled" + , "ddg_dynamut2_outcome")] + +n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) + + length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols + +i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df)) +i2<- intersect(names(dynamut_df), names(cols_dynamut2_df)) +merging_cols <- intersect(i1, i2) +cat("\nmerging_cols:", merging_cols) + +if (merging_cols == "mutationinformation") { + cat("\nStage 1: Found common col between dfs, checking values in it...") + c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]]) + c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]]) + c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]]) + c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]]) + cols_check <- c(c1, c2, c3, c4) + expected_cols = n_comb_cols - ( length(cols_check) - 1) + if (all(cols_check)){ + cat("\nStage 2:Proceeding with merging dfs:\n") + comb_df <- Reduce(inner_join, list(cols_mcsm_df + , cols_mcsm_na_df + , dynamut_df + , cols_dynamut2_df)) + comb_df_s = arrange(comb_df, position) + + # if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) { + # cat("\Stage3, PASS: dfs merged sucessfully" + # , "\nnrow of merged_df: ", nrow(comb_df_s) + # , "\nncol of merged_df:", ncol(comb_df_s)) + # } + + } } +names(comb_df_s) #======================================================================= -# name tidying -df_ps$mutation_info = as.factor(df_ps$mutation_info) -df_ps$duet_outcome = as.factor(df_ps$duet_outcome) -df_ps$foldx_outcome = as.factor(df_ps$foldx_outcome) -df_ps$ligand_outcome = as.factor(df_ps$ligand_outcome) +fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] +fact_cols +lapply(comb_df_s[, fact_cols], class) +comb_df_s[,fact_cols] <- lapply(comb_df_s[,cols],as.factor) -# check -table(df_ps$mutation_info) +if (any(lapply(comb_df_s[, fact_cols], class) == "character")){ + cat("\nChanging cols to factor") + comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor) + if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +} +lapply(comb_df_s[, fact_cols], class) + +#======================================================================= +table(comb_df_s$mutation_info) # further checks to make sure dr and other muts are indeed unique -dr_muts = df_ps[df_ps$mutation_info == dr_muts_col,] +dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,] dr_muts_names = unique(dr_muts$mutation) -other_muts = df_ps[df_ps$mutation_info == other_muts_col,] +other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,] other_muts_names = unique(other_muts$mutation) if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){ cat("PASS: dr and other muts are indeed unique") }else{ - cat("FAIL: dr adn others muts are NOT unique!") + cat("FAIL: dr and others muts are NOT unique!") quit() } +# pretty display names i.e. labels to reduce major code duplication later +foo_cnames = data.frame(colnames(comb_df_s)) +names(foo_cnames) <- "old_name" -#%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT: LIG -#%%%%%%%%%%%%%%%%%%%% +stability_suffix <- paste0(delta_symbol, delta_symbol, "G") +flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") -df_lig = merged_df3_lig +lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn +duet_dn = paste0("DUET ", stability_suffix); duet_dn +foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn +deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn +mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn +dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn +dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn +encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn +encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn +sdm_dn = paste0("SDM " , stability_suffix); sdm_dn +mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn -# name tidying -df_lig$mutation_info = as.factor(df_lig$mutation_info) -df_lig$duet_outcome = as.factor(df_lig$duet_outcome) -#df_lig$ligand_outcome = as.factor(df_lig$ligand_outcome) - -# check -table(df_lig$mutation_info) - -#======================================================================== -#=========== -# Data: ps -#=========== -# keep similar dtypes cols together -cols_to_select_ps = c("mutationinformation", "mutation", "position", "mutation_info" - , "duet_outcome" +# Change colnames of some columns using datatable +comb_df_sl = comb_df_s +names(comb_df_sl) +setnames(comb_df_sl + , old = c("asa", "rsa", "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af" + , LigDist_colname , "duet_scaled" - , "ligand_distance" - , "asa" - , "rsa" - , "rd_values" - , "kd_values") + , "foldx_scaled" + , "deepddg_scaled" + , "mcsm_na_scaled" + , "ddg_dynamut_scaled" + , "ddg_dynamut2_scaled" + , "ddg_encom_scaled" + , "dds_encom_scaled" + , "ddg_sdm" + , "ddg_mcsm") + + , new = c("ASA", "RSA", "RD", "KD" + , "Log10 (OR)", "-Log (P)", "MAF" + , lig_dn + , duet_dn + , foldx_dn + , deepddg_dn + , mcsm_na_dn + , dynamut_dn + , dynamut2_dn + , encom_ddg_dn + , encom_dds_dn + , sdm_dn + , mcsm_dn) + ) -df_wf_ps = df_ps[, cols_to_select_ps] +foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl)) -pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps +# some more pretty labels +table(comb_df_sl$mutation_info) -expected_rows_lf_ps = nrow(df_wf_ps) * (length(df_wf_ps) - length(pivot_cols_ps)) -expected_rows_lf_ps +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM" +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM" + +table(comb_df_sl$mutation_info) + +####################################################################### +#====================== +# Selecting dfs +# with appropriate cols +#======================= +static_cols_start = c("mutationinformation" + , "position" + , "mutation" + , "mutation_info") + +static_cols_end = c(lig_dn + , "ASA" + , "RSA" + , "RD" + , "KD") + +# ordering is important! + +######################################################################### +#============== +# DUET: LF +#============== +cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end) +wf_duet = comb_df_sl[, cols_to_select_duet] + +#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps +pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet + +expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet)) +expected_rows_lf # LF data: duet -df_lf_ps = gather(df_wf_ps, param_type, param_value, duet_scaled:kd_values, factor_key=TRUE) +lf_duet = gather(wf_duet + , key = param_type + , value = param_value + , all_of(duet_dn):tail(static_cols_end,1) + , factor_key = TRUE) -if (nrow(df_lf_ps) == expected_rows_lf_ps){ - cat("PASS: long format data created for duet") +if (nrow(lf_duet) == expected_rows_lf){ + cat("\nPASS: long format data created for ", duet_dn) }else{ - cat("FAIL: long format data could not be created for duet") - exit() + cat("\nFAIL: long format data could not be created for duet") + quit() } -str(df_wf_ps) -str(df_lf_ps) - -# assign pretty labels: param_type -levels(df_lf_ps$param_type); table(df_lf_ps$param_type) - -ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")") -ligand_dist_colname - -duet_stability_name = paste0(delta_symbol, delta_symbol, "G") -duet_stability_name - -#levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="duet_scaled"] <- "Stability" -levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="duet_scaled"] <- duet_stability_name -#levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="ligand_distance"] <- "Ligand Distance" -levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="ligand_distance"] <- ligand_dist_colname -levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="asa"] <- "ASA" -levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="rsa"] <- "RSA" -levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="rd_values"] <- "RD" -levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="kd_values"] <- "KD" -# check -levels(df_lf_ps$param_type); table(df_lf_ps$param_type) - -# assign pretty labels: mutation_info -levels(df_lf_ps$mutation_info); table(df_lf_ps$mutation_info) -sum(table(df_lf_ps$mutation_info)) == nrow(df_lf_ps) - -levels(df_lf_ps$mutation_info)[levels(df_lf_ps$mutation_info)==dr_muts_col] <- "DM" -levels(df_lf_ps$mutation_info)[levels(df_lf_ps$mutation_info)==other_muts_col] <- "OM" -# check -levels(df_lf_ps$mutation_info); table(df_lf_ps$mutation_info) - ############################################################################ +#============== +# FoldX: LF +#============== +cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end) +wf_foldx = comb_df_sl[, cols_to_select_foldx] -#=========== -# LF data: LIG -#=========== -# keep similar dtypes cols together -cols_to_select_lig = c("mutationinformation", "mutation", "position", "mutation_info" - , "ligand_outcome" - - , "affinity_scaled" - #, "ligand_distance" - , "asa" - , "rsa" - , "rd_values" - , "kd_values") +pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx -df_wf_lig = df_lig[, cols_to_select_lig] +expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx)) +expected_rows_lf -pivot_cols_lig = cols_to_select_lig[1:5]; pivot_cols_lig +# LF data: duet +print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>") +lf_foldx <<- gather(wf_foldx + , key = param_type + , value = param_value + , all_of(foldx_dn):tail(static_cols_end,1) + , factor_key = TRUE) -expected_rows_lf_lig = nrow(df_wf_lig) * (length(df_wf_lig) - length(pivot_cols_lig)) -expected_rows_lf_lig - -# LF data: foldx -df_lf_lig = gather(df_wf_lig, param_type, param_value, affinity_scaled:kd_values, factor_key=TRUE) - -if (nrow(df_lf_lig) == expected_rows_lf_lig){ - cat("PASS: long format data created for foldx") +if (nrow(lf_foldx) == expected_rows_lf){ + cat("\nPASS: long format data created for ", foldx_dn) }else{ - cat("FAIL: long format data could not be created for foldx") - exit() + cat("\nFAIL: long format data could not be created for duet") + quit() } -# assign pretty labels: param_type -levels(df_lf_lig$param_type); table(df_lf_lig$param_type) - -levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="affinity_scaled"] <- "Ligand Affinity" -#levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="ligand_distance"] <- "Ligand Distance" -levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="asa"] <- "ASA" -levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="rsa"] <- "RSA" -levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="rd_values"] <- "RD" -levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="kd_values"] <- "KD" -#check -levels(df_lf_lig$param_type); table(df_lf_lig$param_type) - -# assign pretty labels: mutation_info -levels(df_lf_lig$mutation_info); table(df_lf_lig$mutation_info) -sum(table(df_lf_lig$mutation_info)) == nrow(df_lf_lig) - -levels(df_lf_lig$mutation_info)[levels(df_lf_lig$mutation_info)==dr_muts_col] <- "DM" -levels(df_lf_lig$mutation_info)[levels(df_lf_lig$mutation_info)==other_muts_col] <- "OM" -# check -levels(df_lf_lig$mutation_info); table(df_lf_lig$mutation_info) - -############################################################################# -#=========== -# Data: foldx -#=========== -# keep similar dtypes cols together -cols_to_select_foldx = c("mutationinformation", "mutation", "position", "mutation_info" - , "foldx_outcome" - - , "foldx_scaled") - #, "ligand_distance" - #, "asa" - #, "rsa" - #, "rd_values" - #, "kd_values") - - -df_wf_foldx = df_ps[, cols_to_select_foldx] - -pivot_cols_foldx = cols_to_select_foldx[1:5]; pivot_cols_foldx - -expected_rows_lf_foldx = nrow(df_wf_foldx) * (length(df_wf_foldx) - length(pivot_cols_foldx)) -expected_rows_lf_foldx - -# LF data: foldx -df_lf_foldx = gather(df_wf_foldx, param_type, param_value, foldx_scaled, factor_key=TRUE) - -if (nrow(df_lf_foldx) == expected_rows_lf_foldx){ - cat("PASS: long format data created for foldx") -}else{ - cat("FAIL: long format data could not be created for foldx") - exit() -} - -foldx_stability_name = paste0(delta_symbol, delta_symbol, "G") -foldx_stability_name - -# assign pretty labels: param type -levels(df_lf_foldx$param_type); table(df_lf_foldx$param_type) - -#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="foldx_scaled"] <- "Stability" -levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="foldx_scaled"] <- foldx_stability_name -#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="ligand_distance"] <- "Ligand Distance" -#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="asa"] <- "ASA" -#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="rsa"] <- "RSA" -#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="rd_values"] <- "RD" -#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="kd_values"] <- "KD" -# check -levels(df_lf_foldx$param_type); table(df_lf_foldx$param_type) - -# assign pretty labels: mutation_info -levels(df_lf_foldx$mutation_info); table(df_lf_foldx$mutation_info) -sum(table(df_lf_foldx$mutation_info)) == nrow(df_lf_foldx) - -levels(df_lf_foldx$mutation_info)[levels(df_lf_foldx$mutation_info)==dr_muts_col] <- "DM" -levels(df_lf_foldx$mutation_info)[levels(df_lf_foldx$mutation_info)==other_muts_col] <- "OM" -# check -levels(df_lf_foldx$mutation_info); table(df_lf_foldx$mutation_info) - ############################################################################ +#============== +# Deepddg: LF +#============== +cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end) +wf_deepddg = comb_df_sl[, cols_to_select_deepddg] -# clear excess variables -rm(cols_to_select_ps, cols_to_select_foldx, cols_to_select_lig - , pivot_cols_ps, pivot_cols_foldx, pivot_cols_lig - , expected_rows_lf_ps, expected_rows_lf_foldx, expected_rows_lf_lig - , my_max, my_min, na_count, na_count_df2, na_count_df3, dup_muts_nu - , c1, c2, n) +pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg + +expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg)) +expected_rows_lf + +# LF data: duet +lf_deepddg = gather(wf_deepddg + , key = param_type + , value = param_value + , all_of(deepddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_deepddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", deepddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM-NA: LF +#============== +cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) +wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + +pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na + +expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) +expected_rows_lf + +# LF data: duet +lf_mcsm_na = gather(wf_mcsm_na + , key = param_type + , value = param_value + , all_of(mcsm_na_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm_na) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_na_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut: LF +#============== +cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end) +wf_dynamut = comb_df_sl[, cols_to_select_dynamut] + +pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut + +expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut)) +expected_rows_lf + +# LF data: duet +lf_dynamut = gather(wf_dynamut + , key = param_type + , value = param_value + , all_of(dynamut_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut2: LF +#============== +cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end) + +wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2] + +pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2 + +expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2)) +expected_rows_lf + +# LF data: duet +lf_dynamut2 = gather(wf_dynamut2 + , key = param_type + , value = param_value + , all_of(dynamut2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut2_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# EnCOM ddg: LF +#============== +cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end) +wf_encomddg = comb_df_sl[, cols_to_select_encomddg] + +pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg + +expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg)) +expected_rows_lf + +# LF data: encomddg +lf_encomddg = gather(wf_encomddg + , key = param_type + , value = param_value + , all_of(encom_ddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", encom_ddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} +############################################################################ +#============== +# EnCOM dds: LF +#============== +cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end) +wf_encomdds = comb_df_sl[, cols_to_select_encomdds] + +pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds + +expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds)) +expected_rows_lf + +# LF data: encomddg +lf_encomdds = gather(wf_encomdds + , key = param_type + , value = param_value + , all_of(encom_dds_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomdds) == expected_rows_lf){ + cat("\nPASS: long format data created for", encom_dds_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# SDM: LF +#============== +cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end) +wf_sdm = comb_df_sl[, cols_to_select_sdm] + +pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm + +expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm)) +expected_rows_lf + +# LF data: encomddg +lf_sdm = gather(wf_sdm + , key = param_type + , value = param_value + , all_of(sdm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_sdm) == expected_rows_lf){ + cat("\nPASS: long format data created for", sdm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM: LF +#============== +cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end) +wf_mcsm = comb_df_sl[, cols_to_select_mcsm] + +pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm + +expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm)) +expected_rows_lf + +# LF data: encomddg +lf_mcsm = gather(wf_mcsm + , key = param_type + , value = param_value + , all_of(mcsm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm) == expected_rows_lf){ + cat("\nPASS: long format data created for", mcsm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} +############################################################################ +# # clear excess variables +# rm(all_plot_dfs +# , cols_dynamut2_df +# , cols_mcsm_df +# , cols_mcsm_na_df +# , comb_df +# , corr_data_ps +# , corr_ps_df3 +# , df_lf_ps +# , foo +# , foo_cnames +# , gene_metadata +# , logo_data +# , logo_data_or_mult +# , logo_data_plot +# , logo_data_plot_logor +# , logo_data_plot_or +# , my_data_snp +# , my_df +# , my_df_u +# , ols_mcsm_df +# , other_muts +# , pd_df +# , subcols_df_ps +# , tab_mt +# , wide_df_logor +# , wide_df_logor_m +# , wide_df_or +# , wide_df_or_mult +# , wt) +# +# +# rm(c3, c4, check1 +# , cols_check +# , cols_to_select +# , cols_to_select_deepddg +# , cols_to_select_duet +# , cols_to_select_dynamut +# , cols_to_select_dynamut2 +# , cols_to_select_encomddg +# , cols_to_select_encomdds +# , cols_to_select_mcsm +# , cols_to_select_mcsm_na +# , cols_to_select_sdm) From 6e01ef22c00070ef820817b2f71c0709129f81a6 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 26 Aug 2021 16:37:56 +0100 Subject: [PATCH 02/51] added stat_bp_stability.R which needs to be a function for generating stat plots --- scripts/functions/stat_bp_stability.R | 51 +++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 scripts/functions/stat_bp_stability.R diff --git a/scripts/functions/stat_bp_stability.R b/scripts/functions/stat_bp_stability.R new file mode 100644 index 0000000..a34b66f --- /dev/null +++ b/scripts/functions/stat_bp_stability.R @@ -0,0 +1,51 @@ +my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type" + , data = df_lf_ps, paired = FALSE, p.adjust.method = "BH") + +y_value = "param_value" + +stat_bp_mut <- function(df + , x_bp_cateog = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , scales = "free_y" + , title = "" + , col_categ = "duet_outcome" + , grp_comp = "my_comparisons" + , stat_method = "wilcox.test" + , my_paired = FALSE + #, stat_label = "p.format") + , stat_label = "p.signif" ) + +p1 = ggplot(df_lf_ps, aes(x = mutation_info + , y = eval(parse(text = y_value)) )) + + facet_wrap(~ param_type + , nrow = 1 + , scales = "free_y") + + geom_boxplot(fill = "white", outlier.colour = NA + #, position = position_dodge(width = 0.9) + , width = 0.2) + + geom_point(position = position_jitterdodge(dodge.width=0.01) + , alpha = 0.5 + , show.legend = FALSE + , aes(colour = factor(duet_outcome))) + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_text(size = my_ats) + , plot.title = element_text(size = my_pts , hjust = 0.5, colour = "black", face = "bold") + , strip.background = element_rect(fill = "khaki2") + , strip.text.x = element_text(size = my_fls, colour = "black") + , legend.title = element_text(color = "black", size = my_als) + , legend.text = element_text(size = my_ats) + , legend.direction = "vertical") + + labs(title = "DUET" + , x = "" + , y = "")+ + stat_compare_means(comparisons = my_comparisons + , method = "wilcox.test" + , paired = FALSE + #, label = "p.format") + , label = "p.signif") \ No newline at end of file From da9bb677060a468a8fe7bcbd996bff5a5183d6eb Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 27 Aug 2021 13:01:52 +0100 Subject: [PATCH 03/51] added function for stats from lf data --- scripts/functions/lf_unpaired_stats.R | 21 +++++ scripts/functions/stat_bp_stability.R | 57 +++++++++++++- scripts/functions/test_lf_unpaired_stats.R | 17 ++++ scripts/plotting/other_plots_data.R | 92 +++++++++++----------- 4 files changed, 137 insertions(+), 50 deletions(-) create mode 100644 scripts/functions/lf_unpaired_stats.R create mode 100644 scripts/functions/test_lf_unpaired_stats.R diff --git a/scripts/functions/lf_unpaired_stats.R b/scripts/functions/lf_unpaired_stats.R new file mode 100644 index 0000000..28a8ad0 --- /dev/null +++ b/scripts/functions/lf_unpaired_stats.R @@ -0,0 +1,21 @@ +library(ggpubr) +################################################################### + +lf_unpaired_stats <- function(lf_data + , lf_stat_value = "param_value" + , lf_stat_group = "mutation_info" + , lf_col_statvars = "param_type" + , my_paired = FALSE + , stat_adj = "none"){ + + stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group)) + + my_stat_df = compare_means(stat_formula + , group.by = lf_col_statvars + , data = lf_data + , paired = my_paired + , p.adjust.method = stat_adj) + + + return(my_stat_df) +} \ No newline at end of file diff --git a/scripts/functions/stat_bp_stability.R b/scripts/functions/stat_bp_stability.R index a34b66f..8ca4a7f 100644 --- a/scripts/functions/stat_bp_stability.R +++ b/scripts/functions/stat_bp_stability.R @@ -1,8 +1,49 @@ -my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type" - , data = df_lf_ps, paired = FALSE, p.adjust.method = "BH") +library(ggpubr) +################################################################### + +my_unpaired_stats <- function(lf_data + , lf_stat_value = "param_value" + , lf_stat_group = "mutation_info" + , lf_col_statvars = "param_type" + , my_paired = FALSE + , stat_adj = "none"){ + + stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group)) + + my_stat_df = compare_means(stat_formula + , group.by = lf_col_statvars + , data = lf_data + , paired = my_paired + , p.adjust.method = stat_adj) + + + return(my_stat_df) +} + +##################### +# call stat function +##################### +stat_results_df <- my_unpaired_stats(lf_data = lf_duet + , lf_stat_value = "param_value" + , lf_stat_group = "mutation_info" + , lf_col_statvars = "param_type" + , my_paired = FALSE + , stat_adj = "none" +) y_value = "param_value" +################################# +my_comparisons <- list( c("DM", "OM") ) + +my_ats = 22 # axis text size +my_als = 20 # axis label size +my_fls = 20 # facet label size +my_pts = 22 # plot title size + +#################################### + + stat_bp_mut <- function(df , x_bp_cateog = "mutation_info" , y_var = "param_value" @@ -16,7 +57,12 @@ stat_bp_mut <- function(df #, stat_label = "p.format") , stat_label = "p.signif" ) -p1 = ggplot(df_lf_ps, aes(x = mutation_info + +############################# +y_value = "param_value" + + +p1 = ggplot(lf_duet, aes(x = mutation_info , y = eval(parse(text = y_value)) )) + facet_wrap(~ param_type , nrow = 1 @@ -48,4 +94,7 @@ p1 = ggplot(df_lf_ps, aes(x = mutation_info , method = "wilcox.test" , paired = FALSE #, label = "p.format") - , label = "p.signif") \ No newline at end of file + , label = "p.signif") + +p1 + diff --git a/scripts/functions/test_lf_unpaired_stats.R b/scripts/functions/test_lf_unpaired_stats.R new file mode 100644 index 0000000..9ec4aac --- /dev/null +++ b/scripts/functions/test_lf_unpaired_stats.R @@ -0,0 +1,17 @@ +setwd("~/git/LSHTM_analysis/scripts/functions") +source("lf_unpaired_stats.R") + +# run other_plots_data.R +# to get the df you want to test this function + + +##################### +# call stat function +##################### +stat_results_df <- lf_unpaired_stats(lf_data = lf_duet + , lf_stat_value = "param_value" + , lf_stat_group = "mutation_info" + , lf_col_statvars = "param_type" + , my_paired = FALSE + , stat_adj = "none" +) \ No newline at end of file diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R index 8fc9e00..d2229b8 100644 --- a/scripts/plotting/other_plots_data.R +++ b/scripts/plotting/other_plots_data.R @@ -6,7 +6,7 @@ #======================================================================= # working dir and loading libraries # getwd() -# setwd("~/git/LSHTM_analysis/scripts/plotting") +setwd("~/git/LSHTM_analysis/scripts/plotting") # getwd() # make cmd @@ -14,7 +14,7 @@ # drug = "streptomycin" # gene = "gid" -#source("get_plotting_dfs.R") +source("get_plotting_dfs.R") #======================================================================= # MOVE TO COMBINE or singular file for deepddg @@ -492,47 +492,47 @@ if (nrow(lf_mcsm) == expected_rows_lf){ quit() } ############################################################################ -# # clear excess variables -# rm(all_plot_dfs -# , cols_dynamut2_df -# , cols_mcsm_df -# , cols_mcsm_na_df -# , comb_df -# , corr_data_ps -# , corr_ps_df3 -# , df_lf_ps -# , foo -# , foo_cnames -# , gene_metadata -# , logo_data -# , logo_data_or_mult -# , logo_data_plot -# , logo_data_plot_logor -# , logo_data_plot_or -# , my_data_snp -# , my_df -# , my_df_u -# , ols_mcsm_df -# , other_muts -# , pd_df -# , subcols_df_ps -# , tab_mt -# , wide_df_logor -# , wide_df_logor_m -# , wide_df_or -# , wide_df_or_mult -# , wt) -# -# -# rm(c3, c4, check1 -# , cols_check -# , cols_to_select -# , cols_to_select_deepddg -# , cols_to_select_duet -# , cols_to_select_dynamut -# , cols_to_select_dynamut2 -# , cols_to_select_encomddg -# , cols_to_select_encomdds -# , cols_to_select_mcsm -# , cols_to_select_mcsm_na -# , cols_to_select_sdm) +# clear excess variables +rm(all_plot_dfs + , cols_dynamut2_df + , cols_mcsm_df + , cols_mcsm_na_df + , comb_df + , corr_data_ps + , corr_ps_df3 + , df_lf_ps + , foo + , foo_cnames + , gene_metadata + , logo_data + , logo_data_or_mult + , logo_data_plot + , logo_data_plot_logor + , logo_data_plot_or + , my_data_snp + , my_df + , my_df_u + , ols_mcsm_df + , other_muts + , pd_df + , subcols_df_ps + , tab_mt + , wide_df_logor + , wide_df_logor_m + , wide_df_or + , wide_df_or_mult + , wt) + + +rm(c3, c4, check1 + , cols_check + , cols_to_select + , cols_to_select_deepddg + , cols_to_select_duet + , cols_to_select_dynamut + , cols_to_select_dynamut2 + , cols_to_select_encomddg + , cols_to_select_encomdds + , cols_to_select_mcsm + , cols_to_select_mcsm_na + , cols_to_select_sdm) From edb409baef92d835d73e0c4ae9533290c0702d39 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 27 Aug 2021 13:03:39 +0100 Subject: [PATCH 04/51] renamed dm_om barplot function scriptto lf_bp_stability.R --- scripts/functions/{stat_bp_stability.R => lf_bp_stability.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/functions/{stat_bp_stability.R => lf_bp_stability.R} (100%) diff --git a/scripts/functions/stat_bp_stability.R b/scripts/functions/lf_bp_stability.R similarity index 100% rename from scripts/functions/stat_bp_stability.R rename to scripts/functions/lf_bp_stability.R From 826d3c72b724a27739bf5225beb145ec7eadbe72 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 27 Aug 2021 14:05:00 +0100 Subject: [PATCH 05/51] added functions for bp with stat and tested them --- scripts/functions/lf_bp_stability.R | 100 --------------------- scripts/functions/lf_bp_with_stats.R | 68 ++++++++++++++ scripts/functions/test_lf_bp_with_stats.R | 28 ++++++ scripts/functions/test_lf_unpaired_stats.R | 12 +-- 4 files changed, 103 insertions(+), 105 deletions(-) delete mode 100644 scripts/functions/lf_bp_stability.R create mode 100644 scripts/functions/lf_bp_with_stats.R create mode 100644 scripts/functions/test_lf_bp_with_stats.R diff --git a/scripts/functions/lf_bp_stability.R b/scripts/functions/lf_bp_stability.R deleted file mode 100644 index 8ca4a7f..0000000 --- a/scripts/functions/lf_bp_stability.R +++ /dev/null @@ -1,100 +0,0 @@ -library(ggpubr) -################################################################### - -my_unpaired_stats <- function(lf_data - , lf_stat_value = "param_value" - , lf_stat_group = "mutation_info" - , lf_col_statvars = "param_type" - , my_paired = FALSE - , stat_adj = "none"){ - - stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group)) - - my_stat_df = compare_means(stat_formula - , group.by = lf_col_statvars - , data = lf_data - , paired = my_paired - , p.adjust.method = stat_adj) - - - return(my_stat_df) -} - -##################### -# call stat function -##################### -stat_results_df <- my_unpaired_stats(lf_data = lf_duet - , lf_stat_value = "param_value" - , lf_stat_group = "mutation_info" - , lf_col_statvars = "param_type" - , my_paired = FALSE - , stat_adj = "none" -) - -y_value = "param_value" - -################################# -my_comparisons <- list( c("DM", "OM") ) - -my_ats = 22 # axis text size -my_als = 20 # axis label size -my_fls = 20 # facet label size -my_pts = 22 # plot title size - -#################################### - - -stat_bp_mut <- function(df - , x_bp_cateog = "mutation_info" - , y_var = "param_value" - , facet_var = "param_type" - , scales = "free_y" - , title = "" - , col_categ = "duet_outcome" - , grp_comp = "my_comparisons" - , stat_method = "wilcox.test" - , my_paired = FALSE - #, stat_label = "p.format") - , stat_label = "p.signif" ) - - -############################# -y_value = "param_value" - - -p1 = ggplot(lf_duet, aes(x = mutation_info - , y = eval(parse(text = y_value)) )) + - facet_wrap(~ param_type - , nrow = 1 - , scales = "free_y") + - geom_boxplot(fill = "white", outlier.colour = NA - #, position = position_dodge(width = 0.9) - , width = 0.2) + - geom_point(position = position_jitterdodge(dodge.width=0.01) - , alpha = 0.5 - , show.legend = FALSE - , aes(colour = factor(duet_outcome))) + - theme(axis.text.x = element_text(size = my_ats) - , axis.text.y = element_text(size = my_ats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_ats) - , axis.title.y = element_text(size = my_ats) - , plot.title = element_text(size = my_pts , hjust = 0.5, colour = "black", face = "bold") - , strip.background = element_rect(fill = "khaki2") - , strip.text.x = element_text(size = my_fls, colour = "black") - , legend.title = element_text(color = "black", size = my_als) - , legend.text = element_text(size = my_ats) - , legend.direction = "vertical") + - labs(title = "DUET" - , x = "" - , y = "")+ - stat_compare_means(comparisons = my_comparisons - , method = "wilcox.test" - , paired = FALSE - #, label = "p.format") - , label = "p.signif") - -p1 - diff --git a/scripts/functions/lf_bp_with_stats.R b/scripts/functions/lf_bp_with_stats.R new file mode 100644 index 0000000..336c270 --- /dev/null +++ b/scripts/functions/lf_bp_with_stats.R @@ -0,0 +1,68 @@ +library(ggpubr) +################################################################### + +#################################### +lf_bp_with_stats <- function(lf_df + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , p_title = "" + , colour_categ = "" + , colour_bp_strip = "khaki2" + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + #, stat_label = "p.format") + , stat_label = c("p.format", "p.signif") + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 # plot title size +) { + my_comparisonsL <- list( stat_grp_comp ) + + bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp)) + , y = eval(parse(text = y_var)) )) + + + facet_wrap(~ eval(parse(text = facet_var)) + , nrow = n_facet_row + , scales = y_scales) + + + geom_boxplot(fill = "white", outlier.colour = NA + #, position = position_dodge(width = 0.9) + , width = 0.2) + + + geom_point(position = position_jitterdodge(dodge.width = 0.01) + , alpha = 0.5 + , show.legend = FALSE + , aes(colour = factor(eval(parse(text = colour_categ))) )) + + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_text(size = my_ats) + , plot.title = element_text(size = my_pts , hjust = 0.5, colour = "black", face = "bold") + , strip.background = element_rect(fill = colour_bp_strip) + , strip.text.x = element_text(size = my_fls, colour = "black") + , legend.title = element_text(color = "black", size = my_als) + , legend.text = element_text(size = my_ats) + , legend.direction = "vertical") + + + labs(title = p_title + , x = "" + , y = "")+ + + stat_compare_means(comparisons = my_comparisonsL + , method = stat_method + , paired = my_paired + #, label = "p.format") + , label = stat_label[1]) + + return(bp_statP) + +} diff --git a/scripts/functions/test_lf_bp_with_stats.R b/scripts/functions/test_lf_bp_with_stats.R new file mode 100644 index 0000000..4cfff9d --- /dev/null +++ b/scripts/functions/test_lf_bp_with_stats.R @@ -0,0 +1,28 @@ +setwd("~/git/LSHTM_analysis/scripts/plotting/") + +source("../functions/lf_bp_with_stats.R") + +###################### +# call function +###################### +# Note: Data +# run other_plots_data.R +# to get the long format data to test this function + +lf_bp_with_stats(lf_df = lf_dynamut2 + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , p_title = "Dynamut2" + , colour_categ = "ddg_dynamut2_outcome" + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + #, stat_label = "p.format") + , stat_label = c("p.format", "p.signif") + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 )# plot title size diff --git a/scripts/functions/test_lf_unpaired_stats.R b/scripts/functions/test_lf_unpaired_stats.R index 9ec4aac..a1ff9d1 100644 --- a/scripts/functions/test_lf_unpaired_stats.R +++ b/scripts/functions/test_lf_unpaired_stats.R @@ -1,13 +1,15 @@ setwd("~/git/LSHTM_analysis/scripts/functions") source("lf_unpaired_stats.R") +##################### +# call stat function() +# a useful way to check stats +# for any lf data +##################### +# Note: Data # run other_plots_data.R -# to get the df you want to test this function +# to get the long format data to test this function - -##################### -# call stat function -##################### stat_results_df <- lf_unpaired_stats(lf_data = lf_duet , lf_stat_value = "param_value" , lf_stat_group = "mutation_info" From fcb4b85747c50ef5318f40254f3852d5d20cf21f Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 2 Sep 2021 12:50:24 +0100 Subject: [PATCH 06/51] modified bp with option for adding stats and boxplplots. Moved old one to redundant --- scripts/functions/lf_bp.R | 193 ++++++++++++++++++ .../{ => redundant}/lf_bp_with_stats.R | 55 +++-- .../redundant/test_lf_bp_with_stats.R | 83 ++++++++ scripts/functions/test_lf_bp.R | 55 +++++ scripts/functions/test_lf_bp_with_stats.R | 28 --- scripts/plotting/Header_TT.R | 67 +++--- scripts/plotting/get_plotting_dfs.R | 25 ++- scripts/plotting/other_plots_data.R | 39 ++-- 8 files changed, 443 insertions(+), 102 deletions(-) create mode 100644 scripts/functions/lf_bp.R rename scripts/functions/{ => redundant}/lf_bp_with_stats.R (54%) create mode 100644 scripts/functions/redundant/test_lf_bp_with_stats.R create mode 100644 scripts/functions/test_lf_bp.R delete mode 100644 scripts/functions/test_lf_bp_with_stats.R diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R new file mode 100644 index 0000000..4677548 --- /dev/null +++ b/scripts/functions/lf_bp.R @@ -0,0 +1,193 @@ +############################# +# Barplots: ggplot +# stats +/- +# violin +/- +# barplot +/ +# beeswarm +############################# + +lf_bp <- function(lf_df + , p_title = "" + , colour_categ = "" + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , colour_bp_strip = "khaki2" + , dot_size = 3 + , dot_transparency = 0.3 + , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 # plot title size) + , make_boxplot = FALSE + , bp_width = c("auto", 0.5) + , add_stats = FALSE + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + , stat_label = c("p.format", "p.signif") ){ + + p1 <- ggplot(lf_df, aes(x = eval(parse(text = x_grp)) + , y = eval(parse(text = y_var)) )) + + + facet_wrap(~ eval(parse(text = facet_var)) + , nrow = n_facet_row + , scales = y_scales) + + + geom_violin(trim = T + , scale = "width" + #, position = position_dodge(width = 0.9) + , draw_quantiles = violin_quantiles) + + if (make_boxplot){ + + if (bp_width == "auto"){ + bp_width = 0.5/length(unique(lf_df[[x_grp]])) + cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n") + }else{ + cat("\nBoxplot width value provided, using:", bp_width, "\n") + bp_width = bp_width} + + p2 = p1 + geom_boxplot(fill = "white" + , outlier.colour = NA + #, position = position_dodge(width = 0.9) + , width = bp_width) + + geom_beeswarm(priority = "density" + #, shape = 21 + , size = dot_size + , alpha = dot_transparency + , show.legend = FALSE + , cex = 0.8 + , aes(colour = factor(eval(parse(text = colour_categ))) )) + + } else { + # ggbeeswarm (better than geom_point) + p2 = p1 + geom_beeswarm(priority = "density" + #, shape = 21 + , size = dot_size + , alpha = dot_transparency + , show.legend = FALSE + , cex = 0.8 + , aes(colour = factor(eval(parse(text = colour_categ))) )) + } + + # Add foramtting to graph + OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_text(size = my_ats) + , plot.title = element_text(size = my_pts + , hjust = 0.5 + , colour = "black" + , face = "bold") + , strip.background = element_rect(fill = colour_bp_strip) + , strip.text.x = element_text(size = my_fls + , colour = "black") + , legend.title = element_text(color = "black" + , size = my_als) + , legend.text = element_text(size = my_ats) + , legend.direction = "vertical") + + + labs(title = p_title + , x = "" + , y = "") + + if (add_stats){ + my_comparisonsL <- list( stat_grp_comp ) + + OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL + , method = stat_method + , paired = my_paired + , label = stat_label[1]) + } + return(OutPlot) +} + +############################# +# Barplot NO stats: plotly +# violin +/- +# barplot +/ +# beeswarm + +# TODO: plot_ly() +############################# +lf_bp_plotly <- function(lf_df + , p_title = "" + , colour_categ = "" + , x_grp = mutation_info + , y_var = param_value + , facet_var = param_type + , n_facet_row = 1 + , y_scales = "free_y" + , colour_bp_strip = "khaki2" + , dot_size = 3 + , dot_transparency = 0.3 + , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL + , my_ats = 20 # axis text size + , my_als = 18 # axis label size + , my_fls = 18 # facet label size + , my_pts = 22 # plot title size) + #, make_boxplot = FALSE + , bp_width = c("auto", 0.5) + #, add_stats = FALSE + #, stat_grp_comp = c("DM", "OM") + #, stat_method = "wilcox.test" + #, my_paired = FALSE + #, stat_label = c("p.format", "p.signif") + ){ + + OutPlotly = ggplot(lf_df, aes(x = eval(parse(text = x_grp)) + , y = eval(parse(text = y_var)) + , label1 = x_grp + , label2 = y_var + , lable3 = colour_categ) ) + + + facet_wrap(~ eval(parse(text = facet_var)) + , nrow = n_facet_row + , scales = y_scales) + + + geom_violin(trim = T + , scale = "width" + , draw_quantiles = violin_quantiles) + + + geom_beeswarm(priority = "density" + , size = dot_size + , alpha = dot_transparency + , show.legend = FALSE + , cex = 0.8 + , aes(colour = factor(eval(parse(text = colour_categ) ) ) ) ) + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_text(size = my_ats) + , plot.title = element_text(size = my_pts + , hjust = 0.5 + , colour = "black" + , face = "bold") + , strip.background = element_rect(fill = colour_bp_strip) + , strip.text.x = element_text(size = my_fls + , colour = "black") + , legend.title = element_text(color = "black" + , size = my_als) + , legend.text = element_text(size = my_ats) + , legend.position = "none")+ + + labs(title = p_title + , x = "" + , y = "") + + OutPlotly = ggplotly(OutPlotly + #, tooltip = c("label") + ) + return(OutPlotly) + +} diff --git a/scripts/functions/lf_bp_with_stats.R b/scripts/functions/redundant/lf_bp_with_stats.R similarity index 54% rename from scripts/functions/lf_bp_with_stats.R rename to scripts/functions/redundant/lf_bp_with_stats.R index 336c270..22533d5 100644 --- a/scripts/functions/lf_bp_with_stats.R +++ b/scripts/functions/redundant/lf_bp_with_stats.R @@ -14,13 +14,23 @@ lf_bp_with_stats <- function(lf_df , stat_grp_comp = c("DM", "OM") , stat_method = "wilcox.test" , my_paired = FALSE - #, stat_label = "p.format") + , bp_width = c("auto", 0.5) + , dot_size = 3 + , dot_transparency = 0.3 , stat_label = c("p.format", "p.signif") , my_ats = 22 # axis text size , my_als = 20 # axis label size , my_fls = 20 # facet label size , my_pts = 22 # plot title size ) { + if (bp_width == "auto"){ + bp_width = 0.5/length(unique(lf_df[[x_grp]])) + cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n") + }else{ + cat("\nBoxplot width value provided, using:", bp_width, "\n") + bp_width = bp_width + } + my_comparisonsL <- list( stat_grp_comp ) bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp)) @@ -30,15 +40,30 @@ lf_bp_with_stats <- function(lf_df , nrow = n_facet_row , scales = y_scales) + - geom_boxplot(fill = "white", outlier.colour = NA - #, position = position_dodge(width = 0.9) - , width = 0.2) + - - geom_point(position = position_jitterdodge(dodge.width = 0.01) - , alpha = 0.5 - , show.legend = FALSE - , aes(colour = factor(eval(parse(text = colour_categ))) )) + + geom_violin(trim = T + , scale = "width" + #, position = position_dodge(width = 0.9) + , draw_quantiles = c(0.25, 0.5, 0.75)) + + # geom_boxplot(fill = "white" + # , outlier.colour = NA + # #, position = position_dodge(width = 0.9) + # , width = bp_width) + + + # geom_point(position = position_jitterdodge(dodge.width = 0.5) + # , alpha = 0.5 + # , show.legend = FALSE + # , aes(colour = factor(eval(parse(text = colour_categ))) )) + + + # ggbeeswarm (better than geom_point) + geom_beeswarm(priority = "density" + #, shape = 21 + , size = dot_size + , alpha = dot_transparency + , show.legend = FALSE + , cex = 0.8 + , aes(colour = factor(eval(parse(text = colour_categ))) )) + + theme(axis.text.x = element_text(size = my_ats) , axis.text.y = element_text(size = my_ats , angle = 0 @@ -46,10 +71,15 @@ lf_bp_with_stats <- function(lf_df , vjust = 0) , axis.title.x = element_text(size = my_ats) , axis.title.y = element_text(size = my_ats) - , plot.title = element_text(size = my_pts , hjust = 0.5, colour = "black", face = "bold") + , plot.title = element_text(size = my_pts + , hjust = 0.5 + , colour = "black" + , face = "bold") , strip.background = element_rect(fill = colour_bp_strip) - , strip.text.x = element_text(size = my_fls, colour = "black") - , legend.title = element_text(color = "black", size = my_als) + , strip.text.x = element_text(size = my_fls + , colour = "black") + , legend.title = element_text(color = "black" + , size = my_als) , legend.text = element_text(size = my_ats) , legend.direction = "vertical") + @@ -60,7 +90,6 @@ lf_bp_with_stats <- function(lf_df stat_compare_means(comparisons = my_comparisonsL , method = stat_method , paired = my_paired - #, label = "p.format") , label = stat_label[1]) return(bp_statP) diff --git a/scripts/functions/redundant/test_lf_bp_with_stats.R b/scripts/functions/redundant/test_lf_bp_with_stats.R new file mode 100644 index 0000000..51654a1 --- /dev/null +++ b/scripts/functions/redundant/test_lf_bp_with_stats.R @@ -0,0 +1,83 @@ +setwd("~/git/LSHTM_analysis/scripts/plotting/") + +source("../functions/lf_bp_with_stats.R") +source("../functions/lf_bp.R") + +###################### +# Make plot +###################### +# Note: Data +# run other_plots_data.R +# to get the long format data to test this function + +lf_bp(lf_df = lf_dynamut2 + , p_title = "Dynamut2" + , colour_categ = "ddg_dynamut2_outcome" + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , colour_bp_strip = "khaki2" + , dot_size = 3 + , dot_transparency = 0.3 + , violin_quantiles = c(0.25, 0.5, 0.75) + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 # plot title size + , make_boxplot = F + , bp_width = "auto" + , add_stats = T + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + , stat_label = c("p.format", "p.signif") ) + +# foo = lf_dynamut2 %>% +# group_by(mutation_info, param_type) %>% +# summarise( Mean = mean(param_value, na.rm = T) +# , SD = sd(param_value, na.rm = T) +# , Median = median(param_value, na.rm = T) +# , IQR = IQR(param_value, na.rm = T) ) + +# Quick tests +plotdata_sel = subset(lf_dynamut2 + , lf_dynamut2$param_type == "ASA") + +plot_sum = plotdata_sel %>% + group_by(mutation_info, param_type) %>% + summarise(n = n() + , Mean = mean(param_value, na.rm = T) + , SD = sd(param_value, na.rm = T) + , Min = min(param_value, na.rm = T) + , Q1 = quantile(param_value, na.rm = T, 0.25) + , Median = median(param_value, na.rm = T) + , Q3 = quantile(param_value, na.rm = T, 0.75) + , Max = max(param_value, na.rm = T) ) %>% + rename('Mutation Class' = mutation_info + , Parameter = param_type) +plot_sum = as.data.frame(plot_sum, row.names = NULL) +plot_sum + +bar = compare_means(param_value ~ mutation_info + , group.by = "param_type" + , data = plotdata_sel + , paired = FALSE + , p.adjust.method = "BH") +bar2 = bar[c("param_type" + , "group1" + , "group2" + , "p.format" + , "p.signif" + , "p.adj")] %>% + rename(Parameter = param_type + , Group1 = group1 + , Group2 = group2 + , "P-value" = p.format + , "P-sig" = p.signif + , "P-adj" = p.adj) +bar2 = data.frame(bar2); bar2 + +library(Hmisc) +describe(lf_dynamut2) diff --git a/scripts/functions/test_lf_bp.R b/scripts/functions/test_lf_bp.R new file mode 100644 index 0000000..42b78bf --- /dev/null +++ b/scripts/functions/test_lf_bp.R @@ -0,0 +1,55 @@ +setwd("~/git/LSHTM_analysis/scripts/plotting/") +source("Header_TT.R") +source("../functions/lf_bp.R") +# ================================================ +# Data: run get_plotting_data.R +# to get the long format data to test this function +# drug = "streptomycin" +# gene = "gid" +# source("get_plotting_dfs.R") +# ================================================== + +###################### +# Make plot: ggplot +###################### +lf_bp(lf_df = lf_dynamut2 + , p_title = "Dynamut2" + , colour_categ = "ddg_dynamut2_outcome" + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , colour_bp_strip = "khaki2" + , dot_size = 3 + , dot_transparency = 0.3 + , violin_quantiles = c(0.25, 0.5, 0.75) + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 # plot title size + , make_boxplot = F + , bp_width = "auto" + , add_stats = T + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + , stat_label = c("p.format", "p.signif") ) + +###################### +# Make plot: plotly +###################### +# FIXME: This labels are not working as I want! +# lf_bp_plotly(lf_df = lf_deepddg +# , p_title = "DeepDDG" +# , colour_categ = "deepddg_outcome" +# , x_grp = "mutation_info" +# , y_var = "param_value" +# , facet_var = "param_type" +# , n_facet_row = 1 +# , y_scales = "free_y" +# , colour_bp_strip = "khaki2" +# , dot_size = 3 +# , dot_transparency = 0.3 +# , violin_quantiles = c(0.25, 0.5, 0.75) +# ) diff --git a/scripts/functions/test_lf_bp_with_stats.R b/scripts/functions/test_lf_bp_with_stats.R deleted file mode 100644 index 4cfff9d..0000000 --- a/scripts/functions/test_lf_bp_with_stats.R +++ /dev/null @@ -1,28 +0,0 @@ -setwd("~/git/LSHTM_analysis/scripts/plotting/") - -source("../functions/lf_bp_with_stats.R") - -###################### -# call function -###################### -# Note: Data -# run other_plots_data.R -# to get the long format data to test this function - -lf_bp_with_stats(lf_df = lf_dynamut2 - , x_grp = "mutation_info" - , y_var = "param_value" - , facet_var = "param_type" - , n_facet_row = 1 - , y_scales = "free_y" - , p_title = "Dynamut2" - , colour_categ = "ddg_dynamut2_outcome" - , stat_grp_comp = c("DM", "OM") - , stat_method = "wilcox.test" - , my_paired = FALSE - #, stat_label = "p.format") - , stat_label = c("p.format", "p.signif") - , my_ats = 22 # axis text size - , my_als = 20 # axis label size - , my_fls = 20 # facet label size - , my_pts = 22 )# plot title size diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R index 199031b..e4593d0 100755 --- a/scripts/plotting/Header_TT.R +++ b/scripts/plotting/Header_TT.R @@ -3,12 +3,6 @@ ######################################################### #lib_loc = "/usr/local/lib/R/site-library") -#if (!require("gplots")) { -# install.packages("gplots", dependencies = TRUE) -# library(gplots) -#} -require(extrafont) - require("getopt", quietly = TRUE) # cmd parse arguments if (!require("tidyverse")) { @@ -16,9 +10,23 @@ if (!require("tidyverse")) { library(tidyverse) } -if (!require("ggplot2")) { - install.packages("ggplot2", dependencies = TRUE) - library(ggplot2) +# if (!require("ggplot2")) { +# install.packages("ggplot2", dependencies = TRUE) +# library(ggplot2) +# } + +# if (!require ("dplyr")){ +# install.packages("dplyr") +# library(dplyr) +# } + +# Install +#if(!require(devtools)) install.packages("devtools") +#devtools::install_github("kassambara/ggcorrplot") + +if (!require ("ggbeeswarm")){ + install.packages("ggbeeswarm") + library(ggbeeswarm) } if (!require("plotly")) { @@ -101,11 +109,6 @@ if (!require ("psych")){ library(psych) } -if (!require ("dplyr")){ - install.packages("dplyr") - library(dplyr) -} - if (!require ("compare")){ install.packages("compare") library(compare) @@ -116,31 +119,25 @@ if (!require ("arsenal")){ library(arsenal) } +if(!require(ggseqlogo)){ + install.packages("ggseqlogo") + library(ggseqlogo) +} -#if (!requireNamespace("BiocManager", quietly = TRUE)) -# install.packages("BiocManager") - -#BiocManager::install("Logolas") -library("Logolas") - -#install.packages("ggseqlogo") -library(ggseqlogo) - - -####TIDYVERSE -# Install -#if(!require(devtools)) install.packages("devtools") -#devtools::install_github("kassambara/ggcorrplot") - -library(ggcorrplot) - - -###for PDB files -#install.packages("bio3d") +# for PDB files if(!require(bio3d)){ install.packages("bio3d") library(bio3d) } -#install.packages("protr") library(protr) +if(!require(protr)){ + install.packages("protr") + library(protr) +} + +#if (!requireNamespace("BiocManager", quietly = TRUE)) +# install.packages("BiocManager") + +#BiocManager::install("Logolas") +library("Logolas") \ No newline at end of file diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index 2dae471..2fc1c19 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -86,8 +86,10 @@ all_plot_dfs = combining_dfs_plotting(my_df_u , lig_dist_colname = LigDist_colname , lig_dist_cutoff = LigDist_cutoff) -merged_df2 = all_plot_dfs[[1]] -merged_df3 = all_plot_dfs[[2]] +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +merged_df2_comp = all_plot_dfs[[3]] +merged_df3_comp = all_plot_dfs[[4]] #====================================================================== # read other files infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene @@ -98,10 +100,15 @@ infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dyna infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene , "_complex_mcsm_na_norm.csv") - + +infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene + , "_mcsm_formatted_snps.csv") + dynamut_df = read.csv(infilename_dynamut) dynamut2_df = read.csv(infilename_dynamut2) mcsm_na_df = read.csv(infilename_mcsm_na) +mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F) +names(mcsm_f_snps) = "mutationinformation" #################################################################### # Data for subcols barplot (~heatmpa) @@ -430,11 +437,17 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { , "\nGot: ", check1) } + +rm(foo) +#################################################################### +# Data for DM OM Plots: Long format dfs +#################################################################### +source("other_plots_data.R") + ######################################################################## # End of script ######################################################################## -rm(foo) -cat("\n===================================================\n" +cat("\n######################################################\n" , "\nSuccessful: get_plotting_dfs.R worked!" - , "\n====================================================") \ No newline at end of file + , "\n###################################################\n") diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R index d2229b8..8eb2020 100644 --- a/scripts/plotting/other_plots_data.R +++ b/scripts/plotting/other_plots_data.R @@ -3,10 +3,9 @@ # TASK: producing boxplots for dr and other muts ######################################################### -#======================================================================= # working dir and loading libraries # getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting") +# setwd("~/git/LSHTM_analysis/scripts/plotting") # getwd() # make cmd @@ -14,21 +13,21 @@ setwd("~/git/LSHTM_analysis/scripts/plotting") # drug = "streptomycin" # gene = "gid" -source("get_plotting_dfs.R") +# source("get_plotting_dfs.R") #======================================================================= # MOVE TO COMBINE or singular file for deepddg +# +# cols_to_select = c("mutation", "mutationinformation" +# , "wild_type", "position", "mutant_type" +# , "mutation_info") +# +# merged_df3_short = merged_df3[, cols_to_select] -cols_to_select = c("mutation", "mutationinformation" - , "wild_type", "position", "mutant_type" - , "mutation_info") - -merged_df3_short = merged_df3[, cols_to_select] - -infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene - , "_mcsm_formatted_snps.csv") - -mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F) -names(mcsm_f_snps) <- "mutationinformation" +# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene +# , "_mcsm_formatted_snps.csv") +# +# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F) +# names(mcsm_f_snps) <- "mutationinformation" # write merged_df3 to generate structural figure on chimera #write.csv(merged_df3_short, "merged_df3_short.csv") @@ -52,11 +51,11 @@ my_min = min(merged_df3$deepddg_scaled); my_min my_max = max(merged_df3$deepddg_scaled); my_max if (my_min == -1 && my_max == 1){ - cat("PASS: DeepDDG successfully scaled b/w -1 and 1" + cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1" #, "\nProceeding with assigning deep outcome category") , "\n") }else{ - cat("FAIL: could not scale DeepDDG ddg values" + cat("\nFAIL: could not scale DeepDDG ddg values" , "Aborting!") } @@ -100,7 +99,7 @@ if (merging_cols == "mutationinformation") { cols_check <- c(c1, c2, c3, c4) expected_cols = n_comb_cols - ( length(cols_check) - 1) if (all(cols_check)){ - cat("\nStage 2:Proceeding with merging dfs:\n") + cat("\nStage 2: Proceeding with merging dfs:\n") comb_df <- Reduce(inner_join, list(cols_mcsm_df , cols_mcsm_na_df , dynamut_df @@ -115,12 +114,13 @@ if (merging_cols == "mutationinformation") { } } -names(comb_df_s) +#names(comb_df_s) +cat("\n!!!IT GOT TO HERE!!!!") #======================================================================= fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] fact_cols lapply(comb_df_s[, fact_cols], class) -comb_df_s[,fact_cols] <- lapply(comb_df_s[,cols],as.factor) +comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor) if (any(lapply(comb_df_s[, fact_cols], class) == "character")){ cat("\nChanging cols to factor") @@ -512,7 +512,6 @@ rm(all_plot_dfs , my_data_snp , my_df , my_df_u - , ols_mcsm_df , other_muts , pd_df , subcols_df_ps From a981580b7a70bc33cddee5e42addb799835163c2 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 2 Sep 2021 12:51:31 +0100 Subject: [PATCH 07/51] separated get_plotting_dfs_with_lig.R --- scripts/plotting/get_plotting_dfs_with_lig.R | 589 +++++++++++++++++++ 1 file changed, 589 insertions(+) create mode 100644 scripts/plotting/get_plotting_dfs_with_lig.R diff --git a/scripts/plotting/get_plotting_dfs_with_lig.R b/scripts/plotting/get_plotting_dfs_with_lig.R new file mode 100644 index 0000000..f17e997 --- /dev/null +++ b/scripts/plotting/get_plotting_dfs_with_lig.R @@ -0,0 +1,589 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Get formatted data for plots +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +source("Header_TT.R") +source("../functions/my_pairs_panel.R") # with lower panel turned off +source("../functions/plotting_globals.R") +source("../functions/plotting_data.R") +source("../functions/combining_dfs_plotting.R") +source("../functions/bp_subcolours.R") + +#******************** +# cmd args passed +# in from other scripts +# to call this +#******************** +#drug = 'streptomycin' +#gene = 'gid' +#==================== +# variables for lig +#==================== + +LigDist_colname = "ligand_distance" +LigDist_cutoff = 10 + +#=========== +# input +#=========== +#--------------------- +# call: import_dirs() +#--------------------- +import_dirs(drug, gene) + +#--------------------------- +# call: plotting_data() +#--------------------------- +if (!exists("infile_params") && exists("gene")){ +#if (!is.character(infile_params) && exists("gene")){ # when running as cmd + #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA + in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + infile_params = paste0(outdir, "/", in_filename_params) + cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") +} + +# Input 1: read _comb_afor.csv +cat("\nReading mcsm combined data file: ", infile_params) +mcsm_df = read.csv(infile_params, header = T) +pd_df = plotting_data(mcsm_df + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) + +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] + +#-------------------------------- +# call: combining_dfs_plotting() +#-------------------------------- +if (!exists("infile_metadata") && exists("gene")){ +#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd + in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid + infile_metadata = paste0(outdir, "/", in_filename_metadata) + cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") +} + +# Input 2: read _meta data.csv +cat("\nReading meta data file: ", infile_metadata) + +gene_metadata <- read.csv(infile_metadata + , stringsAsFactors = F + , header = T) + +all_plot_dfs = combining_dfs_plotting(my_df_u + , gene_metadata + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) + +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +merged_df2_comp = all_plot_dfs[[3]] +merged_df3_comp = all_plot_dfs[[4]] +merged_df2_lig = all_plot_dfs[[5]] +merged_df3_lig = all_plot_dfs[[6]] +merged_df2_comp_lig = all_plot_dfs[[7]] +merged_df3_comp_lig = all_plot_dfs[[8]] + +#################################################################### +# Data for subcols barplot (~heatmap) +#################################################################### +# can include: mutation, or_kin, pwald, af_kin +cols_to_select = c("mutationinformation", "drtype" + , "wild_type" + , "position" + , "mutant_type" + , "chain", "ligand_id", "ligand_distance" + , "duet_stability_change", "duet_outcome", "duet_scaled" + , "ligand_affinity_change", "ligand_outcome", "affinity_scaled" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_outcome" # comment out as not available for pnca + , "asa", "rsa", "rd_values", "kd_values" + , "af", "or_mychisq", "pval_fisher" + , "or_fisher", "or_logistic", "pval_logistic" + , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity" + , "wt_calcprop", "mut_calcprop") + +#======================= +# Data for sub colours +# barplot: PS +#======================= + +cat("\nNo. of cols to select:", length(cols_to_select)) + +subcols_df_ps = merged_df3[, cols_to_select] + +cat("\nNo of unique positions for ps:" + , length(unique(subcols_df_ps$position))) + +# add count_pos col that counts the no. of nsSNPS at a position +setDT(subcols_df_ps)[, pos_count := .N, by = .(position)] + +# should be a factor +if (is.factor(subcols_df_ps$duet_outcome)){ + cat("\nDuet_outcome is factor") + table(subcols_df_ps$duet_outcome) +}else{ + cat("\nConverting duet_outcome to factor") + subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome) + table(subcols_df_ps$duet_outcome) +} + +# should be -1 and 1 +min(subcols_df_ps$duet_scaled) +max(subcols_df_ps$duet_scaled) + +tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min) +tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max) + +# check unique values in normalised data +cat("\nNo. of unique values in duet scaled, no rounding:" + , length(unique(subcols_df_ps$duet_scaled))) + +# No rounding +my_grp = subcols_df_ps$duet_scaled; length(my_grp) + +# Add rounding is to be used +n = 3 +subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n) + +cat("\nNo. of unique values in duet scaled", n, "places rounding:" + , length(unique(subcols_df_ps$duet_scaledR))) + +my_grp_r = subcols_df_ps$duet_scaledR # rounding + +# Add grp cols +subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "") +subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "") + +# Call the function to create the palette based on the group defined above +subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp") +subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r") + +print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours")) +print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")) + +#======================= +# Data for sub colours +# barplot: LIG +#======================= +cat("\nNo. of cols to select:", length(cols_to_select)) + +subcols_df_lig = merged_df3_lig[, cols_to_select] + +cat("\nNo of unique positions for LIG:" + , length(unique(subcols_df_lig$position))) + +# should be a factor +if (is.factor(subcols_df_lig$ligand_outcome)){ + cat("\nLigand_outcome is factor") + table(subcols_df_lig$ligand_outcome) +}else{ + cat("\nConverting ligand_outcome to factor") + subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome) + table(subcols_df_lig$ligand_outcome) +} + +# should be -1 and 1 +min(subcols_df_lig$affinity_scaled) +max(subcols_df_lig$affinity_scaled) + +tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min) +tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max) + +# check unique values in normalised data +cat("\nNo. of unique values in affinity scaled, no rounding:" + , length(unique(subcols_df_lig$affinity_scaled))) + +# No rounding +my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig) + +# Add rounding is to be used +n = 3 +subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n) + +cat("\nNo. of unique values in duet scaled", n, "places rounding:" + , length(unique(subcols_df_lig$affinity_scaledR))) + +my_grp_lig_r = subcols_df_lig$affinity_scaledR # rounding + +# Add grp cols +subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "") +subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "") + +# Call the function to create the palette based on the group defined above +subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig") +subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r") + +print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours")) +print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours")) + +#################################################################### +# Data for logoplots +#################################################################### +#------------------------- +# choose df for logoplot +#------------------------- +logo_data = merged_df3 +#logo_data = merged_df3_comp + +# quick checks +colnames(logo_data) +str(logo_data) + +c1 = unique(logo_data$position) +nrow(logo_data) +cat("No. of rows in my_data:", nrow(logo_data) + , "\nDistinct positions corresponding to snps:", length(c1) + , "\n===========================================================") +#======================================================================= +#================== +# logo data: OR +#================== +foo = logo_data[, c("position" + , "mutant_type","duet_scaled", "or_mychisq" + , "mut_prop_polarity", "mut_prop_water")] + +logo_data$log10or = log10(logo_data$or_mychisq) +logo_data_plot = logo_data[, c("position" + , "mutant_type", "or_mychisq", "log10or")] + +logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")] +wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0) + +wide_df_or = as.matrix(wide_df_or) +rownames(wide_df_or) = wide_df_or[,1] +dim(wide_df_or) +wide_df_or = wide_df_or[,-1] +str(wide_df_or) + +position_or = as.numeric(colnames(wide_df_or)) + +#================== +# logo data: logOR +#================== +logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")] +wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0) + +wide_df_logor = as.matrix(wide_df_logor) + +rownames(wide_df_logor) = wide_df_logor[,1] +wide_df_logor = subset(wide_df_logor, select = -c(1) ) +colnames(wide_df_logor) +wide_df_logor_m = data.matrix(wide_df_logor) + +rownames(wide_df_logor_m) +colnames(wide_df_logor_m) + +position_logor = as.numeric(colnames(wide_df_logor_m)) + +#=============================== +# logo data: multiple nsSNPs (>1) +#================================= +#require(data.table) + +# get freq count of positions so you can subset freq<1 +setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] + +table(logo_data$position) +table(logo_data$mut_pos_occurrence) + +max_mut = max(table(logo_data$position)) + +# extract freq_pos > 1 +my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] +u = unique(my_data_snp$position) +max_mult_mut = max(table(my_data_snp$position)) + +if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){ + + cat("PASS: positions with multiple muts extracted" + , "\nNo. of mutations:", nrow(my_data_snp) + , "\nNo. of positions:", length(u) + , "\nMax no. of muts at any position", max_mult_mut) +}else{ + cat("FAIL: positions with multiple muts could NOT be extracted" + , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] + , "\nGot:", nrow(my_data_snp) ) +} + +cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]]) + +#-------------------------------------- +# matrix for_mychisq mutant type +# frequency of mutant type by position +#--------------------------------------- +table(my_data_snp$mutant_type, my_data_snp$position) +tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) +class(tab_mt) + +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +# should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos + +#------------------------------------- +# matrix for wild type +# frequency of wild type by position +#------------------------------------- +tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt +tab_wt = unclass(tab_wt) + +# remove wt duplicates +wt = my_data_snp[, c("position", "wild_type")] +wt = wt[!duplicated(wt),] + +tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 + +rownames(tab_wt) +rownames(tab_wt) + +identical(colnames(tab_mt), colnames(tab_wt)) +identical(ncol(tab_mt), ncol(tab_wt)) + +#---------------------------------- +# logo data OR: multiple nsSNPs (>1) +#---------------------------------- +logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")] +#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0) +wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA) + +wide_df_or_mult = as.matrix(wide_df_or_mult) +rownames(wide_df_or_mult) = wide_df_or_mult[,1] +wide_df_or_mult = wide_df_or_mult[,-1] +str(wide_df_or_mult) + +position_or_mult = as.numeric(colnames(wide_df_or_mult)) + +#################################################################### +# Data for Corrplots +#################################################################### +cat("\n==========================================" + , "\nCORR PLOTS data: PS" + , "\n===========================================") + +df_ps = merged_df2 + +#-------------------- +# adding log cols : NEW UNCOMMENT +#-------------------- +#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) +#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) + +##df_ps$log10_or_kin = log10(df_ps$or_kin) +##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin) + +#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0) + +#---------------------------- +# columns for corr plots:PS +#---------------------------- +# subset data to generate pairwise correlations +cols_to_select = c("mutationinformation" + , "duet_scaled" + , "foldx_scaled" + #, "mutation_info_labels" + , "asa" + , "rsa" + , "rd_values" + , "kd_values" + , "log10_or_mychisq" + , "neglog_pval_fisher" + ##, "or_kin" + ##, "neglog_pwald_kin" + , "af" + ##, "af_kin" + , "duet_outcome" + , drug) + +corr_data_ps = df_ps[cols_to_select] + +dim(corr_data_ps) + +#-------------------------------------- +# assign nice colnames (for display) +#-------------------------------------- +my_corr_colnames = c("Mutation" + , "DUET" + , "FoldX" + #, "Mutation class" + , "ASA" + , "RSA" + , "RD" + , "KD" + , "Log (OR)" + , "-Log (P)" + ##, "Adjusted (OR)" + ##, "-Log (P wald)" + , "MAF" + ##, "AF_kin" + , "duet_outcome" + , drug) + +length(my_corr_colnames) + +colnames(corr_data_ps) +colnames(corr_data_ps) <- my_corr_colnames +colnames(corr_data_ps) + +start = 1 +end = which(colnames(corr_data_ps) == drug); end # should be the last column +offset = 1 + +#=========================== +# Corr data for plots: PS +# big_df ps: ~ merged_df2 +#=========================== + +#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug +corr_ps_df2 = corr_data_ps[start:end] +head(corr_ps_df2) + +#=========================== +# Corr data for plots: PS +# short_df ps: ~merged_df3 +#=========================== +corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),] + +na_or = sum(is.na(corr_ps_df3$`Log (OR)`)) +check1 = nrow(corr_ps_df3) - na_or + +##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`)) +##check2 = nrow(corr_ps_df3) - na_adj_or + +if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { + cat( "\nPASS: No. of rows for corr_ps_df3 match" + , "\nPASS: No. of OR values checked: " , check1) +} else { + cat("\nFAIL: Numbers mismatch:" + , "\nExpected nrows: ", nrow(merged_df3) + , "\nGot: ", nrow(corr_ps_df3) + , "\nExpected OR values: ", nrow(merged_df3_comp) + , "\nGot: ", check1) +} + +#================================= +# Data for Correlation plots: LIG +#================================= +cat("\n==========================================" + , "\nCORR PLOTS data: LIG" + , "\n===========================================") + +df_lig = merged_df2_lig + +table(df_lig$ligand_outcome) + +#-------------------- +# adding log cols : NEW UNCOMMENT +#-------------------- +#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq) +#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher) + +##df_lig$log10_or_kin = log10(df_lig$or_kin) +##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin) + +#---------------------------- +# columns for corr plots:PS +#---------------------------- +# subset data to generate pairwise correlations +cols_to_select = c("mutationinformation" + , "affinity_scaled" + #, "mutation_info_labels" + , "asa" + , "rsa" + , "rd_values" + , "kd_values" + , "log10_or_mychisq" + , "neglog_pval_fisher" + ##, "or_kin" + ##, "neglog_pwald_kin" + , "af" + ##, "af_kin" + , "ligand_outcome" + , drug) + +corr_data_lig = df_lig[, cols_to_select] + +dim(corr_data_lig) + +#-------------------------------------- +# assign nice colnames (for display) +#-------------------------------------- +my_corr_colnames = c("Mutation" + , "Ligand Affinity" + #, "Mutation class" + , "ASA" + , "RSA" + , "RD" + , "KD" + , "Log (OR)" + , "-Log (P)" + ##, "Adjusted (OR)" + ##, "-Log (P wald)" + , "MAF" + ##, "MAF_kin" + , "ligand_outcome" + , drug) + +length(my_corr_colnames) + +colnames(corr_data_lig) +colnames(corr_data_lig) <- my_corr_colnames +colnames(corr_data_lig) + +start = 1 +end = which(colnames(corr_data_lig) == drug); end # should be the last column +offset = 1 + +#============================= +# Corr data for plots: LIG +# big_df lig: ~ merged_df2_lig +#============================== +#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug +corr_lig_df2 = corr_data_lig[start:end] +head(corr_lig_df2) + +#============================= +# Corr data for plots: LIG +# short_df lig: ~ merged_df3_lig +#============================== +corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),] + +na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`)) +check1_lig = nrow(corr_lig_df3) - na_or_lig + +if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) { + cat( "\nPASS: No. of rows for corr_lig_df3 match" + , "\nPASS: No. of OR values checked: " , check1_lig) +} else { + cat("\nFAIL: Numbers mismatch:" + , "\nExpected nrows: ", nrow(merged_df3_lig) + , "\nGot: ", nrow(corr_ps_df3_lig) + , "\nExpected OR values: ", nrow(merged_df3_comp_lig) + , "\nGot: ", check1_lig) +} + +# remove unnecessary columns +identical(corr_data_lig, corr_lig_df2) +identical(corr_data_ps, corr_ps_df2) + +#rm(df_ps, df_lig, corr_data_ps, corr_data_lig) + +######################################################################## +# End of script +######################################################################## +rm(foo) + +cat("\n===================================================\n" + , "\nSuccessful: get_plotting_dfs.R worked!" + , "\n====================================================") \ No newline at end of file From 605eb54526205738a8bc2e2d4b6d993c4573b007 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 2 Sep 2021 17:40:24 +0100 Subject: [PATCH 08/51] saving work for the day --- dynamut/split_csv.sh | 4 +++- scripts/functions/lf_bp.R | 8 +++++--- scripts/functions/test_lf_bp.R | 9 ++++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh index b5f15f1..18103c6 100755 --- a/dynamut/split_csv.sh +++ b/dynamut/split_csv.sh @@ -1,6 +1,6 @@ #!/bin/bash -# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA +# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20) # Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh # copy your snp file to split into the dynamut dir @@ -13,7 +13,9 @@ mkdir -p ${OUTDIR}/${CHUNK} cd ${OUTDIR}/${CHUNK} split ../../${INFILE} -l ${CHUNK} -d snp_batch_ +#split ${INFILE} -l ${CHUNK} -d snp_batch_ # use case #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50 +~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50 diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R index 4677548..608247d 100644 --- a/scripts/functions/lf_bp.R +++ b/scripts/functions/lf_bp.R @@ -24,7 +24,7 @@ lf_bp <- function(lf_df , my_pts = 22 # plot title size) , make_boxplot = FALSE , bp_width = c("auto", 0.5) - , add_stats = FALSE + , add_stats = TRUE , stat_grp_comp = c("DM", "OM") , stat_method = "wilcox.test" , my_paired = FALSE @@ -104,8 +104,10 @@ lf_bp <- function(lf_df OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL , method = stat_method , paired = my_paired - , label = stat_label[1]) - } + , label = stat_label[2]) + return(OutPlot) + } + return(OutPlot) } diff --git a/scripts/functions/test_lf_bp.R b/scripts/functions/test_lf_bp.R index 42b78bf..f3d2327 100644 --- a/scripts/functions/test_lf_bp.R +++ b/scripts/functions/test_lf_bp.R @@ -12,9 +12,9 @@ source("../functions/lf_bp.R") ###################### # Make plot: ggplot ###################### -lf_bp(lf_df = lf_dynamut2 - , p_title = "Dynamut2" - , colour_categ = "ddg_dynamut2_outcome" +lf_bp(lf_df = lf_encomddg + , p_title = "ENCoM-DDG" + , colour_categ = "ddg_encom_outcome" , x_grp = "mutation_info" , y_var = "param_value" , facet_var = "param_type" @@ -36,6 +36,9 @@ lf_bp(lf_df = lf_dynamut2 , my_paired = FALSE , stat_label = c("p.format", "p.signif") ) +#wilcox.test(wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "DM"] +# , wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "OM"]) + ###################### # Make plot: plotly ###################### From 869fca7f945a823c67248d381726058bcec45c92 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 6 Sep 2021 19:50:50 +0100 Subject: [PATCH 09/51] added function for generating lineage barplots and also test script along wiadding script for processing data and added it to get_plotting_dfs.R --- dynamut/split_csv.sh | 6 +- scripts/functions/bp_lineage.R | 172 +++++++++++++++++++++++++++ scripts/functions/test_bp_lineage.R | 111 ++++++++++++++++++ scripts/plotting/get_plotting_dfs.R | 8 +- scripts/plotting/lineage_bp_data.R | 173 ++++++++++++++++++++++++++++ scripts/plotting/other_plots_data.R | 5 +- 6 files changed, 470 insertions(+), 5 deletions(-) create mode 100644 scripts/functions/bp_lineage.R create mode 100644 scripts/functions/test_bp_lineage.R create mode 100644 scripts/plotting/lineage_bp_data.R diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh index 18103c6..17c1a03 100755 --- a/dynamut/split_csv.sh +++ b/dynamut/split_csv.sh @@ -12,10 +12,12 @@ CHUNK=$3 mkdir -p ${OUTDIR}/${CHUNK} cd ${OUTDIR}/${CHUNK} +# makes the 2 dirs, hence ../.. split ../../${INFILE} -l ${CHUNK} -d snp_batch_ -#split ${INFILE} -l ${CHUNK} -d snp_batch_ # use case #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50 -~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50 +#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50 + +# add .txt to the files diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R new file mode 100644 index 0000000..86eb9f1 --- /dev/null +++ b/scripts/functions/bp_lineage.R @@ -0,0 +1,172 @@ +######################################## +# Lineage and within SNP count barplot +######################################## + +lin_count_bp <- function( lf_data + , x_categ = "" + , y_count = "" + , bar_fill_categ = "" + , display_label_col = "" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , d_lab_size = 5 + , d_lab_hjust = 0.5 + , d_lab_vjust = 0.5 + , d_lab_col = "black" + , my_xats = 20 # x axis text size + , my_yats = 20 # y axis text size + , my_xals = 22 # x axis label size + , my_yals = 22 # y axis label size + , my_lls = 22 # legend label size + , bar_col_labels = c("Mutations", "Total Samples") + , bar_col_values = c("grey50", "gray75") + , bar_leg_name = "" + , leg_location = "top" + , y_log10 = FALSE + , y_scale_percent = FALSE + , y_label = c("Count", "SNP diversity") + ) { + g = ggplot(lf_data + , aes( x = factor( eval(parse(text = x_categ)), ordered = T ) + , y = eval(parse(text = y_count)) + , fill = eval(parse(text = bar_fill_categ)) ) ) + + OutPlot = g + geom_bar( stat = bar_stat_stype + , position = position_stack(reverse = TRUE) + #, alpha = 1 + #, colour = "grey75" + ) + + theme(axis.text.x = element_text(size = my_xats + , angle = x_lab_angle) + , axis.text.y = element_text(size = my_yats + , angle = 90 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals + , colour = "black") + , axis.title.y = element_text(size = my_yals + , colour = "black") + , legend.position = leg_location + , legend.text = element_text(size = my_lls)) + + + geom_label(aes(label = eval(parse(text = display_label_col))) + , size = d_lab_size + , hjust = d_lab_hjust + , vjust = d_lab_vjust + , colour = d_lab_col + , show.legend = FALSE + #, check_overlap = TRUE + , position = position_stack(reverse = T)) + + + scale_fill_manual(values = bar_col_values + , name = bar_leg_name + , labels = bar_col_labels) + + labs(title = "" + , x = "" + , y = y_label + , colour = "black") + + + if (y_log10){ + + OutPlot = OutPlot + + scale_y_continuous(trans = "log10" + , labels = trans_format("log10", math_format(10^.x) ) ) + + } + + if (y_scale_percent){ + + OutPlot = OutPlot + + #scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + + scale_y_continuous(labels = scales::percent) + + + labs(title = "" + , x = "" + , y = y_label + , colour = "black") + } + + return(OutPlot) +} + +############################ +# Lineage diversity barplot +############################ +# lin_diversity_bp <- function( wf_data +# , x_categ = "sel_lineages" +# , y_count = "snp_diversity" +# , bar_stat_stype = "identity" +# , display_label_col = "snp_diversity_f" +# , x_lab_angle = 90 +# , d_lab_size = 5 +# , d_lab_hjust = 0.5 +# , d_lab_vjust = 0.5 +# , d_lab_col = "black" +# , my_xats = 20 # x axis text size +# , my_yats = 20 # y axis text size +# , my_xals = 22 # x axis label size +# , my_yals = 22 # y axis label size +# , my_lls = 22 # legend label size +# , bar_leg_name = "" +# , leg_location = "top" +# , y_scale_percent = TRUE +# , y_label = "SNP diversity" ) +# +# { +# g = ggplot(wf_data +# , aes( x = factor( eval(parse(text = x_categ)), ordered = T ) +# , y = eval(parse(text = y_count)) ) ) +# +# OutPlot = g + geom_bar( stat = bar_stat_stype +# , position = position_stack(reverse = TRUE) +# ) + +# +# theme(axis.text.x = element_text(size = my_xats +# , angle = x_lab_angle) +# , axis.text.y = element_text(size = my_yats +# , angle = 90 +# , hjust = 1 +# , vjust = 0) +# , axis.title.x = element_text(size = my_xals +# , colour = "black") +# , axis.title.y = element_text(size = my_yals +# , colour = "black") +# , legend.position = leg_location +# , legend.text = element_text(size = my_lls)) + +# +# geom_label(aes(label = eval(parse(text = display_label_col))) +# , size = d_lab_size +# , hjust = d_lab_hjust +# , vjust = d_lab_vjust +# , colour = d_lab_col +# , show.legend = FALSE +# #, check_overlap = TRUE +# , position = position_stack(reverse = T)) +# # return(OutPlot) +# +# if (y_scale_percent){ +# +# OutPlot = OutPlot + +# scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + +# labs(title = "" +# , x = "" +# , y = y_label +# , colour = "black") +# +# return(OutPlot) +# } + +# return(OutPlot) + +# } + + + + +# ggp <- ggplot(bar_sel, aes(sel_lineages, snp_diversity)) + +# geom_bar(stat = "identity") +# ggp + scale_y_continuous(labels = scales::percent_format(accuracy = 1) +# #, limits = c(0,1) +# , breaks = seq(0, 30, 5) +# ) diff --git a/scripts/functions/test_bp_lineage.R b/scripts/functions/test_bp_lineage.R new file mode 100644 index 0000000..6742429 --- /dev/null +++ b/scripts/functions/test_bp_lineage.R @@ -0,0 +1,111 @@ +setwd("~/git/LSHTM_analysis/scripts/plotting") + +#source ('get_plotting_dfs.R') +source("../functions/bp_lineage.R") + +######################################### +# Lineage and SNP count: lineage lf data +######################################### +# Relevel factors so that x-axis categ appear as you want +lin_lf_plot = lin_lf +lin_lf_plot +is.factor(lin_lf_plot$sel_lineages_f) + +lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f, c("" + , "L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7")) + +levels(lin_lf_plot$sel_lineages_f) + +lin_count_bp(lin_lf_plot + , x_categ = "sel_lineages_f" + , y_count = "p_count" + , bar_fill_categ = "count_categ" + , display_label_col = "p_count" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , bar_col_labels = c("Mutations", "Total Samples") + , bar_col_values = c("grey50", "gray75") + , y_log10 = T + , y_label = "Count" + , y_scale_percent = F) + +############################################### +# Lineage SNP diversity count: lineage wf data +############################################### +# Relevel factors so that x-axis categ appear as you want +lin_wf_plot = lin_wf +is.factor(lin_wf_plot$sel_lineages_f) + +lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f, c("" + , "L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7")) + +levels(lin_wf_plot$sel_lineages_f) + +#========== +# Plot +#========== +lin_count_bp(lin_wf_plot + , x_categ = "sel_lineages_f" + , y_count = "snp_diversity" + , display_label_col = "snp_diversity_f" + + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , y_scale_percent = T + , y_label = "SNP diversity" + + +) + + + + +, x_categ = "sel_lineages_f" +, y_count = "p_count" +, bar_fill_categ = "count_categ" +, display_label_col = "p_count" +, bar_stat_stype = "identity" +, x_lab_angle = 90 +, my_xats = 15 +, bar_col_labels = c("Mutations", "Total Samples") +, bar_col_values = c("grey50", "gray75") +, y_log10 = T +, y_scale_percent = F \ No newline at end of file diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index 2fc1c19..89b477c 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -437,13 +437,19 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { , "\nGot: ", check1) } - rm(foo) #################################################################### # Data for DM OM Plots: Long format dfs #################################################################### + source("other_plots_data.R") +#################################################################### +# Data for Lineage barplots: WF and LF dfs +#################################################################### + +source("lineage_bp_data.R") + ######################################################################## # End of script ######################################################################## diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R new file mode 100644 index 0000000..2cdfbe8 --- /dev/null +++ b/scripts/plotting/lineage_bp_data.R @@ -0,0 +1,173 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for lineage barplots: +# WF and LF data with lineage sample, and snp counts +# sourced by get_plotting_dfs.R +######################################################### +# working dir and loading libraries +# getwd() +# setwd("~/git/LSHTM_analysis/scripts/plotting") +# getwd() + +# make cmd +# globals +# drug = "streptomycin" +# gene = "gid" + +# source("get_plotting_dfs.R") +#======================================================================= +################################################# +# Get data with lineage count, and snp diversity +################################################# +table(merged_df2$lineage) + +if (table(merged_df2$lineage == "")[[2]]) { + +cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]]) + +} + +################################## +# WF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +################################## +sel_lineages = levels(as.factor(merged_df2$lineage)) + +lin_wf = data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i]) + #print(curr_total) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = merged_df2[merged_df2$lineage==i,] + print(paste0(i, "=======\n")) + print(length(unique(foo$mutationinformation))) + curr_count = length(unique(foo$mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} +lin_wf + +# Add these counts as columns to the df +lin_wf$num_snps_u = total_snps_u +lin_wf$total_samples = total_samples + +# Add SNP diversity +lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples +lin_wf + +#===================== +# Add some formatting +#===================== +# SNP diversity +lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) +lin_wf + +# Lineage names +lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages) +lin_wf + +# # Lineage names +# lin_wf = lin_wf %>% +# mutate(ordering_category = case_when( +# sel_lineages_f == "" ~ 0 +# , sel_lineages_f == "L1" ~ 1 +# , sel_lineages_f == "L2" ~ 2 +# , sel_lineages_f == "L3" ~ 3 +# , sel_lineages_f == "L4" ~ 4 +# , sel_lineages_f == "L5" ~ 5 +# , sel_lineages_f == "L6" ~ 6 +# , sel_lineages_f == "L7" ~ 7 +# , sel_lineages_f == "LBOV" ~ 8 +# +# , sel_lineages_f == "L1;L2" ~ 9 +# , sel_lineages_f == "L1;L3" ~ 10 +# , sel_lineages_f == "L1;L4" ~ 11 +# +# , sel_lineages_f == "L2;L3" ~ 12 +# , sel_lineages_f == "L2;L3;L4" ~ 13 +# , sel_lineages_f == "L2;L4" ~ 14 +# , sel_lineages_f == "L2;L6" ~ 15 +# , sel_lineages_f == "L2;LBOV" ~ 16 +# +# , sel_lineages_f == "L3;L4" ~ 17 +# +# , sel_lineages_f == "L4;L6" ~ 18 +# , sel_lineages_f == "L4;L7" ~ 19 +# +# , FALSE ~ -1) +# ) + +################################## +# LF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +################################## +names(lin_wf) +tot_cols = ncol(lin_wf) +pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f") +pivot_cols_n = length(pivot_cols) + +expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) + +lin_lf <- gather(lin_wf + , count_categ + , p_count + , num_snps_u:total_samples + , factor_key = TRUE) +lin_lf + +# quick checks +if ( nrow(lin_lf) == expected_rows ){ + cat("\nPASS: Lineage LF data created" + , "\nnrow: ", nrow(lin_lf) + , "\nncol: ", ncol(lin_lf)) +} else { + cat("\nFAIL: numbers mismatch" + , "\nExpected nrow: ", expected_rows) +} +####################################### +# #===================== +# # Add some formatting +# #===================== +# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages) +# lin_lf + + +# lin_lf = lin_lf %>% +# mutate(ordering_category = case_when( +# sel_lineages_f == "" ~ 0 +# , sel_lineages_f == "L1" ~ 1 +# , sel_lineages_f == "L2" ~ 2 +# , sel_lineages_f == "L3" ~ 3 +# , sel_lineages_f == "L4" ~ 4 +# , sel_lineages_f == "L5" ~ 5 +# , sel_lineages_f == "L6" ~ 6 +# , sel_lineages_f == "L7" ~ 7 +# , sel_lineages_f == "LBOV" ~ 8 +# +# , sel_lineages_f == "L1;L2" ~ 9 +# , sel_lineages_f == "L1;L3" ~ 10 +# , sel_lineages_f == "L1;L4" ~ 11 +# +# , sel_lineages_f == "L2;L3" ~ 12 +# , sel_lineages_f == "L2;L3;L4" ~ 13 +# , sel_lineages_f == "L2;L4" ~ 14 +# , sel_lineages_f == "L2;L6" ~ 15 +# , sel_lineages_f == "L2;LBOV" ~ 16 +# +# , sel_lineages_f == "L3;L4" ~ 17 +# +# , sel_lineages_f == "L4;L6" ~ 18 +# , sel_lineages_f == "L4;L7" ~ 19 +# +# , FALSE ~ -1) +# ) diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R index 8eb2020..a55303b 100644 --- a/scripts/plotting/other_plots_data.R +++ b/scripts/plotting/other_plots_data.R @@ -1,7 +1,8 @@ #!/usr/bin/env Rscript ######################################################### -# TASK: producing boxplots for dr and other muts - +# TASK: Script to format data for dm om plots: +# generating LF data +# sourced by get_plotting_dfs.R ######################################################### # working dir and loading libraries # getwd() From 50b89cdcd7afd3875bfe95de89f33a8d71897d1f Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 6 Sep 2021 19:52:56 +0100 Subject: [PATCH 10/51] one function with tuned params to generate count and diversity barplot --- scripts/functions/bp_lineage.R | 85 ++-------------------------------- 1 file changed, 3 insertions(+), 82 deletions(-) diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R index 86eb9f1..6b7090c 100644 --- a/scripts/functions/bp_lineage.R +++ b/scripts/functions/bp_lineage.R @@ -1,5 +1,7 @@ ######################################## -# Lineage and within SNP count barplot +# Lineage barplot +# Lineage and nsSNP count barplot +# Lineage Diversity barplot ######################################## lin_count_bp <- function( lf_data @@ -89,84 +91,3 @@ lin_count_bp <- function( lf_data return(OutPlot) } - -############################ -# Lineage diversity barplot -############################ -# lin_diversity_bp <- function( wf_data -# , x_categ = "sel_lineages" -# , y_count = "snp_diversity" -# , bar_stat_stype = "identity" -# , display_label_col = "snp_diversity_f" -# , x_lab_angle = 90 -# , d_lab_size = 5 -# , d_lab_hjust = 0.5 -# , d_lab_vjust = 0.5 -# , d_lab_col = "black" -# , my_xats = 20 # x axis text size -# , my_yats = 20 # y axis text size -# , my_xals = 22 # x axis label size -# , my_yals = 22 # y axis label size -# , my_lls = 22 # legend label size -# , bar_leg_name = "" -# , leg_location = "top" -# , y_scale_percent = TRUE -# , y_label = "SNP diversity" ) -# -# { -# g = ggplot(wf_data -# , aes( x = factor( eval(parse(text = x_categ)), ordered = T ) -# , y = eval(parse(text = y_count)) ) ) -# -# OutPlot = g + geom_bar( stat = bar_stat_stype -# , position = position_stack(reverse = TRUE) -# ) + -# -# theme(axis.text.x = element_text(size = my_xats -# , angle = x_lab_angle) -# , axis.text.y = element_text(size = my_yats -# , angle = 90 -# , hjust = 1 -# , vjust = 0) -# , axis.title.x = element_text(size = my_xals -# , colour = "black") -# , axis.title.y = element_text(size = my_yals -# , colour = "black") -# , legend.position = leg_location -# , legend.text = element_text(size = my_lls)) + -# -# geom_label(aes(label = eval(parse(text = display_label_col))) -# , size = d_lab_size -# , hjust = d_lab_hjust -# , vjust = d_lab_vjust -# , colour = d_lab_col -# , show.legend = FALSE -# #, check_overlap = TRUE -# , position = position_stack(reverse = T)) -# # return(OutPlot) -# -# if (y_scale_percent){ -# -# OutPlot = OutPlot + -# scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + -# labs(title = "" -# , x = "" -# , y = y_label -# , colour = "black") -# -# return(OutPlot) -# } - -# return(OutPlot) - -# } - - - - -# ggp <- ggplot(bar_sel, aes(sel_lineages, snp_diversity)) + -# geom_bar(stat = "identity") -# ggp + scale_y_continuous(labels = scales::percent_format(accuracy = 1) -# #, limits = c(0,1) -# , breaks = seq(0, 30, 5) -# ) From 3cee341170b2c30a1fb74b5503cc9aa6b21cc38d Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 7 Sep 2021 09:27:47 +0100 Subject: [PATCH 11/51] replaced old lineage barplot with count and diversity combined plots sourced from function --- scripts/functions/bp_lineage.R | 12 +- scripts/functions/test_bp_lineage.R | 107 +++------ scripts/plotting/basic_barplots_combined.R | 2 +- scripts/plotting/lineage_basic_barplot.R | 214 ------------------ .../lineage_basic_barplots_combined.R | 128 +++++++++++ scripts/plotting/lineage_bp_data.R | 112 ++++----- scripts/plotting/running_plotting_scripts.txt | 8 + 7 files changed, 217 insertions(+), 366 deletions(-) delete mode 100644 scripts/plotting/lineage_basic_barplot.R create mode 100644 scripts/plotting/lineage_basic_barplots_combined.R diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R index 6b7090c..ad43386 100644 --- a/scripts/functions/bp_lineage.R +++ b/scripts/functions/bp_lineage.R @@ -20,8 +20,8 @@ lin_count_bp <- function( lf_data , my_xals = 22 # x axis label size , my_yals = 22 # y axis label size , my_lls = 22 # legend label size - , bar_col_labels = c("Mutations", "Total Samples") - , bar_col_values = c("grey50", "gray75") + , bar_col_labels = "" + , bar_col_values = "" , bar_leg_name = "" , leg_location = "top" , y_log10 = FALSE @@ -68,20 +68,18 @@ lin_count_bp <- function( lf_data , y = y_label , colour = "black") - if (y_log10){ OutPlot = OutPlot + scale_y_continuous(trans = "log10" , labels = trans_format("log10", math_format(10^.x) ) ) - - } + } if (y_scale_percent){ OutPlot = OutPlot + - #scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + - scale_y_continuous(labels = scales::percent) + + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + + #scale_y_continuous(labels = scales::percent) + labs(title = "" , x = "" diff --git a/scripts/functions/test_bp_lineage.R b/scripts/functions/test_bp_lineage.R index 6742429..876f237 100644 --- a/scripts/functions/test_bp_lineage.R +++ b/scripts/functions/test_bp_lineage.R @@ -1,39 +1,25 @@ setwd("~/git/LSHTM_analysis/scripts/plotting") -#source ('get_plotting_dfs.R') +source ('get_plotting_dfs.R') source("../functions/bp_lineage.R") ######################################### # Lineage and SNP count: lineage lf data ######################################### -# Relevel factors so that x-axis categ appear as you want -lin_lf_plot = lin_lf -lin_lf_plot -is.factor(lin_lf_plot$sel_lineages_f) - -lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f, c("" - , "L1" - , "L2" - , "L3" - , "L4" - , "L5" - , "L6" - , "L7" - , "LBOV" - , "L1;L2" - , "L1;L3" - , "L1;L4" - , "L2;L3" - , "L2;L3;L4" - , "L2;L4" - , "L2;L6" - , "L2;LBOV" - , "L3;L4" - , "L4;L6" - , "L4;L7")) +#========================= +# Data: All lineages or +# selected few +#========================= +sel_lineages = levels(lin_lf$sel_lineages_f) +sel_lineages +lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,] +# drop unused factor levels +lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f) levels(lin_lf_plot$sel_lineages_f) - +#========================= +# Lineage count plot +#========================= lin_count_bp(lin_lf_plot , x_categ = "sel_lineages_f" , y_count = "p_count" @@ -44,68 +30,33 @@ lin_count_bp(lin_lf_plot , my_xats = 20 , bar_col_labels = c("Mutations", "Total Samples") , bar_col_values = c("grey50", "gray75") - , y_log10 = T - , y_label = "Count" - , y_scale_percent = F) + , y_scale_percent = F # T for diversity + , y_log10 = F + , y_label = "Count") ############################################### # Lineage SNP diversity count: lineage wf data ############################################### -# Relevel factors so that x-axis categ appear as you want -lin_wf_plot = lin_wf -is.factor(lin_wf_plot$sel_lineages_f) - -lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f, c("" - , "L1" - , "L2" - , "L3" - , "L4" - , "L5" - , "L6" - , "L7" - , "LBOV" - , "L1;L2" - , "L1;L3" - , "L1;L4" - , "L2;L3" - , "L2;L3;L4" - , "L2;L4" - , "L2;L6" - , "L2;LBOV" - , "L3;L4" - , "L4;L6" - , "L4;L7")) +#========================= +# Data: All lineages or +# selected few +#========================= +sel_lineages = levels(lin_wf$sel_lineages_f) +sel_lineages +lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,] +# drop unused factor levels +lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f) levels(lin_wf_plot$sel_lineages_f) - -#========== -# Plot -#========== +#========================= +# Lineage Diversity plot +#========================= lin_count_bp(lin_wf_plot , x_categ = "sel_lineages_f" , y_count = "snp_diversity" , display_label_col = "snp_diversity_f" - , bar_stat_stype = "identity" , x_lab_angle = 90 , my_xats = 20 , y_scale_percent = T - , y_label = "SNP diversity" - - -) - - - - -, x_categ = "sel_lineages_f" -, y_count = "p_count" -, bar_fill_categ = "count_categ" -, display_label_col = "p_count" -, bar_stat_stype = "identity" -, x_lab_angle = 90 -, my_xats = 15 -, bar_col_labels = c("Mutations", "Total Samples") -, bar_col_values = c("grey50", "gray75") -, y_log10 = T -, y_scale_percent = F \ No newline at end of file + , y_label = "SNP diversity") diff --git a/scripts/plotting/basic_barplots_combined.R b/scripts/plotting/basic_barplots_combined.R index 7fee2d7..2643b0d 100644 --- a/scripts/plotting/basic_barplots_combined.R +++ b/scripts/plotting/basic_barplots_combined.R @@ -23,7 +23,7 @@ plot_basic_bp_combined_labelled = paste0(plotdir,"/", basic_bp_combined_labell #======================================================================= #======= -# combin DUET and Ligand affinity plots +# combine DUET and Ligand affinity plots #======= svg(plot_basic_bp_combined_labelled , width = 12, height = 12 ) diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R deleted file mode 100644 index e4503d1..0000000 --- a/scripts/plotting/lineage_basic_barplot.R +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env Rscript -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() -######################################################### -# TASK: Basic lineage barplot showing numbers - -# Output: Basic barplot with lineage samples and mut count - -########################################################## -# Installing and loading required packages -########################################################## -source("Header_TT.R") -require(data.table) -source("combining_dfs_plotting.R") -# should return the following dfs, directories and variables - -# PS combined: -# 1) merged_df2 -# 2) merged_df2_comp -# 3) merged_df3 -# 4) merged_df3_comp - -# LIG combined: -# 5) merged_df2_lig -# 6) merged_df2_comp_lig -# 7) merged_df3_lig -# 8) merged_df3_comp_lig - -# 9) my_df_u -# 10) my_df_u_lig - -cat("Directories imported:" - , "\n====================" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir) - -cat("Variables imported:" - , "\n=====================" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\nNA count for ORs:", na_count - , "\nNA count in df2:", na_count_df2 - , "\nNA count in df3:", na_count_df3 - , "\ndr_muts_col:", dr_muts_col - , "\nother_muts_col:", other_muts_col - , "\ndrtype_col:", resistance_col) - - -#=========== -# input -#=========== -# output of combining_dfs_plotting.R - -#======= -# output -#======= -# plot 1 -basic_bp_lineage = "basic_lineage_barplot.svg" -plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage) - -#======================================================================= -#================ -# Data for plots: -# you need merged_df2, comprehensive one -# since this has one-many relationship -# i.e the same SNP can belong to multiple lineages -#================ -# REASSIGNMENT as necessary -my_df = merged_df2 - -# clear excess variable -rm(merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -#========================== -# Plot: Lineage barplot -# x = lineage y = No. of samples -# col = Lineage -# fill = lineage -#============================ -table(my_df$lineage) -as.data.frame(table(my_df$lineage)) - -#============= -# Data for plots -#============= -# REASSIGNMENT -df <- my_df - -rm(my_df) - -# get freq count of positions so you can subset freq<1 -#setDT(df)[, lineage_count := .N, by = .(lineage)] - -#****************** -# generate plot: barplot of mutation by lineage -#****************** -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4" - #, "lineage5" - #, "lineage6" - #, "lineage7" - ) - -df_lin = subset(df, subset = lineage %in% sel_lineages) - -# Create df with lineage inform & no. of unique mutations -# per lineage and total samples within lineage -# this is essentially barplot with two y axis - -bar = bar = as.data.frame(sel_lineages) #4, 1 -total_snps_u = NULL -total_samples = NULL - -for (i in sel_lineages){ - #print(i) - curr_total = length(unique(df$id)[df$lineage==i]) - total_samples = c(total_samples, curr_total) - print(total_samples) - - foo = df[df$lineage==i,] - print(paste0(i, "=======")) - print(length(unique(foo$mutationinformation))) - curr_count = length(unique(foo$mutationinformation)) - - total_snps_u = c(total_snps_u, curr_count) -} - -print(total_snps_u) -bar$num_snps_u = total_snps_u -bar$total_samples = total_samples -bar - -#***************** -# generate plot: lineage barplot with two y-axis -#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 -#***************** - -y1 = bar$num_snps_u -y2 = bar$total_samples -x = sel_lineages - -to_plot = data.frame(x = x - , y1 = y1 - , y2 = y2) -to_plot - -# FIXME later: will be depricated! -melted = melt(to_plot, id = "x") -melted - - -svg(plot_basic_bp_lineage) - -my_ats = 20 # axis text size -my_als = 22 # axis label size - -g = ggplot(melted, aes(x = x - , y = value - , fill = variable)) - -printFile = g + geom_bar(stat = "identity" - , position = position_stack(reverse = TRUE) - , alpha=.75 - , colour='grey75') + - theme(axis.text.x = element_text(size = my_ats) - , axis.text.y = element_text(size = my_ats - #, angle = 30 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_als - , colour = 'black') - , axis.title.y = element_text(size = my_als - , colour = 'black') - , legend.position = "top" - , legend.text = element_text(size = my_als)) + - #geom_text() + - geom_label(aes(label = value) - , size = 5 - , hjust = 0.5 - , vjust = 0.5 - , colour = 'black' - , show.legend = FALSE - #, check_overlap = TRUE - , position = position_stack(reverse = T)) + - labs(title = '' - , x = '' - , y = "Number" - , fill = 'Variable' - , colour = 'black') + - scale_fill_manual(values = c('grey50', 'gray75') - , name='' - , labels=c('Mutations', 'Total Samples')) + - scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')) - -print(printFile) -dev.off() diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R new file mode 100644 index 0000000..4b63587 --- /dev/null +++ b/scripts/plotting/lineage_basic_barplots_combined.R @@ -0,0 +1,128 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() +######################################################### +# TASK: Basic lineage barplot showing numbers + +# Output: Basic barplot with lineage samples and mut count + +########################################################## +# Installing and loading required packages +########################################################## +source("Header_TT.R") +source("../functions/bp_lineage.R") + +#=========== +# input +#=========== +source ('get_plotting_dfs.R') + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + #, "\nNo. of duplicated muts:", dup_muts_nu + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + +#======= +# output +#======= +# plot 1 +basic_bp_lineage_cl = "basic_lineage_barplots_combined.svg" +plot_basic_bp_lineage_cl = paste0(plotdir,"/", basic_bp_lineage_cl) +plot_basic_bp_lineage_cl +################################################################# +#============================= +# PLOT 1: Lineage count plot: +# LF data +#============================= +#------------------------ +# Data: All lineages or +# selected few +#------------------------ +sel_lineages = levels(lin_lf$sel_lineages_f)[1:4] +sel_lineages +lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,] +str(lin_lf_plot) + +# drop unused factor levels +lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f) +levels(lin_lf_plot$sel_lineages_f) +str(lin_lf_plot) + +#------------------------ +# plot from my function: +#------------------------ +lin_countP = lin_count_bp(lin_lf_plot + , x_categ = "sel_lineages_f" + , y_count = "p_count" + , bar_fill_categ = "count_categ" + , display_label_col = "p_count" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , bar_col_labels = c("Mutations", "Total Samples") + , bar_col_values = c("grey50", "gray75") + , y_scale_percent = F # T for diversity + , y_log10 = F + , y_label = "Count") +lin_countP +#================================ +# PLOT 2: Lineage Diversity plot +# WF data +#================================ +#------------------------ +# Data: All lineages or +# selected few +#------------------------ +sel_lineages = levels(lin_wf$sel_lineages_f)[1:4] +sel_lineages +lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,] +str(lin_wf_plot) + +# drop unused factor levels +lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f) +levels(lin_wf_plot$sel_lineages_f) +str(lin_wf_plot) + +#------------------------ +# plot from my function: +#------------------------ +lin_diversityP = lin_count_bp(lin_wf_plot + , x_categ = "sel_lineages_f" + , y_count = "snp_diversity" + , display_label_col = "snp_diversity_f" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , y_scale_percent = T + , y_label = "SNP diversity") + +lin_diversityP +#########################################################################333 +#================================ +# Combine plots +#================================ + +svg(plot_basic_bp_lineage_cl , width = 8, height = 15 ) + +lineage_bp_combined = cowplot::plot_grid(lin_countP, lin_diversityP + #, labels = c("(a)", "(b)", "(c)", "(d)") + , nrow = 2 + , labels = "AUTO" + , label_size = 25) + +lineage_bp_combined +dev.off() diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R index 2cdfbe8..2d51ab0 100644 --- a/scripts/plotting/lineage_bp_data.R +++ b/scripts/plotting/lineage_bp_data.R @@ -68,42 +68,35 @@ lin_wf #===================== # SNP diversity lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) -lin_wf +lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") # Lineage names lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages) lin_wf -# # Lineage names -# lin_wf = lin_wf %>% -# mutate(ordering_category = case_when( -# sel_lineages_f == "" ~ 0 -# , sel_lineages_f == "L1" ~ 1 -# , sel_lineages_f == "L2" ~ 2 -# , sel_lineages_f == "L3" ~ 3 -# , sel_lineages_f == "L4" ~ 4 -# , sel_lineages_f == "L5" ~ 5 -# , sel_lineages_f == "L6" ~ 6 -# , sel_lineages_f == "L7" ~ 7 -# , sel_lineages_f == "LBOV" ~ 8 -# -# , sel_lineages_f == "L1;L2" ~ 9 -# , sel_lineages_f == "L1;L3" ~ 10 -# , sel_lineages_f == "L1;L4" ~ 11 -# -# , sel_lineages_f == "L2;L3" ~ 12 -# , sel_lineages_f == "L2;L3;L4" ~ 13 -# , sel_lineages_f == "L2;L4" ~ 14 -# , sel_lineages_f == "L2;L6" ~ 15 -# , sel_lineages_f == "L2;LBOV" ~ 16 -# -# , sel_lineages_f == "L3;L4" ~ 17 -# -# , sel_lineages_f == "L4;L6" ~ 18 -# , sel_lineages_f == "L4;L7" ~ 19 -# -# , FALSE ~ -1) -# ) +# Important: Relevel factors so that x-axis categ appear as you want +lin_wf$sel_lineages_f = factor(lin_wf$sel_lineages_f, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + +levels(lin_wf$sel_lineages_f) ################################## # LF data: lineages with @@ -134,40 +127,27 @@ if ( nrow(lin_lf) == expected_rows ){ cat("\nFAIL: numbers mismatch" , "\nExpected nrow: ", expected_rows) } -####################################### -# #===================== -# # Add some formatting -# #===================== -# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages) -# lin_lf +# Important: Relevel factors so that x-axis categ appear as you want +lin_lf$sel_lineages_f = factor(lin_lf$sel_lineages_f, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) -# lin_lf = lin_lf %>% -# mutate(ordering_category = case_when( -# sel_lineages_f == "" ~ 0 -# , sel_lineages_f == "L1" ~ 1 -# , sel_lineages_f == "L2" ~ 2 -# , sel_lineages_f == "L3" ~ 3 -# , sel_lineages_f == "L4" ~ 4 -# , sel_lineages_f == "L5" ~ 5 -# , sel_lineages_f == "L6" ~ 6 -# , sel_lineages_f == "L7" ~ 7 -# , sel_lineages_f == "LBOV" ~ 8 -# -# , sel_lineages_f == "L1;L2" ~ 9 -# , sel_lineages_f == "L1;L3" ~ 10 -# , sel_lineages_f == "L1;L4" ~ 11 -# -# , sel_lineages_f == "L2;L3" ~ 12 -# , sel_lineages_f == "L2;L3;L4" ~ 13 -# , sel_lineages_f == "L2;L4" ~ 14 -# , sel_lineages_f == "L2;L6" ~ 15 -# , sel_lineages_f == "L2;LBOV" ~ 16 -# -# , sel_lineages_f == "L3;L4" ~ 17 -# -# , sel_lineages_f == "L4;L6" ~ 18 -# , sel_lineages_f == "L4;L7" ~ 19 -# -# , FALSE ~ -1) -# ) +levels(lin_lf$sel_lineages_f) diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt index 8d7ba0c..332293e 100644 --- a/scripts/plotting/running_plotting_scripts.txt +++ b/scripts/plotting/running_plotting_scripts.txt @@ -112,6 +112,14 @@ note: - fa flag has default if not supplied - fb flag has default if not supplied + +#=================== +# Add LINEAGE ONE +#=================== +# Lineage_bp.R +creates Count and Diversity plot + + ######################################################################## # TODO Delete: dirs.R From c9519b3b56194aad06ce8394292cce3a166444d6 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 7 Sep 2021 10:52:26 +0100 Subject: [PATCH 12/51] moved old lineage_basic_barplot.R to redundant --- .../redundant/lineage_basic_barplot.R | 214 +++++++++++++ .../redundant/other_plots_data_SAFEGUARD.R | 301 ++++++++++++++++++ 2 files changed, 515 insertions(+) create mode 100644 scripts/plotting/redundant/lineage_basic_barplot.R create mode 100644 scripts/plotting/redundant/other_plots_data_SAFEGUARD.R diff --git a/scripts/plotting/redundant/lineage_basic_barplot.R b/scripts/plotting/redundant/lineage_basic_barplot.R new file mode 100644 index 0000000..e4503d1 --- /dev/null +++ b/scripts/plotting/redundant/lineage_basic_barplot.R @@ -0,0 +1,214 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() +######################################################### +# TASK: Basic lineage barplot showing numbers + +# Output: Basic barplot with lineage samples and mut count + +########################################################## +# Installing and loading required packages +########################################################## +source("Header_TT.R") +require(data.table) +source("combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3 + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + + +#=========== +# input +#=========== +# output of combining_dfs_plotting.R + +#======= +# output +#======= +# plot 1 +basic_bp_lineage = "basic_lineage_barplot.svg" +plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage) + +#======================================================================= +#================ +# Data for plots: +# you need merged_df2, comprehensive one +# since this has one-many relationship +# i.e the same SNP can belong to multiple lineages +#================ +# REASSIGNMENT as necessary +my_df = merged_df2 + +# clear excess variable +rm(merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +#========================== +# Plot: Lineage barplot +# x = lineage y = No. of samples +# col = Lineage +# fill = lineage +#============================ +table(my_df$lineage) +as.data.frame(table(my_df$lineage)) + +#============= +# Data for plots +#============= +# REASSIGNMENT +df <- my_df + +rm(my_df) + +# get freq count of positions so you can subset freq<1 +#setDT(df)[, lineage_count := .N, by = .(lineage)] + +#****************** +# generate plot: barplot of mutation by lineage +#****************** +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4" + #, "lineage5" + #, "lineage6" + #, "lineage7" + ) + +df_lin = subset(df, subset = lineage %in% sel_lineages) + +# Create df with lineage inform & no. of unique mutations +# per lineage and total samples within lineage +# this is essentially barplot with two y axis + +bar = bar = as.data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(df$id)[df$lineage==i]) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = df[df$lineage==i,] + print(paste0(i, "=======")) + print(length(unique(foo$mutationinformation))) + curr_count = length(unique(foo$mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} + +print(total_snps_u) +bar$num_snps_u = total_snps_u +bar$total_samples = total_samples +bar + +#***************** +# generate plot: lineage barplot with two y-axis +#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 +#***************** + +y1 = bar$num_snps_u +y2 = bar$total_samples +x = sel_lineages + +to_plot = data.frame(x = x + , y1 = y1 + , y2 = y2) +to_plot + +# FIXME later: will be depricated! +melted = melt(to_plot, id = "x") +melted + + +svg(plot_basic_bp_lineage) + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +g = ggplot(melted, aes(x = x + , y = value + , fill = variable)) + +printFile = g + geom_bar(stat = "identity" + , position = position_stack(reverse = TRUE) + , alpha=.75 + , colour='grey75') + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + #, angle = 30 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als + , colour = 'black') + , axis.title.y = element_text(size = my_als + , colour = 'black') + , legend.position = "top" + , legend.text = element_text(size = my_als)) + + #geom_text() + + geom_label(aes(label = value) + , size = 5 + , hjust = 0.5 + , vjust = 0.5 + , colour = 'black' + , show.legend = FALSE + #, check_overlap = TRUE + , position = position_stack(reverse = T)) + + labs(title = '' + , x = '' + , y = "Number" + , fill = 'Variable' + , colour = 'black') + + scale_fill_manual(values = c('grey50', 'gray75') + , name='' + , labels=c('Mutations', 'Total Samples')) + + scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')) + +print(printFile) +dev.off() diff --git a/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R b/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R new file mode 100644 index 0000000..df5c1e3 --- /dev/null +++ b/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R @@ -0,0 +1,301 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: producing boxplots for dr and other muts + +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) +library(tidyverse) +source("combining_dfs_plotting.R") + +rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig + , merged_df3_comp, merged_df3_comp_lig + , my_df_u, my_df_u_lig) + + +cols_to_select = c("mutation", "mutationinformation" + , "wild_type", "position", "mutant_type" + , "mutation_info") + +merged_df3_short = merged_df3[, cols_to_select] + +# write merged_df3 to generate structural figure +write.csv(merged_df3_short, "merged_df3_short.csv") + +#======================================================================== +#%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT: PS +#%%%%%%%%%%%%%%%%%%%% +df_ps = merged_df3 + +#============================ +# adding foldx scaled values +# scale data b/w -1 and 1 +#============================ +n = which(colnames(df_ps) == "ddg"); n + +my_min = min(df_ps[,n]); my_min +my_max = max(df_ps[,n]); my_max + +df_ps$foldx_scaled = ifelse(df_ps[,n] < 0 + , df_ps[,n]/abs(my_min) + , df_ps[,n]/my_max) +# sanity check +my_min = min(df_ps$foldx_scaled); my_min +my_max = max(df_ps$foldx_scaled); my_max + +if (my_min == -1 && my_max == 1){ + cat("PASS: foldx ddg successfully scaled b/w -1 and 1" + , "\nProceeding with assigning foldx outcome category") +}else{ + cat("FAIL: could not scale foldx ddg values" + , "Aborting!") +} + +#================================ +# adding foldx outcome category +# ddg<0 = "Stabilising" (-ve) +#================================= + +c1 = table(df_ps$ddg < 0) +df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising") +c2 = table(df_ps$ddg < 0) + +if ( all(c1 == c2) ){ + cat("PASS: foldx outcome successfully created") +}else{ + cat("FAIL: foldx outcome could not be created. Aborting!") + exit() +} +#======================================================================= +# name tidying +df_ps$mutation_info = as.factor(df_ps$mutation_info) +df_ps$duet_outcome = as.factor(df_ps$duet_outcome) +df_ps$foldx_outcome = as.factor(df_ps$foldx_outcome) +df_ps$ligand_outcome = as.factor(df_ps$ligand_outcome) + +# check +table(df_ps$mutation_info) + + # further checks to make sure dr and other muts are indeed unique +dr_muts = df_ps[df_ps$mutation_info == dr_muts_col,] +dr_muts_names = unique(dr_muts$mutation) + +other_muts = df_ps[df_ps$mutation_info == other_muts_col,] +other_muts_names = unique(other_muts$mutation) + +if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && + table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){ + cat("PASS: dr and other muts are indeed unique") +}else{ + cat("FAIL: dr adn others muts are NOT unique!") + quit() +} + + +#%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT: LIG +#%%%%%%%%%%%%%%%%%%%% + +df_lig = merged_df3_lig + +# name tidying +df_lig$mutation_info = as.factor(df_lig$mutation_info) +df_lig$duet_outcome = as.factor(df_lig$duet_outcome) +#df_lig$ligand_outcome = as.factor(df_lig$ligand_outcome) + +# check +table(df_lig$mutation_info) + +#======================================================================== +#=========== +# Data: ps +#=========== +# keep similar dtypes cols together +cols_to_select_ps = c("mutationinformation", "mutation", "position", "mutation_info" + , "duet_outcome" + + , "duet_scaled" + , "ligand_distance" + , "asa" + , "rsa" + , "rd_values" + , "kd_values") + +df_wf_ps = df_ps[, cols_to_select_ps] + +pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps + +expected_rows_lf_ps = nrow(df_wf_ps) * (length(df_wf_ps) - length(pivot_cols_ps)) +expected_rows_lf_ps + +# LF data: duet +df_lf_ps = gather(df_wf_ps, param_type, param_value, duet_scaled:kd_values, factor_key=TRUE) + +if (nrow(df_lf_ps) == expected_rows_lf_ps){ + cat("PASS: long format data created for duet") +}else{ + cat("FAIL: long format data could not be created for duet") + exit() +} + +str(df_wf_ps) +str(df_lf_ps) + +# assign pretty labels: param_type +levels(df_lf_ps$param_type); table(df_lf_ps$param_type) + +ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")") +ligand_dist_colname + +duet_stability_name = paste0(delta_symbol, delta_symbol, "G") +duet_stability_name + +#levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="duet_scaled"] <- "Stability" +levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="duet_scaled"] <- duet_stability_name +#levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="ligand_distance"] <- "Ligand Distance" +levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="ligand_distance"] <- ligand_dist_colname +levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="asa"] <- "ASA" +levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="rsa"] <- "RSA" +levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="rd_values"] <- "RD" +levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="kd_values"] <- "KD" +# check +levels(df_lf_ps$param_type); table(df_lf_ps$param_type) + +# assign pretty labels: mutation_info +levels(df_lf_ps$mutation_info); table(df_lf_ps$mutation_info) +sum(table(df_lf_ps$mutation_info)) == nrow(df_lf_ps) + +levels(df_lf_ps$mutation_info)[levels(df_lf_ps$mutation_info)==dr_muts_col] <- "DM" +levels(df_lf_ps$mutation_info)[levels(df_lf_ps$mutation_info)==other_muts_col] <- "OM" +# check +levels(df_lf_ps$mutation_info); table(df_lf_ps$mutation_info) + +############################################################################ + +#=========== +# LF data: LIG +#=========== +# keep similar dtypes cols together +cols_to_select_lig = c("mutationinformation", "mutation", "position", "mutation_info" + , "ligand_outcome" + + , "affinity_scaled" + #, "ligand_distance" + , "asa" + , "rsa" + , "rd_values" + , "kd_values") + +df_wf_lig = df_lig[, cols_to_select_lig] + +pivot_cols_lig = cols_to_select_lig[1:5]; pivot_cols_lig + +expected_rows_lf_lig = nrow(df_wf_lig) * (length(df_wf_lig) - length(pivot_cols_lig)) +expected_rows_lf_lig + +# LF data: foldx +df_lf_lig = gather(df_wf_lig, param_type, param_value, affinity_scaled:kd_values, factor_key=TRUE) + +if (nrow(df_lf_lig) == expected_rows_lf_lig){ + cat("PASS: long format data created for foldx") +}else{ + cat("FAIL: long format data could not be created for foldx") + exit() +} + +# assign pretty labels: param_type +levels(df_lf_lig$param_type); table(df_lf_lig$param_type) + +levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="affinity_scaled"] <- "Ligand Affinity" +#levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="ligand_distance"] <- "Ligand Distance" +levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="asa"] <- "ASA" +levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="rsa"] <- "RSA" +levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="rd_values"] <- "RD" +levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="kd_values"] <- "KD" +#check +levels(df_lf_lig$param_type); table(df_lf_lig$param_type) + +# assign pretty labels: mutation_info +levels(df_lf_lig$mutation_info); table(df_lf_lig$mutation_info) +sum(table(df_lf_lig$mutation_info)) == nrow(df_lf_lig) + +levels(df_lf_lig$mutation_info)[levels(df_lf_lig$mutation_info)==dr_muts_col] <- "DM" +levels(df_lf_lig$mutation_info)[levels(df_lf_lig$mutation_info)==other_muts_col] <- "OM" +# check +levels(df_lf_lig$mutation_info); table(df_lf_lig$mutation_info) + +############################################################################# +#=========== +# Data: foldx +#=========== +# keep similar dtypes cols together +cols_to_select_foldx = c("mutationinformation", "mutation", "position", "mutation_info" + , "foldx_outcome" + + , "foldx_scaled") + #, "ligand_distance" + #, "asa" + #, "rsa" + #, "rd_values" + #, "kd_values") + + +df_wf_foldx = df_ps[, cols_to_select_foldx] + +pivot_cols_foldx = cols_to_select_foldx[1:5]; pivot_cols_foldx + +expected_rows_lf_foldx = nrow(df_wf_foldx) * (length(df_wf_foldx) - length(pivot_cols_foldx)) +expected_rows_lf_foldx + +# LF data: foldx +df_lf_foldx = gather(df_wf_foldx, param_type, param_value, foldx_scaled, factor_key=TRUE) + +if (nrow(df_lf_foldx) == expected_rows_lf_foldx){ + cat("PASS: long format data created for foldx") +}else{ + cat("FAIL: long format data could not be created for foldx") + exit() +} + +foldx_stability_name = paste0(delta_symbol, delta_symbol, "G") +foldx_stability_name + +# assign pretty labels: param type +levels(df_lf_foldx$param_type); table(df_lf_foldx$param_type) + +#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="foldx_scaled"] <- "Stability" +levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="foldx_scaled"] <- foldx_stability_name +#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="ligand_distance"] <- "Ligand Distance" +#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="asa"] <- "ASA" +#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="rsa"] <- "RSA" +#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="rd_values"] <- "RD" +#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="kd_values"] <- "KD" +# check +levels(df_lf_foldx$param_type); table(df_lf_foldx$param_type) + +# assign pretty labels: mutation_info +levels(df_lf_foldx$mutation_info); table(df_lf_foldx$mutation_info) +sum(table(df_lf_foldx$mutation_info)) == nrow(df_lf_foldx) + +levels(df_lf_foldx$mutation_info)[levels(df_lf_foldx$mutation_info)==dr_muts_col] <- "DM" +levels(df_lf_foldx$mutation_info)[levels(df_lf_foldx$mutation_info)==other_muts_col] <- "OM" +# check +levels(df_lf_foldx$mutation_info); table(df_lf_foldx$mutation_info) + +############################################################################ + +# clear excess variables +rm(cols_to_select_ps, cols_to_select_foldx, cols_to_select_lig + , pivot_cols_ps, pivot_cols_foldx, pivot_cols_lig + , expected_rows_lf_ps, expected_rows_lf_foldx, expected_rows_lf_lig + , my_max, my_min, na_count, na_count_df2, na_count_df3, dup_muts_nu + , c1, c2, n) From 686fd0cd808df34a3ac728f0bdbbd62828f51a46 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 7 Sep 2021 11:16:41 +0100 Subject: [PATCH 13/51] updated running_plotting_scripts.R --- scripts/plotting/barplots_subcolours.R | 2 +- scripts/plotting/basic_barplots_combined.R | 0 scripts/plotting/corr_adjusted_PS_LIG.R | 0 scripts/plotting/dirs.R | 0 scripts/plotting/dist_plots_check.R | 0 scripts/plotting/extreme_muts.R | 0 scripts/plotting/get_plotting_dfs.R | 8 +++--- scripts/plotting/get_plotting_dfs_with_lig.R | 0 scripts/plotting/ggcorr_all_PS_LIG.R | 0 scripts/plotting/hist_af_or_base.R | 0 scripts/plotting/hist_af_or_combined.R | 0 scripts/plotting/legend_adjustment.R | 0 .../lineage_basic_barplots_combined.R | 24 ++++++++++++++++- scripts/plotting/lineage_bp_data.R | 0 scripts/plotting/lineage_dist_combined_PS.R | 0 .../plotting/lineage_dist_dm_om_combined_PS.R | 0 scripts/plotting/opp_mcsm_muts.R | 0 scripts/plotting/or_plots_combined.R | 0 scripts/plotting/other_plots_combined.R | 0 scripts/plotting/other_plots_data.R | 0 scripts/plotting/output_tables.R | 0 scripts/plotting/ps_plots_combined.R | 0 scripts/plotting/resolving_ambiguous_muts.R | 0 scripts/plotting/running_plotting_scripts.txt | 26 +++++++++++++++---- 24 files changed, 49 insertions(+), 11 deletions(-) mode change 100644 => 100755 scripts/plotting/basic_barplots_combined.R mode change 100644 => 100755 scripts/plotting/corr_adjusted_PS_LIG.R mode change 100644 => 100755 scripts/plotting/dirs.R mode change 100644 => 100755 scripts/plotting/dist_plots_check.R mode change 100644 => 100755 scripts/plotting/extreme_muts.R mode change 100644 => 100755 scripts/plotting/get_plotting_dfs.R mode change 100644 => 100755 scripts/plotting/get_plotting_dfs_with_lig.R mode change 100644 => 100755 scripts/plotting/ggcorr_all_PS_LIG.R mode change 100644 => 100755 scripts/plotting/hist_af_or_base.R mode change 100644 => 100755 scripts/plotting/hist_af_or_combined.R mode change 100644 => 100755 scripts/plotting/legend_adjustment.R mode change 100644 => 100755 scripts/plotting/lineage_basic_barplots_combined.R mode change 100644 => 100755 scripts/plotting/lineage_bp_data.R mode change 100644 => 100755 scripts/plotting/lineage_dist_combined_PS.R mode change 100644 => 100755 scripts/plotting/lineage_dist_dm_om_combined_PS.R mode change 100644 => 100755 scripts/plotting/opp_mcsm_muts.R mode change 100644 => 100755 scripts/plotting/or_plots_combined.R mode change 100644 => 100755 scripts/plotting/other_plots_combined.R mode change 100644 => 100755 scripts/plotting/other_plots_data.R mode change 100644 => 100755 scripts/plotting/output_tables.R mode change 100644 => 100755 scripts/plotting/ps_plots_combined.R mode change 100644 => 100755 scripts/plotting/resolving_ambiguous_muts.R diff --git a/scripts/plotting/barplots_subcolours.R b/scripts/plotting/barplots_subcolours.R index 4e4806a..1f98bec 100755 --- a/scripts/plotting/barplots_subcolours.R +++ b/scripts/plotting/barplots_subcolours.R @@ -124,4 +124,4 @@ print(outPlot_bp_lig) dev.off() ######################################################################= # End of script -######################################################################= \ No newline at end of file +######################################################################= diff --git a/scripts/plotting/basic_barplots_combined.R b/scripts/plotting/basic_barplots_combined.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/corr_adjusted_PS_LIG.R b/scripts/plotting/corr_adjusted_PS_LIG.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/dist_plots_check.R b/scripts/plotting/dist_plots_check.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/extreme_muts.R b/scripts/plotting/extreme_muts.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R old mode 100644 new mode 100755 index 89b477c..5876d8d --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -39,8 +39,8 @@ import_dirs(drug, gene) #--------------------------- # call: plotting_data() #--------------------------- -if (!exists("infile_params") && exists("gene")){ -#if (!is.character(infile_params) && exists("gene")){ # when running as cmd +#if (!exists("infile_params") && exists("gene")){ +if (!is.character(infile_params) && exists("gene")){ # when running as cmd #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid infile_params = paste0(outdir, "/", in_filename_params) @@ -67,8 +67,8 @@ cat("\nLigand distance cut off, colname:", LigDist_colname #-------------------------------- # call: combining_dfs_plotting() #-------------------------------- -if (!exists("infile_metadata") && exists("gene")){ -#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd +#if (!exists("infile_metadata") && exists("gene")){ +if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid infile_metadata = paste0(outdir, "/", in_filename_metadata) cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") diff --git a/scripts/plotting/get_plotting_dfs_with_lig.R b/scripts/plotting/get_plotting_dfs_with_lig.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/ggcorr_all_PS_LIG.R b/scripts/plotting/ggcorr_all_PS_LIG.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/hist_af_or_base.R b/scripts/plotting/hist_af_or_base.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/hist_af_or_combined.R b/scripts/plotting/hist_af_or_combined.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/legend_adjustment.R b/scripts/plotting/legend_adjustment.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R old mode 100644 new mode 100755 index 4b63587..94bcd4a --- a/scripts/plotting/lineage_basic_barplots_combined.R +++ b/scripts/plotting/lineage_basic_barplots_combined.R @@ -3,9 +3,10 @@ getwd() setwd("~/git/LSHTM_analysis/scripts/plotting/") getwd() ######################################################### -# TASK: Basic lineage barplot showing numbers +# TASK: Basic lineage barplots showing numbers # Output: Basic barplot with lineage samples and mut count +# + SNP diversity ########################################################## # Installing and loading required packages @@ -16,6 +17,27 @@ source("../functions/bp_lineage.R") #=========== # input #=========== +#drug = 'streptomycin' +#gene = 'gid' + +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character", + "data_file1" , "fa", 2, "character", + "data_file2" , "fb", 2, "character" +), byrow = TRUE, ncol = 4) + +opt = getopt(spec) + +drug = opt$drug +gene = opt$gene +infile_params = opt$data_file1 +infile_metadata = opt$data_file2 + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} + source ('get_plotting_dfs.R') cat("Directories imported:" diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/lineage_dist_combined_PS.R b/scripts/plotting/lineage_dist_combined_PS.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/lineage_dist_dm_om_combined_PS.R b/scripts/plotting/lineage_dist_dm_om_combined_PS.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/or_plots_combined.R b/scripts/plotting/or_plots_combined.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/other_plots_combined.R b/scripts/plotting/other_plots_combined.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/output_tables.R b/scripts/plotting/output_tables.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/ps_plots_combined.R b/scripts/plotting/ps_plots_combined.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt index 332293e..74f67de 100644 --- a/scripts/plotting/running_plotting_scripts.txt +++ b/scripts/plotting/running_plotting_scripts.txt @@ -113,12 +113,28 @@ note: - fb flag has default if not supplied -#=================== -# Add LINEAGE ONE -#=================== -# Lineage_bp.R -creates Count and Diversity plot +#==================================== +# lineage_basic_barplots_combined.R +#==================================== +#----------------------------------------------------------------------- +./lineage_basic_barplots_combined.R-d streptomycin -g gid +#----------------------------------------------------------------------- +It replaces + ## lineage_basic_barplot.R +These have been moved to redundant/ + + +sources: + ## get_plotting_dfs.R + ## functions//bp_lineage.R" + + outputs: 1 svg in the plotdir + ## basic_lineage_barplots_combined.svg + +note: + - fa flag has default if not supplied + - fb flag has default if not supplied ######################################################################## # TODO From 2ee66c770bbf36cc571e5037895f4c8e09c74e58 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 7 Sep 2021 11:18:10 +0100 Subject: [PATCH 14/51] updated notes --- scripts/plotting/running_plotting_scripts.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt index 74f67de..369bc4b 100644 --- a/scripts/plotting/running_plotting_scripts.txt +++ b/scripts/plotting/running_plotting_scripts.txt @@ -120,7 +120,7 @@ note: ./lineage_basic_barplots_combined.R-d streptomycin -g gid #----------------------------------------------------------------------- -It replaces +It replaces (and has an added diversity plot) ## lineage_basic_barplot.R These have been moved to redundant/ From 03031d2eb6ddd2d27cc142375bc3be90d2f735da Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Sep 2021 13:12:07 +0100 Subject: [PATCH 15/51] moved all test scripts for functions to tests/ --- .../functions/{ => tests}/test_aa_prop_bp.R | 0 .../functions/{ => tests}/test_af_or_calcs.R | 0 scripts/functions/{ => tests}/test_bp.R | 0 .../functions/{ => tests}/test_bp_lineage.R | 0 .../{ => tests}/test_combining_dfs_plotting.R | 0 scripts/functions/{ => tests}/test_lf_bp.R | 0 .../{ => tests}/test_lf_unpaired_stats.R | 0 scripts/functions/tests/test_lineage_dist.R | 32 ++ .../{ => tests}/test_plotting_data.R | 0 scripts/plotting/Header_TT.R | 40 +- scripts/plotting/get_plotting_dfs.R | 8 +- .../lineage_basic_barplots_combined.R | 39 +- scripts/plotting/lineage_bp_data.R | 129 +++--- scripts/plotting/lineage_dist_combined_PS.R | 303 -------------- .../plotting/lineage_dist_dm_om_combined_PS.R | 387 ------------------ 15 files changed, 162 insertions(+), 776 deletions(-) rename scripts/functions/{ => tests}/test_aa_prop_bp.R (100%) rename scripts/functions/{ => tests}/test_af_or_calcs.R (100%) rename scripts/functions/{ => tests}/test_bp.R (100%) rename scripts/functions/{ => tests}/test_bp_lineage.R (100%) rename scripts/functions/{ => tests}/test_combining_dfs_plotting.R (100%) rename scripts/functions/{ => tests}/test_lf_bp.R (100%) rename scripts/functions/{ => tests}/test_lf_unpaired_stats.R (100%) create mode 100644 scripts/functions/tests/test_lineage_dist.R rename scripts/functions/{ => tests}/test_plotting_data.R (100%) delete mode 100755 scripts/plotting/lineage_dist_combined_PS.R delete mode 100755 scripts/plotting/lineage_dist_dm_om_combined_PS.R diff --git a/scripts/functions/test_aa_prop_bp.R b/scripts/functions/tests/test_aa_prop_bp.R similarity index 100% rename from scripts/functions/test_aa_prop_bp.R rename to scripts/functions/tests/test_aa_prop_bp.R diff --git a/scripts/functions/test_af_or_calcs.R b/scripts/functions/tests/test_af_or_calcs.R similarity index 100% rename from scripts/functions/test_af_or_calcs.R rename to scripts/functions/tests/test_af_or_calcs.R diff --git a/scripts/functions/test_bp.R b/scripts/functions/tests/test_bp.R similarity index 100% rename from scripts/functions/test_bp.R rename to scripts/functions/tests/test_bp.R diff --git a/scripts/functions/test_bp_lineage.R b/scripts/functions/tests/test_bp_lineage.R similarity index 100% rename from scripts/functions/test_bp_lineage.R rename to scripts/functions/tests/test_bp_lineage.R diff --git a/scripts/functions/test_combining_dfs_plotting.R b/scripts/functions/tests/test_combining_dfs_plotting.R similarity index 100% rename from scripts/functions/test_combining_dfs_plotting.R rename to scripts/functions/tests/test_combining_dfs_plotting.R diff --git a/scripts/functions/test_lf_bp.R b/scripts/functions/tests/test_lf_bp.R similarity index 100% rename from scripts/functions/test_lf_bp.R rename to scripts/functions/tests/test_lf_bp.R diff --git a/scripts/functions/test_lf_unpaired_stats.R b/scripts/functions/tests/test_lf_unpaired_stats.R similarity index 100% rename from scripts/functions/test_lf_unpaired_stats.R rename to scripts/functions/tests/test_lf_unpaired_stats.R diff --git a/scripts/functions/tests/test_lineage_dist.R b/scripts/functions/tests/test_lineage_dist.R new file mode 100644 index 0000000..1f40d16 --- /dev/null +++ b/scripts/functions/tests/test_lineage_dist.R @@ -0,0 +1,32 @@ +############################### +# TEST function lineage_dist.R +# to plot lineage +# dist plots with or without facet +############################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") + +source("get_plotting_dfs.R") + +cat("cols imported:" + , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2) + + +############################################################# + +lineage_distP(lin_dist_plot + , with_facet = F + , leg_label = "Mutation Class" +) + +lineage_distP(lin_dist_plot + , with_facet = T + , facet_wrap_var = "mutation_info_labels" + , leg_label = "Mutation Class" + , leg_pos_wf = "none" + , leg_dir_wf = "horizontal" + +) \ No newline at end of file diff --git a/scripts/functions/test_plotting_data.R b/scripts/functions/tests/test_plotting_data.R similarity index 100% rename from scripts/functions/test_plotting_data.R rename to scripts/functions/tests/test_plotting_data.R diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R index e4593d0..47599d3 100755 --- a/scripts/plotting/Header_TT.R +++ b/scripts/plotting/Header_TT.R @@ -1,8 +1,12 @@ ######################################################### -### A) Installing and loading required packages +# A) Installing and loading required packages +# B) My functions +######################################################### + ######################################################### #lib_loc = "/usr/local/lib/R/site-library") + require("getopt", quietly = TRUE) # cmd parse arguments if (!require("tidyverse")) { @@ -10,6 +14,21 @@ if (!require("tidyverse")) { library(tidyverse) } +if (!require("shiny")) { + install.packages("shiny", dependencies = TRUE) + library(shiny) +} + +if (!require("gridExtra")) { + install.packages("gridExtra", dependencies = TRUE) + library(gridExtra) +} + +if (!require("ggridges")) { + install.packages("ggridges", dependencies = TRUE) + library(ggridges) +} + # if (!require("ggplot2")) { # install.packages("ggplot2", dependencies = TRUE) # library(ggplot2) @@ -20,6 +39,11 @@ if (!require("tidyverse")) { # library(dplyr) # } +if (!require ("plyr")){ + install.packages("plyr") + library(plyr) + } + # Install #if(!require(devtools)) install.packages("devtools") #devtools::install_github("kassambara/ggcorrplot") @@ -140,4 +164,16 @@ if(!require(protr)){ # install.packages("BiocManager") #BiocManager::install("Logolas") -library("Logolas") \ No newline at end of file +library("Logolas") + + +#################################### +# Load all my functions: +# only works if tidyverse is loaded +# hence included it here! +#################################### + +func_path = "~/git/LSHTM_analysis/scripts/functions/" +source_files <- list.files(func_path, "\\.R$") # locate all .R files +map(paste0(func_path, source_files), source) # source all your R scripts! + diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index 5876d8d..89b477c 100755 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -39,8 +39,8 @@ import_dirs(drug, gene) #--------------------------- # call: plotting_data() #--------------------------- -#if (!exists("infile_params") && exists("gene")){ -if (!is.character(infile_params) && exists("gene")){ # when running as cmd +if (!exists("infile_params") && exists("gene")){ +#if (!is.character(infile_params) && exists("gene")){ # when running as cmd #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid infile_params = paste0(outdir, "/", in_filename_params) @@ -67,8 +67,8 @@ cat("\nLigand distance cut off, colname:", LigDist_colname #-------------------------------- # call: combining_dfs_plotting() #-------------------------------- -#if (!exists("infile_metadata") && exists("gene")){ -if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd +if (!exists("infile_metadata") && exists("gene")){ +#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid infile_metadata = paste0(outdir, "/", in_filename_metadata) cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R index 94bcd4a..b6f25e6 100755 --- a/scripts/plotting/lineage_basic_barplots_combined.R +++ b/scripts/plotting/lineage_basic_barplots_combined.R @@ -12,7 +12,6 @@ getwd() # Installing and loading required packages ########################################################## source("Header_TT.R") -source("../functions/bp_lineage.R") #=========== # input @@ -40,24 +39,6 @@ if(is.null(drug)|is.null(gene)) { source ('get_plotting_dfs.R') -cat("Directories imported:" - , "\n====================" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir) - -cat("Variables imported:" - , "\n=====================" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - #, "\nNo. of duplicated muts:", dup_muts_nu - , "\ndr_muts_col:", dr_muts_col - , "\nother_muts_col:", other_muts_col - , "\ndrtype_col:", resistance_col) - #======= # output #======= @@ -74,21 +55,21 @@ plot_basic_bp_lineage_cl # Data: All lineages or # selected few #------------------------ -sel_lineages = levels(lin_lf$sel_lineages_f)[1:4] +sel_lineages = levels(lin_lf$sel_lineages)[1:4] sel_lineages -lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,] +lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%sel_lineages,] str(lin_lf_plot) # drop unused factor levels -lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f) -levels(lin_lf_plot$sel_lineages_f) +lin_lf_plot$sel_lineages = factor(lin_lf_plot$sel_lineages) +levels(lin_lf_plot$sel_lineages) str(lin_lf_plot) #------------------------ # plot from my function: #------------------------ lin_countP = lin_count_bp(lin_lf_plot - , x_categ = "sel_lineages_f" + , x_categ = "sel_lineages" , y_count = "p_count" , bar_fill_categ = "count_categ" , display_label_col = "p_count" @@ -109,21 +90,21 @@ lin_countP # Data: All lineages or # selected few #------------------------ -sel_lineages = levels(lin_wf$sel_lineages_f)[1:4] +sel_lineages = levels(lin_wf$sel_lineages)[1:4] sel_lineages -lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,] +lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%sel_lineages,] str(lin_wf_plot) # drop unused factor levels -lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f) -levels(lin_wf_plot$sel_lineages_f) +lin_wf_plot$sel_lineages = factor(lin_wf_plot$sel_lineages) +levels(lin_wf_plot$sel_lineages) str(lin_wf_plot) #------------------------ # plot from my function: #------------------------ lin_diversityP = lin_count_bp(lin_wf_plot - , x_categ = "sel_lineages_f" + , x_categ = "sel_lineages" , y_count = "snp_diversity" , display_label_col = "snp_diversity_f" , bar_stat_stype = "identity" diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R index 2d51ab0..e9ab929 100755 --- a/scripts/plotting/lineage_bp_data.R +++ b/scripts/plotting/lineage_bp_data.R @@ -27,13 +27,44 @@ cat("\nMissing samples with lineage classification:", table(merged_df2$lineage = } +# Add pretty lineage labels and mut_info_labels +class(merged_df2$lineage); table(merged_df2$lineage) +merged_df2$lineage_labels = gsub("lineage", "L", merged_df2$lineage) +table(merged_df2$lineage_labels) + +class(merged_df2$lineage_labels) + +merged_df2$lineage_labels = factor(merged_df2$lineage_labels, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + +class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels) + + ################################## # WF data: lineages with # snp count # total_samples # snp diversity (perc) ################################## -sel_lineages = levels(as.factor(merged_df2$lineage)) +sel_lineages = levels(merged_df2$lineage_labels) lin_wf = data.frame(sel_lineages) #4, 1 total_snps_u = NULL @@ -41,12 +72,12 @@ total_samples = NULL for (i in sel_lineages){ #print(i) - curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i]) + curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i]) #print(curr_total) total_samples = c(total_samples, curr_total) print(total_samples) - foo = merged_df2[merged_df2$lineage==i,] + foo = merged_df2[merged_df2$lineage_labels==i,] print(paste0(i, "=======\n")) print(length(unique(foo$mutationinformation))) curr_count = length(unique(foo$mutationinformation)) @@ -70,33 +101,29 @@ lin_wf lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") -# Lineage names -lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages) -lin_wf +# Important: Check factors so that x-axis categ appear as you want +lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) -# Important: Relevel factors so that x-axis categ appear as you want -lin_wf$sel_lineages_f = factor(lin_wf$sel_lineages_f, c("L1" - , "L2" - , "L3" - , "L4" - , "L5" - , "L6" - , "L7" - , "LBOV" - , "L1;L2" - , "L1;L3" - , "L1;L4" - , "L2;L3" - , "L2;L3;L4" - , "L2;L4" - , "L2;L6" - , "L2;LBOV" - , "L3;L4" - , "L4;L6" - , "L4;L7" - , "")) - -levels(lin_wf$sel_lineages_f) +levels(lin_wf$sel_lineages) ################################## # LF data: lineages with @@ -106,7 +133,7 @@ levels(lin_wf$sel_lineages_f) ################################## names(lin_wf) tot_cols = ncol(lin_wf) -pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f") +pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") pivot_cols_n = length(pivot_cols) expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) @@ -129,25 +156,25 @@ if ( nrow(lin_lf) == expected_rows ){ } # Important: Relevel factors so that x-axis categ appear as you want -lin_lf$sel_lineages_f = factor(lin_lf$sel_lineages_f, c("L1" - , "L2" - , "L3" - , "L4" - , "L5" - , "L6" - , "L7" - , "LBOV" - , "L1;L2" - , "L1;L3" - , "L1;L4" - , "L2;L3" - , "L2;L3;L4" - , "L2;L4" - , "L2;L6" - , "L2;LBOV" - , "L3;L4" - , "L4;L6" - , "L4;L7" - , "")) +lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) -levels(lin_lf$sel_lineages_f) +levels(lin_lf$sel_lineages) diff --git a/scripts/plotting/lineage_dist_combined_PS.R b/scripts/plotting/lineage_dist_combined_PS.R deleted file mode 100755 index bf1c75b..0000000 --- a/scripts/plotting/lineage_dist_combined_PS.R +++ /dev/null @@ -1,303 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Lineage dist plots: ggridges - -# Output: 2 SVGs for PS stability - -# 1) all muts -# 2) dr_muts - -########################################################## -# Installing and loading required packages -########################################################## -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - -source("Header_TT.R") -library(ggridges) -source("combining_dfs_plotting.R") -# PS combined: -# 1) merged_df2 -# 2) merged_df2_comp -# 3) merged_df3 -# 4) merged_df3_comp - -# LIG combined: -# 5) merged_df2_lig -# 6) merged_df2_comp_lig -# 7) merged_df3_lig -# 8) merged_df3_comp_lig - -# 9) my_df_u -# 10) my_df_u_lig - -cat("Directories imported:" - , "\n====================" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir) - -cat("Variables imported:" - , "\n=====================" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\nNA count for ORs:", na_count - , "\nNA count in df2:", na_count_df2 - , "\nNA count in df3:", na_count_df3 - , "\ndr_muts_col:", dr_muts_col - , "\nother_muts_col:", other_muts_col - , "\ndrtype_col:", resistance_col) - -#======= -# output -#======= -lineage_dist_combined = "lineage_dist_combined_PS.svg" -plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) -#======================================================================== - -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -# using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available, hence use df with NA -########################### -# REASSIGNMENT -my_df = merged_df2 - -# delete variables not required -rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -table(my_df$mutation_info) - -# subset df with dr muts only -my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") -table(my_df_dr$mutation_info) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Plot 1: ALL Muts -# x = mcsm_values, y = dist -# fill = stability -#============================ - -my_plot_name = 'lineage_dist_PS.svg' - -plot_lineage_duet = paste0(plotdir,"/", my_plot_name) - -#=================== -# Data for plots -#=================== -table(my_df$lineage); str(my_df$lineage) - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4" - #, "lineage5" - #, "lineage6" - #, "lineage7" - ) - -# uncomment as necessary -df_lin = subset(my_df, subset = lineage %in% sel_lineages ) -table(df_lin$lineage) - -# refactor -df_lin$lineage = factor(df_lin$lineage) - -sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage} - -table(df_lin$lineage)#{RESULT: No of samples within lineage} - -length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} - -length(df_lin$mutationinformation) - -u2 = unique(my_df$mutationinformation) -u = unique(df_lin$mutationinformation) -check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df <- df_lin -#%%%%%%%%%%%%%%%%%%%%%%%%% - -rm(df_lin) - -#****************** -# generate distribution plot of lineages -#****************** -# 2 : ggridges (good!) -my_ats = 15 # axis text size -my_als = 20 # axis label size - -my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4' - #, 'Lineage 5', 'Lineage 6', 'Lineage 7' - ) -names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' - # , 'lineage5', 'lineage6', 'lineage7' - ) -# check plot name -plot_lineage_duet - -# output svg -#svg(plot_lineage_duet) -p1 = ggplot(df, aes(x = duet_scaled - , y = duet_outcome))+ - - #printFile=geom_density_ridges_gradient( - geom_density_ridges_gradient(aes(fill = ..x..) - #, jittered_points = TRUE - , scale = 3 - , size = 0.3 ) + - facet_wrap( ~lineage - , scales = "free" - #, switch = 'x' - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1)) + - scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") - , name = "DUET" ) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - - , axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = my_als-5) - , legend.title = element_text(size = my_als) -) - -print(p1) -#dev.off() - -####################################################################### -# lineage distribution plot for dr_muts -####################################################################### - -#========================== -# Plot 2: dr muts ONLY -# x = mcsm_values, y = dist -# fill = stability -#============================ - -my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg' - -plot_lineage_dr_duet = paste0(plotdir,"/", my_plot_name_dr) - -#=================== -# Data for plots -#=================== -table(my_df_dr$lineage); str(my_df_dr$lineage) - -# uncomment as necessary -df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) -table(df_lin_dr$lineage) - -# refactor -df_lin_dr$lineage = factor(df_lin_dr$lineage) - -sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage} - -table(df_lin_dr$lineage)#{RESULT: No of samples within lineage} - -length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} - -length(df_lin_dr$mutationinformation) - -u2 = unique(my_df_dr$mutationinformation) -u = unique(df_lin_dr$mutationinformation) -check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df_dr <- df_lin_dr -#%%%%%%%%%%%%%%%%%%%%%%%%% - -rm(df_lin_dr) - -#****************** -# generate distribution plot of lineages -#****************** -# 2 : ggridges (good!) -my_ats = 15 # axis text size -my_als = 20 # axis label size - - -# check plot name -plot_lineage_dr_duet - -# output svg -#svg(plot_lineage_dr_duet) -p2 = ggplot(df_dr, aes(x = duet_scaled - , y = duet_outcome))+ - - geom_density_ridges_gradient(aes(fill = ..x..) - #, jittered_points = TRUE - , scale = 3 - , size = 0.3) + - #geom_point(aes(size = or_mychisq))+ - facet_wrap( ~lineage - , scales = "free" - #, switch = 'x' - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1) - #, ylim = c(0, 6) - #, clip = "off" - ) + - scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") - , name = "DUET" ) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = 10) - , legend.title = element_text(size = my_als) - #, legend.position = "none" - ) - -print(p2) -#dev.off() -######################################################################## -#============== -# combine plot -#=============== - -svg(plot_lineage_dist_combined, width = 12, height = 6) - -printFile = cowplot::plot_grid(p1, p2 - , label_size = my_als+10) - -print(printFile) -dev.off() diff --git a/scripts/plotting/lineage_dist_dm_om_combined_PS.R b/scripts/plotting/lineage_dist_dm_om_combined_PS.R deleted file mode 100755 index 07912ac..0000000 --- a/scripts/plotting/lineage_dist_dm_om_combined_PS.R +++ /dev/null @@ -1,387 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Lineage dist plots: ggridges - -# Output: 2 SVGs for PS stability - -# 1) all muts -# 2) dr_muts - -########################################################## -# Installing and loading required packages -########################################################## -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - -source("Header_TT.R") -library(ggridges) -library(plyr) -source("combining_dfs_plotting.R") -# PS combined: -# 1) merged_df2 -# 2) merged_df2_comp -# 3) merged_df3 -# 4) merged_df3_comp - -# LIG combined: -# 5) merged_df2_lig -# 6) merged_df2_comp_lig -# 7) merged_df3_lig -# 8) merged_df3_comp_lig - -# 9) my_df_u -# 10) my_df_u_lig - -cat("Directories imported:" - , "\n====================" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir) - -cat("Variables imported:" - , "\n=====================" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\nNA count for ORs:", na_count - , "\nNA count in df2:", na_count_df2 - , "\nNA count in df3:", na_count_df3 - , "\ndr_muts_col:", dr_muts_col - , "\nother_muts_col:", other_muts_col - , "\ndrtype_col:", resistance_col) - -cat("cols imported:" - , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2) - -#======= -# output -#======= -lineage_dist_combined_dm_om = "lineage_dist_combined_dm_om_PS.svg" -plot_lineage_dist_combined_dm_om = paste0(plotdir,"/", lineage_dist_combined_dm_om) - -lineage_dist_combined_dm_om_L = "lineage_dist_combined_dm_om_PS_labelled.svg" -plot_lineage_dist_combined_dm_om_L = paste0(plotdir,"/", lineage_dist_combined_dm_om_L) - -#======================================================================== - -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -# using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available, hence use df with NA -########################### -# REASSIGNMENT -my_df = merged_df2 - -# delete variables not required -rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp - , merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig) - -# quick checks -colnames(my_df) -str(my_df) - -table(my_df$mutation_info) - -#=================== -# Data for plots -#=================== -table(my_df$lineage); str(my_df$lineage) - -# select lineages 1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - #, "lineage5" - #, "lineage6" - #, "lineage7") - -# works nicely with facet wrap using labeller, but not otherwise -#my_labels = c('Lineage 1' -# , 'Lineage 2' -# , 'Lineage 3' -# , 'Lineage 4') -# #, 'Lineage 5' -# #, 'Lineage 6' -# #, 'Lineage 7') - -#names(my_labels) = c('lineage1' -# , 'lineage2' -# , 'lineage3' -# , 'lineage4') -# #, 'lineage5' -# #, 'lineage6' -# #, 'lineage7') - -#========================== -# subset selected lineages -#========================== -df_lin = subset(my_df, subset = lineage %in% sel_lineages) -table(df_lin$lineage) - -#{RESULT: Total number of samples for lineage} -sum(table(df_lin$lineage)) - -#{RESULT: No of samples within lineage} -table(df_lin$lineage) - -#{Result: No. of unique mutations the 4 lineages contribute to} -length(unique(df_lin$mutationinformation)) - -u2 = unique(my_df$mutationinformation) -u = unique(df_lin$mutationinformation) - -#{Result:Muts not present within selected lineages} -check = u2[!u2%in%u]; print(check) - -# workaround to make labels appear nicely for in otherwise cases -#================== -# lineage: labels -# from "plyr" -#================== -#{Result:No of samples in selected lineages} -table(df_lin$lineage) - -df_lin$lineage_labels = mapvalues(df_lin$lineage - , from = c("lineage1","lineage2", "lineage3", "lineage4") - , to = c("Lineage 1", "Lineage 2", "Lineage 3", "Lineage 4")) -table(df_lin$lineage_labels) - -table(df_lin$lineage_labels) == table(df_lin$lineage) - -#======================== -# mutation_info: labels -#======================== -#{Result:No of DM and OM muts in selected lineages} -table(df_lin$mutation_info) - -df_lin$mutation_info_labels = ifelse(df_lin$mutation_info == dr_muts_col, "DM", "OM") -table(df_lin$mutation_info_labels) - -table(df_lin$mutation_info) == table(df_lin$mutation_info_labels) - - -#======================== -# duet_outcome: labels -#======================== -#{Result: No. of D and S mutations in selected lineages} -table(df_lin$duet_outcome) - -df_lin$duet_outcome_labels = ifelse(df_lin$duet_outcome == "Destabilising", "D", "S") -table(df_lin$duet_outcome_labels) - -table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels) - - -#======================= -# subset dr muts only -#======================= -#my_df_dr = subset(df_lin, mutation_info == dr_muts_col) -#table(my_df_dr$mutation_info) -#table(my_df_dr$lineage) - -#========================= -# subset other muts only -#========================= -#my_df_other = subset(df_lin, mutation_info == other_muts_col) -#table(my_df_other$mutation_info) -#table(my_df_other$lineage) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Distribution plots -#============================ - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df <- df_lin -#%%%%%%%%%%%%%%%%%%%%%%%%% - -rm(df_lin) - -#****************** -# generate distribution plot of lineages -#****************** -# 2 : ggridges (good!) -my_ats = 15 # axis text size -my_als = 20 # axis label size -n_colours = length(unique(df$duet_scaled)) -my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1) - -#======================================= -# Plot 1: lineage dist: geom_density_ridges_gradient (allows aesthetics to vary along ridgeline, no alpha setting!) -# else same as geom_density_ridges) -# x = duet_scaled -# y = duet_outcome -# fill = duet_scaled -# Facet: Lineage -#======================================= -# output individual svg -#plot_lineage_dist_duet_f paste0(plotdir,"/", "lineage_dist_duet_f.svg") -#plot_lineage_dist_duet_f -#svg(plot_lineage_dist_duet_f) - -p1 = ggplot(df, aes(x = duet_scaled - , y = duet_outcome))+ - geom_density_ridges_gradient(aes(fill = ..x..) - #, jittered_points = TRUE - , scale = 3 - , size = 0.3 ) + - facet_wrap( ~lineage_labels - # , scales = "free" - # , labeller = labeller(lineage = my_labels) - ) + - coord_cartesian( xlim = c(-1, 1)) + - scale_fill_gradientn(colours = my_palette - , name = "DUET" - #, breaks = c(-1, 0, 1) - #, labels = c(-1,0,1) - #, limits = c(-1,1) - ) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - #, axis.text.y = element_blank() - , axis.text.y = element_text(size = my_ats) - , axis.title.x = element_text(size = my_ats) - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = my_als-10) - #, legend.title = element_text(size = my_als-6) - , legend.title = element_blank() - , legend.position = c(-0.08, 0.41) - #, legend.direction = "horizontal" - #, legend.position = "left" -)+ - labs(x = "DUET") - -p1 - - -#p1_with_legend = p1 + guides(fill = guide_colourbar(label = FALSE)) - -#======================================= -# Plot 2: lineage dist: geom_density_ridges, allows alpha to be set -# x = duet_scaled -# y = lineage_labels -# fill = mutation_info -# NO FACET -#======================================= -# output svg -#plot_lineage_dist_duet_dm_om = paste0(plotdir,"/", "lineage_dist_duet_dm_om.svg") -#plot_lineage_dist_duet_dm_om -#svg(plot_lineage_dist_duet_dm_om) - -p2 = ggplot(df, aes(x = duet_scaled - , y = lineage_labels))+ - geom_density_ridges(aes(fill = factor(mutation_info_labels)) - , scale = 3 - , size = 0.3 - , alpha = 0.8) + - coord_cartesian( xlim = c(-1, 1)) + - scale_fill_manual(values = c("#E69F00", "#999999")) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_ats) - , axis.title.x = element_text(size = my_ats) - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = my_als-4) - , legend.title = element_text(size = my_als-4) - , legend.position = c(0.8, 0.9)) + - labs(x = "DUET" - , fill = "Mutation class") # legend title - -p2 - -#======================================= -# Plot 3: lineage dist: geom_density_ridges_gradient (allows aesthetics to vary along ridgeline, no alpha setting!) -# else same as geom_density_ridges) -# x = duet_scaled -# y = lineage_labels -# fill = duet_scaled -# NO FACET (nf) -#======================================= -# output individual svg -#plot_lineage_dist_duet_nf = paste0(plotdir,"/", "lineage_dist_duet_nf.svg") -#plot_lineage_dist_duet_nf -#svg(plot_lineage_dist_duet_nf) - -p3 = ggplot(df, aes(x = duet_scaled - , y = lineage_labels))+ - geom_density_ridges_gradient(aes(fill = ..x..) - #, jittered_points = TRUE - , scale = 3 - , size = 0.3 ) + - coord_cartesian( xlim = c(-1, 1)) + - scale_fill_gradientn(colours = my_palette, name = "DUET") + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - - , axis.text.y = element_text(size = my_ats) - , axis.title.x = element_text(size = my_ats) - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = my_als-10) - , legend.title = element_text(size = my_als-3) - , legend.position = c(0.8, 0.8)) + - #, legend.direction = "horizontal")+ - #, legend.position = "top")+ - labs(x = "DUET") - -p3 - -######################################################################## -#============== -# combine plots -#=============== -# 1) without labels -plot_lineage_dist_combined_dm_om -svg(plot_lineage_dist_combined_dm_om, width = 12, height = 6) - -OutPlot1 = cowplot::plot_grid(p1, p2 - , rel_widths = c(0.5/2, 0.5/2)) - -print(OutPlot1) -dev.off() - - -# 2) with labels -plot_lineage_dist_combined_dm_om_L -svg(plot_lineage_dist_combined_dm_om_L, width = 12, height = 6) - -OutPlot2 = cowplot::plot_grid(p1, p2 - #, labels = c("(a)", "(b)") - , labels = "AUTO" - #, label_x = -0.045, label_y = 0.92 - #, hjust = -0.7, vjust = -0.5 - #, align = "h" - , rel_widths = c(0.5/2, 0.5/2) - , label_size = my_als) - -print(OutPlot2) -dev.off() - -############################################################################## From b7d50fbbcd15d0a78b6cfa04b27520759e9e8e47 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Sep 2021 16:10:11 +0100 Subject: [PATCH 16/51] added lineage_labels and mutation_info_labels to combinig_dfs_plotting --- scripts/functions/combining_dfs_plotting.R | 34 ++++ scripts/functions/tests/test_lineage_dist.R | 3 +- .../lineage_basic_barplots_combined.R | 8 +- scripts/plotting/lineage_bp_data.R | 180 ------------------ 4 files changed, 38 insertions(+), 187 deletions(-) delete mode 100755 scripts/plotting/lineage_bp_data.R diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 18e0374..848face 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -152,6 +152,40 @@ combining_dfs_plotting <- function( my_df_u unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) quit() } + + # Quick formatting: pretty labels + #----------------------- + # mutation_info_labels + #----------------------- + merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col + , "DM", "OM") + merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels) + #----------------------- + # lineage labels + #----------------------- + merged_df2$lineage_labels = gsub("lineage", "L", merged_df2$lineage) + + merged_df2$lineage_labels = factor(merged_df2$lineage_labels, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + #================================================================= # Merge 2: merged_df3 diff --git a/scripts/functions/tests/test_lineage_dist.R b/scripts/functions/tests/test_lineage_dist.R index 1f40d16..eeeebe5 100644 --- a/scripts/functions/tests/test_lineage_dist.R +++ b/scripts/functions/tests/test_lineage_dist.R @@ -16,12 +16,13 @@ cat("cols imported:" ############################################################# - +# without facet lineage_distP(lin_dist_plot , with_facet = F , leg_label = "Mutation Class" ) +# without facet lineage_distP(lin_dist_plot , with_facet = T , facet_wrap_var = "mutation_info_labels" diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R index b6f25e6..837e57b 100755 --- a/scripts/plotting/lineage_basic_barplots_combined.R +++ b/scripts/plotting/lineage_basic_barplots_combined.R @@ -55,9 +55,7 @@ plot_basic_bp_lineage_cl # Data: All lineages or # selected few #------------------------ -sel_lineages = levels(lin_lf$sel_lineages)[1:4] -sel_lineages -lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%sel_lineages,] +lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%c("L1", "L2", "L3", "L4"),] str(lin_lf_plot) # drop unused factor levels @@ -90,9 +88,7 @@ lin_countP # Data: All lineages or # selected few #------------------------ -sel_lineages = levels(lin_wf$sel_lineages)[1:4] -sel_lineages -lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%sel_lineages,] +lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%c("L1", "L2", "L3", "L4"),] str(lin_wf_plot) # drop unused factor levels diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R deleted file mode 100755 index e9ab929..0000000 --- a/scripts/plotting/lineage_bp_data.R +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Script to format data for lineage barplots: -# WF and LF data with lineage sample, and snp counts -# sourced by get_plotting_dfs.R -######################################################### -# working dir and loading libraries -# getwd() -# setwd("~/git/LSHTM_analysis/scripts/plotting") -# getwd() - -# make cmd -# globals -# drug = "streptomycin" -# gene = "gid" - -# source("get_plotting_dfs.R") -#======================================================================= -################################################# -# Get data with lineage count, and snp diversity -################################################# -table(merged_df2$lineage) - -if (table(merged_df2$lineage == "")[[2]]) { - -cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]]) - -} - -# Add pretty lineage labels and mut_info_labels -class(merged_df2$lineage); table(merged_df2$lineage) -merged_df2$lineage_labels = gsub("lineage", "L", merged_df2$lineage) -table(merged_df2$lineage_labels) - -class(merged_df2$lineage_labels) - -merged_df2$lineage_labels = factor(merged_df2$lineage_labels, c("L1" - , "L2" - , "L3" - , "L4" - , "L5" - , "L6" - , "L7" - , "LBOV" - , "L1;L2" - , "L1;L3" - , "L1;L4" - , "L2;L3" - , "L2;L3;L4" - , "L2;L4" - , "L2;L6" - , "L2;LBOV" - , "L3;L4" - , "L4;L6" - , "L4;L7" - , "")) - -class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels) - - -################################## -# WF data: lineages with -# snp count -# total_samples -# snp diversity (perc) -################################## -sel_lineages = levels(merged_df2$lineage_labels) - -lin_wf = data.frame(sel_lineages) #4, 1 -total_snps_u = NULL -total_samples = NULL - -for (i in sel_lineages){ - #print(i) - curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i]) - #print(curr_total) - total_samples = c(total_samples, curr_total) - print(total_samples) - - foo = merged_df2[merged_df2$lineage_labels==i,] - print(paste0(i, "=======\n")) - print(length(unique(foo$mutationinformation))) - curr_count = length(unique(foo$mutationinformation)) - - total_snps_u = c(total_snps_u, curr_count) -} -lin_wf - -# Add these counts as columns to the df -lin_wf$num_snps_u = total_snps_u -lin_wf$total_samples = total_samples - -# Add SNP diversity -lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples -lin_wf - -#===================== -# Add some formatting -#===================== -# SNP diversity -lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) -lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") - -# Important: Check factors so that x-axis categ appear as you want -lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1" - , "L2" - , "L3" - , "L4" - , "L5" - , "L6" - , "L7" - , "LBOV" - , "L1;L2" - , "L1;L3" - , "L1;L4" - , "L2;L3" - , "L2;L3;L4" - , "L2;L4" - , "L2;L6" - , "L2;LBOV" - , "L3;L4" - , "L4;L6" - , "L4;L7" - , "")) - -levels(lin_wf$sel_lineages) - -################################## -# LF data: lineages with -# snp count -# total_samples -# snp diversity (perc) -################################## -names(lin_wf) -tot_cols = ncol(lin_wf) -pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") -pivot_cols_n = length(pivot_cols) - -expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) - -lin_lf <- gather(lin_wf - , count_categ - , p_count - , num_snps_u:total_samples - , factor_key = TRUE) -lin_lf - -# quick checks -if ( nrow(lin_lf) == expected_rows ){ - cat("\nPASS: Lineage LF data created" - , "\nnrow: ", nrow(lin_lf) - , "\nncol: ", ncol(lin_lf)) -} else { - cat("\nFAIL: numbers mismatch" - , "\nExpected nrow: ", expected_rows) -} - -# Important: Relevel factors so that x-axis categ appear as you want -lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1" - , "L2" - , "L3" - , "L4" - , "L5" - , "L6" - , "L7" - , "LBOV" - , "L1;L2" - , "L1;L3" - , "L1;L4" - , "L2;L3" - , "L2;L3;L4" - , "L2;L4" - , "L2;L6" - , "L2;LBOV" - , "L3;L4" - , "L4;L6" - , "L4;L7" - , "")) - -levels(lin_lf$sel_lineages) From 93038fa17c677ace0321efad2025c608121602b4 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Sep 2021 16:14:14 +0100 Subject: [PATCH 17/51] added lineage_dist.R nad renamed lineage_bp_data file to lineage_data --- scripts/functions/lineage_dist.R | 69 ++++++++++++++ scripts/plotting/lineage_data.R | 155 +++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 scripts/functions/lineage_dist.R create mode 100755 scripts/plotting/lineage_data.R diff --git a/scripts/functions/lineage_dist.R b/scripts/functions/lineage_dist.R new file mode 100644 index 0000000..aee1b62 --- /dev/null +++ b/scripts/functions/lineage_dist.R @@ -0,0 +1,69 @@ +############################### +# TASK: function to plot lineage +# dist plots with or without facet +# think about color palette +# for stability +############################## + +#n_colours = length(unique(lin_dist_plot$duet_scaled)) +#my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1) + + +lineage_distP <- function(plotdf + , x_axis = "duet_scaled" + , y_axis = "lineage_labels" + , x_lab = "DUET" + , with_facet = F + , facet_wrap_var = "" + , fill_categ = "mutation_info_labels" + , fill_categ_cols = c("#E69F00", "#999999") + , my_ats = 15 # axis text size + , my_als = 20 # axis label size + , my_leg_ts = 16 + , my_leg_title = 16 + , my_strip_ts = 20 + , leg_pos = c(0.8, 0.9) + , leg_pos_wf = c("top", "left", "bottom", "right") + , leg_dir_wf = c("horizontal", "vertical") + , leg_label = "") + +{ + +LinDistP = ggplot(plotdf, aes_string(x = x_axis + , y = y_axis))+ + + geom_density_ridges(aes_string(fill = fill_categ) + , scale = 3 + , size = 0.3 + , alpha = 0.8) + + scale_x_continuous(expand = c(0.01, 0.01)) + + #coord_cartesian( xlim = c(-1, 1)) + + scale_fill_manual(values = fill_categ_cols) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_blank() + , strip.text = element_text(size = my_strip_ts) + , legend.text = element_text(size = my_leg_ts) + , legend.title = element_text(size = my_leg_title) + , legend.position = c(0.8, 0.9)) + + labs(x = x_lab + , fill = leg_label) + +if (with_facet){ + + # used reformulate or make as formula + #fwv = reformulate(facet_wrap_var) + fwv = as.formula(paste0("~", facet_wrap_var)) + + LinDistP = LinDistP + + facet_wrap(fwv) + + theme(legend.position = leg_pos_wf + , legend.direction = leg_dir_wf) +} + +return(LinDistP) +} diff --git a/scripts/plotting/lineage_data.R b/scripts/plotting/lineage_data.R new file mode 100755 index 0000000..29a6348 --- /dev/null +++ b/scripts/plotting/lineage_data.R @@ -0,0 +1,155 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for lineage barplots: +# WF and LF data with lineage sample, and snp counts +# sourced by get_plotting_dfs.R +######################################################### +# working dir and loading libraries +# getwd() +# setwd("~/git/LSHTM_analysis/scripts/plotting") +# getwd() + +# make cmd +# globals +# drug = "streptomycin" +# gene = "gid" + +# source("get_plotting_dfs.R") +#======================================================================= +################################################# +# Get data with lineage count, and snp diversity +################################################# +table(merged_df2$lineage) + +if (table(merged_df2$lineage == "")[[2]]) { + +cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]]) + +} + +table(merged_df2$lineage_labels) +class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels) + +################################## +# WF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +################################## +sel_lineages = levels(merged_df2$lineage_labels) + +lin_wf = data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i]) + #print(curr_total) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = merged_df2[merged_df2$lineage_labels==i,] + print(paste0(i, "=======\n")) + print(length(unique(foo$mutationinformation))) + curr_count = length(unique(foo$mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} +lin_wf + +# Add these counts as columns to the df +lin_wf$num_snps_u = total_snps_u +lin_wf$total_samples = total_samples +lin_wf + +# Add SNP diversity +lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples +lin_wf + +#===================== +# Add some formatting +#===================== +# SNP diversity +lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) +lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") + +lin_wf$sel_lineages + +# Important: Check factors so that x-axis categ appear as you want +lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + +levels(lin_wf$sel_lineages) + +################################## +# LF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +################################## +names(lin_wf) +tot_cols = ncol(lin_wf) +pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") +pivot_cols_n = length(pivot_cols) + +expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) + +lin_lf <- gather(lin_wf + , count_categ + , p_count + , num_snps_u:total_samples + , factor_key = TRUE) +lin_lf + +# quick checks +if ( nrow(lin_lf ) == expected_rows ){ + cat("\nPASS: Lineage LF data created" + , "\nnrow: ", nrow(lin_lf) + , "\nncol: ", ncol(lin_lf)) +} else { + cat("\nFAIL: numbers mismatch" + , "\nExpected nrow: ", expected_rows) +} + +# Important: Relevel factors so that x-axis categ appear as you want +lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + +levels(lin_lf$sel_lineages) From 2bd85f70212f08c9be3e65293270bbe0e3f84079 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Sep 2021 16:15:07 +0100 Subject: [PATCH 18/51] added lineage_dist_plots.R --- scripts/plotting/lineage_dist_plots.R | 114 ++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 scripts/plotting/lineage_dist_plots.R diff --git a/scripts/plotting/lineage_dist_plots.R b/scripts/plotting/lineage_dist_plots.R new file mode 100644 index 0000000..a425f37 --- /dev/null +++ b/scripts/plotting/lineage_dist_plots.R @@ -0,0 +1,114 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Lineage dist plots: ggridges + +# Output: 1 or 2 SVGs for PS stability + +########################################################## +# Installing and loading required packages +########################################################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") # also loads all my functions + +#=========== +# input +#=========== +#drug = "streptomycin" +#gene = "gid" +source("get_plotting_dfs.R") + +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character", + "data_file1" , "fa", 2, "character", + "data_file2" , "fb", 2, "character" +), byrow = TRUE, ncol = 4) + +opt = getopt(spec) + +drug = opt$drug +gene = opt$gene +infile_params = opt$data_file1 +infile_metadata = opt$data_file2 + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} + +#======= +# output +#======= +lineage_dist_dm_om_ps = "lineage_dist_dm_om_PS.svg" +plot_lineage_dist_dm_om_ps = paste0(plotdir,"/", lineage_dist_dm_om_ps) +#======================================================================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +#=================== +# Data for plots +#=================== +# quick checks +table(merged_df2$mutation_info_labels); levels(merged_df2$lineage_labels) +table(merged_df2$lineage_labels); levels(merged_df2$mutation_info_labels) + +lin_dist_plot = merged_df2[merged_df2$lineage_labels%in%c("L1", "L2", "L3", "L4"),] +table(lin_dist_plot$lineage_labels); nlevels(lin_dist_plot$lineage_labels) + +# refactor +lin_dist_plot$lineage_labels = factor(lin_dist_plot$lineage_labels) +nlevels(lin_dist_plot$lineage_labels) + +#----------------------------------------------------------------------- +# IMPORTANT RESULTS to put inside table or text for interactive plots + +sum(table(lin_dist_plot$lineage_labels)) #{RESULT: Total number of samples for lineage} + +table(lin_dist_plot$lineage_labels)#{RESULT: No of samples within lineage} + +length(unique(lin_dist_plot$mutationinformation))#{Result: No. of unique mutations selected lineages contribute to} +length(lin_dist_plot$mutationinformation) + +u2 = unique(merged_df2$mutationinformation) +u = unique(lin_dist_plot$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} +#----------------------------------------------------------------------- +# without facet +linP_dm_om = lineage_distP(lin_dist_plot + , with_facet = F + , x_axis = "deepddg" + , y_axis = "lineage_labels" + , x_lab = "DeepDDG" + , leg_label = "Mutation Class" +) +linP_dm_om + +# with facet +linP_dm_om_facet = lineage_distP(lin_dist_plot + , with_facet = T + , facet_wrap_var = "mutation_info_labels" + , leg_label = "Mutation Class" + , leg_pos_wf = "none" + , leg_dir_wf = "horizontal" + +) +linP_dm_om_facet + +#================= +# output plot: +# without facet +#================= +svg(plot_lineage_dist_dm_om_ps) +linP_dm_om + +dev.off() From dda5d1ea9386e8c6405eb29cd4ddd7cef58f1dfd Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 9 Sep 2021 16:16:18 +0100 Subject: [PATCH 19/51] moved old lineage_dist plot scripts to redundant --- .../redundant/lineage_dist_combined_PS.R | 303 +++++++++++++++++ .../lineage_dist_dm_om_combined_PS.R | 309 ++++++++++++++++++ 2 files changed, 612 insertions(+) create mode 100755 scripts/plotting/redundant/lineage_dist_combined_PS.R create mode 100755 scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R diff --git a/scripts/plotting/redundant/lineage_dist_combined_PS.R b/scripts/plotting/redundant/lineage_dist_combined_PS.R new file mode 100755 index 0000000..bf1c75b --- /dev/null +++ b/scripts/plotting/redundant/lineage_dist_combined_PS.R @@ -0,0 +1,303 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Lineage dist plots: ggridges + +# Output: 2 SVGs for PS stability + +# 1) all muts +# 2) dr_muts + +########################################################## +# Installing and loading required packages +########################################################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") +library(ggridges) +source("combining_dfs_plotting.R") +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3 + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + +#======= +# output +#======= +lineage_dist_combined = "lineage_dist_combined_PS.svg" +plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) +#======================================================================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### +# REASSIGNMENT +my_df = merged_df2 + +# delete variables not required +rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info) + +# subset df with dr muts only +my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") +table(my_df_dr$mutation_info) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot 1: ALL Muts +# x = mcsm_values, y = dist +# fill = stability +#============================ + +my_plot_name = 'lineage_dist_PS.svg' + +plot_lineage_duet = paste0(plotdir,"/", my_plot_name) + +#=================== +# Data for plots +#=================== +table(my_df$lineage); str(my_df$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4" + #, "lineage5" + #, "lineage6" + #, "lineage7" + ) + +# uncomment as necessary +df_lin = subset(my_df, subset = lineage %in% sel_lineages ) +table(df_lin$lineage) + +# refactor +df_lin$lineage = factor(df_lin$lineage) + +sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage} + +table(df_lin$lineage)#{RESULT: No of samples within lineage} + +length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} + +length(df_lin$mutationinformation) + +u2 = unique(my_df$mutationinformation) +u = unique(df_lin$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df <- df_lin +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm(df_lin) + +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size + +my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4' + #, 'Lineage 5', 'Lineage 6', 'Lineage 7' + ) +names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' + # , 'lineage5', 'lineage6', 'lineage7' + ) +# check plot name +plot_lineage_duet + +# output svg +#svg(plot_lineage_duet) +p1 = ggplot(df, aes(x = duet_scaled + , y = duet_outcome))+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient(aes(fill = ..x..) + #, jittered_points = TRUE + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" + #, switch = 'x' + , labeller = labeller(lineage = my_labels) ) + + coord_cartesian( xlim = c(-1, 1)) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-5) + , legend.title = element_text(size = my_als) +) + +print(p1) +#dev.off() + +####################################################################### +# lineage distribution plot for dr_muts +####################################################################### + +#========================== +# Plot 2: dr muts ONLY +# x = mcsm_values, y = dist +# fill = stability +#============================ + +my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg' + +plot_lineage_dr_duet = paste0(plotdir,"/", my_plot_name_dr) + +#=================== +# Data for plots +#=================== +table(my_df_dr$lineage); str(my_df_dr$lineage) + +# uncomment as necessary +df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) +table(df_lin_dr$lineage) + +# refactor +df_lin_dr$lineage = factor(df_lin_dr$lineage) + +sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage} + +table(df_lin_dr$lineage)#{RESULT: No of samples within lineage} + +length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} + +length(df_lin_dr$mutationinformation) + +u2 = unique(my_df_dr$mutationinformation) +u = unique(df_lin_dr$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df_dr <- df_lin_dr +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm(df_lin_dr) + +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size + + +# check plot name +plot_lineage_dr_duet + +# output svg +#svg(plot_lineage_dr_duet) +p2 = ggplot(df_dr, aes(x = duet_scaled + , y = duet_outcome))+ + + geom_density_ridges_gradient(aes(fill = ..x..) + #, jittered_points = TRUE + , scale = 3 + , size = 0.3) + + #geom_point(aes(size = or_mychisq))+ + facet_wrap( ~lineage + , scales = "free" + #, switch = 'x' + , labeller = labeller(lineage = my_labels) ) + + coord_cartesian( xlim = c(-1, 1) + #, ylim = c(0, 6) + #, clip = "off" + ) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = 10) + , legend.title = element_text(size = my_als) + #, legend.position = "none" + ) + +print(p2) +#dev.off() +######################################################################## +#============== +# combine plot +#=============== + +svg(plot_lineage_dist_combined, width = 12, height = 6) + +printFile = cowplot::plot_grid(p1, p2 + , label_size = my_als+10) + +print(printFile) +dev.off() diff --git a/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R b/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R new file mode 100755 index 0000000..3875382 --- /dev/null +++ b/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R @@ -0,0 +1,309 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Lineage dist plots: ggridges + +# Output: 2 SVGs for PS stability + +# 1) all muts +# 2) dr_muts + +########################################################## +# Installing and loading required packages +########################################################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") + +source("get_plotting_dfs.R") + +cat("cols imported:" + , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2) + +#======= +# output +#======= +lineage_dist_combined_dm_om = "lineage_dist_combined_dm_om_PS.svg" +plot_lineage_dist_combined_dm_om = paste0(plotdir,"/", lineage_dist_combined_dm_om) + +lineage_dist_combined_dm_om_L = "lineage_dist_combined_dm_om_PS_labelled.svg" +plot_lineage_dist_combined_dm_om_L = paste0(plotdir,"/", lineage_dist_combined_dm_om_L) + +#======================================================================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +#=================== +# Data for plots +#=================== +lin_dist_plot = merged_df2[merged_df2$lineage%in%c("lineage1", "lineage2", "lineage3", "lineage4"),] +table(lin_dist_plot$lineage) + +#{RESULT: Total number of samples for lineage} +sum(table(lin_dist_plot$lineage)) + +#{RESULT: No of samples within lineage} +table(lin_dist_plot$lineage) + +#{Result: No. of unique mutations the 4 lineages contribute to} +length(unique(lin_dist_plot$mutationinformation)) + +u2 = unique(lin_dist_plot$mutationinformation) +u = unique(lin_dist_plot$mutationinformation) + +#{Result:Muts not present within selected lineages} +check = u2[!u2%in%u]; print(check) + +# workaround to make labels appear nicely for in otherwise cases +#================== +# lineage: labels +# from "plyr" +#================== +#{Result:No of samples in selected lineages} +table(lin_dist_plot$lineage) + +lin_dist_plot$lineage_labels = mapvalues(lin_dist_plot$lineage + , from = c("lineage1","lineage2", "lineage3", "lineage4") + , to = c("Lineage 1", "Lineage 2", "Lineage 3", "Lineage 4")) +table(lin_dist_plot$lineage_labels) + +table(lin_dist_plot$lineage_labels) == table(lin_dist_plot$lineage) + +#======================== +# mutation_info: labels +#======================== +#{Result:No of DM and OM muts in selected lineages} +table(lin_dist_plot$mutation_info) + +lin_dist_plot$mutation_info_labels = ifelse(lin_dist_plot$mutation_info == dr_muts_col + , "DM", "OM") +table(lin_dist_plot$mutation_info_labels) + +table(lin_dist_plot$mutation_info) == table(lin_dist_plot$mutation_info_labels) + +#======================== +# duet_outcome: labels +#======================== +#{Result: No. of D and S mutations in selected lineages} +table(lin_dist_plot$duet_outcome) + +lin_dist_plot$duet_outcome_labels = ifelse(lin_dist_plot$duet_outcome == "Destabilising" + , "D", "S") +table(lin_dist_plot$duet_outcome_labels) + +table(lin_dist_plot$duet_outcome) == table(lin_dist_plot$duet_outcome_labels) + + +#======================= +# subset dr muts only +#======================= +#my_df_dr = subset(df_lin, mutation_info == dr_muts_col) +#table(my_df_dr$mutation_info) +#table(my_df_dr$lineage) + +#========================= +# subset other muts only +#========================= +#my_df_other = subset(df_lin, mutation_info == other_muts_col) +#table(my_df_other$mutation_info) +#table(my_df_other$lineage) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size +n_colours = length(unique(lin_dist_plot$duet_scaled)) + +my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1) + +#======================================= +# Plot 1: lineage dist: geom_density_ridges_gradient (allows aesthetics to vary along ridgeline, no alpha setting!) +# else same as geom_density_ridges) +# x = duet_scaled +# y = duet_outcome +# fill = duet_scaled +# Facet: Lineage +#======================================= +# output individual svg +#plot_lineage_dist_duet_f paste0(plotdir,"/", "lineage_dist_duet_f.svg") +#plot_lineage_dist_duet_f +#svg(plot_lineage_dist_duet_f) + +p1 = ggplot(lin_dist_plot, aes(x = duet_scaled + #, y = duet_outcome + , y = mutation_info_labels + ))+ + geom_density_ridges_gradient(aes(fill = ..x..) + #, jittered_points = TRUE + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage_labels + #~mutation_info_labels + # ~mutation_info_labels + # , scales = "free" + # , labeller = labeller(lineage = my_labels) + ) + + #coord_cartesian( xlim = c(-1, 1)) + + scale_x_continuous(expand = c(0.01, 0)) + + + scale_fill_gradientn(colours = my_palette + , name = "DUET" + #, breaks = c(-1, 0, 1) + #, labels = c(-1,0,1) + #, limits = c(-1,1) + ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + #, axis.text.y = element_blank() + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-10) + #, legend.title = element_text(size = my_als-6) + , legend.title = element_blank() + , legend.position = c(-0.08, 0.41) + , legend.direction = "horizontal" + , legend.position = "top" +)+ + labs(x = "DUET") + +p1 + + +#p1_with_legend = p1 + guides(fill = guide_colourbar(label = FALSE)) + +#======================================= +# Plot 2: lineage dist: geom_density_ridges, allows alpha to be set +# x = duet_scaled +# y = lineage_labels +# fill = mutation_info +# NO FACET +#======================================= +# output svg +#plot_lineage_dist_duet_dm_om = paste0(plotdir,"/", "lineage_dist_duet_dm_om.svg") +#plot_lineage_dist_duet_dm_om +#svg(plot_lineage_dist_duet_dm_om) + +p2 = ggplot(lin_dist_plot, aes(x = duet_scaled + , y = lineage_labels))+ + geom_density_ridges(aes(fill = factor(mutation_info_labels)) + , scale = 3 + , size = 0.3 + , alpha = 0.8) + + scale_x_continuous(expand = c(0.01, 0)) + + #coord_cartesian( xlim = c(-1, 1)) + + scale_fill_manual(values = c("#E69F00", "#999999")) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-4) + , legend.title = element_text(size = my_als-4) + , legend.position = c(0.8, 0.9)) + + labs(x = "DUET" + , fill = "Mutation class") # legend title + +p2 + +#======================================= +# Plot 3: lineage dist: geom_density_ridges_gradient (allows aesthetics to vary along ridgeline, no alpha setting!) +# else same as geom_density_ridges) +# x = duet_scaled +# y = lineage_labels +# fill = duet_scaled +# NO FACET (nf) +#======================================= +# output individual svg +#plot_lineage_dist_duet_nf = paste0(plotdir,"/", "lineage_dist_duet_nf.svg") +#plot_lineage_dist_duet_nf +#svg(plot_lineage_dist_duet_nf) + +p3 = ggplot(lin_dist_plot, aes(x = duet_scaled + , y = lineage_labels))+ + # geom_density_ridges_gradient(aes(fill = ..x..) + # #, jittered_points = TRUE + # , scale = 3 + # , size = 0.3 ) + + geom_density_ridges()+ + #facet_wrap (~mutation_info_labels) + + #coord_cartesian( xlim = c(-1, 1)) + + scale_x_continuous(expand = c(0.01, 0)) + + + #scale_fill_gradientn(colours = my_palette, name = "DUET") + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-10) + , legend.title = element_text(size = my_als-3) + , legend.position = c(0.8, 0.8)) + + #, legend.direction = "horizontal")+ + #, legend.position = "top")+ + labs(x = "DUET") + +p3 + +######################################################################## +#============== +# combine plots +#=============== +# 1) without labels +plot_lineage_dist_combined_dm_om +svg(plot_lineage_dist_combined_dm_om, width = 12, height = 6) + +OutPlot1 = cowplot::plot_grid(p1, p2 + , rel_widths = c(0.5/2, 0.5/2)) + +print(OutPlot1) +dev.off() + + +# 2) with labels +plot_lineage_dist_combined_dm_om_L +svg(plot_lineage_dist_combined_dm_om_L, width = 12, height = 6) + +OutPlot2 = cowplot::plot_grid(p1, p2 + #, labels = c("(a)", "(b)") + , labels = "AUTO" + #, label_x = -0.045, label_y = 0.92 + #, hjust = -0.7, vjust = -0.5 + #, align = "h" + , rel_widths = c(0.5/2, 0.5/2) + , label_size = my_als) + +print(OutPlot2) +dev.off() + +############################################################################## From 4ba4ff602e6235298cddb9408fb13a59bc1e437e Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 10 Sep 2021 16:58:36 +0100 Subject: [PATCH 20/51] added foldx_scaled and deepddg_scaled values added to combine_df.py and also used that script to merge all the dfs so that merged_df2 and merged_df3 are infact what we need for downstream processing --- scripts/combining_dfs.py | 240 +++++++++--- scripts/plotting/get_plotting_dfs.R | 452 ++++++---------------- scripts/plotting/lineage_data.R | 30 +- scripts/plotting/lineage_dist_plots.R | 71 +++- scripts/plotting/other_plots_data.R | 538 -------------------------- 5 files changed, 354 insertions(+), 977 deletions(-) delete mode 100755 scripts/plotting/other_plots_data.R diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 634af18..4e2781e 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -41,6 +41,7 @@ import pandas as pd from pandas import DataFrame import numpy as np import argparse +from functools import reduce #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') @@ -92,19 +93,6 @@ outdir = args.output_dir gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) -# !"Redundant, now that improvements have been made! -# See section "REGEX" -# nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' -# print('nsSNP for gene', gene, ':', nssnp_match) - -# wt_regex = gene_match.lower()+'([A-Za-z]{3})' -# print('wt regex:', wt_regex) - -# mut_regex = r'[0-9]+(\w{3})$' -# print('mt regex:', mut_regex) - -# pos_regex = r'([0-9]+)' -# print('position regex:', pos_regex) #%%======================================================================= #============== # directories @@ -122,49 +110,52 @@ if not outdir: # input #======= #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' -in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb -in_filename_foldx = gene.lower() + '_foldx.csv' -in_filename_deepddg = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir - -in_filename_dssp = gene.lower() + '_dssp.csv' -in_filename_kd = gene.lower() + '_kd.csv' -in_filename_rd = gene.lower() + '_rd.csv' - +in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb +in_filename_foldx = gene.lower() + '_foldx.csv' +in_filename_deepddg = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir +in_filename_dssp = gene.lower() + '_dssp.csv' +in_filename_kd = gene.lower() + '_kd.csv' +in_filename_rd = gene.lower() + '_rd.csv' #in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info -in_filename_afor = gene.lower() + '_af_or.csv' +in_filename_afor = gene.lower() + '_af_or.csv' #in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' +infilename_dynamut = gene.lower() + '_complex_dynamut_norm.csv' +infilename_dynamut2 = gene.lower() + '_complex_dynamut2_norm.csv' +infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' +infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv' -infile_mcsm = outdir + in_filename_mcsm -infile_foldx = outdir + in_filename_foldx +infile_mcsm = outdir + in_filename_mcsm +infile_foldx = outdir + in_filename_foldx infile_deepddg = outdir + in_filename_deepddg +infile_dssp = outdir + in_filename_dssp +infile_kd = outdir + in_filename_kd +infile_rd = outdir + in_filename_rd +#infile_snpinfo = outdir + in_filename_snpinfo +infile_afor = outdir + in_filename_afor +#infile_afor_kin = outdir + in_filename_afor_kin +infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut +infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2 +infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na +infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps -infile_dssp = outdir + in_filename_dssp -infile_kd = outdir + in_filename_kd -infile_rd = outdir + in_filename_rd - -#infile_snpinfo = outdir + '/' + in_filename_snpinfo -infile_afor = outdir + '/' + in_filename_afor -#infile_afor_kin = outdir + '/' + in_filename_afor_kin - -print('\nInput path:', indir - , '\nOutput path:', outdir, '\n' - , '\nInput filename mcsm:', infile_mcsm - , '\nInput filename foldx:', infile_foldx, '\n' - , '\nInput filename deepddg', infile_deepddg , '\n' - , '\nInput filename dssp:', infile_dssp - , '\nInput filename kd:', infile_kd - , '\nInput filename rd', infile_rd - - #, '\nInput filename snp info:', infile_snpinfo, '\n' - , '\nInput filename af or:', infile_afor - #, '\nInput filename afor kinship:', infile_afor_kin - , '\n============================================================') +# read csv +mcsm_df = pd.read_csv(infile_mcsm, sep = ',') +foldx_df = pd.read_csv(infile_foldx , sep = ',') +deepddg_df = pd.read_csv(infile_deepddg, sep = ',') +dssp_df = pd.read_csv(infile_dssp, sep = ',') +kd_df = pd.read_csv(infile_kd, sep = ',') +rd_df = pd.read_csv(infile_rd, sep = ',') +afor_df = pd.read_csv(infile_afor, sep = ',') +dynamut_df = pd.read_csv(infile_dynamut, sep = ',') +dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',') +mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') +mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None) #======= # output #======= out_filename_comb = gene.lower() + '_all_params.csv' -outfile_comb = outdir + '/' + out_filename_comb +outfile_comb = outdir + out_filename_comb print('Output filename:', outfile_comb , '\n===================================================================') @@ -174,12 +165,101 @@ r_join = 'right' i_join = 'inner' # end of variable assignment for input and output files -#%%============================================================================ +#%%############################################################################ +#===================== +# some preprocessing +#===================== +#------------- +# FoldX +#------------- +foldx_df.shape +#======================= +# scale foldx values +#======================= + +# Rescale values in Foldx_change col b/w -1 and 1 so negative numbers +# stay neg and pos numbers stay positive +foldx_min = foldx_df['ddg'].min() +foldx_max = foldx_df['ddg'].max() +foldx_min +foldx_max + +foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed') + +foldx_df['foldx_scaled'] = foldx_df['ddg'].apply(foldx_scale) +print('Raw foldx scores:\n', foldx_df['ddg'] + , '\n---------------------------------------------------------------' + , '\nScaled foldx scores:\n', foldx_df['foldx_scaled']) + +# additional check added +fsmi = foldx_df['foldx_scaled'].min() +fsma = foldx_df['foldx_scaled'].max() + +c = foldx_df[foldx_df['ddg']>=0].count() +foldx_pos = c.get(key = 'ddg') + +c2 = foldx_df[foldx_df['foldx_scaled']>=0].count() +foldx_pos2 = c2.get(key = 'foldx_scaled') + +if foldx_pos == foldx_pos2 and fsmi == -1 and fsma == 1: + print('\nPASS: Foldx values scaled correctly b/w -1 and 1') +else: + print('\nFAIL: Foldx values scaled numbers MISmatch' + , '\nExpected number:', foldx_pos + , '\nGot:', foldx_pos2 + , '\n======================================================') + +# rename ddg column to ddg_foldx +foldx_df['ddg'] +foldx_df = foldx_df.rename(columns = {'ddg':'ddg_foldx'}) +foldx_df['ddg_foldx'] + +#------------- +# Deepddg +#------------- +deepddg_df.shape + +#======================= +# scale Deepddg values +#======================= + +# Rescale values in deepddg_change col b/w -1 and 1 so negative numbers +# stay neg and pos numbers stay positive +deepddg_min = deepddg_df['deepddg'].min() +deepddg_max = deepddg_df['deepddg'].max() + +deepddg_scale = lambda x : x/abs(deepddg_min) if x < 0 else (x/deepddg_max if x >= 0 else 'failed') + +deepddg_df['deepddg_scaled'] = deepddg_df['deepddg'].apply(deepddg_scale) +print('Raw deepddg scores:\n', deepddg_df['deepddg'] + , '\n---------------------------------------------------------------' + , '\nScaled deepddg scores:\n', deepddg_df['deepddg_scaled']) + +# additional check added +dsmi = deepddg_df['deepddg_scaled'].min() +dsma = deepddg_df['deepddg_scaled'].max() + +c = deepddg_df[deepddg_df['deepddg']>=0].count() +deepddg_pos = c.get(key = 'deepddg') + +c2 = deepddg_df[deepddg_df['deepddg_scaled']>=0].count() +deepddg_pos2 = c2.get(key = 'deepddg_scaled') + +if deepddg_pos == deepddg_pos2 and dsmi == -1 and dsma == 1: + print('\nPASS: deepddg values scaled correctly b/w -1 and 1') +else: + print('\nFAIL: deepddg values scaled numbers MISmatch' + , '\nExpected number:', deepddg_pos + , '\nGot:', deepddg_pos2 + , '\n======================================================') +#%%============================================================================= +# Now merges begin +#%%============================================================================= print('===================================' , '\nFirst merge: mcsm + foldx' , '\n===================================') -mcsm_df = pd.read_csv(infile_mcsm, sep = ',') +mcsm_df.shape # add 3 lowercase aa code for wt and mutant get_aa_3lower(df = mcsm_df @@ -189,7 +269,7 @@ get_aa_3lower(df = mcsm_df , col_mut = 'mut_aa_3lower') #mcsm_df.columns = mcsm_df.columns.str.lower() -foldx_df = pd.read_csv(infile_foldx , sep = ',') +# foldx_df.shape #mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join) merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df) @@ -205,8 +285,8 @@ print('===================================' , '\nSecond merge: mcsm_foldx_dfs + deepddg' , '\n===================================') -deepddg_df = pd.read_csv(infile_deepddg, sep = ',') -deepddg_df.columns +#deepddg_df = pd.read_csv(infile_deepddg, sep = ',') +#deepddg_df.columns # merge with mcsm_foldx_dfs and deepddg_df mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_df, on = 'mutationinformation', how = l_join) @@ -218,9 +298,9 @@ print('===================================' , '\Third merge: dssp + kd' , '\n===================================') -dssp_df = pd.read_csv(infile_dssp, sep = ',') -kd_df = pd.read_csv(infile_kd, sep = ',') -rd_df = pd.read_csv(infile_rd, sep = ',') +dssp_df.shape +kd_df.shape +rd_df.shape #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) merging_cols_m2 = detect_common_cols(dssp_df, kd_df) @@ -308,8 +388,8 @@ print('\n=======================================' , '\ncombined_df_clean + afor_df ' , '\n=======================================') -afor_df = pd.read_csv(infile_afor, sep = ',') afor_cols = afor_df.columns +afor_df.shape # create a mapping from the gwas mutation column i.e _abcXXXrst #---------------------- @@ -360,16 +440,60 @@ else: sys.exit('\nFAIL: merge unsuccessful for af and or') #%%============================================================================ -# Output columns +# Output columns: when dynamut, dynamut2 and others weren't being combined out_filename_comb_afor = gene.lower() + '_comb_afor.csv' outfile_comb_afor = outdir + '/' + out_filename_comb_afor print('Output filename:', outfile_comb_afor , '\n===================================================================') -# write csv +# # write csv print('Writing file: combined stability and afor') combined_stab_afor.to_csv(outfile_comb_afor, index = False) print('\nFinished writing file:' , '\nNo. of rows:', combined_stab_afor.shape[0] , '\nNo. of cols:', combined_stab_afor.shape[1]) -#%% end of script +#%%============================================================================ +# combine dynamut, dynamut2, and mcsm_na +dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] + +dfs_merged = reduce(lambda left,right: pd.merge(left + , right + , on = ['mutationinformation'] + , how = 'inner') + , dfs_list) +# drop excess columns +drop_cols = detect_common_cols(dfs_merged, combined_stab_afor) +drop_cols.remove('mutationinformation') + +dfs_merged_clean = dfs_merged.drop(drop_cols, axis = 1) +merging_cols_m6 = detect_common_cols(dfs_merged_clean, combined_stab_afor) + +len(dfs_merged_clean.columns) +len(combined_stab_afor.columns) + +combined_all_params = pd.merge(combined_stab_afor + , dfs_merged_clean + , on = merging_cols_m6 + , how = i_join) + +expected_ncols = len(dfs_merged_clean.columns) + len(combined_stab_afor.columns) - len(merging_cols_m6) +expected_nrows = len(combined_stab_afor) + +if len(combined_all_params.columns) == expected_ncols and len(combined_all_params) == expected_nrows: + print('\nPASS: All dfs combined') +else: + print('\nFAIL:lengths mismatch' + , '\nExpected ncols:', expected_ncols + , '\nGot:', len(dfs_merged_clean.columns) + , '\nExpected nrows:', expected_nrows + , '\nGot:', len(dfs_merged_clean) ) + +#%% Done for gid on 10/09/2021 +# write csv +print('Writing file: all params') +combined_all_params.to_csv(outfile_comb, index = False) + +print('\nFinished writing file:' + , '\nNo. of rows:', combined_all_params.shape[0] + , '\nNo. of cols:', combined_all_params.shape[1]) +#%% end of script \ No newline at end of file diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index 89b477c..f1a7620 100755 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -8,11 +8,11 @@ setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() source("Header_TT.R") -source("../functions/my_pairs_panel.R") # with lower panel turned off -source("../functions/plotting_globals.R") -source("../functions/plotting_data.R") -source("../functions/combining_dfs_plotting.R") -source("../functions/bp_subcolours.R") +# source("../functions/my_pairs_panel.R") # with lower panel turned off +# source("../functions/plotting_globals.R") +# source("../functions/plotting_data.R") +# source("../functions/combining_dfs_plotting.R") +# source("../functions/bp_subcolours.R") #******************** # cmd args passed @@ -41,8 +41,8 @@ import_dirs(drug, gene) #--------------------------- if (!exists("infile_params") && exists("gene")){ #if (!is.character(infile_params) && exists("gene")){ # when running as cmd - #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA - in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA (and for gid finally) 10/09/21 + #in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid infile_params = paste0(outdir, "/", in_filename_params) cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") } @@ -91,369 +91,139 @@ merged_df3 = all_plot_dfs[[2]] merged_df2_comp = all_plot_dfs[[3]] merged_df3_comp = all_plot_dfs[[4]] #====================================================================== -# read other files -infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene - , "_complex_dynamut_norm.csv") +#TODO: Think! MOVE TO COMBINE or singular file for deepddg -infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene - , "_complex_dynamut2_norm.csv") +#============================ +# adding deepddg scaled values +# scale data b/w -1 and 1 +#============================ +n = which(colnames(merged_df3) == "deepddg"); n -infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene - , "_complex_mcsm_na_norm.csv") - -infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene - , "_mcsm_formatted_snps.csv") - -dynamut_df = read.csv(infilename_dynamut) -dynamut2_df = read.csv(infilename_dynamut2) -mcsm_na_df = read.csv(infilename_mcsm_na) -mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F) -names(mcsm_f_snps) = "mutationinformation" +my_min = min(merged_df3[,n]); my_min +my_max = max(merged_df3[,n]); my_max -#################################################################### -# Data for subcols barplot (~heatmpa) -#################################################################### -# can include: mutation, or_kin, pwald, af_kin -cols_to_select = c("mutationinformation", "drtype" - , "wild_type" - , "position" - , "mutant_type" - , "chain", "ligand_id", "ligand_distance" - , "duet_stability_change", "duet_outcome", "duet_scaled" - , "ligand_affinity_change", "ligand_outcome", "affinity_scaled" - , "ddg_foldx", "foldx_scaled", "foldx_outcome" - , "deepddg", "deepddg_outcome" # comment out as not available for pnca - , "asa", "rsa", "rd_values", "kd_values" - , "af", "or_mychisq", "pval_fisher" - , "or_fisher", "or_logistic", "pval_logistic" - , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity" - , "wt_calcprop", "mut_calcprop") +merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0 + , merged_df3[,n]/abs(my_min) + , merged_df3[,n]/my_max) +# sanity check +my_min = min(merged_df3$deepddg_scaled); my_min +my_max = max(merged_df3$deepddg_scaled); my_max -#======================= -# Data for sub colours -# barplot: PS -#======================= - -cat("\nNo. of cols to select:", length(cols_to_select)) - -subcols_df_ps = merged_df3[, cols_to_select] - -cat("\nNo of unique positions for ps:" - , length(unique(subcols_df_ps$position))) - -# add count_pos col that counts the no. of nsSNPS at a position -setDT(subcols_df_ps)[, pos_count := .N, by = .(position)] - -# should be a factor -if (is.factor(subcols_df_ps$duet_outcome)){ - cat("\nDuet_outcome is factor") - table(subcols_df_ps$duet_outcome) +if (my_min == -1 && my_max == 1){ + cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1" + #, "\nProceeding with assigning deep outcome category") + , "\n") }else{ - cat("\nConverting duet_outcome to factor") - subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome) - table(subcols_df_ps$duet_outcome) + cat("\nFAIL: could not scale DeepDDG ddg values" + , "Aborting!") } -# should be -1 and 1 -min(subcols_df_ps$duet_scaled) -max(subcols_df_ps$duet_scaled) -tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min) -tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max) +#################################################################### +# Data for combining other dfs +#################################################################### -# check unique values in normalised data -cat("\nNo. of unique values in duet scaled, no rounding:" - , length(unique(subcols_df_ps$duet_scaled))) +source("other_dfs_data.R") -# No rounding -my_grp = subcols_df_ps$duet_scaled; length(my_grp) +#################################################################### +# Data for subcols barplot (~heatmap) +#################################################################### -# Add rounding is to be used -n = 3 -subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n) - -cat("\nNo. of unique values in duet scaled", n, "places rounding:" - , length(unique(subcols_df_ps$duet_scaledR))) - -my_grp_r = subcols_df_ps$duet_scaledR # rounding - -# Add grp cols -subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "") -subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "") - -# Call the function to create the palette based on the group defined above -subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp") -subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r") - -print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours")) -print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")) +source("coloured_bp_data.R") #################################################################### # Data for logoplots #################################################################### -#------------------------- -# choose df for logoplot -#------------------------- -logo_data = merged_df3 -#logo_data = merged_df3_comp -# quick checks -colnames(logo_data) -str(logo_data) +source("logo_data.R") -c1 = unique(logo_data$position) -nrow(logo_data) -cat("No. of rows in my_data:", nrow(logo_data) - , "\nDistinct positions corresponding to snps:", length(c1) - , "\n===========================================================") -#======================================================================= -#================== -# logo data: OR -#================== -foo = logo_data[, c("position" - , "mutant_type","duet_scaled", "or_mychisq" - , "mut_prop_polarity", "mut_prop_water")] +s1 = c("\nSuccessfully sourced logo_data.R") +cat(s1) -logo_data$log10or = log10(logo_data$or_mychisq) -logo_data_plot = logo_data[, c("position" - , "mutant_type", "or_mychisq", "log10or")] - -logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")] -wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0) - -wide_df_or = as.matrix(wide_df_or) -rownames(wide_df_or) = wide_df_or[,1] -dim(wide_df_or) -wide_df_or = wide_df_or[,-1] -str(wide_df_or) - -position_or = as.numeric(colnames(wide_df_or)) - -#================== -# logo data: logOR -#================== -logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")] -wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0) - -wide_df_logor = as.matrix(wide_df_logor) - -rownames(wide_df_logor) = wide_df_logor[,1] -wide_df_logor = subset(wide_df_logor, select = -c(1) ) -colnames(wide_df_logor) -wide_df_logor_m = data.matrix(wide_df_logor) - -rownames(wide_df_logor_m) -colnames(wide_df_logor_m) - -position_logor = as.numeric(colnames(wide_df_logor_m)) - -#=============================== -# logo data: multiple nsSNPs (>1) -#================================= -#require(data.table) - -# get freq count of positions so you can subset freq<1 -setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] - -table(logo_data$position) -table(logo_data$mut_pos_occurrence) - -max_mut = max(table(logo_data$position)) - -# extract freq_pos > 1 -my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] -u = unique(my_data_snp$position) -max_mult_mut = max(table(my_data_snp$position)) - -if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){ - - cat("PASS: positions with multiple muts extracted" - , "\nNo. of mutations:", nrow(my_data_snp) - , "\nNo. of positions:", length(u) - , "\nMax no. of muts at any position", max_mult_mut) -}else{ - cat("FAIL: positions with multiple muts could NOT be extracted" - , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] - , "\nGot:", nrow(my_data_snp) ) -} - -cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]]) - -#-------------------------------------- -# matrix for_mychisq mutant type -# frequency of mutant type by position -#--------------------------------------- -table(my_data_snp$mutant_type, my_data_snp$position) -tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) -class(tab_mt) - -# unclass to convert to matrix -tab_mt = unclass(tab_mt) -tab_mt = as.matrix(tab_mt, rownames = T) - -# should be TRUE -is.matrix(tab_mt) - -rownames(tab_mt) #aa -colnames(tab_mt) #pos - -#------------------------------------- -# matrix for wild type -# frequency of wild type by position -#------------------------------------- -tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt -tab_wt = unclass(tab_wt) - -# remove wt duplicates -wt = my_data_snp[, c("position", "wild_type")] -wt = wt[!duplicated(wt),] - -tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 - -rownames(tab_wt) -rownames(tab_wt) - -identical(colnames(tab_mt), colnames(tab_wt)) -identical(ncol(tab_mt), ncol(tab_wt)) - -#---------------------------------- -# logo data OR: multiple nsSNPs (>1) -#---------------------------------- -logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")] -#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0) -wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA) - -wide_df_or_mult = as.matrix(wide_df_or_mult) -rownames(wide_df_or_mult) = wide_df_or_mult[,1] -wide_df_or_mult = wide_df_or_mult[,-1] -str(wide_df_or_mult) - -position_or_mult = as.numeric(colnames(wide_df_or_mult)) - -#################################################################### -# Data for Corrplots -#################################################################### -cat("\n==========================================" - , "\nCORR PLOTS data: PS" - , "\n===========================================") - -df_ps = merged_df2 - -#-------------------- -# adding log cols : NEW UNCOMMENT -#-------------------- -#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) -#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) - -##df_ps$log10_or_kin = log10(df_ps$or_kin) -##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin) - -#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0) - -#---------------------------- -# columns for corr plots:PS -#---------------------------- -# subset data to generate pairwise correlations -cols_to_select = c("mutationinformation" - , "duet_scaled" - , "foldx_scaled" - #, "mutation_info_labels" - , "asa" - , "rsa" - , "rd_values" - , "kd_values" - , "log10_or_mychisq" - , "neglog_pval_fisher" - ##, "or_kin" - ##, "neglog_pwald_kin" - , "af" - ##, "af_kin" - , "duet_outcome" - , drug) - -corr_data_ps = df_ps[cols_to_select] - -dim(corr_data_ps) - -#-------------------------------------- -# assign nice colnames (for display) -#-------------------------------------- -my_corr_colnames = c("Mutation" - , "DUET" - , "FoldX" - #, "Mutation class" - , "ASA" - , "RSA" - , "RD" - , "KD" - , "Log (OR)" - , "-Log (P)" - ##, "Adjusted (OR)" - ##, "-Log (P wald)" - , "MAF" - ##, "AF_kin" - , "duet_outcome" - , drug) - -length(my_corr_colnames) - -colnames(corr_data_ps) -colnames(corr_data_ps) <- my_corr_colnames -colnames(corr_data_ps) - -start = 1 -end = which(colnames(corr_data_ps) == drug); end # should be the last column -offset = 1 - -#=========================== -# Corr data for plots: PS -# big_df ps: ~ merged_df2 -#=========================== - -#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug -corr_ps_df2 = corr_data_ps[start:end] -head(corr_ps_df2) - -#=========================== -# Corr data for plots: PS -# short_df ps: ~merged_df3 -#=========================== -corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),] - -na_or = sum(is.na(corr_ps_df3$`Log (OR)`)) -check1 = nrow(corr_ps_df3) - na_or - -##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`)) -##check2 = nrow(corr_ps_df3) - na_adj_or - -if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { - cat( "\nPASS: No. of rows for corr_ps_df3 match" - , "\nPASS: No. of OR values checked: " , check1) -} else { - cat("\nFAIL: Numbers mismatch:" - , "\nExpected nrows: ", nrow(merged_df3) - , "\nGot: ", nrow(corr_ps_df3) - , "\nExpected OR values: ", nrow(merged_df3_comp) - , "\nGot: ", check1) -} - -rm(foo) #################################################################### # Data for DM OM Plots: Long format dfs #################################################################### -source("other_plots_data.R") +#source("other_plots_data.R") + +source("dm_om_data.R") + +s2 = c("\nSuccessfully sourced other_plots_data.R") +cat(s2) #################################################################### # Data for Lineage barplots: WF and LF dfs #################################################################### -source("lineage_bp_data.R") +source("lineage_data.R") + +s3 = c("\nSuccessfully sourced lineage_data.R") +cat(s3) + +#################################################################### +# Data for corr plots: +#################################################################### +# make sure the above script works because merged_df2_combined is needed +source("corr_data.R") + +s4 = c("\nSuccessfully sourced corr_data.R") +cat(s4) ######################################################################## # End of script ######################################################################## +if ( all( length(s1), length(s2), length(s3), length(s4) ) >0 ){ + cat( + "\n##################################################" + , "\nSuccessful: get_plotting_dfs.R worked!" + , "\n###################################################\n") +} else { + cat( + "\n#################################################" + , "\nFAIL: get_plotting_dfs.R didn't complete fully!Please check" + , "\n###################################################\n" ) + } + +######################################################################## +# clear excess variables +rm(c1, c2, c3, c4, check1 + , curr_count, curr_total + , cols_check + , cols_to_select + , cols_to_select_deepddg + , cols_to_select_duet + , cols_to_select_dynamut + , cols_to_select_dynamut2 + , cols_to_select_encomddg + , cols_to_select_encomdds + , cols_to_select_mcsm + , cols_to_select_mcsm_na + , cols_to_select_sdm + , infile_metadata + , infile_params + #, infilename_dynamut + #, infilename_dynamut2 + #, infilename_mcsm_f_snps + #, infilename_mcsm_na + ) -cat("\n######################################################\n" - , "\nSuccessful: get_plotting_dfs.R worked!" - , "\n###################################################\n") +rm(pivot_cols +, pivot_cols_deepddg +, pivot_cols_duet +, pivot_cols_dynamut +, pivot_cols_dynamut2 +, pivot_cols_encomddg +, pivot_cols_encomdds +, pivot_cols_foldx +, pivot_cols_mcsm +, pivot_cols_mcsm_na +, pivot_cols_n +, pivot_cols_sdm) + +rm(expected_cols +, expected_ncols +, expected_rows +, expected_rows_lf +, fact_cols) + + diff --git a/scripts/plotting/lineage_data.R b/scripts/plotting/lineage_data.R index 29a6348..9549863 100755 --- a/scripts/plotting/lineage_data.R +++ b/scripts/plotting/lineage_data.R @@ -4,21 +4,10 @@ # WF and LF data with lineage sample, and snp counts # sourced by get_plotting_dfs.R ######################################################### -# working dir and loading libraries -# getwd() -# setwd("~/git/LSHTM_analysis/scripts/plotting") -# getwd() -# make cmd -# globals -# drug = "streptomycin" -# gene = "gid" - -# source("get_plotting_dfs.R") -#======================================================================= -################################################# +#================================================= # Get data with lineage count, and snp diversity -################################################# +#================================================= table(merged_df2$lineage) if (table(merged_df2$lineage == "")[[2]]) { @@ -30,12 +19,12 @@ cat("\nMissing samples with lineage classification:", table(merged_df2$lineage = table(merged_df2$lineage_labels) class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels) -################################## +#========================================== # WF data: lineages with # snp count # total_samples # snp diversity (perc) -################################## +#========================================== sel_lineages = levels(merged_df2$lineage_labels) lin_wf = data.frame(sel_lineages) #4, 1 @@ -67,9 +56,9 @@ lin_wf lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples lin_wf -#===================== +#---------------------- # Add some formatting -#===================== +#---------------------- # SNP diversity lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") @@ -100,12 +89,12 @@ lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1" levels(lin_wf$sel_lineages) -################################## +#================================= # LF data: lineages with # snp count # total_samples # snp diversity (perc) -################################## +#================================= names(lin_wf) tot_cols = ncol(lin_wf) pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") @@ -153,3 +142,6 @@ lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1" , "")) levels(lin_lf$sel_lineages) + +################################################################ + diff --git a/scripts/plotting/lineage_dist_plots.R b/scripts/plotting/lineage_dist_plots.R index a425f37..cd1563d 100644 --- a/scripts/plotting/lineage_dist_plots.R +++ b/scripts/plotting/lineage_dist_plots.R @@ -16,9 +16,9 @@ source("Header_TT.R") # also loads all my functions #=========== # input #=========== -#drug = "streptomycin" -#gene = "gid" -source("get_plotting_dfs.R") +drug = "streptomycin" +gene = "gid" +#source("get_plotting_dfs.R") spec = matrix(c( "drug" , "d", 1, "character", @@ -47,7 +47,7 @@ plot_lineage_dist_dm_om_ps = paste0(plotdir,"/", lineage_dist_dm_om_ps) ########################### # Data for plots -# you need merged_df2 or merged_df2_comp +# you need merged_df2_combined or merged_df2_combined_comp # since this is one-many relationship # i.e the same SNP can belong to multiple lineages # using the _comp dataset means @@ -59,10 +59,12 @@ plot_lineage_dist_dm_om_ps = paste0(plotdir,"/", lineage_dist_dm_om_ps) # Data for plots #=================== # quick checks -table(merged_df2$mutation_info_labels); levels(merged_df2$lineage_labels) -table(merged_df2$lineage_labels); levels(merged_df2$mutation_info_labels) +table(merged_df2_combined$mutation_info_labels); levels(merged_df2_combined$lineage_labels) +table(merged_df2_combined$lineage_labels); levels(merged_df2_combined$mutation_info_labels) -lin_dist_plot = merged_df2[merged_df2$lineage_labels%in%c("L1", "L2", "L3", "L4"),] +sel_lineages = c("L1", "L2", "L3", "L4") + +lin_dist_plot = merged_df2_combined[merged_df2_combined$lineage_labels%in%sel_lineages,] table(lin_dist_plot$lineage_labels); nlevels(lin_dist_plot$lineage_labels) # refactor @@ -79,29 +81,55 @@ table(lin_dist_plot$lineage_labels)#{RESULT: No of samples within lineage} length(unique(lin_dist_plot$mutationinformation))#{Result: No. of unique mutations selected lineages contribute to} length(lin_dist_plot$mutationinformation) -u2 = unique(merged_df2$mutationinformation) +u2 = unique(merged_df2_combined$mutationinformation) u = unique(lin_dist_plot$mutationinformation) check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} #----------------------------------------------------------------------- -# without facet + +my_x_and_t = c("duet_scaled", "mCSM-DUET") +my_x_and_t = c("foldx_scaled", "FoldX") +#my_x_and_t = c("deepddg_scaled", "DeepDDG") + +my_x_and_t = c("ddg_dynamut2_scaled", "Dynamut2") +my_x_and_t = c("ddg_dynamut_scaled", "Dynamut") + +my_x_and_t = c("ddg_mcsm_scaled", "mCSM") +my_x_and_t = c("ddg_sdm_scaled", "SDM") +my_x_and_t = c("ddg_duet_scaled", "DUET-d") + +my_x_and_t = c("ddg_encom_scaled", "EnCOM-Stability") +my_x_and_t = c("dds_encom_scaled", "EnCOM-Flexibility") + +my_x_and_t = c("mcsm_na_scaled", "mCSM-NA") + +# TO DO +my_x_and_t = c("affinity_scaled", "mCSM-Lig") #ligdist< 10 + +#===================== +# Plot: without facet +#===================== + linP_dm_om = lineage_distP(lin_dist_plot - , with_facet = F - , x_axis = "deepddg" + , x_axis = my_x_and_t[1] + , x_lab = my_x_and_t[2] , y_axis = "lineage_labels" - , x_lab = "DeepDDG" , leg_label = "Mutation Class" -) + , with_facet = F) linP_dm_om -# with facet +#===================== +# Plot: with facet +#===================== + linP_dm_om_facet = lineage_distP(lin_dist_plot - , with_facet = T - , facet_wrap_var = "mutation_info_labels" - , leg_label = "Mutation Class" - , leg_pos_wf = "none" - , leg_dir_wf = "horizontal" - -) + , x_axis = my_x_and_t[1] + , x_lab = my_x_and_t[2] + , y_axis = "lineage_labels" + , with_facet = T + , facet_wrap_var = "mutation_info_labels" + , leg_label = "Mutation Class" + , leg_pos_wf = "none" + , leg_dir_wf = "horizontal") linP_dm_om_facet #================= @@ -109,6 +137,7 @@ linP_dm_om_facet # without facet #================= svg(plot_lineage_dist_dm_om_ps) + linP_dm_om dev.off() diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R deleted file mode 100755 index a55303b..0000000 --- a/scripts/plotting/other_plots_data.R +++ /dev/null @@ -1,538 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Script to format data for dm om plots: -# generating LF data -# sourced by get_plotting_dfs.R -######################################################### -# working dir and loading libraries -# getwd() -# setwd("~/git/LSHTM_analysis/scripts/plotting") -# getwd() - -# make cmd -# globals -# drug = "streptomycin" -# gene = "gid" - -# source("get_plotting_dfs.R") -#======================================================================= -# MOVE TO COMBINE or singular file for deepddg -# -# cols_to_select = c("mutation", "mutationinformation" -# , "wild_type", "position", "mutant_type" -# , "mutation_info") -# -# merged_df3_short = merged_df3[, cols_to_select] - -# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene -# , "_mcsm_formatted_snps.csv") -# -# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F) -# names(mcsm_f_snps) <- "mutationinformation" - -# write merged_df3 to generate structural figure on chimera -#write.csv(merged_df3_short, "merged_df3_short.csv") -#======================================================================== -# MOVE TO COMBINE or singular file for deepddg - -#============================ -# adding deepddg scaled values -# scale data b/w -1 and 1 -#============================ -n = which(colnames(merged_df3) == "deepddg"); n - -my_min = min(merged_df3[,n]); my_min -my_max = max(merged_df3[,n]); my_max - -merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0 - , merged_df3[,n]/abs(my_min) - , merged_df3[,n]/my_max) -# sanity check -my_min = min(merged_df3$deepddg_scaled); my_min -my_max = max(merged_df3$deepddg_scaled); my_max - -if (my_min == -1 && my_max == 1){ - cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1" - #, "\nProceeding with assigning deep outcome category") - , "\n") -}else{ - cat("\nFAIL: could not scale DeepDDG ddg values" - , "Aborting!") -} - -#======================================================================== -# cols to select - -cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation" - , "mutation_info", "position" - , LigDist_colname - , "duet_stability_change", "duet_scaled", "duet_outcome" - , "ligand_affinity_change", "affinity_scaled", "ligand_outcome" - , "ddg_foldx", "foldx_scaled", "foldx_outcome" - , "deepddg", "deepddg_scaled", "deepddg_outcome" - , "asa", "rsa" - , "rd_values", "kd_values" - , "log10_or_mychisq", "neglog_pval_fisher", "af")] - -cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation" - , "mcsm_na_affinity", "mcsm_na_scaled" - , "mcsm_na_outcome")] -# entire dynamut_df - -cols_dynamut2_df <- dynamut2_df[, c("mutationinformation" - , "ddg_dynamut2", "ddg_dynamut2_scaled" - , "ddg_dynamut2_outcome")] - -n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) + - length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols - -i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df)) -i2<- intersect(names(dynamut_df), names(cols_dynamut2_df)) -merging_cols <- intersect(i1, i2) -cat("\nmerging_cols:", merging_cols) - -if (merging_cols == "mutationinformation") { - cat("\nStage 1: Found common col between dfs, checking values in it...") - c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]]) - c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]]) - c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]]) - c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]]) - cols_check <- c(c1, c2, c3, c4) - expected_cols = n_comb_cols - ( length(cols_check) - 1) - if (all(cols_check)){ - cat("\nStage 2: Proceeding with merging dfs:\n") - comb_df <- Reduce(inner_join, list(cols_mcsm_df - , cols_mcsm_na_df - , dynamut_df - , cols_dynamut2_df)) - comb_df_s = arrange(comb_df, position) - - # if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) { - # cat("\Stage3, PASS: dfs merged sucessfully" - # , "\nnrow of merged_df: ", nrow(comb_df_s) - # , "\nncol of merged_df:", ncol(comb_df_s)) - # } - - } -} -#names(comb_df_s) -cat("\n!!!IT GOT TO HERE!!!!") -#======================================================================= -fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] -fact_cols -lapply(comb_df_s[, fact_cols], class) -comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor) - -if (any(lapply(comb_df_s[, fact_cols], class) == "character")){ - cat("\nChanging cols to factor") - comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor) - if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){ - cat("\nSuccessful: cols changed to factor") - } -} -lapply(comb_df_s[, fact_cols], class) - -#======================================================================= -table(comb_df_s$mutation_info) - - # further checks to make sure dr and other muts are indeed unique -dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,] -dr_muts_names = unique(dr_muts$mutation) - -other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,] -other_muts_names = unique(other_muts$mutation) - -if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && - table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){ - cat("PASS: dr and other muts are indeed unique") -}else{ - cat("FAIL: dr and others muts are NOT unique!") - quit() -} - -# pretty display names i.e. labels to reduce major code duplication later -foo_cnames = data.frame(colnames(comb_df_s)) -names(foo_cnames) <- "old_name" - -stability_suffix <- paste0(delta_symbol, delta_symbol, "G") -flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") - -lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn -duet_dn = paste0("DUET ", stability_suffix); duet_dn -foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn -deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn -mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn -dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn -dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn -encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn -encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn -sdm_dn = paste0("SDM " , stability_suffix); sdm_dn -mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn - -# Change colnames of some columns using datatable -comb_df_sl = comb_df_s -names(comb_df_sl) - -setnames(comb_df_sl - , old = c("asa", "rsa", "rd_values", "kd_values" - , "log10_or_mychisq", "neglog_pval_fisher", "af" - , LigDist_colname - , "duet_scaled" - , "foldx_scaled" - , "deepddg_scaled" - , "mcsm_na_scaled" - , "ddg_dynamut_scaled" - , "ddg_dynamut2_scaled" - , "ddg_encom_scaled" - , "dds_encom_scaled" - , "ddg_sdm" - , "ddg_mcsm") - - , new = c("ASA", "RSA", "RD", "KD" - , "Log10 (OR)", "-Log (P)", "MAF" - , lig_dn - , duet_dn - , foldx_dn - , deepddg_dn - , mcsm_na_dn - , dynamut_dn - , dynamut2_dn - , encom_ddg_dn - , encom_dds_dn - , sdm_dn - , mcsm_dn) - ) - -foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl)) - -# some more pretty labels -table(comb_df_sl$mutation_info) - -levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM" -levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM" - -table(comb_df_sl$mutation_info) - -####################################################################### -#====================== -# Selecting dfs -# with appropriate cols -#======================= -static_cols_start = c("mutationinformation" - , "position" - , "mutation" - , "mutation_info") - -static_cols_end = c(lig_dn - , "ASA" - , "RSA" - , "RD" - , "KD") - -# ordering is important! - -######################################################################### -#============== -# DUET: LF -#============== -cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end) -wf_duet = comb_df_sl[, cols_to_select_duet] - -#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps -pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet - -expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet)) -expected_rows_lf - -# LF data: duet -lf_duet = gather(wf_duet - , key = param_type - , value = param_value - , all_of(duet_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_duet) == expected_rows_lf){ - cat("\nPASS: long format data created for ", duet_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -############################################################################ -#============== -# FoldX: LF -#============== -cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end) -wf_foldx = comb_df_sl[, cols_to_select_foldx] - -pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx - -expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx)) -expected_rows_lf - -# LF data: duet -print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>") -lf_foldx <<- gather(wf_foldx - , key = param_type - , value = param_value - , all_of(foldx_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_foldx) == expected_rows_lf){ - cat("\nPASS: long format data created for ", foldx_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -############################################################################ -#============== -# Deepddg: LF -#============== -cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end) -wf_deepddg = comb_df_sl[, cols_to_select_deepddg] - -pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg - -expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg)) -expected_rows_lf - -# LF data: duet -lf_deepddg = gather(wf_deepddg - , key = param_type - , value = param_value - , all_of(deepddg_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_deepddg) == expected_rows_lf){ - cat("\nPASS: long format data created for ", deepddg_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -############################################################################ -#============== -# mCSM-NA: LF -#============== -cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) -wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] - -pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na - -expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) -expected_rows_lf - -# LF data: duet -lf_mcsm_na = gather(wf_mcsm_na - , key = param_type - , value = param_value - , all_of(mcsm_na_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_mcsm_na) == expected_rows_lf){ - cat("\nPASS: long format data created for ", mcsm_na_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -############################################################################ -#============== -# Dynamut: LF -#============== -cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end) -wf_dynamut = comb_df_sl[, cols_to_select_dynamut] - -pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut - -expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut)) -expected_rows_lf - -# LF data: duet -lf_dynamut = gather(wf_dynamut - , key = param_type - , value = param_value - , all_of(dynamut_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_dynamut) == expected_rows_lf){ - cat("\nPASS: long format data created for ", dynamut_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -############################################################################ -#============== -# Dynamut2: LF -#============== -cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end) - -wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2] - -pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2 - -expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2)) -expected_rows_lf - -# LF data: duet -lf_dynamut2 = gather(wf_dynamut2 - , key = param_type - , value = param_value - , all_of(dynamut2_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_dynamut2) == expected_rows_lf){ - cat("\nPASS: long format data created for ", dynamut2_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -############################################################################ -#============== -# EnCOM ddg: LF -#============== -cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end) -wf_encomddg = comb_df_sl[, cols_to_select_encomddg] - -pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg - -expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg)) -expected_rows_lf - -# LF data: encomddg -lf_encomddg = gather(wf_encomddg - , key = param_type - , value = param_value - , all_of(encom_ddg_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_encomddg) == expected_rows_lf){ - cat("\nPASS: long format data created for ", encom_ddg_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} -############################################################################ -#============== -# EnCOM dds: LF -#============== -cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end) -wf_encomdds = comb_df_sl[, cols_to_select_encomdds] - -pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds - -expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds)) -expected_rows_lf - -# LF data: encomddg -lf_encomdds = gather(wf_encomdds - , key = param_type - , value = param_value - , all_of(encom_dds_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_encomdds) == expected_rows_lf){ - cat("\nPASS: long format data created for", encom_dds_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -############################################################################ -#============== -# SDM: LF -#============== -cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end) -wf_sdm = comb_df_sl[, cols_to_select_sdm] - -pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm - -expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm)) -expected_rows_lf - -# LF data: encomddg -lf_sdm = gather(wf_sdm - , key = param_type - , value = param_value - , all_of(sdm_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_sdm) == expected_rows_lf){ - cat("\nPASS: long format data created for", sdm_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} - -############################################################################ -#============== -# mCSM: LF -#============== -cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end) -wf_mcsm = comb_df_sl[, cols_to_select_mcsm] - -pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm - -expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm)) -expected_rows_lf - -# LF data: encomddg -lf_mcsm = gather(wf_mcsm - , key = param_type - , value = param_value - , all_of(mcsm_dn):tail(static_cols_end,1) - , factor_key = TRUE) - -if (nrow(lf_mcsm) == expected_rows_lf){ - cat("\nPASS: long format data created for", mcsm_dn) -}else{ - cat("\nFAIL: long format data could not be created for duet") - quit() -} -############################################################################ -# clear excess variables -rm(all_plot_dfs - , cols_dynamut2_df - , cols_mcsm_df - , cols_mcsm_na_df - , comb_df - , corr_data_ps - , corr_ps_df3 - , df_lf_ps - , foo - , foo_cnames - , gene_metadata - , logo_data - , logo_data_or_mult - , logo_data_plot - , logo_data_plot_logor - , logo_data_plot_or - , my_data_snp - , my_df - , my_df_u - , other_muts - , pd_df - , subcols_df_ps - , tab_mt - , wide_df_logor - , wide_df_logor_m - , wide_df_or - , wide_df_or_mult - , wt) - - -rm(c3, c4, check1 - , cols_check - , cols_to_select - , cols_to_select_deepddg - , cols_to_select_duet - , cols_to_select_dynamut - , cols_to_select_dynamut2 - , cols_to_select_encomddg - , cols_to_select_encomdds - , cols_to_select_mcsm - , cols_to_select_mcsm_na - , cols_to_select_sdm) From 5c8a9e8f0013f0970cbf2d607dec3126ecadbc93 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 10 Sep 2021 18:16:41 +0100 Subject: [PATCH 21/51] sorted combining_dfs.py with all other data files and tidied up get_plotting_dfs.R --- scripts/combining_dfs.py | 71 ++++++++++++++++++++-------- scripts/functions/plotting_data.R | 72 +++++++++++++++-------------- scripts/plotting/get_plotting_dfs.R | 44 +++++++++--------- 3 files changed, 111 insertions(+), 76 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 4e2781e..faa9677 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -169,25 +169,31 @@ i_join = 'inner' #===================== # some preprocessing #===================== -#------------- + +#=========== # FoldX -#------------- +#=========== foldx_df.shape -#======================= + +#---------------------- # scale foldx values -#======================= +#---------------------- +# rename ddg column to ddg_foldx +foldx_df['ddg'] +foldx_df = foldx_df.rename(columns = {'ddg':'ddg_foldx'}) +foldx_df['ddg_foldx'] # Rescale values in Foldx_change col b/w -1 and 1 so negative numbers # stay neg and pos numbers stay positive -foldx_min = foldx_df['ddg'].min() -foldx_max = foldx_df['ddg'].max() +foldx_min = foldx_df['ddg_foldx'].min() +foldx_max = foldx_df['ddg_foldx'].max() foldx_min foldx_max foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed') -foldx_df['foldx_scaled'] = foldx_df['ddg'].apply(foldx_scale) -print('Raw foldx scores:\n', foldx_df['ddg'] +foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale) +print('Raw foldx scores:\n', foldx_df['ddg_foldx'] , '\n---------------------------------------------------------------' , '\nScaled foldx scores:\n', foldx_df['foldx_scaled']) @@ -195,8 +201,8 @@ print('Raw foldx scores:\n', foldx_df['ddg'] fsmi = foldx_df['foldx_scaled'].min() fsma = foldx_df['foldx_scaled'].max() -c = foldx_df[foldx_df['ddg']>=0].count() -foldx_pos = c.get(key = 'ddg') +c = foldx_df[foldx_df['ddg_foldx']>=0].count() +foldx_pos = c.get(key = 'ddg_foldx') c2 = foldx_df[foldx_df['foldx_scaled']>=0].count() foldx_pos2 = c2.get(key = 'foldx_scaled') @@ -209,20 +215,30 @@ else: , '\nGot:', foldx_pos2 , '\n======================================================') -# rename ddg column to ddg_foldx -foldx_df['ddg'] -foldx_df = foldx_df.rename(columns = {'ddg':'ddg_foldx'}) -foldx_df['ddg_foldx'] +#------------------------- +# foldx outcome category +#-------------------------- +foldx_df['foldx_outcome'] = foldx_df['ddg_foldx'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') +foldx_df[foldx_df['ddg_foldx']>=0].count() +foc = foldx_df['foldx_outcome'].value_counts() -#------------- +if foc['Stabilising'] == foldx_pos and foc['Stabilising'] == foldx_pos2: + print('\nPASS: Foldx outcome category created') +else: + print('\nFAIL: Foldx outcome category could NOT be created' + , '\nExpected number:', foldx_pos + , '\nGot:', foc[0] + , '\n======================================================') + sys.exit() + +#======================= # Deepddg -#------------- +#======================= deepddg_df.shape -#======================= +#------------------------- # scale Deepddg values -#======================= - +#------------------------- # Rescale values in deepddg_change col b/w -1 and 1 so negative numbers # stay neg and pos numbers stay positive deepddg_min = deepddg_df['deepddg'].min() @@ -252,6 +268,23 @@ else: , '\nExpected number:', deepddg_pos , '\nGot:', deepddg_pos2 , '\n======================================================') + +#-------------------------- +# Deepddg outcome category +#-------------------------- +deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') +deepddg_df[deepddg_df['deepddg']>=0].count() +doc = deepddg_df['deepddg_outcome'].value_counts() + +if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2: + print('\nPASS: Deepddg outcome category created') +else: + print('\nFAIL: Deepddg outcome category could NOT be created' + , '\nExpected number:', deepddg_pos + , '\nGot:', doc[0] + , '\n======================================================') + sys.exit() + #%%============================================================================= # Now merges begin #%%============================================================================= diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index ddda207..5744faa 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -16,7 +16,9 @@ library(dplyr) ## my_df_u_lig ## dup_muts #======================================================== -plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) { +plotting_data <- function(df + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10) { my_df = data.frame() my_df_u = data.frame() my_df_u_lig = data.frame() @@ -38,51 +40,51 @@ cat("\nInput dimensions:", dim(df)) #================================== #------------------------------ -# adding foldx scaled values -# scale data b/w -1 and 1 -#------------------------------ -n = which(colnames(df) == "ddg"); n - -my_min = min(df[,n]); my_min -my_max = max(df[,n]); my_max - -df$foldx_scaled = ifelse(df[,n] < 0 - , df[,n]/abs(my_min) - , df[,n]/my_max) -# sanity check -my_min = min(df$foldx_scaled); my_min -my_max = max(df$foldx_scaled); my_max - -if (my_min == -1 && my_max == 1){ - cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1" - , "\nProceeding with assigning foldx outcome category") -}else{ - cat("\nFAIL: could not scale foldx ddg values" - , "Aborting!\n") -} +# # adding foldx scaled values +# # scale data b/w -1 and 1 +# #------------------------------ +# n = which(colnames(df) == "ddg"); n +# +# my_min = min(df[,n]); my_min +# my_max = max(df[,n]); my_max +# +# df$foldx_scaled = ifelse(df[,n] < 0 +# , df[,n]/abs(my_min) +# , df[,n]/my_max) +# # sanity check +# my_min = min(df$foldx_scaled); my_min +# my_max = max(df$foldx_scaled); my_max +# +# if (my_min == -1 && my_max == 1){ +# cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1" +# , "\nProceeding with assigning foldx outcome category") +# }else{ +# cat("\nFAIL: could not scale foldx ddg values" +# , "Aborting!\n") +# } #------------------------------ # adding foldx outcome category # ddg<0 = "Stabilising" (-ve) #------------------------------ -c1 = table(df$ddg < 0) -df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising") -c2 = table(df$ddg < 0) - -if ( all(c1 == c2) ){ - cat("\nPASS: foldx outcome successfully created") -}else{ - cat("\nFAIL: foldx outcome could not be created. Aborting!\n") - exit() -} +# c1 = table(df$ddg < 0) +# df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising") +# c2 = table(df$ddg < 0) +# +# if ( all(c1 == c2) ){ +# cat("\nPASS: foldx outcome successfully created") +# }else{ +# cat("\nFAIL: foldx outcome could not be created. Aborting!\n") +# exit() +# } #------------------------------ # renaming foldx column from # "ddg" --> "ddg_foldx" #------------------------------ -# change name to foldx -colnames(df)[n] <- "ddg_foldx" +# # change name to foldx +# colnames(df)[n] <- "ddg_foldx" #================================== # extract unique mutation entries diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index f1a7620..c1ce5b2 100755 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -97,33 +97,33 @@ merged_df3_comp = all_plot_dfs[[4]] # adding deepddg scaled values # scale data b/w -1 and 1 #============================ -n = which(colnames(merged_df3) == "deepddg"); n - -my_min = min(merged_df3[,n]); my_min -my_max = max(merged_df3[,n]); my_max - -merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0 - , merged_df3[,n]/abs(my_min) - , merged_df3[,n]/my_max) -# sanity check -my_min = min(merged_df3$deepddg_scaled); my_min -my_max = max(merged_df3$deepddg_scaled); my_max - -if (my_min == -1 && my_max == 1){ - cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1" - #, "\nProceeding with assigning deep outcome category") - , "\n") -}else{ - cat("\nFAIL: could not scale DeepDDG ddg values" - , "Aborting!") -} - +# n = which(colnames(merged_df3) == "deepddg"); n +# +# my_min = min(merged_df3[,n]); my_min +# my_max = max(merged_df3[,n]); my_max +# +# merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0 +# , merged_df3[,n]/abs(my_min) +# , merged_df3[,n]/my_max) +# # sanity check +# my_min = min(merged_df3$deepddg_scaled); my_min +# my_max = max(merged_df3$deepddg_scaled); my_max +# +# if (my_min == -1 && my_max == 1){ +# cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1" +# #, "\nProceeding with assigning deep outcome category") +# , "\n") +# }else{ +# cat("\nFAIL: could not scale DeepDDG ddg values" +# , "Aborting!") +# } +# #################################################################### # Data for combining other dfs #################################################################### -source("other_dfs_data.R") +#source("other_dfs_data.R") #################################################################### # Data for subcols barplot (~heatmap) From 3ddbee8c90428d9203d73c34fcd342d3b995f631 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 10 Sep 2021 18:19:01 +0100 Subject: [PATCH 22/51] finally moved foldx_outcome and deepddg_outcome calcs to combine_dfs.py in python script i.e cleaned source data --- scripts/functions/plotting_data.R | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index 5744faa..fa4e9c1 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -37,6 +37,7 @@ cat("\nInput dimensions:", dim(df)) # This will enable to always have these variables available # when calling for plots +# included this now in combine_dfs.py!!!! finallyS #================================== #------------------------------ From 27f0b15d4c162efccc0b7fa3a9eb4dd297c32b12 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 10 Sep 2021 18:19:56 +0100 Subject: [PATCH 23/51] tidied script plotting_data.R by removing superceded code --- scripts/functions/plotting_data.R | 56 ------------------------------- 1 file changed, 56 deletions(-) diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index fa4e9c1..faaebca 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -31,62 +31,6 @@ dup_muts = data.frame() cat("\nInput dimensions:", dim(df)) -#================================== -# add foldx outcome category -# and foldx scaled values - -# This will enable to always have these variables available -# when calling for plots -# included this now in combine_dfs.py!!!! finallyS -#================================== - -#------------------------------ -# # adding foldx scaled values -# # scale data b/w -1 and 1 -# #------------------------------ -# n = which(colnames(df) == "ddg"); n -# -# my_min = min(df[,n]); my_min -# my_max = max(df[,n]); my_max -# -# df$foldx_scaled = ifelse(df[,n] < 0 -# , df[,n]/abs(my_min) -# , df[,n]/my_max) -# # sanity check -# my_min = min(df$foldx_scaled); my_min -# my_max = max(df$foldx_scaled); my_max -# -# if (my_min == -1 && my_max == 1){ -# cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1" -# , "\nProceeding with assigning foldx outcome category") -# }else{ -# cat("\nFAIL: could not scale foldx ddg values" -# , "Aborting!\n") -# } - -#------------------------------ -# adding foldx outcome category -# ddg<0 = "Stabilising" (-ve) -#------------------------------ -# c1 = table(df$ddg < 0) -# df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising") -# c2 = table(df$ddg < 0) -# -# if ( all(c1 == c2) ){ -# cat("\nPASS: foldx outcome successfully created") -# }else{ -# cat("\nFAIL: foldx outcome could not be created. Aborting!\n") -# exit() -# } - -#------------------------------ -# renaming foldx column from -# "ddg" --> "ddg_foldx" -#------------------------------ - -# # change name to foldx -# colnames(df)[n] <- "ddg_foldx" - #================================== # extract unique mutation entries #================================== From 3f3fe89a6b36bee4df1e0d89863aa0285e6ce309 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 10 Sep 2021 18:20:45 +0100 Subject: [PATCH 24/51] added shorter scripts for each different processing for plots to make it wasire to read code --- scripts/plotting/coloured_bp_data.R | 80 +++ scripts/plotting/corr_data.R | 67 +++ scripts/plotting/dm_om_data.R | 416 ++++++++++++++++ scripts/plotting/logo_data.R | 142 ++++++ scripts/plotting/redundant/other_dfs_data.R | 117 +++++ scripts/plotting/redundant/other_plots_data.R | 470 ++++++++++++++++++ 6 files changed, 1292 insertions(+) create mode 100644 scripts/plotting/coloured_bp_data.R create mode 100644 scripts/plotting/corr_data.R create mode 100644 scripts/plotting/dm_om_data.R create mode 100644 scripts/plotting/logo_data.R create mode 100644 scripts/plotting/redundant/other_dfs_data.R create mode 100755 scripts/plotting/redundant/other_plots_data.R diff --git a/scripts/plotting/coloured_bp_data.R b/scripts/plotting/coloured_bp_data.R new file mode 100644 index 0000000..a1f0964 --- /dev/null +++ b/scripts/plotting/coloured_bp_data.R @@ -0,0 +1,80 @@ +#!/usr/bin/env Rscript +################################################################# +# TASK: Script to add bp colours ~ barplot heatmap +################################################################# + +my_df = merged_df3 + +cols_to_select = c("mutationinformation", "drtype" + , "wild_type" + , "position" + , "mutant_type" + , "chain", "ligand_id", "ligand_distance" + , "duet_stability_change", "duet_outcome", "duet_scaled" + , "ligand_affinity_change", "ligand_outcome", "affinity_scaled" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_outcome" # comment out as not available for pnca + , "asa", "rsa", "rd_values", "kd_values" + , "af", "or_mychisq", "pval_fisher" + , "or_fisher", "or_logistic", "pval_logistic" + , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity" + , "wt_calcprop", "mut_calcprop") + +#======================= +# Data for sub colours +# barplot: PS +#======================= + +cat("\nNo. of cols to select:", length(cols_to_select)) + +subcols_df_ps = my_df[, cols_to_select] + +cat("\nNo of unique positions for ps:" + , length(unique(subcols_df_ps$position))) + +# add count_pos col that counts the no. of nsSNPS at a position +setDT(subcols_df_ps)[, pos_count := .N, by = .(position)] + +# should be a factor +if (is.factor(subcols_df_ps$duet_outcome)){ + cat("\nDuet_outcome is factor") + table(subcols_df_ps$duet_outcome) +}else{ + cat("\nConverting duet_outcome to factor") + subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome) + table(subcols_df_ps$duet_outcome) +} + +# should be -1 and 1 +min(subcols_df_ps$duet_scaled) +max(subcols_df_ps$duet_scaled) + +tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min) +tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max) + +# check unique values in normalised data +cat("\nNo. of unique values in duet scaled, no rounding:" + , length(unique(subcols_df_ps$duet_scaled))) + +# No rounding +my_grp = subcols_df_ps$duet_scaled; length(my_grp) + +# Add rounding is to be used +n = 3 +subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n) + +cat("\nNo. of unique values in duet scaled", n, "places rounding:" + , length(unique(subcols_df_ps$duet_scaledR))) + +my_grp_r = subcols_df_ps$duet_scaledR # rounding + +# Add grp cols +subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "") +subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "") + +# Call the function to create the palette based on the group defined above +subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp") +subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r") + +cat("Colour palette generated for my_grp: ", length(subcols_ps), " colours") +cat("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours") diff --git a/scripts/plotting/corr_data.R b/scripts/plotting/corr_data.R new file mode 100644 index 0000000..d33efc5 --- /dev/null +++ b/scripts/plotting/corr_data.R @@ -0,0 +1,67 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for corr plots +######################################################### + +#================================================= +# Data for Corrplots +#================================================= +cat("\n==========================================" + , "\nCORR PLOTS data: ALL params" + , "\n=========================================") + +# use data +#merged_df2 + +#---------------------------- +# columns for corr plots:PS +#---------------------------- +# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find! +big_df_colnames = data.frame(names(merged_df2)) + +corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels" + , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa" + , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af" + , "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity" + , "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance") + +#=========================== +# Corr data for plots: PS +# big_df ps: ~ merged_df2 +#=========================== + +corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select] + +#=========================== +# Corr data for plots: PS +# short_df ps: ~merged_df3 +#=========================== + +corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),] + +na_or = sum(is.na(corr_df_m3$log10_or_mychisq)) +check1 = nrow(corr_df_m3) - na_or; check1 + +if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { + cat( "\nPASS: No. of rows for corr_df_m3 match" + , "\nPASS: No. of OR values checked: " , check1) +} else { + cat("\nFAIL: Numbers mismatch:" + , "\nExpected nrows: ", nrow(merged_df3) + , "\nGot: ", nrow(corr_df_m3) + , "\nExpected OR values: ", nrow(merged_df3_comp) + , "\nGot: ", check1) +} + +cat("\nCorr Data created:" +, "\n===================================" +, "\ncorr_df_m2: created from merged_df2" +, "\n===================================" +, "\nnrows:", nrow(corr_df_m2) +, "\nncols:", ncol(corr_df_m2) +, "\n===================================" +, "\ncorr_df_m3: created from merged_df3" +, "\n===================================" +, "\nnrows:", nrow(corr_df_m3) +, "\nncols:", ncol(corr_df_m3) +) diff --git a/scripts/plotting/dm_om_data.R b/scripts/plotting/dm_om_data.R new file mode 100644 index 0000000..4bd82e7 --- /dev/null +++ b/scripts/plotting/dm_om_data.R @@ -0,0 +1,416 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for dm om plots: +# generating LF data +# sourced by get_plotting_dfs.R +######################################################### +##======================================================================== +# cols to select: +# THINK: whu + +comb_df <- merged_df3[, c("mutationinformation", "mutation" + , "mutation_info","mutation_info_labels" + , "position" + , LigDist_colname + , "duet_stability_change", "duet_scaled", "duet_outcome" + , "ligand_affinity_change", "affinity_scaled", "ligand_outcome" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_scaled", "deepddg_outcome" + , "asa", "rsa" + , "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af" + , "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome" + , "ddg_dynamut", "ddg_dynamut_scaled","ddg_dynamut_outcome" + , "ddg_encom", "ddg_encom_scaled", "ddg_encom_outcome" + , "dds_encom", "dds_encom_scaled", "dds_encom_outcome" + , "ddg_mcsm", "ddg_mcsm_scaled", "ddg_mcsm_outcome" + , "ddg_sdm", "ddg_sdm_scaled", "ddg_sdm_outcome" + , "ddg_duet", "ddg_duet_scaled", "ddg_duet_outcome" + , "ddg_dynamut2","ddg_dynamut2_scaled", "ddg_dynamut2_outcome")] + + +comb_df_s = arrange(comb_df, position) + +#======================================================================= +fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] +fact_cols +lapply(comb_df_s[, fact_cols], class) +comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor) + +if (any(lapply(comb_df_s[, fact_cols], class) == "character")){ + cat("\nChanging cols to factor") + comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor) + if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +} +lapply(comb_df_s[, fact_cols], class) + +#======================================================================= +table(comb_df_s$mutation_info) + + # further checks to make sure dr and other muts are indeed unique +dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,] +dr_muts_names = unique(dr_muts$mutation) + +other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,] +other_muts_names = unique(other_muts$mutation) + +if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && + table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){ + cat("PASS: dr and other muts are indeed unique") +}else{ + cat("FAIL: dr and others muts are NOT unique!") + quit() +} + +# pretty display names i.e. labels to reduce major code duplication later +foo_cnames = data.frame(colnames(comb_df_s)) +names(foo_cnames) <- "old_name" + +stability_suffix <- paste0(delta_symbol, delta_symbol, "G") +flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") + +lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn +duet_dn = paste0("DUET ", stability_suffix); duet_dn +foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn +deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn +mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn +dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn +dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn +encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn +encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn +sdm_dn = paste0("SDM " , stability_suffix); sdm_dn +mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn + +# Change colnames of some columns using datatable +comb_df_sl = comb_df_s +names(comb_df_sl) + +setnames(comb_df_sl + , old = c("asa", "rsa", "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af" + , LigDist_colname + , "duet_scaled" + , "foldx_scaled" + , "deepddg_scaled" + , "mcsm_na_scaled" + , "ddg_dynamut_scaled" + , "ddg_dynamut2_scaled" + , "ddg_encom_scaled" + , "dds_encom_scaled" + , "ddg_sdm" + , "ddg_mcsm") + + , new = c("ASA", "RSA", "RD", "KD" + , "Log10 (OR)", "-Log (P)", "MAF" + , lig_dn + , duet_dn + , foldx_dn + , deepddg_dn + , mcsm_na_dn + , dynamut_dn + , dynamut2_dn + , encom_ddg_dn + , encom_dds_dn + , sdm_dn + , mcsm_dn) + ) + +foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl)) + +# some more pretty labels +table(comb_df_sl$mutation_info) + +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM" +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM" + +table(comb_df_sl$mutation_info) + +####################################################################### +#====================== +# Selecting dfs +# with appropriate cols +#======================= +static_cols_start = c("mutationinformation" + , "position" + , "mutation" + , "mutation_info") + +static_cols_end = c(lig_dn + , "ASA" + , "RSA" + , "RD" + , "KD") + +# ordering is important! + +######################################################################### +#============== +# DUET: LF +#============== +cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end) +wf_duet = comb_df_sl[, cols_to_select_duet] + +#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps +pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet + +expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet)) +expected_rows_lf + +# LF data: duet +lf_duet = gather(wf_duet + , key = param_type + , value = param_value + , all_of(duet_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_duet) == expected_rows_lf){ + cat("\nPASS: long format data created for ", duet_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# FoldX: LF +#============== +cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end) +wf_foldx = comb_df_sl[, cols_to_select_foldx] + +pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx + +expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx)) +expected_rows_lf + +# LF data: Foldx +lf_foldx <<- gather(wf_foldx + , key = param_type + , value = param_value + , all_of(foldx_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_foldx) == expected_rows_lf){ + cat("\nPASS: long format data created for ", foldx_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Deepddg: LF +#============== +cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end) +wf_deepddg = comb_df_sl[, cols_to_select_deepddg] + +pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg + +expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg)) +expected_rows_lf + +# LF data: Deepddg +lf_deepddg = gather(wf_deepddg + , key = param_type + , value = param_value + , all_of(deepddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_deepddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", deepddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM-NA: LF +#============== +cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) +wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + +pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na + +expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) +expected_rows_lf + +# LF data: mcsm_na +lf_mcsm_na = gather(wf_mcsm_na + , key = param_type + , value = param_value + , all_of(mcsm_na_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm_na) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_na_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut: LF +#============== +cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end) +wf_dynamut = comb_df_sl[, cols_to_select_dynamut] + +pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut + +expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut)) +expected_rows_lf + +# LF data: dynamut +lf_dynamut = gather(wf_dynamut + , key = param_type + , value = param_value + , all_of(dynamut_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut2: LF +#============== +cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end) + +wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2] + +pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2 + +expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2)) +expected_rows_lf + +# LF data: dynamut2 +lf_dynamut2 = gather(wf_dynamut2 + , key = param_type + , value = param_value + , all_of(dynamut2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut2_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# EnCOM ddg: LF +#============== +cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end) +wf_encomddg = comb_df_sl[, cols_to_select_encomddg] + +pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg + +expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg)) +expected_rows_lf + +# LF data: encomddg +lf_encomddg = gather(wf_encomddg + , key = param_type + , value = param_value + , all_of(encom_ddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", encom_ddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} +############################################################################ +#============== +# EnCOM dds: LF +#============== +cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end) +wf_encomdds = comb_df_sl[, cols_to_select_encomdds] + +pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds + +expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds)) +expected_rows_lf + +# LF data: encomdds +lf_encomdds = gather(wf_encomdds + , key = param_type + , value = param_value + , all_of(encom_dds_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomdds) == expected_rows_lf){ + cat("\nPASS: long format data created for", encom_dds_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# SDM: LF +#============== +cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end) +wf_sdm = comb_df_sl[, cols_to_select_sdm] + +pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm + +expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm)) +expected_rows_lf + +# LF data: sdm +lf_sdm = gather(wf_sdm + , key = param_type + , value = param_value + , all_of(sdm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_sdm) == expected_rows_lf){ + cat("\nPASS: long format data created for", sdm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM: LF +#============== +cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end) +wf_mcsm = comb_df_sl[, cols_to_select_mcsm] + +pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm + +expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm)) +expected_rows_lf + +# LF data: mcsm +lf_mcsm = gather(wf_mcsm + , key = param_type + , value = param_value + , all_of(mcsm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm) == expected_rows_lf){ + cat("\nPASS: long format data created for", mcsm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +#========================== +# Duet-d(from Dynamut): LF +#=========================== + +#Not created, redundant and chaos! + +############################################################################ + diff --git a/scripts/plotting/logo_data.R b/scripts/plotting/logo_data.R new file mode 100644 index 0000000..7eaf1b6 --- /dev/null +++ b/scripts/plotting/logo_data.R @@ -0,0 +1,142 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for Logo_plots +######################################################### +#------------------------- +# choose df for logoplot +#------------------------- +logo_data = merged_df3 +#logo_data = merged_df3_comp + +# quick checks +colnames(logo_data) +str(logo_data) + +c1 = unique(logo_data$position) +nrow(logo_data) +cat("No. of rows in my_data:", nrow(logo_data) + , "\nDistinct positions corresponding to snps:", length(c1) + , "\n===========================================================") +#======================================================================= +#================== +# logo data: OR +#================== +foo = logo_data[, c("position" + , "mutant_type","duet_scaled", "or_mychisq" + , "mut_prop_polarity", "mut_prop_water")] + +logo_data$log10or = log10(logo_data$or_mychisq) +logo_data_plot = logo_data[, c("position" + , "mutant_type", "or_mychisq", "log10or")] + +logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")] +wide_df_or = logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0) + +wide_df_or = as.matrix(wide_df_or) +rownames(wide_df_or) = wide_df_or[,1] +dim(wide_df_or) +wide_df_or = wide_df_or[,-1] +str(wide_df_or) + +position_or = as.numeric(colnames(wide_df_or)) + +#================== +# logo data: logOR +#================== +logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")] +wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0) + +wide_df_logor = as.matrix(wide_df_logor) + +rownames(wide_df_logor) = wide_df_logor[,1] +wide_df_logor = subset(wide_df_logor, select = -c(1) ) +colnames(wide_df_logor) +wide_df_logor_m = data.matrix(wide_df_logor) + +rownames(wide_df_logor_m) +colnames(wide_df_logor_m) + +position_logor = as.numeric(colnames(wide_df_logor_m)) + +#=============================== +# logo data: multiple nsSNPs (>1) +#================================= +#require(data.table) + +# get freq count of positions so you can subset freq<1 +setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] + +table(logo_data$position) +table(logo_data$mut_pos_occurrence) + +max_mut = max(table(logo_data$position)) + +# extract freq_pos > 1 +my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] +u = unique(my_data_snp$position) +max_mult_mut = max(table(my_data_snp$position)) + +if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){ + + cat("PASS: positions with multiple muts extracted" + , "\nNo. of mutations:", nrow(my_data_snp) + , "\nNo. of positions:", length(u) + , "\nMax no. of muts at any position", max_mult_mut) +}else{ + cat("FAIL: positions with multiple muts could NOT be extracted" + , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] + , "\nGot:", nrow(my_data_snp) ) +} + +cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]]) + +#-------------------------------------- +# matrix for_mychisq mutant type +# frequency of mutant type by position +#--------------------------------------- +table(my_data_snp$mutant_type, my_data_snp$position) +tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) +class(tab_mt) + +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +# should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos + +#------------------------------------- +# matrix for wild type +# frequency of wild type by position +#------------------------------------- +tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt +tab_wt = unclass(tab_wt) + +# remove wt duplicates +wt = my_data_snp[, c("position", "wild_type")] +wt = wt[!duplicated(wt),] + +tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 + +rownames(tab_wt) +rownames(tab_wt) + +identical(colnames(tab_mt), colnames(tab_wt)) +identical(ncol(tab_mt), ncol(tab_wt)) + +#---------------------------------- +# logo data OR: multiple nsSNPs (>1) +#---------------------------------- +logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")] +#wide_df_or = logo_data_or %>% spread(position, or_mychisq, fill = 0.0) +wide_df_or_mult = logo_data_or_mult %>% spread(position, or_mychisq, fill = NA) + +wide_df_or_mult = as.matrix(wide_df_or_mult) +rownames(wide_df_or_mult) = wide_df_or_mult[,1] +wide_df_or_mult = wide_df_or_mult[,-1] +str(wide_df_or_mult) + +position_or_mult = as.numeric(colnames(wide_df_or_mult)) diff --git a/scripts/plotting/redundant/other_dfs_data.R b/scripts/plotting/redundant/other_dfs_data.R new file mode 100644 index 0000000..97b0567 --- /dev/null +++ b/scripts/plotting/redundant/other_dfs_data.R @@ -0,0 +1,117 @@ +#!/usr/bin/env Rscript + +# Didn't end up using it: sorted it at the source +# .py script to combine all dfs to output all_params + +################################################################# +# TASK: Script to add all other dfs to merged_df2 and merged_df3 + +################################################################# +# Combine other dfs: +# dynamut_df, dynamut2_df, mcsm_na_df, +# perhaps : deepddg and mcsm ppi (for embb) +################################################################ +# read other files +infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene + , "_complex_dynamut_norm.csv") + +infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene + , "_complex_dynamut2_norm.csv") + +infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene + , "_complex_mcsm_na_norm.csv") + +infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene + , "_mcsm_formatted_snps.csv") + +dynamut_df = read.csv(infilename_dynamut) +dynamut2_df = read.csv(infilename_dynamut2) +mcsm_na_df = read.csv(infilename_mcsm_na) +mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F) +names(mcsm_f_snps) = "mutationinformation" + +#================================= +# check with intersect to find the common col, but use +c1 = length(intersect(names(dynamut_df), names(dynamut2_df))) +c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df))) + +if (c1 == 1 && c2 == 1) { + n_common = 1 +}else{ + cat("\nMore than one common col found, inspect before merging!") +} + +# mutationinformation column to be on the safe side +# delete chain from dynamut2_df +#dynamut2_df = subset(dynamut2_df, select = -chain) + +# quick checks +lapply(list(dynamut_df + , dynamut2_df + , mcsm_na_df), ncol) + +lapply(list(dynamut_df + , dynamut2_df + , mcsm_na_df), colnames) + +lapply(list(dynamut_df + , dynamut2_df + , mcsm_na_df), nrow) + +ncols_comb = lapply(list(dynamut_df + , dynamut2_df + , mcsm_na_df), ncol) + +#--------------------------------- +# Combine 1: all other params dfs +#--------------------------------- +combined_dfs = Reduce(inner_join, list(dynamut_df + , dynamut2_df + , mcsm_na_df)) +# Reduce("+", ncols_comb) + +#----------------------------------------- +# Combine 2: combine1 result + merged_df2 +#----------------------------------------- +drop_cols = intersect(names(combined_dfs), names(merged_df2)) +drop_cols = drop_cols + +drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")] + +combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols] + +nrow(combined_dfs_f); nrow(merged_df2) +ncol(combined_dfs_f); ncol(merged_df2) + +#----------------------------------------- +# Combined merged_df2 +#----------------------------------------- +merged_df2_combined = merge(merged_df2 + , combined_dfs_f + , by = "mutationinformation" +) + +expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1 + +if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){ + + cat("\nPASS: merged_df2 combined with other parameters dfs." + , "\nUse this for lineage distribution plots") +}else{ + + cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs") + quit() + +} + +rm(combined_dfs, combined_dfs_f) + +#================================ +# combined data +# short_df ps: ~ merged_df3 +# TODO: later integrate properly +#================================ +#----------------------------------------- +# Combined merged_df2 +#----------------------------------------- +merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),] diff --git a/scripts/plotting/redundant/other_plots_data.R b/scripts/plotting/redundant/other_plots_data.R new file mode 100755 index 0000000..61a508f --- /dev/null +++ b/scripts/plotting/redundant/other_plots_data.R @@ -0,0 +1,470 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for dm om plots: +# generating LF data +# sourced by get_plotting_dfs.R +######################################################### +# working dir and loading libraries +# getwd() +# setwd("~/git/LSHTM_analysis/scripts/plotting") +# getwd() + +# make cmd +# globals +# drug = "streptomycin" +# gene = "gid" + +# source("get_plotting_dfs.R") +#======================================================================= +# MOVE TO COMBINE or singular file for deepddg +# +# cols_to_select = c("mutation", "mutationinformation" +# , "wild_type", "position", "mutant_type" +# , "mutation_info") +# +# merged_df3_short = merged_df3[, cols_to_select] + +# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene +# , "_mcsm_formatted_snps.csv") +# +# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F) +# names(mcsm_f_snps) <- "mutationinformation" + +# write merged_df3 to generate structural figure on chimera +#write.csv(merged_df3_short, "merged_df3_short.csv") +#======================================================================== + +#======================================================================== +# cols to select + +cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation" + , "mutation_info", "position" + , LigDist_colname + , "duet_stability_change", "duet_scaled", "duet_outcome" + , "ligand_affinity_change", "affinity_scaled", "ligand_outcome" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_scaled", "deepddg_outcome" + , "asa", "rsa" + , "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af")] + +cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation" + , "mcsm_na_affinity", "mcsm_na_scaled" + , "mcsm_na_outcome")] +# entire dynamut_df + +cols_dynamut2_df <- dynamut2_df[, c("mutationinformation" + , "ddg_dynamut2", "ddg_dynamut2_scaled" + , "ddg_dynamut2_outcome")] + +n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) + + length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols + +i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df)) +i2<- intersect(names(dynamut_df), names(cols_dynamut2_df)) +merging_cols <- intersect(i1, i2) +cat("\nmerging_cols:", merging_cols) + +if (merging_cols == "mutationinformation") { + cat("\nStage 1: Found common col between dfs, checking values in it...") + c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]]) + c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]]) + c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]]) + c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]]) + cols_check <- c(c1, c2, c3, c4) + expected_cols = n_comb_cols - ( length(cols_check) - 1) + if (all(cols_check)){ + cat("\nStage 2: Proceeding with merging dfs:\n") + comb_df <- Reduce(inner_join, list(cols_mcsm_df + , cols_mcsm_na_df + , dynamut_df + , cols_dynamut2_df)) + comb_df_s = arrange(comb_df, position) + + # if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) { + # cat("\Stage3, PASS: dfs merged sucessfully" + # , "\nnrow of merged_df: ", nrow(comb_df_s) + # , "\nncol of merged_df:", ncol(comb_df_s)) + # } + + } +} +#names(comb_df_s) +cat("\n!!!IT GOT TO HERE!!!!") +#======================================================================= +fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] +fact_cols +lapply(comb_df_s[, fact_cols], class) +comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor) + +if (any(lapply(comb_df_s[, fact_cols], class) == "character")){ + cat("\nChanging cols to factor") + comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor) + if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +} +lapply(comb_df_s[, fact_cols], class) + +#======================================================================= +table(comb_df_s$mutation_info) + + # further checks to make sure dr and other muts are indeed unique +dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,] +dr_muts_names = unique(dr_muts$mutation) + +other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,] +other_muts_names = unique(other_muts$mutation) + +if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && + table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){ + cat("PASS: dr and other muts are indeed unique") +}else{ + cat("FAIL: dr and others muts are NOT unique!") + quit() +} + +# pretty display names i.e. labels to reduce major code duplication later +foo_cnames = data.frame(colnames(comb_df_s)) +names(foo_cnames) <- "old_name" + +stability_suffix <- paste0(delta_symbol, delta_symbol, "G") +flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") + +lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn +duet_dn = paste0("DUET ", stability_suffix); duet_dn +foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn +deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn +mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn +dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn +dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn +encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn +encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn +sdm_dn = paste0("SDM " , stability_suffix); sdm_dn +mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn + +# Change colnames of some columns using datatable +comb_df_sl = comb_df_s +names(comb_df_sl) + +setnames(comb_df_sl + , old = c("asa", "rsa", "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af" + , LigDist_colname + , "duet_scaled" + , "foldx_scaled" + , "deepddg_scaled" + , "mcsm_na_scaled" + , "ddg_dynamut_scaled" + , "ddg_dynamut2_scaled" + , "ddg_encom_scaled" + , "dds_encom_scaled" + , "ddg_sdm" + , "ddg_mcsm") + + , new = c("ASA", "RSA", "RD", "KD" + , "Log10 (OR)", "-Log (P)", "MAF" + , lig_dn + , duet_dn + , foldx_dn + , deepddg_dn + , mcsm_na_dn + , dynamut_dn + , dynamut2_dn + , encom_ddg_dn + , encom_dds_dn + , sdm_dn + , mcsm_dn) + ) + +foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl)) + +# some more pretty labels +table(comb_df_sl$mutation_info) + +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM" +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM" + +table(comb_df_sl$mutation_info) + +####################################################################### +#====================== +# Selecting dfs +# with appropriate cols +#======================= +static_cols_start = c("mutationinformation" + , "position" + , "mutation" + , "mutation_info") + +static_cols_end = c(lig_dn + , "ASA" + , "RSA" + , "RD" + , "KD") + +# ordering is important! + +######################################################################### +#============== +# DUET: LF +#============== +cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end) +wf_duet = comb_df_sl[, cols_to_select_duet] + +#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps +pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet + +expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet)) +expected_rows_lf + +# LF data: duet +lf_duet = gather(wf_duet + , key = param_type + , value = param_value + , all_of(duet_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_duet) == expected_rows_lf){ + cat("\nPASS: long format data created for ", duet_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# FoldX: LF +#============== +cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end) +wf_foldx = comb_df_sl[, cols_to_select_foldx] + +pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx + +expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx)) +expected_rows_lf + +# LF data: duet +print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>") +lf_foldx <<- gather(wf_foldx + , key = param_type + , value = param_value + , all_of(foldx_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_foldx) == expected_rows_lf){ + cat("\nPASS: long format data created for ", foldx_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Deepddg: LF +#============== +cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end) +wf_deepddg = comb_df_sl[, cols_to_select_deepddg] + +pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg + +expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg)) +expected_rows_lf + +# LF data: duet +lf_deepddg = gather(wf_deepddg + , key = param_type + , value = param_value + , all_of(deepddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_deepddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", deepddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM-NA: LF +#============== +cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) +wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + +pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na + +expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) +expected_rows_lf + +# LF data: duet +lf_mcsm_na = gather(wf_mcsm_na + , key = param_type + , value = param_value + , all_of(mcsm_na_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm_na) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_na_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut: LF +#============== +cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end) +wf_dynamut = comb_df_sl[, cols_to_select_dynamut] + +pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut + +expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut)) +expected_rows_lf + +# LF data: duet +lf_dynamut = gather(wf_dynamut + , key = param_type + , value = param_value + , all_of(dynamut_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut2: LF +#============== +cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end) + +wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2] + +pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2 + +expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2)) +expected_rows_lf + +# LF data: duet +lf_dynamut2 = gather(wf_dynamut2 + , key = param_type + , value = param_value + , all_of(dynamut2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut2_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# EnCOM ddg: LF +#============== +cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end) +wf_encomddg = comb_df_sl[, cols_to_select_encomddg] + +pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg + +expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg)) +expected_rows_lf + +# LF data: encomddg +lf_encomddg = gather(wf_encomddg + , key = param_type + , value = param_value + , all_of(encom_ddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", encom_ddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} +############################################################################ +#============== +# EnCOM dds: LF +#============== +cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end) +wf_encomdds = comb_df_sl[, cols_to_select_encomdds] + +pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds + +expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds)) +expected_rows_lf + +# LF data: encomddg +lf_encomdds = gather(wf_encomdds + , key = param_type + , value = param_value + , all_of(encom_dds_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomdds) == expected_rows_lf){ + cat("\nPASS: long format data created for", encom_dds_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# SDM: LF +#============== +cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end) +wf_sdm = comb_df_sl[, cols_to_select_sdm] + +pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm + +expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm)) +expected_rows_lf + +# LF data: encomddg +lf_sdm = gather(wf_sdm + , key = param_type + , value = param_value + , all_of(sdm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_sdm) == expected_rows_lf){ + cat("\nPASS: long format data created for", sdm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM: LF +#============== +cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end) +wf_mcsm = comb_df_sl[, cols_to_select_mcsm] + +pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm + +expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm)) +expected_rows_lf + +# LF data: encomddg +lf_mcsm = gather(wf_mcsm + , key = param_type + , value = param_value + , all_of(mcsm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm) == expected_rows_lf){ + cat("\nPASS: long format data created for", mcsm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} +############################################################################ + From 996d67b423bc4ee5033f53c78ca2f8be93a52ae0 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 13 Sep 2021 10:24:41 +0100 Subject: [PATCH 25/51] added pretty colnames to corr_data.R --- scripts/plotting/corr_data.R | 53 ++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/scripts/plotting/corr_data.R b/scripts/plotting/corr_data.R index d33efc5..3120763 100644 --- a/scripts/plotting/corr_data.R +++ b/scripts/plotting/corr_data.R @@ -32,6 +32,32 @@ corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels" corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select] +#----------------------- +# formatting: some cols +# Add pretty colnames +#----------------------- +corr_df_m2_f <- corr_df_m2 %>% + rename( + DUET = duet_stability_change + , 'mCSM-lig' = ligand_affinity_change + , FoldX = ddg_foldx + , DeepDDG = deepddg + , ASA = asa + , RSA = rsa + , KD = kd_values + , RD = rd_values + , MAF = af + , 'Log (OR)' = log10_or_mychisq + , '-Log (P)' = neglog_pval_fisher + , Dynamut = ddg_dynamut + , 'ENCoM-DDG'= ddg_encom + , mCSM = ddg_mcsm + , SDM = ddg_sdm + , 'DUET-d' = ddg_duet + , 'ENCoM-DDS'= dds_encom + , Dynamut2 = ddg_dynamut2 + , 'mCSM-NA' = mcsm_na_affinity ) + #=========================== # Corr data for plots: PS # short_df ps: ~merged_df3 @@ -53,6 +79,33 @@ if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { , "\nGot: ", check1) } +#----------------------- +# formatting: some cols +# Add pretty colnames +#----------------------- +corr_df_m3_f <- corr_df_m3 %>% + rename( + DUET = duet_stability_change + , 'mCSM-lig' = ligand_affinity_change + , FoldX = ddg_foldx + , DeepDDG = deepddg + , ASA = asa + , RSA = rsa + , KD = kd_values + , RD = rd_values + , MAF = af + , 'Log (OR)' = log10_or_mychisq + , '-Log (P)' = neglog_pval_fisher + , Dynamut = ddg_dynamut + , 'ENCoM-DDG'= ddg_encom + , mCSM = ddg_mcsm + , SDM = ddg_sdm + , 'DUET-d' = ddg_duet + , 'ENCoM-DDS'= dds_encom + , Dynamut2 = ddg_dynamut2 + , 'mCSM-NA' = mcsm_na_affinity ) + +######################################################################## cat("\nCorr Data created:" , "\n===================================" , "\ncorr_df_m2: created from merged_df2" From b98977336cb1fec4223563188a539de4bf79ef85 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 14 Sep 2021 15:36:05 +0100 Subject: [PATCH 26/51] updated my_pairs_panel.R to make the dots coloured --- scripts/functions/my_pairs_panel.R | 44 ++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/scripts/functions/my_pairs_panel.R b/scripts/functions/my_pairs_panel.R index 0c73192..eb0268a 100644 --- a/scripts/functions/my_pairs_panel.R +++ b/scripts/functions/my_pairs_panel.R @@ -1,24 +1,40 @@ -my_corr_pairs <- function (corr_data){ +my_corr_pairs <- function (corr_data_all + , corr_data_range = 1:length(corr_data_all) + , corr_method = "spearman" # other options: "pearson" or "kendall" + , colour_categ_col = "mutation_info_labels" + , categ_colour = c("#E69F00", "#999999") + , density_show = F + , hist_col = "coral4" + , dot_size = 1.6 + , ats = 1 + , corr_lab_size = 1 + , corr_value_size = 1) + { - OutPlot_corr = pairs.panels(corr_data - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = TRUE # show density plots - , ellipses = F # show correlation ellipses + corr_data_df = corr_data_all[corr_data_range] + my_bg = categ_colour[corr_data_all[[colour_categ_col]] ] + + OutPlot_corr = pairs.panels(corr_data_df + , method = corr_method + , hist.col = hist_col + , density = density_show + , ellipses = F + , smooth = F , stars = T , rug = F , breaks = "Sturges" , show.points = T - #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps$duet_outcome))] # foldx colours are reveresed - #, pch = 21 # for bg - , jitter = T + #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_data$duet_outcome))] # foldx colours are reveresed + , bg = my_bg + , pch = 21 , alpha = 1 - , cex = 1.8 - , cex.axis = 2 - , cex.labels = 3.5 - , cex.cor = 1 - , smooth = F) + , cex = dot_size + , cex.axis = ats + , cex.labels = corr_lab_size + , cex.cor = corr_value_size + ) return(OutPlot_corr) + #return (my_bg) } From bf432cd054485d63d9b1f9a722ef5f0893b8af2c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 14 Sep 2021 18:20:12 +0100 Subject: [PATCH 27/51] more updates to pairs_panels to take colnames for plotting --- scripts/functions/my_pairs_panel.R | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/scripts/functions/my_pairs_panel.R b/scripts/functions/my_pairs_panel.R index eb0268a..8968111 100644 --- a/scripts/functions/my_pairs_panel.R +++ b/scripts/functions/my_pairs_panel.R @@ -1,17 +1,17 @@ my_corr_pairs <- function (corr_data_all - , corr_data_range = 1:length(corr_data_all) + , corr_cols = colnames(corr_data_all) , corr_method = "spearman" # other options: "pearson" or "kendall" , colour_categ_col = "mutation_info_labels" , categ_colour = c("#E69F00", "#999999") , density_show = F , hist_col = "coral4" , dot_size = 1.6 - , ats = 1 - , corr_lab_size = 1 + , ats = 1.5 + , corr_lab_size = 3 , corr_value_size = 1) { - corr_data_df = corr_data_all[corr_data_range] + corr_data_df = corr_data_all[corr_cols] my_bg = categ_colour[corr_data_all[[colour_categ_col]] ] OutPlot_corr = pairs.panels(corr_data_df @@ -38,6 +38,14 @@ my_corr_pairs <- function (corr_data_all } +c_plot <- my_corr_pairs(corrplot_df + + , dot_size = 1.6 + , ats = 1.5 + , corr_lab_size = 1.5 + , corr_value_size = 1) + + ###################################################################### my_pp = function (x, smooth = TRUE, scale = FALSE, density = TRUE, ellipses = TRUE, digits = 2, method = "pearson", pch = 20, lm = FALSE, cor = TRUE, From 449af7acf424dae644e3d40bce3be963066b65f1 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Sep 2021 15:46:42 +0100 Subject: [PATCH 28/51] fixed pos_count cals in function by specifying dplyr and changed summarize to summarise --- scripts/functions/my_pairs_panel.R | 8 -------- scripts/functions/position_count_bp.R | 21 ++++++++++++++------- scripts/plotting/Header_TT.R | 11 ++++++++++- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/scripts/functions/my_pairs_panel.R b/scripts/functions/my_pairs_panel.R index 8968111..808417f 100644 --- a/scripts/functions/my_pairs_panel.R +++ b/scripts/functions/my_pairs_panel.R @@ -38,14 +38,6 @@ my_corr_pairs <- function (corr_data_all } -c_plot <- my_corr_pairs(corrplot_df - - , dot_size = 1.6 - , ats = 1.5 - , corr_lab_size = 1.5 - , corr_value_size = 1) - - ###################################################################### my_pp = function (x, smooth = TRUE, scale = FALSE, density = TRUE, ellipses = TRUE, digits = 2, method = "pearson", pch = 20, lm = FALSE, cor = TRUE, diff --git a/scripts/functions/position_count_bp.R b/scripts/functions/position_count_bp.R index ce0767c..2907b4b 100755 --- a/scripts/functions/position_count_bp.R +++ b/scripts/functions/position_count_bp.R @@ -42,7 +42,9 @@ site_snp_count_bp <- function (plotdf , "\nNo. of cols:", ncol(plotdf) , "\nNow adding column: frequency of mutational positions")) - # adding snpcount for each position + #------------------------------------------- + # adding column: snpcount for each position + #------------------------------------------- setDT(plotdf)[, pos_count := .N, by = .(eval(parse(text = df_colname)))] cat("\nCumulative nssnp count\n" @@ -64,15 +66,20 @@ site_snp_count_bp <- function (plotdf cat(paste0("\nrevised df dimensions:" , "\nNo. of rows:", nrow(plotdf) , "\nNo. of cols:", ncol(plotdf))) - + + #------------------------------------------------------ + # creating df: average count of snpcount for each position + # created in earlier step + #------------------------------------------------------- # use group by on pos_count snpsBYpos_df <- plotdf %>% - group_by(eval(parse(text = df_colname))) %>% - summarize(snpsBYpos = mean(pos_count)) - - cat("\nnssnp count\n" - , table(snpsBYpos_df$snpsBYpos)) + dplyr::group_by(eval(parse(text = df_colname))) %>% + dplyr::summarise(snpsBYpos = mean(pos_count)) # changed from summarize! + cat("\nnssnp count per position\n" + , table(snpsBYpos_df$snpsBYpos) + , "\n") + # calculating total no. of sites associated with nsSNPs tot_sites = sum(table(snpsBYpos_df$snpsBYpos)) diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R index 47599d3..2fa892c 100755 --- a/scripts/plotting/Header_TT.R +++ b/scripts/plotting/Header_TT.R @@ -6,7 +6,6 @@ ######################################################### #lib_loc = "/usr/local/lib/R/site-library") - require("getopt", quietly = TRUE) # cmd parse arguments if (!require("tidyverse")) { @@ -19,6 +18,11 @@ if (!require("shiny")) { library(shiny) } +if (!require("shinyBS")) { + install.packages("shinyBS", dependencies = TRUE) + library(shinyBS) +} + if (!require("gridExtra")) { install.packages("gridExtra", dependencies = TRUE) library(gridExtra) @@ -39,6 +43,11 @@ if (!require("ggridges")) { # library(dplyr) # } +if (!require ("DT")){ + install.packages("DT") + library(DT) +} + if (!require ("plyr")){ install.packages("plyr") library(plyr) From 7550efbd4c45c852af4ebd3fd56fa50e86641188 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Sep 2021 19:29:09 +0100 Subject: [PATCH 29/51] added wideplot subcols generation within bp_subcolours.R to make it easier to call the whole thing as a function and use merged_df3 to generate plot without having to separately generate special data for it. Tested with real data on different stability params --- scripts/functions/bp_subcolours.R | 81 ++++++++++++++++++++++++++++- scripts/plotting/get_plotting_dfs.R | 2 +- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/scripts/functions/bp_subcolours.R b/scripts/functions/bp_subcolours.R index a3cc403..3db4079 100755 --- a/scripts/functions/bp_subcolours.R +++ b/scripts/functions/bp_subcolours.R @@ -3,7 +3,7 @@ # LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar ######################################################### -ColourPalleteMulti <- function(df, group, subgroup){ +ColourPalleteMulti = function(df, group, subgroup){ # Find how many colour categories to create and the number of colours in each categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) @@ -24,4 +24,81 @@ ColourPalleteMulti <- function(df, group, subgroup){ , category.end[i]))(categories[i,2])})) return(colours) } -######################################################### \ No newline at end of file +######################################################################### + +bp_stability_hmap <- function(plotdf = merged_df3 + , xvar_colname = "position" + #, bar_col_colname = "group" + , stability_colname = "duet_scaled" + , stability_outcome_colname = "duet_outcome" + , p_title = "" # "Protein stability (DUET)" + , my_xaxls = 12 # x-axis label size + , my_yaxls = 20 # y-axis label size + , my_xaxts = 18 # x-axis text size + , my_yaxts = 20 # y-axis text size + , my_pts = 20 # plot-title size + , my_xlab = "Position" + , my_ylab = "No. of nsSNPs" +) +{ + + # order the df by position and ensure it is a factor + plotdf = plotdf[order(plotdf[[xvar_colname]]), ] + plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]]) + + #cat("\nSneak peak:\n") + head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) ) + + # stability values isolated to help with generating column called: 'group' + my_grp = plotdf[[stability_colname]] + cat( "\nLength of nsSNPs:", length(my_grp) + , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) ) + + # Add col: 'group' + plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "") + + # check unique values in normalised data + cat("\nNo. of unique values in", stability_colname, "no rounding:" + , length(unique(plotdf[[stability_colname]]))) + + # Call the function to create the palette based on the group defined above + #subcols_ps + subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname) + + cat("\nNo. of sub colours generated:", length(subcols_bp_hmap)) + + #------------------------------- + # Generate the subcols barplot + #------------------------------- + + #g = ggplot(plotdf, aes(x = factor(position, ordered = T))) + g = ggplot(plotdf, aes_string(x = xvar_colname + # , ordered = T) + )) + + + OutWidePlot = g + geom_bar(aes(fill = group) + , colour = "grey") + + + scale_fill_manual( values = subcols_bp_hmap + , guide = "none") + + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) + , plot.title = element_text(size = my_pts + , hjust = 0.5)) + + + labs(title = p_title + , x = my_xlab + , y = my_ylab) + + return(OutWidePlot) +} diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index c1ce5b2..a50b0a9 100755 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -129,7 +129,7 @@ merged_df3_comp = all_plot_dfs[[4]] # Data for subcols barplot (~heatmap) #################################################################### -source("coloured_bp_data.R") +#source("coloured_bp_data.R") #################################################################### # Data for logoplots From 2ac5ec410eb78d2ceeb5ea690596bba354e716ed Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Sep 2021 19:33:52 +0100 Subject: [PATCH 30/51] added test_bp_subcolours.R --- scripts/functions/tests/test_bp_subcolours.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 scripts/functions/tests/test_bp_subcolours.R diff --git a/scripts/functions/tests/test_bp_subcolours.R b/scripts/functions/tests/test_bp_subcolours.R new file mode 100644 index 0000000..83fca05 --- /dev/null +++ b/scripts/functions/tests/test_bp_subcolours.R @@ -0,0 +1,16 @@ +#!/usr/bin/env Rscript +source("~/git/Misc/shiny/myshiny/gid_data.R") +setwd("~/git/LSHTM_analysis/scripts/functions/") +source("bp_subcolours.R") + + +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "deepddg_scaled" + , stability_outcome_colname = "deepddg_outcome" + , p_title = "DeepDDG" ) + + +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "foldx_scaled" + , stability_outcome_colname = "foldx_outcome" + , p_title = "FoldX" ) From f0e66b2f7b3e489e13a65cad15046fb292856717 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Sep 2021 19:34:24 +0100 Subject: [PATCH 31/51] added the scratch script as _v2 to play while repurposing bp_subcolours.R --- .../functions/redundant/bp_subcolours_v2.R | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 scripts/functions/redundant/bp_subcolours_v2.R diff --git a/scripts/functions/redundant/bp_subcolours_v2.R b/scripts/functions/redundant/bp_subcolours_v2.R new file mode 100644 index 0000000..a049ba2 --- /dev/null +++ b/scripts/functions/redundant/bp_subcolours_v2.R @@ -0,0 +1,104 @@ +######################################################### +# 1b: Define function: coloured barplot by subgroup +# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar +######################################################### + +ColourPalleteMulti = function(df, group, subgroup){ + + # Find how many colour categories to create and the number of colours in each + categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) + , df + , function(x) length(unique(x))) + # return(categories) } + + category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete + + category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom + + #return(category.start); return(category.end)} + + # Build Colour pallette + colours <- unlist(lapply(1:nrow(categories), + function(i){ + colorRampPalette(colors = c(category.start[i] + , category.end[i]))(categories[i,2])})) + return(colours) +} +######################################################################### + +bp_stability_hmap <- function(plotdf = merged_df3 + , xvar_colname = "position" + #, bar_col_colname = "group" + , stability_colname = "duet_scaled" + , stability_outcome_colname = "duet_outcome" + , p_title = "" # "Protein stability (DUET)" + , my_xaxls = 12 # x-axis label size + , my_yaxls = 20 # y-axis label size + , my_xaxts = 18 # x-axis text size + , my_yaxts = 20 # y-axis text size + , my_pts = 20 # plot-title size + , my_xlab = "Position" + , my_ylab = "No. of nsSNPs" + ) +{ + + # order the df by position and ensure it is a factor + plotdf = plotdf[order(plotdf[[xvar_colname]]), ] + plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]]) + + #cat("\nSneak peak:\n") + head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) ) + + # stability values isolated to help with generating column called: 'group' + my_grp = plotdf[[stability_colname]] + cat( "\nLength of nsSNPs:", length(my_grp) + , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) ) + + # Add col: 'group' + plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "") + + # check unique values in normalised data + cat("\nNo. of unique values in", stability_colname, "no rounding:" + , length(unique(plotdf[[stability_colname]]))) + + # Call the function to create the palette based on the group defined above + #subcols_ps + subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname) + + cat("\nNo. of sub colours generated:", length(subcols_bp_hmap)) + + #------------------------------- + # Generate the subcols barplot + #------------------------------- + + #g = ggplot(plotdf, aes(x = factor(position, ordered = T))) + g = ggplot(plotdf, aes_string(x = xvar_colname + # , ordered = T) + )) + + + OutWidePlot = g + geom_bar(aes(fill = group) + , colour = "grey") + + + scale_fill_manual( values = subcols_bp_hmap + , guide = "none") + + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) + , plot.title = element_text(size = my_pts + , hjust = 0.5)) + + + labs(title = p_title + , x = my_xlab + , y = my_ylab) + + return(OutWidePlot) +} \ No newline at end of file From 96e6e8db5da59c6da0aabe70cf9679cf7c4d7aa1 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Sep 2021 19:37:39 +0100 Subject: [PATCH 32/51] saving work and tidying script --- scripts/functions/bp_subcolours.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/functions/bp_subcolours.R b/scripts/functions/bp_subcolours.R index 3db4079..91a8914 100755 --- a/scripts/functions/bp_subcolours.R +++ b/scripts/functions/bp_subcolours.R @@ -26,11 +26,18 @@ ColourPalleteMulti = function(df, group, subgroup){ } ######################################################################### +######################## +# Generate bp with +# colour palette derived +# from the data using +# above function +######################### + bp_stability_hmap <- function(plotdf = merged_df3 , xvar_colname = "position" #, bar_col_colname = "group" - , stability_colname = "duet_scaled" - , stability_outcome_colname = "duet_outcome" + , stability_colname = "" + , stability_outcome_colname = "" , p_title = "" # "Protein stability (DUET)" , my_xaxls = 12 # x-axis label size , my_yaxls = 20 # y-axis label size From 1d16c6848ec6dafd81ab26064aac7389e21dc533 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Sep 2021 19:42:08 +0100 Subject: [PATCH 33/51] moved coloured_bp_data.R to redundant in light of updated function and reflected this in notes withing get_plotting_dfs.R --- scripts/plotting/get_plotting_dfs.R | 5 +++++ scripts/plotting/{ => redundant}/coloured_bp_data.R | 0 2 files changed, 5 insertions(+) rename scripts/plotting/{ => redundant}/coloured_bp_data.R (100%) diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index a50b0a9..ec67a49 100755 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -124,12 +124,17 @@ merged_df3_comp = all_plot_dfs[[4]] #################################################################### #source("other_dfs_data.R") +# Fixed this at source i.e python script +# Moved: "other_dfs_data.R" to redundant/ #################################################################### # Data for subcols barplot (~heatmap) #################################################################### #source("coloured_bp_data.R") +# Repurposed function so that params can be passed instead to generate +# data required for plotting. +# Moved "coloured_bp_data.R" to redundant/ #################################################################### # Data for logoplots diff --git a/scripts/plotting/coloured_bp_data.R b/scripts/plotting/redundant/coloured_bp_data.R similarity index 100% rename from scripts/plotting/coloured_bp_data.R rename to scripts/plotting/redundant/coloured_bp_data.R From 746889b07538a23dff34761f54097752d5986ade Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Sep 2021 19:48:56 +0100 Subject: [PATCH 34/51] saving work for the day after massive repurpose --- scripts/plotting/get_plotting_dfs.R | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index ec67a49..d5d1535 100755 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -1,26 +1,21 @@ #!/usr/bin/env Rscript + ######################################################### # TASK: Get formatted data for plots -#======================================================================= +######################################################### # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() source("Header_TT.R") -# source("../functions/my_pairs_panel.R") # with lower panel turned off -# source("../functions/plotting_globals.R") -# source("../functions/plotting_data.R") -# source("../functions/combining_dfs_plotting.R") -# source("../functions/bp_subcolours.R") #******************** # cmd args passed # in from other scripts # to call this #******************** -#drug = 'streptomycin' -#gene = 'gid' + #==================== # variables for lig #==================== From 56600ac3f8bc9cbe0bdb98a6407665f36e75707f Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 16 Sep 2021 10:05:28 +0100 Subject: [PATCH 35/51] added config/ with drug gene names --- config/gid.R | 2 ++ scripts/functions/tests/test_bp_subcolours.R | 4 ++++ 2 files changed, 6 insertions(+) create mode 100644 config/gid.R diff --git a/config/gid.R b/config/gid.R new file mode 100644 index 0000000..226af91 --- /dev/null +++ b/config/gid.R @@ -0,0 +1,2 @@ +gene = "gid" +drug = "streptomycin" diff --git a/scripts/functions/tests/test_bp_subcolours.R b/scripts/functions/tests/test_bp_subcolours.R index 83fca05..2156e49 100644 --- a/scripts/functions/tests/test_bp_subcolours.R +++ b/scripts/functions/tests/test_bp_subcolours.R @@ -9,6 +9,10 @@ bp_stability_hmap(plotdf = merged_df3 , stability_outcome_colname = "deepddg_outcome" , p_title = "DeepDDG" ) +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_dynamut2_scaled" + , stability_outcome_colname = "ddg_dynamut2_outcome" + , p_title = "Dynamut2" ) bp_stability_hmap(plotdf = merged_df3 , stability_colname = "foldx_scaled" From cb5d7aa5ab7bcca24f365e67d218afbc4384990d Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 16 Sep 2021 10:59:55 +0100 Subject: [PATCH 36/51] corrected foldx_outcome classification in combining_dfs.py as positive are Destabilising and neg as Stabilising --- scripts/combining_dfs.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index faa9677..9331edd 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -190,6 +190,10 @@ foldx_max = foldx_df['ddg_foldx'].max() foldx_min foldx_max +# quick check +len(foldx_df.loc[foldx_df['ddg_foldx'] >= 0]) +len(foldx_df.loc[foldx_df['ddg_foldx'] < 0]) + foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed') foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale) @@ -216,13 +220,16 @@ else: , '\n======================================================') #------------------------- -# foldx outcome category +# foldx outcome category: +# Remember, its inverse +# +ve: Destabilising +# -ve: Stabilising #-------------------------- -foldx_df['foldx_outcome'] = foldx_df['ddg_foldx'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') +foldx_df['foldx_outcome'] = foldx_df['ddg_foldx'].apply(lambda x: 'Destabilising' if x >= 0 else 'Stabilising') foldx_df[foldx_df['ddg_foldx']>=0].count() foc = foldx_df['foldx_outcome'].value_counts() -if foc['Stabilising'] == foldx_pos and foc['Stabilising'] == foldx_pos2: +if foc['Destabilising'] == foldx_pos and foc['Destabilising'] == foldx_pos2: print('\nPASS: Foldx outcome category created') else: print('\nFAIL: Foldx outcome category could NOT be created' From e8734b1c4be4b9d99184b23567337703a0a90d79 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 16 Sep 2021 12:43:36 +0100 Subject: [PATCH 37/51] sorted merged_df2 and consequently others by position in combining_dfs_plotting.R --- scripts/functions/combining_dfs_plotting.R | 8 ++- scripts/functions/tests/test_bp_subcolours.R | 70 ++++++++++++++++++-- 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 848face..107c114 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -153,7 +153,13 @@ combining_dfs_plotting <- function( my_df_u quit() } - # Quick formatting: pretty labels + # Quick formatting: ordering df and pretty labels + + #------------------------------ + # sorting by column: position + #------------------------------ + merged_df2 = merged_df2[order(merged_df2$position), ] + #----------------------- # mutation_info_labels #----------------------- diff --git a/scripts/functions/tests/test_bp_subcolours.R b/scripts/functions/tests/test_bp_subcolours.R index 2156e49..8866ffe 100644 --- a/scripts/functions/tests/test_bp_subcolours.R +++ b/scripts/functions/tests/test_bp_subcolours.R @@ -1,20 +1,78 @@ #!/usr/bin/env Rscript -source("~/git/Misc/shiny/myshiny/gid_data.R") -setwd("~/git/LSHTM_analysis/scripts/functions/") -source("bp_subcolours.R") +#source("~/git/Misc/shiny/myshiny/gid_data.R") +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R") +# p1 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "duet_scaled" + , stability_outcome_colname = "duet_outcome" + , p_title = "DUET" ) + +# p2 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "foldx_scaled" + , stability_outcome_colname = "foldx_outcome" + , p_title = "FoldX" ) + +# p3 bp_stability_hmap(plotdf = merged_df3 , stability_colname = "deepddg_scaled" , stability_outcome_colname = "deepddg_outcome" , p_title = "DeepDDG" ) +# p4 bp_stability_hmap(plotdf = merged_df3 , stability_colname = "ddg_dynamut2_scaled" , stability_outcome_colname = "ddg_dynamut2_outcome" , p_title = "Dynamut2" ) +# p5 bp_stability_hmap(plotdf = merged_df3 - , stability_colname = "foldx_scaled" - , stability_outcome_colname = "foldx_outcome" - , p_title = "FoldX" ) + , stability_colname = "mcsm_na_scaled" + , stability_outcome_colname = "mcsm_na_outcome" + , p_title = "mCSM-NA" ) + +# p6 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_dynamut_scaled" + , stability_outcome_colname = "ddg_dynamut_outcome" + , p_title = "Dynamut" ) + +# p7 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_mcsm_scaled" + , stability_outcome_colname = "ddg_mcsm_outcome" + , p_title = "mCSM" ) + +# p8 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_duet_scaled" + , stability_outcome_colname = "ddg_duet_outcome" + , p_title = "DUET-d" ) + +# p9 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_sdm_scaled" + , stability_outcome_colname = "ddg_sdm_outcome" + , p_title = "SDM" ) + +# p10 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_encom_scaled" + , stability_outcome_colname = "ddg_encom_outcome" + , p_title = "ENCoM-Stability" ) + +# p11 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "dds_encom_scaled" + , stability_outcome_colname = "dds_encom_outcome" + , p_title = "ENCoM-Flexibility" ) + +# p12 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "affinity_scaled" + , stability_outcome_colname = "ligand_outcome" + , p_title = "mCSM-lig" ) From 51aa3217928b17159a10a6ef5bf5cb33d396f8d8 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 16 Sep 2021 12:44:42 +0100 Subject: [PATCH 38/51] sorting out bp_subcolours in interaction --- .../functions/tests/test_bp_subcolours_i.R | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 scripts/functions/tests/test_bp_subcolours_i.R diff --git a/scripts/functions/tests/test_bp_subcolours_i.R b/scripts/functions/tests/test_bp_subcolours_i.R new file mode 100644 index 0000000..d8c1b42 --- /dev/null +++ b/scripts/functions/tests/test_bp_subcolours_i.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript +#source("~/git/Misc/shiny/myshiny/gid_data.R") + +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R") + +# p1 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "duet_scaled" + , stability_outcome_colname = "duet_outcome" + , p_title = "DUET" ) + +# p2 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "foldx_scaled" + , stability_outcome_colname = "foldx_outcome" + , p_title = "FoldX" ) + +# p3 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "deepddg_scaled" + , stability_outcome_colname = "deepddg_outcome" + , p_title = "DeepDDG" ) + +################################################## + +merged_df3_f = merged_df3 + +setDT(merged_df3_f)[, pos_count := .N, by = position] + +################################################## +ui <- basicPage( + plotOutput("plot1", click = "plot_click"), + verbatimTextOutput("info") +) + +server <- function(input, output) { + output$plot1 <- renderPlot({ + + #plot(mtcars$wt, mtcars$mpg) + bp_stability_hmap(plotdf = merged_df3_f + , xvar_colname = "position" + , stability_colname = "foldx_scaled" + , stability_outcome_colname = "foldx_outcome" + , p_title = "FoldX" ) + + }) + + output$info <- renderPrint({ + # With base graphics, need to tell it what the x and y variables are. + nearPoints(merged_df3_f, input$plot_click + , xvar = "position" + , yvar = "pos_count" + ) + }) +} + +shinyApp(ui, server) \ No newline at end of file From e2d7a6567e5e0cb196deb6291d6d57fe5af513df Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 16 Sep 2021 18:59:02 +0100 Subject: [PATCH 39/51] minor bug fixes to allow i_graps for stability to render correctly --- scripts/functions/stability_count_bp.R | 35 ++++++++++++++------------ 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/scripts/functions/stability_count_bp.R b/scripts/functions/stability_count_bp.R index 8f7d9ed..e5ed684 100644 --- a/scripts/functions/stability_count_bp.R +++ b/scripts/functions/stability_count_bp.R @@ -15,39 +15,42 @@ theme_set(theme_grey()) ## ...opt args #========================================================== stability_count_bp <- function(plotdf - , df_colname - , leg_title = "Legend title" - , axis_text_size = 25 - , axis_label_size = 22 - , leg_text_size = 20 - , leg_title_size = 22 + , df_colname = "" + , leg_title = "Legend Title" + , ats = 25 # axis text size + , als = 22 # axis label size + , lts = 20 # legend text size + , ltis = 22 # label title size + , geom_ls = 10 # geom_label size , yaxis_title = "Number of nsSNPs" , bp_plot_title = "" , label_categories = c("Destabilising", "Stabilising") , title_colour = "chocolate4" , subtitle_text = NULL - , subtitle_size = 20 + , sts = 20 , subtitle_colour = "pink" #, leg_position = c(0.73,0.8) # within plot area , leg_position = "top"){ - OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + +# OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + + OutPlot_count = ggplot(plotdf, aes_string(x = df_colname)) + geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) + geom_label(stat = "count" , aes(label = ..count..) , color = "black" , show.legend = FALSE - , size = 10) + + , size = geom_ls) + theme(axis.text.x = element_blank() , axis.title.x = element_blank() - , axis.title.y = element_text(size = axis_label_size) - , axis.text.y = element_text(size = axis_text_size) + , axis.title.y = element_text(size = als) + , axis.text.y = element_text(size = ats) , legend.position = leg_position - , legend.text = element_text(size = leg_text_size) - , legend.title = element_text(size = leg_title_size) - , plot.title = element_text(size = axis_label_size - , colour = title_colour) - , plot.subtitle = element_text(size = subtitle_size + , legend.text = element_text(size = lts) + , legend.title = element_text(size = ltis) + , plot.title = element_text(size = als + , colour = title_colour + , hjust = 0.5) + , plot.subtitle = element_text(size = sts , hjust = 0.5 , colour = subtitle_colour)) + labs(title = bp_plot_title From e115c3636c61c2f34f9dd414f92d8a6e762c3d95 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 17 Sep 2021 13:33:19 +0100 Subject: [PATCH 40/51] fixed lf_bp function with aes_string and reformulate --- scripts/functions/lf_bp.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R index 608247d..71b0472 100644 --- a/scripts/functions/lf_bp.R +++ b/scripts/functions/lf_bp.R @@ -29,11 +29,17 @@ lf_bp <- function(lf_df , stat_method = "wilcox.test" , my_paired = FALSE , stat_label = c("p.format", "p.signif") ){ - - p1 <- ggplot(lf_df, aes(x = eval(parse(text = x_grp)) - , y = eval(parse(text = y_var)) )) + + + fwv = as.formula(paste0("~", facet_var)) + p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var)) + - facet_wrap(~ eval(parse(text = facet_var)) + #fwv = eval(parse(text = facet_var)) + # facet_wrap(~ fwv + # , nrow = n_facet_row + # , scales = y_scales) + + # + # fwv = as.formula(paste0("~", facet_var)) + facet_wrap( fwv , nrow = n_facet_row , scales = y_scales) + @@ -73,7 +79,7 @@ lf_bp <- function(lf_df , cex = 0.8 , aes(colour = factor(eval(parse(text = colour_categ))) )) } - + # Add foramtting to graph OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats) , axis.text.y = element_text(size = my_ats From 5cd6c300a709aaefad2a6d397a1a905067868854 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 17 Sep 2021 13:35:48 +0100 Subject: [PATCH 41/51] saving minor update to function fix --- scripts/functions/lf_bp.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R index 71b0472..675658e 100644 --- a/scripts/functions/lf_bp.R +++ b/scripts/functions/lf_bp.R @@ -31,14 +31,10 @@ lf_bp <- function(lf_df , stat_label = c("p.format", "p.signif") ){ fwv = as.formula(paste0("~", facet_var)) + #fwv = reformulate(facet_var) + p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var)) + - #fwv = eval(parse(text = facet_var)) - # facet_wrap(~ fwv - # , nrow = n_facet_row - # , scales = y_scales) + - # - # fwv = as.formula(paste0("~", facet_var)) facet_wrap( fwv , nrow = n_facet_row , scales = y_scales) + From daa3556ede6414c03ab1fd0009800026daaceeb4 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 20 Sep 2021 16:12:45 +0100 Subject: [PATCH 42/51] split csv for isoniazid --- dynamut/split_csv.sh | 1 + mcsm/run_mcsm.py | 2 +- my_header.R | 107 ++++++++++++++++++++++++++++++++----------- 3 files changed, 82 insertions(+), 28 deletions(-) diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh index 17c1a03..1f7a793 100755 --- a/dynamut/split_csv.sh +++ b/dynamut/split_csv.sh @@ -19,5 +19,6 @@ split ../../${INFILE} -l ${CHUNK} -d snp_batch_ #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50 #~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50 +#~/git/LSHTM_analysis/dynamut/split_csv.sh katg_mcsm_formatted_snps.csv snp_batches 50 #Date: 20/09/2021 # add .txt to the files diff --git a/mcsm/run_mcsm.py b/mcsm/run_mcsm.py index 7e38543..da621f5 100755 --- a/mcsm/run_mcsm.py +++ b/mcsm/run_mcsm.py @@ -104,7 +104,7 @@ if mutation_filename: in_filename_snps = mutation_filename else: in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv' - + infile_snps = outdir + '/' + in_filename_snps #======= diff --git a/my_header.R b/my_header.R index 5009f29..2fa892c 100644 --- a/my_header.R +++ b/my_header.R @@ -1,21 +1,31 @@ ######################################################### -### A) Installing and loading required packages +# A) Installing and loading required packages +# B) My functions +######################################################### + ######################################################### #lib_loc = "/usr/local/lib/R/site-library") -#if (!require("gplots")) { -# install.packages("gplots", dependencies = TRUE) -# library(gplots) -#} +require("getopt", quietly = TRUE) # cmd parse arguments -#if (!require("tidyverse")) { -# install.packages("tidyverse", dependencies = TRUE) -# library(tidyverse) -#} +if (!require("tidyverse")) { + install.packages("tidyverse", dependencies = TRUE) + library(tidyverse) +} -if (!require("ggplot2")) { - install.packages("ggplot2", dependencies = TRUE) - library(ggplot2) +if (!require("shiny")) { + install.packages("shiny", dependencies = TRUE) + library(shiny) +} + +if (!require("shinyBS")) { + install.packages("shinyBS", dependencies = TRUE) + library(shinyBS) +} + +if (!require("gridExtra")) { + install.packages("gridExtra", dependencies = TRUE) + library(gridExtra) } if (!require("ggridges")) { @@ -23,6 +33,35 @@ if (!require("ggridges")) { library(ggridges) } +# if (!require("ggplot2")) { +# install.packages("ggplot2", dependencies = TRUE) +# library(ggplot2) +# } + +# if (!require ("dplyr")){ +# install.packages("dplyr") +# library(dplyr) +# } + +if (!require ("DT")){ + install.packages("DT") + library(DT) +} + +if (!require ("plyr")){ + install.packages("plyr") + library(plyr) + } + +# Install +#if(!require(devtools)) install.packages("devtools") +#devtools::install_github("kassambara/ggcorrplot") + +if (!require ("ggbeeswarm")){ + install.packages("ggbeeswarm") + library(ggbeeswarm) +} + if (!require("plotly")) { install.packages("plotly", dependencies = TRUE) library(plotly) @@ -103,11 +142,6 @@ if (!require ("psych")){ library(psych) } -if (!require ("dplyr")){ - install.packages("dplyr") - library(dplyr) -} - if (!require ("compare")){ install.packages("compare") library(compare) @@ -118,18 +152,37 @@ if (!require ("arsenal")){ library(arsenal) } +if(!require(ggseqlogo)){ + install.packages("ggseqlogo") + library(ggseqlogo) +} -####TIDYVERSE -# Install -#if(!require(devtools)) install.packages("devtools") -#devtools::install_github("kassambara/ggcorrplot") - -#library(ggcorrplot) - - -###for PDB files -#install.packages("bio3d") +# for PDB files if(!require(bio3d)){ install.packages("bio3d") library(bio3d) } + +library(protr) +if(!require(protr)){ + install.packages("protr") + library(protr) +} + +#if (!requireNamespace("BiocManager", quietly = TRUE)) +# install.packages("BiocManager") + +#BiocManager::install("Logolas") +library("Logolas") + + +#################################### +# Load all my functions: +# only works if tidyverse is loaded +# hence included it here! +#################################### + +func_path = "~/git/LSHTM_analysis/scripts/functions/" +source_files <- list.files(func_path, "\\.R$") # locate all .R files +map(paste0(func_path, source_files), source) # source all your R scripts! + From d443ecea6b6471e4fcfcc31c771d4857dce58af0 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 20 Sep 2021 16:13:15 +0100 Subject: [PATCH 43/51] added separate script for splitting csv after adding chain ID. saves lots of post processing --- dynamut/split_csv_chain.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100755 dynamut/split_csv_chain.sh diff --git a/dynamut/split_csv_chain.sh b/dynamut/split_csv_chain.sh new file mode 100755 index 0000000..2526b3f --- /dev/null +++ b/dynamut/split_csv_chain.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20) + +# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh +# copy your snp file to split into the dynamut dir +# use sed to add chain ID to snp file and then split to avoid post processing + +INFILE=$1 +OUTDIR=$2 +CHUNK=$3 + +mkdir -p ${OUTDIR}/${CHUNK}/chain_added +cd ${OUTDIR}/${CHUNK}/chain_added + +# makes the 2 dirs, hence ../.. +split ../../../${INFILE} -l ${CHUNK} -d snp_batch_ + +# use case +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps.csv snp_batches 50 #Date: 20/09/2021 + +# add .txt to the files From 93a91518e1f46980fb89eb4fd5cca98547d36d50 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 29 Sep 2021 18:24:06 +0100 Subject: [PATCH 44/51] fix runFoldx so that it looks for a missing rotabase.txt in the process_dir and also print the foldx command that will be run --- foldx/runFoldx.py | 70 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/foldx/runFoldx.py b/foldx/runFoldx.py index 8d9358b..12e00c9 100755 --- a/foldx/runFoldx.py +++ b/foldx/runFoldx.py @@ -41,7 +41,7 @@ arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called _complex.pdb in input_dir') -arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _mcsm_snps.csv exists') +arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _mcsm_formatted_snps.csv exists') # FIXME: Doesn't work with 2 chains yet! arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive @@ -148,6 +148,16 @@ print('Arguments being passed:' , '\noutput file:', outfile_foldx , '\n=============================================================') + +# make sure rotabase.txt exists in the process_dir +rotabase_file = process_dir + '/' + 'rotabase.txt' + +if Path(rotabase_file).is_file(): + print(f'rotabase file: {rotabase_file} exists') +else: + print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}') + sys.exit() + #### Delay for 10 seconds to check the params #### print('Sleeping for 10 seconds to give you time to cancel') time.sleep(10) @@ -235,6 +245,13 @@ def main(): nmuts = len(mutlist) print(nmuts) print(mutlist) + print('start') + #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir]) + print('\033[95mSTAGE: repair PDB\033[0m') + print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir)) + #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir]) + # once you decide to use the function + # repairPDB(pdbname) print('start') # some common parameters for foldX @@ -242,61 +259,74 @@ def main(): print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m') print('Running foldx RepairPDB for WT') - subprocess.call(['foldx' + + fold_RepairDB = ['foldx' , '--command=RepairPDB' , foldx_common - , '--pdb-dir=' + os.path.dirname(pdb_filename) +# , '--pdb-dir=' + os.path.dirname(pdb_filename) + , '--pdb-dir=' + indir , '--pdb=' + actual_pdb_filename , 'outPDB=true' - , '--output-dir=' + process_dir]) + , '--output-dir=' + process_dir] + print('CMD:', fold_RepairDB) + subprocess.call(fold_RepairDB) print('\033[95mCOMPLETED STAGE: repair PDB\033[0m') print('\n==========================================================') print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m') print('Running foldx BuildModel for WT') - subprocess.call(['foldx' + + foldx_BuildModel = ['foldx' , '--command=BuildModel' , foldx_common , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' - , '--mutant-file="individual_list_' + pdbname +'.txt"' + , '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt' , 'outPDB=true' , '--numberOfRuns=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_BuildModel) + subprocess.call( foldx_BuildModel, cwd=process_dir) print('Running foldx PrintNetworks for WT') - subprocess.call(['foldx' + foldx_PrintNetworks = ['foldx' , '--command=PrintNetworks' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_PrintNetworks) + subprocess.call(foldx_PrintNetworks, cwd=process_dir) print('Running foldx SequenceDetail for WT') - subprocess.call(['foldx' + foldx_SequenceDetail = ['foldx' , '--command=SequenceDetail' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_SequenceDetail) + subprocess.call(foldx_SequenceDetail , cwd=process_dir) + print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m') print('\n==========================================================') - print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m') for n in range(1,nmuts+1): print('\033[95mNETWORK:\033[0m', n) print('Running foldx PrintNetworks for mutation', n) - subprocess.call(['foldx' + foldx_PrintNetworksMT = ['foldx' , '--command=PrintNetworks' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb' , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_PrintNetworksMT) + subprocess.call( foldx_PrintNetworksMT , cwd=process_dir) print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m') print('\n==========================================================') @@ -323,14 +353,16 @@ def main(): print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m') chain1=chainA chain2=chainB - subprocess.call(['foldx' + foldx_AnalyseComplex = ['foldx' , '--command=AnalyseComplex' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--analyseComplexChains=' + chain1 + ',' + chain2 , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:',foldx_AnalyseComplex) + subprocess.call(foldx_AnalyseComplex, cwd=process_dir) # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout' @@ -340,14 +372,16 @@ def main(): for n in range(1,nmuts+1): print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n) - subprocess.call(['foldx' + foldx_AnalyseComplex = ['foldx' , '--command=AnalyseComplex' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb' , '--analyseComplexChains=' + chain1 + ',' + chain2 , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_AnalyseComplex) + subprocess.call( foldx_AnalyseComplex , cwd=process_dir) # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout' From af227f9864a96143e9886f84631dafc9eec43875 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 30 Sep 2021 13:35:33 +0100 Subject: [PATCH 45/51] moved deepddg_format.py from ind output dir to scripts --- scripts/deepddg_format.py | 141 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 scripts/deepddg_format.py diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py new file mode 100644 index 0000000..aab0769 --- /dev/null +++ b/scripts/deepddg_format.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +#======================================================================= +# Task: format deep ddg df to allow easy merging + +# Input: 2 dfs +#1) .lower()'_mcsm_formatted_snps.csv' +#2) .lower()_complex_ddg_results.csv' +#======================================================================= +#%% load packages +import sys, os +import pandas as pd +from pandas import DataFrame +import numpy as np +#from varname import nameof +import argparse +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() +#=======================================================================#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') + +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +#%%======================================================================= +#============== +# directories +#============== +if not datadir: + datadir = homedir + '/git/Data/' + +if not indir: + indir = datadir + drug + '/input/' + +if not outdir: + outdir = datadir + drug + '/output/' + +#======= +# input +#======= +in_filename_mcsm_snps = gene.lower() + '_mcsm_formatted_snps.csv' +infile_mcsm_snps = outdir + in_filename_mcsm_snps + +in_filename_deepddg = gene.lower() + '_complex_ddg_results.csv' +infile_deepddg = outdir + 'deep_ddg/' + in_filename_deepddg + +print('\nInput path:', indir + , '\nOutput path:', outdir, '\n' + , '\nInput filename mcsm snps', infile_mcsm_snps , '\n' + , '\nInput filename deepddg', infile_deepddg , '\n' + , '\n============================================================') + +#======= +# output +#======= +#out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.txt' +out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.csv' +outfile_deepddg_f = outdir + out_filename_deepddg + +print('Output filename:', outfile_deepddg_f + , '\n===================================================================') +# end of variable assignment for input and output files +#%%============================================================================ +print('===================================' + , '\nmcsm muts' + , '\n===================================') + +mcsm_muts_df = pd.read_csv(infile_mcsm_snps , header = None, sep = ',', names = ['mutationinformation']) +mcsm_muts_df.columns + +#%%============================================================================ +print('===================================' + , '\nDeep ddg' + , '\n===================================') + +deepddg_df = pd.read_csv(infile_deepddg, sep = ',') +deepddg_df.columns + +deepddg_df.rename(columns = {'#chain' : 'chain_id' + , 'WT' : 'wild_type_deepddg' + , 'ResID' : 'position' + , 'Mut' : 'mutant_type_deepddg'} + , inplace = True) +deepddg_df.columns +deepddg_df['mutationinformation'] = deepddg_df['wild_type_deepddg'] + deepddg_df['position'].map(str) + deepddg_df['mutant_type_deepddg'] +deepddg_df.columns + +# add deepddg outcome column: <0--> Destabilising, >0 --> Stabilising +deepddg_df['deepddg_outcome'] = np.where(deepddg_df['deepddg'] < 0, 'Destabilising', 'Stabilising') +deepddg_df['deepddg_outcome'].value_counts() + +# should be identical in count ot Destabilising and stabilising respectively +len(deepddg_df.loc[deepddg_df['deepddg'] < 0]) +len(deepddg_df.loc[deepddg_df['deepddg'] >= 0]) + +# drop extra columns to allow clean merging +deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1) + +# rearrange columns +deepddg_short_df.columns +deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]] + +#%% combine with mcsm snps +deepddg_mcsm_muts_dfs = pd.merge(deepddg_short_df + , mcsm_muts_df + , on = 'mutationinformation' + , how = 'right') +deepddg_mcsm_muts_dfs ['deepddg_outcome'].value_counts() + +#%%============================================================================ +# write csv +print('Writing file: formatted deepddg and only mcsm muts') +deepddg_mcsm_muts_dfs.to_csv(outfile_deepddg_f, index = False) +print('\nFinished writing file:' + , '\nNo. of rows:', deepddg_mcsm_muts_dfs.shape[0] + , '\nNo. of cols:', deepddg_mcsm_muts_dfs.shape[1]) +#%% end of script From 98325d763f4a0d0396e3b428b7bbbd233d776d32 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 30 Sep 2021 13:37:17 +0100 Subject: [PATCH 46/51] fixed output filename in deepddg_format.py --- scripts/deepddg_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py index aab0769..98c2ee1 100644 --- a/scripts/deepddg_format.py +++ b/scripts/deepddg_format.py @@ -77,8 +77,8 @@ print('\nInput path:', indir #======= # output #======= -#out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.txt' -out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.csv' +#out_filename_deepddg = gene.lower() + '_ni_deepddg.txt' +out_filename_deepddg = gene.lower() + '_ni_deepddg.csv' outfile_deepddg_f = outdir + out_filename_deepddg print('Output filename:', outfile_deepddg_f From 675b222181f6aa2fca9d5fa01cce6fa6415948bf Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 18 Oct 2021 13:52:29 +0100 Subject: [PATCH 47/51] added cmd option for dynamut2 formatting results --- dynamut/format_results_dynamut.py | 0 dynamut/format_results_dynamut2.py | 0 dynamut/run_format_results_dynamut.py | 49 +++++++++++++++++++++++---- dynamut/run_get_results_dynamut.py | 6 ++-- dynamut/split_csv_chain.sh | 21 ++++++++++-- scripts/data_extraction.py | 5 +-- scripts/deepddg_format.py | 0 7 files changed, 65 insertions(+), 16 deletions(-) mode change 100644 => 100755 dynamut/format_results_dynamut.py mode change 100644 => 100755 dynamut/format_results_dynamut2.py mode change 100644 => 100755 dynamut/run_format_results_dynamut.py mode change 100644 => 100755 scripts/deepddg_format.py diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py old mode 100644 new mode 100755 diff --git a/dynamut/format_results_dynamut2.py b/dynamut/format_results_dynamut2.py old mode 100644 new mode 100755 diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py old mode 100644 new mode 100755 index 02af524..dd9f7fb --- a/dynamut/run_format_results_dynamut.py +++ b/dynamut/run_format_results_dynamut.py @@ -20,8 +20,45 @@ from format_results_dynamut2 import * # variables # TODO: add cmd line args -gene = 'gid' -drug = 'streptomycin' +#gene = +#drug = + +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +#arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') # should be handled elsewhere! + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +#make_dirs = args.make_dirs + +#%% input and output dirs and files +#======= +# dirs +#======= +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' + +#%%===================================================================== + datadir = homedir + '/git/Data' indir = datadir + '/' + drug + '/input' outdir = datadir + '/' + drug + '/output' @@ -29,12 +66,12 @@ outdir_dynamut = outdir + '/dynamut_results/' outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/' # Input file -infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv' +#infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv' infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv' # Formatted output filename -outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv' -outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv' +#outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv' +outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv' #=============================== # CALL: format_results_dynamut @@ -69,4 +106,4 @@ print('Finished writing file:' , '\nExpected no. of cols:', len(dynamut2_df_f.columns) , '\n=============================================================') -#%%##################################################################### \ No newline at end of file +#%%##################################################################### diff --git a/dynamut/run_get_results_dynamut.py b/dynamut/run_get_results_dynamut.py index e9e82ef..029e934 100755 --- a/dynamut/run_get_results_dynamut.py +++ b/dynamut/run_get_results_dynamut.py @@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au' #headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} # TODO: add cmd line args -#gene = 'gid' -drug = 'streptomycin' +# gene = +# drug = datadir = homedir + '/git/Data/' indir = datadir + drug + '/input/' outdir = datadir + drug + '/output/' @@ -41,4 +41,4 @@ get_results(url_file = my_url_file , output_dir = outdir , outfile_suffix = my_suffix) -######################################################################## \ No newline at end of file +######################################################################## diff --git a/dynamut/split_csv_chain.sh b/dynamut/split_csv_chain.sh index 2526b3f..ac60faa 100755 --- a/dynamut/split_csv_chain.sh +++ b/dynamut/split_csv_chain.sh @@ -13,10 +13,25 @@ CHUNK=$3 mkdir -p ${OUTDIR}/${CHUNK}/chain_added cd ${OUTDIR}/${CHUNK}/chain_added -# makes the 2 dirs, hence ../.. +# makes the 3 dirs, hence ../.. split ../../../${INFILE} -l ${CHUNK} -d snp_batch_ -# use case -#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps.csv snp_batches 50 #Date: 20/09/2021 +######################################################################## +# use cases +# Date: 20/09/2021 +# sed -e 's/^/A /g' katg_mcsm_formatted_snps.csv > katg_mcsm_formatted_snps_chain.csv +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 50 +# Date: 01/10/2021 +# sed -e 's/^/A /g' rpob_mcsm_formatted_snps.csv > rpob_mcsm_formatted_snps_chain.csv +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 50 + +# Date: 02/10/2021 +# sed -e 's/^/A /g' alr_mcsm_formatted_snps.csv > alr_mcsm_formatted_snps_chain.csv +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50 + +# Date: 05/10/2021 +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20 + # add .txt to the files +######################################################################## diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 5582632..31f8a27 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -81,9 +81,6 @@ indir = args.input_dir outdir = args.output_dir make_dirs = args.make_dirs -#drug = 'streptomycin' -#gene = 'gid' - #%% input and output dirs and files #======= # dirs @@ -1373,4 +1370,4 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print(u'\u2698' * 50, '\nEnd of script: Data extraction and writing files' '\n' + u'\u2698' * 50 ) -#%% end of script \ No newline at end of file +#%% end of script diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py old mode 100644 new mode 100755 From ba21188bd2f79421170f8e44e336375ac94ca220 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 18 Oct 2021 13:58:06 +0100 Subject: [PATCH 48/51] added notes --- dynamut/notes.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 dynamut/notes.txt diff --git a/dynamut/notes.txt b/dynamut/notes.txt new file mode 100644 index 0000000..97e6d02 --- /dev/null +++ b/dynamut/notes.txt @@ -0,0 +1,11 @@ +Dynamut was painfully run for gid, part manually, part programatically! + +However, it was decided to ditch that and only run Dynamut2 for future targets + +Dynamut2 was run through the website in batches of 50 for +katG: 17 batches (00..16) +rpoB: 23 batches (00..22) +alr: 6 batches (00..05) + +However, the use of API was made for rpoB batches (09-22) from 13 Oct 2021 +as jobs started to flake and fail through the website! From 873fd3a121a5e23bed34ad06630287dc67c7e694 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 19 Oct 2021 11:12:34 +0100 Subject: [PATCH 49/51] added gene.lower to dynamut2 format result script --- dynamut/run_format_results_dynamut.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py index dd9f7fb..cb6fe70 100755 --- a/dynamut/run_format_results_dynamut.py +++ b/dynamut/run_format_results_dynamut.py @@ -66,12 +66,12 @@ outdir_dynamut = outdir + '/dynamut_results/' outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/' # Input file -#infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv' -infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv' +#infile_dynamut = outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv' +infile_dynamut2 = outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv' # Formatted output filename #outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv' -outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv' +outfile_dynamut2_f = outdir_dynamut2 + gene.lower() + '_dynamut2_norm.csv' #=============================== # CALL: format_results_dynamut From 057291a56147f4684d783ec51316af72f20684bd Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 28 Oct 2021 10:41:43 +0100 Subject: [PATCH 50/51] much development --- mcsm_na/format_results_mcsm_na.py | 9 +- scripts/combining_dfs.py | 331 +++++++++++++++++++++++------- scripts/data_extraction.py | 1 - scripts/deepddg_format.py | 12 +- scripts/rd_df.py | 2 - 5 files changed, 266 insertions(+), 89 deletions(-) diff --git a/mcsm_na/format_results_mcsm_na.py b/mcsm_na/format_results_mcsm_na.py index 95cd9e8..335301c 100644 --- a/mcsm_na/format_results_mcsm_na.py +++ b/mcsm_na/format_results_mcsm_na.py @@ -51,7 +51,7 @@ def format_mcsm_na_output(mcsm_na_output_tsv): print('Assigning meaningful colnames' , '\n=======================================================') my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded - , 'CHAIN': 'chain' # {wild_type}{mutant_type} + , 'CHAIN': 'chain' , 'WILD_RES': 'wild_type' # one letter amino acid code , 'RES_POS': 'position' # number , 'MUT_RES': 'mutant_type' # one letter amino acid code @@ -65,8 +65,8 @@ def format_mcsm_na_output(mcsm_na_output_tsv): ############# # create mutationinformation column ############# - mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type'] - + #mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type'] + mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type'] #%%===================================================================== ############# # Create col: mcsm_na_outcome @@ -131,5 +131,4 @@ def format_mcsm_na_output(mcsm_na_output_tsv): , 'chain' , 'pdb_file']] return(mcsm_na_dataf) -#%%##################################################################### - +#%%##################################################################### \ No newline at end of file diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 9331edd..53361c7 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -34,6 +34,11 @@ Created on Tue Aug 6 12:56:03 2019 # Output: single csv of all 8 dfs combined # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns + +#%% FIXME: let the script proceed even if files don't exist! +# i.e example below +# '/home/tanu/git/Data/ethambutol/output/dynamut_results/embb_complex_dynamut_norm.csv' + #======================================================================= #%% load packages import sys, os @@ -48,7 +53,7 @@ homedir = os.path.expanduser('~') # set working dir os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/scripts') +#os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # FIXME: local imports @@ -109,47 +114,81 @@ if not outdir: #======= # input #======= -#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' -in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb -in_filename_foldx = gene.lower() + '_foldx.csv' -in_filename_deepddg = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir -in_filename_dssp = gene.lower() + '_dssp.csv' -in_filename_kd = gene.lower() + '_kd.csv' -in_filename_rd = gene.lower() + '_rd.csv' -#in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info -in_filename_afor = gene.lower() + '_af_or.csv' -#in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' -infilename_dynamut = gene.lower() + '_complex_dynamut_norm.csv' -infilename_dynamut2 = gene.lower() + '_complex_dynamut2_norm.csv' -infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' +gene_list_normal = ["pnca", "katg", "rpob", "alr"] + +if gene.lower() == "gid": + print("\nReading mCSM file for gene:", gene) + in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' +if gene.lower() == "embb": + print("\nReading mCSM file for gene:", gene) + in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' +if gene.lower() in gene_list_normal: + print("\nReading mCSM file for gene:", gene) + in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' + +infile_mcsm = outdir + in_filename_mcsm +mcsm_df = pd.read_csv(infile_mcsm, sep = ',') + +in_filename_foldx = gene.lower() + '_foldx.csv' +infile_foldx = outdir + in_filename_foldx +foldx_df = pd.read_csv(infile_foldx , sep = ',') + +in_filename_deepddg = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir +infile_deepddg = outdir + in_filename_deepddg +deepddg_df = pd.read_csv(infile_deepddg, sep = ',') + +in_filename_dssp = gene.lower() + '_dssp.csv' +infile_dssp = outdir + in_filename_dssp +dssp_df = pd.read_csv(infile_dssp, sep = ',') + +in_filename_kd = gene.lower() + '_kd.csv' +infile_kd = outdir + in_filename_kd +kd_df = pd.read_csv(infile_kd, sep = ',') + +in_filename_rd = gene.lower() + '_rd.csv' +infile_rd = outdir + in_filename_rd +rd_df = pd.read_csv(infile_rd, sep = ',') + +#in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info +#infile_snpinfo = outdir + in_filename_snpinfo + +in_filename_afor = gene.lower() + '_af_or.csv' +infile_afor = outdir + in_filename_afor +afor_df = pd.read_csv(infile_afor, sep = ',') + +#in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' +#infile_afor_kin = outdir + in_filename_afor_kin + +infilename_dynamut2 = gene.lower() + '_dynamut2_norm.csv' +infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2 +dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',') + +#------------------------------------------------------------ +# ONLY:for gene pnca and gid: End logic should pick this up! +geneL_dy_na = ["pnca", "gid"] +#if gene.lower() == "pnca" or "gid" : +if gene.lower() in geneL_dy_na : + print("\nGene:", gene.lower() + , "\nReading Dynamut and mCSM_na files") + infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid + infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut + dynamut_df = pd.read_csv(infile_dynamut, sep = ',') + + infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid + infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na + mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') + +# ONLY:for gene embb and alr: End logic should pick this up! +geneL_ppi2 = ["embb", "alr"] +#if gene.lower() == "embb" or "alr": +if gene.lower() in "embb" or "alr": + infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv' + infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2 + mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',') +#-------------------------------------------------------------- infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv' - -infile_mcsm = outdir + in_filename_mcsm -infile_foldx = outdir + in_filename_foldx -infile_deepddg = outdir + in_filename_deepddg -infile_dssp = outdir + in_filename_dssp -infile_kd = outdir + in_filename_kd -infile_rd = outdir + in_filename_rd -#infile_snpinfo = outdir + in_filename_snpinfo -infile_afor = outdir + in_filename_afor -#infile_afor_kin = outdir + in_filename_afor_kin -infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut -infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2 -infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na -infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps - -# read csv -mcsm_df = pd.read_csv(infile_mcsm, sep = ',') -foldx_df = pd.read_csv(infile_foldx , sep = ',') -deepddg_df = pd.read_csv(infile_deepddg, sep = ',') -dssp_df = pd.read_csv(infile_dssp, sep = ',') -kd_df = pd.read_csv(infile_kd, sep = ',') -rd_df = pd.read_csv(infile_rd, sep = ',') -afor_df = pd.read_csv(infile_afor, sep = ',') -dynamut_df = pd.read_csv(infile_dynamut, sep = ',') -dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',') -mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') -mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None) +infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps +mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None) #======= # output @@ -158,12 +197,6 @@ out_filename_comb = gene.lower() + '_all_params.csv' outfile_comb = outdir + out_filename_comb print('Output filename:', outfile_comb , '\n===================================================================') - -o_join = 'outer' -l_join = 'left' -r_join = 'right' -i_join = 'inner' - # end of variable assignment for input and output files #%%############################################################################ #===================== @@ -292,6 +325,44 @@ else: , '\n======================================================') sys.exit() +#-------------------------- +# check if >1 chain +#-------------------------- +deepddg_df.loc[:,'chain_id'].value_counts() + +if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1: + print("\nChains detected: >1" + , "\nGene:", gene + , "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index) + +#-------------------------- +# subset chain +#-------------------------- +if gene.lower() == "embb": + sel_chain = "B" +else: + sel_chain = "A" + +deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain] + +#-------------------------- +# Check for duplicates +#-------------------------- +if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1: + print("\nFAIL: Duplicates detected in DeepDDG infile" + , "\nNo. of duplicates:" + , deepddg_df['mutationinformation'].duplicated().value_counts()[1] + , "\nformat deepDDG infile before proceeding") + sys.exit() +else: + print("\nPASS: No duplicates detected in DeepDDG infile") + +#-------------------------- +# Drop chain id col as other targets don't have itCheck for duplicates +#-------------------------- +col_to_drop = ['chain_id'] +deepddg_df = deepddg_df.drop(col_to_drop, axis = 1) + #%%============================================================================= # Now merges begin #%%============================================================================= @@ -311,28 +382,83 @@ get_aa_3lower(df = mcsm_df #mcsm_df.columns = mcsm_df.columns.str.lower() # foldx_df.shape -#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join) +#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = "outer") merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df) -mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1, how = o_join) +mcsm_foldx_dfs = pd.merge(mcsm_df + , foldx_df + , on = merging_cols_m1 + , how = "outer") ncols_m1 = len(mcsm_foldx_dfs.columns) print('\n\nResult of first merge:', mcsm_foldx_dfs.shape , '\n===================================================================') mcsm_foldx_dfs[merging_cols_m1].apply(len) mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs) + +#%% for embB and any other targets where mCSM-lig hasn't run for +# get the empty cells to be full of meaningful info +if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any(): + print ("NAs detected in mcsm cols after merge") + + ############################## + # Extract relevant col values + # code to one + ############################## + + # wt_reg = r'(^[A-Z]{1})' + # print('wild_type:', wt_reg) + + # mut_reg = r'[0-9]+(\w{1})$' + # print('mut type:', mut_reg) + mcsm_foldx_dfs['wild_type'] = mcsm_foldx_dfs.loc[:,'mutationinformation'].str.extract(r'(^[A-Z]{1})') + mcsm_foldx_dfs['position'] = mcsm_foldx_dfs.loc[:,'mutationinformation'].str.extract(r'([0-9]+)') + mcsm_foldx_dfs['mutant_type'] = mcsm_foldx_dfs.loc[:,'mutationinformation'].str.extract(r'[0-9]+([A-Z]{1})$') + + # BEWARE: Bit of logic trap i.e if nan comes first + # in chain column, then nan will be populated! + #df['foo'] = df['chain'].unique()[0] + mcsm_foldx_dfs['chain'] = np.where(mcsm_foldx_dfs[['chain']].isnull().all(axis=1) + , mcsm_foldx_dfs['chain'].unique()[0] + , mcsm_foldx_dfs['chain']) + + mcsm_foldx_dfs['ligand_id'] = np.where(mcsm_foldx_dfs[['ligand_id']].isnull().all(axis=1) + , mcsm_foldx_dfs['ligand_id'].unique()[0] + , mcsm_foldx_dfs['ligand_id']) + #-------------------------------------------------------------------------- + + mcsm_foldx_dfs['wild_pos'] = mcsm_foldx_dfs.loc[:,'wild_type'] + mcsm_foldx_dfs.loc[:,'position'].astype(int).apply(str) + mcsm_foldx_dfs['wild_chain_pos'] = mcsm_foldx_dfs.loc[:,'wild_type'] + mcsm_foldx_dfs.loc[:,'chain'] + mcsm_foldx_dfs.loc[:,'position'].astype(int).apply(str) + + ############# + # Map 1 letter + # code to 3Upper + ############# + # initialise a sub dict that is lookup dict for + # 3-LETTER aa code to 1-LETTER aa code + lookup_dict = dict() + for k, v in oneletter_aa_dict.items(): + lookup_dict[k] = v['three_letter_code_lower'] + wt = mcsm_foldx_dfs['wild_type'].squeeze() # converts to a series that map works on + mcsm_foldx_dfs['wt_aa_3lower'] = wt.map(lookup_dict) + mut = mcsm_foldx_dfs['mutant_type'].squeeze() + mcsm_foldx_dfs['mut_aa_3lower'] = mut.map(lookup_dict) + #%% print('===================================' , '\nSecond merge: mcsm_foldx_dfs + deepddg' , '\n===================================') -#deepddg_df = pd.read_csv(infile_deepddg, sep = ',') -#deepddg_df.columns - # merge with mcsm_foldx_dfs and deepddg_df -mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_df, on = 'mutationinformation', how = l_join) +mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs + , deepddg_df + , on = 'mutationinformation' + , how = "left") mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts() ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns) + +mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64') + #%%============================================================================ print('===================================' , '\Third merge: dssp + kd' @@ -342,9 +468,12 @@ dssp_df.shape kd_df.shape rd_df.shape -#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) +#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer") merging_cols_m2 = detect_common_cols(dssp_df, kd_df) -dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = o_join) +dssp_kd_dfs = pd.merge(dssp_df + , kd_df + , on = merging_cols_m2 + , how = "outer") print('\n\nResult of third merge:', dssp_kd_dfs.shape , '\n===================================================================') @@ -353,10 +482,12 @@ print('===================================' , '\nFourth merge: third merge + rd_df' , '\ndssp_kd_dfs + rd_df' , '\n===================================') -#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join) +#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer") merging_cols_m3 = detect_common_cols(dssp_kd_dfs, rd_df) -dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3 - , how = o_join) +dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs + , rd_df + , on = merging_cols_m3 + , how = "outer") ncols_m3 = len(dssp_kd_rd_dfs.columns) @@ -369,24 +500,41 @@ print('=======================================' , '\nFifth merge: Second merge + fourth merge' , '\nmcsm_foldx_dfs + dssp_kd_rd_dfs' , '\n=======================================') -#combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join) + +#combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = "inner") #merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs) -#combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = i_join) +#combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = "inner") #combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4) # with deepddg values merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs) -combined_df = pd.merge(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = i_join) +combined_df = pd.merge(mcsm_foldx_deepddg_dfs + , dssp_kd_rd_dfs + , on = merging_cols_m4 + , how = "inner") combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4) -if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: - print('PASS: successfully combined 5 dfs' - , '\nNo. of rows combined_df:', len(combined_df) - , '\nNo. of cols combined_df:', len(combined_df.columns)) -else: - sys.exit('FAIL: check individual df merges') - +# FIXME: check logic, doesn't effect anything else! +if not gene == "embB": + print("\nGene is:", gene) + if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: + print('PASS: successfully combined 5 dfs' + , '\nNo. of rows combined_df:', len(combined_df) + , '\nNo. of cols combined_df:', len(combined_df.columns)) + else: + #sys.exit('FAIL: check individual df merges') + print("\nGene is:", gene + , "\ncombined_df length:", len(combined_df) + , "\nmcsm_df_length:", len(mcsm_df) + ) + if len(combined_df.columns) == combined_df_expected_cols: + print('PASS: successfully combined 5 dfs' + , '\nNo. of rows combined_df:', len(combined_df) + , '\nNo. of cols combined_df:', len(combined_df.columns)) + else: + sys.exit('FAIL: check individual merges') + print('\nResult of Fourth merge:', combined_df.shape , '\n===================================================================') @@ -401,7 +549,7 @@ combined_df['chain'].equals(combined_df['chain_id']) combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan combined_df['wild_type'].equals(combined_df['wild_type_dssp']) -#sanity check +# sanity check foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']] # Drop cols @@ -455,7 +603,11 @@ afor_df = afor_df.drop(['position'], axis = 1) afor_cols = afor_df.columns # merge -combined_stab_afor = pd.merge(combined_df_clean, afor_df, on = merging_cols_m5, how = l_join) +combined_stab_afor = pd.merge(combined_df_clean + , afor_df + , on = merging_cols_m5 + , how = "left") + comb_afor_df_cols = combined_stab_afor.columns comb_afor_expected_cols = len(combined_df_clean.columns) + len(afor_df.columns) - len(merging_cols_m5) @@ -467,18 +619,26 @@ if len(combined_stab_afor) == len(combined_df_clean) and len(combined_stab_afor. else: sys.exit('\nFAIL: check individual df merges') -print('\n\nResult of Fourth merge:', combined_stab_afor.shape +print('\n\nResult of Fifth merge:', combined_stab_afor.shape , '\n===================================================================') combined_stab_afor[merging_cols_m5].apply(len) combined_stab_afor[merging_cols_m5].apply(len) == len(combined_stab_afor) -if len(combined_stab_afor) - combined_stab_afor['mutation'].isna().sum() == len(afor_df): - print('\nPASS: Merge successful for af and or' - , '\nNo. of nsSNPs with valid ORs: ', len(afor_df)) -else: - sys.exit('\nFAIL: merge unsuccessful for af and or') +if (len(combined_stab_afor) - combined_stab_afor['mutation'].isna().sum()) == len(afor_df): + print('\nPASS: Merge successful for af and or with matched numbers') +if len(combined_stab_afor) - combined_stab_afor['mutation'].isna().sum() == len(afor_df)-len(afor_df[~afor_df['mutation'].isin(combined_stab_afor['mutation'])]): + print("\nMismatched numbers, OR df has extra snps not found in mcsm df" + , "\nNo. of nsSNPs with valid ORs:", len(afor_df) + , "\nNo. of mcsm nsSNPs: ", len(combined_df_clean) + , "\nNo. of OR nsSNPs not in mCSM df:" + , len(afor_df[~afor_df['mutation'].isin(combined_stab_afor['mutation'])]) + , "\nWriting these mutations to file:") + orsnps_notmcsm = afor_df[~afor_df['mutation'].isin(combined_stab_afor['mutation'])] +else: + sys.exit('\nFAIL: merge unsuccessful for af and or') + #%%============================================================================ # Output columns: when dynamut, dynamut2 and others weren't being combined out_filename_comb_afor = gene.lower() + '_comb_afor.csv' @@ -486,7 +646,7 @@ outfile_comb_afor = outdir + '/' + out_filename_comb_afor print('Output filename:', outfile_comb_afor , '\n===================================================================') -# # write csv +# write csv print('Writing file: combined stability and afor') combined_stab_afor.to_csv(outfile_comb_afor, index = False) print('\nFinished writing file:' @@ -494,7 +654,20 @@ print('\nFinished writing file:' , '\nNo. of cols:', combined_stab_afor.shape[1]) #%%============================================================================ # combine dynamut, dynamut2, and mcsm_na -dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] +#dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] # gid + +if gene.lower() == "pnca": + dfs_list = [dynamut_df, dynamut2_df] +if gene.lower() == "gid": + dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] +if gene.lower() == "embb": + dfs_list = [dynamut2_df, mcsm_ppi2_df] +if gene.lower() == "katg": + dfs_list = [dynamut2_df] +if gene.lower() == "rpob": + dfs_list = [dynamut2_df] +if gene.lower() == "alr": + dfs_list = [dynamut2_df, mcsm_ppi2_df] dfs_merged = reduce(lambda left,right: pd.merge(left , right @@ -514,7 +687,7 @@ len(combined_stab_afor.columns) combined_all_params = pd.merge(combined_stab_afor , dfs_merged_clean , on = merging_cols_m6 - , how = i_join) + , how = "inner") expected_ncols = len(dfs_merged_clean.columns) + len(combined_stab_afor.columns) - len(merging_cols_m6) expected_nrows = len(combined_stab_afor) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 31f8a27..aac7cdb 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -70,7 +70,6 @@ arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and outp arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') - args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py index 98c2ee1..20b2dcb 100755 --- a/scripts/deepddg_format.py +++ b/scripts/deepddg_format.py @@ -117,12 +117,20 @@ deepddg_df['deepddg_outcome'].value_counts() len(deepddg_df.loc[deepddg_df['deepddg'] < 0]) len(deepddg_df.loc[deepddg_df['deepddg'] >= 0]) +#---------------------------------------------- # drop extra columns to allow clean merging -deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1) +#---------------------------------------------- +#deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1) + +#---------------------------------------------- +# embb (where gene-target has > 1 chain) +# include chain else the numbering will be messed up! +#---------------------------------------------- +deepddg_short_df = deepddg_df.drop(['wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1) # rearrange columns deepddg_short_df.columns -deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]] +deepddg_short_df = deepddg_short_df[["chain_id", "mutationinformation", "deepddg", "deepddg_outcome"]] #%% combine with mcsm snps deepddg_mcsm_muts_dfs = pd.merge(deepddg_short_df diff --git a/scripts/rd_df.py b/scripts/rd_df.py index 7eab903..102530d 100755 --- a/scripts/rd_df.py +++ b/scripts/rd_df.py @@ -45,8 +45,6 @@ arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' drug = args.drug gene = args.gene gene_match = gene + '_p.' From 9cfb32afb87cc747656d34d3125eb2c6291ad610 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 28 Oct 2021 12:43:44 +0100 Subject: [PATCH 51/51] pretending that we added the CLI arguments --- mcsm_na/run_format_results_mcsm_na.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcsm_na/run_format_results_mcsm_na.py b/mcsm_na/run_format_results_mcsm_na.py index cb7b4ca..d990368 100644 --- a/mcsm_na/run_format_results_mcsm_na.py +++ b/mcsm_na/run_format_results_mcsm_na.py @@ -14,7 +14,7 @@ from format_results_mcsm_na import * # variables # TODO: add cmd line args - +# Imagine we've done the work gene = 'gid' drug = 'streptomycin' datadir = homedir + '/git/Data'