diff --git a/scripts/plotting/corr_data.R b/scripts/plotting/corr_data.R deleted file mode 100644 index 861e5da..0000000 --- a/scripts/plotting/corr_data.R +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Script to format data for corr plots -######################################################### -#library(dplyr) - -#================================================= -# Data for Corrplots -#================================================= -cat("\n==========================================" - , "\nCORR PLOTS data: ALL params" - , "\n=========================================") - -# use data -#merged_df2 -geneL_normal = c("pnca") -geneL_na_dy = c("gid") -geneL_na = c("rpob") -geneL_ppi2 = c("alr", "embb", "katg", "rpob") - -#---------------------------- -# columns for corr plots:PS -#---------------------------- -# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find! -big_df_colnames = data.frame(names(merged_df2)) - -core_cols = c("mutationinformation", drug, "mutation_info_labels" - , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa" - , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af" - , "deepddg" , "ddg_dynamut2" - , "consurf_score" - #, "consurf_scaled" - , "snap2_score" - #, "snap2_scaled", "snap2_accuracy_pc" - , "ligand_distance") - -if (tolower(gene)%in%geneL_normal){ - corr_cols_select = core_cols -} -if (tolower(gene)%in%geneL_na_dy){ - additional_cols = c("mcsm_na_affinity" - , "ddg_dynamut" - , "ddg_encom", "dds_encom" - , "ddg_mcsm", "ddg_sdm" - , "ddg_duet" - #, "mcsm_na_scaled" - #, "ddg_dynamut_scaled" - #, "ddg_encom_scaled", "dds_encom_scaled" - #, "ddg_mcsm_scaled", "ddg_sdm_scaled" - #, "ddg_duet_scaled" - ) - - corr_cols_select = c(core_cols, additional_cols) - -} - -if (tolower(gene)%in%geneL_na){ - additional_cols = c("mcsm_na_affinity" - #, "mcsm_na_scaled" - ) - - corr_cols_select = c(core_cols, additional_cols) - -} - -if (tolower(gene)%in%geneL_ppi2){ - additional_cols = c("mcsm_ppi2_affinity") - corr_cols_select = c(core_cols, additional_cols) -} - -# corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels" -# , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa" -# , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af" -# , "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity" -# , "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance") - -#=========================== -# Corr data for plots: PS -# big_df ps: ~ merged_df2 -#=========================== - -corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select] - -#----------------------- -# formatting: some cols -# Add pretty colnames -#----------------------- -corr_df_m2_f <- corr_df_m2 %>% dplyr::rename( - 'DUET' = duet_stability_change - , 'mCSM-lig' = ligand_affinity_change - , FoldX = ddg_foldx - , DeepDDG = deepddg - , ASA = asa - , RSA = rsa - , KD = kd_values - , RD = rd_values - , MAF = af - , 'Log (OR)' = log10_or_mychisq - , '-Log (P)' = neglog_pval_fisher - , Dynamut = ddg_dynamut - , 'ENCoM-DDG'= ddg_encom - , mCSM = ddg_mcsm - , SDM = ddg_sdm - , 'DUET-d' = ddg_duet - , 'ENCoM-DDS'= dds_encom - , Dynamut2 = ddg_dynamut2 - , 'mCSM-NA' = mcsm_na_affinity ) - -#=========================== -# Corr data for plots: PS -# short_df ps: ~merged_df3 -#=========================== - -corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),] - -na_or = sum(is.na(corr_df_m3$log10_or_mychisq)) -check1 = nrow(corr_df_m3) - na_or; check1 - -if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { - cat( "\nPASS: No. of rows for corr_df_m3 match" - , "\nPASS: No. of OR values checked: " , check1) -} else { - cat("\nFAIL: Numbers mismatch:" - , "\nExpected nrows: ", nrow(merged_df3) - , "\nGot: ", nrow(corr_df_m3) - , "\nExpected OR values: ", nrow(merged_df3_comp) - , "\nGot: ", check1) -} - -#----------------------- -# formatting: some cols -# Add pretty colnames -#----------------------- -corr_df_m3_f <- corr_df_m3 %>% - rename( - DUET = duet_stability_change - , 'mCSM-lig' = ligand_affinity_change - , FoldX = ddg_foldx - , DeepDDG = deepddg - , ASA = asa - , RSA = rsa - , KD = kd_values - , RD = rd_values - , MAF = af - , 'Log (OR)' = log10_or_mychisq - , '-Log (P)' = neglog_pval_fisher - , Dynamut = ddg_dynamut - , 'ENCoM-DDG'= ddg_encom - , mCSM = ddg_mcsm - , SDM = ddg_sdm - , 'DUET-d' = ddg_duet - , 'ENCoM-DDS'= dds_encom - , Dynamut2 = ddg_dynamut2 - , 'mCSM-NA' = mcsm_na_affinity ) - -######################################################################## -cat("\nCorr Data created:" -, "\n===================================" -, "\ncorr_df_m2: created from merged_df2" -, "\n===================================" -, "\nnrows:", nrow(corr_df_m2) -, "\nncols:", ncol(corr_df_m2) -, "\n===================================" -, "\ncorr_df_m3: created from merged_df3" -, "\n===================================" -, "\nnrows:", nrow(corr_df_m3) -, "\nncols:", ncol(corr_df_m3) -) diff --git a/scripts/plotting/redundant/corr_data.R b/scripts/plotting/redundant/corr_data.R index 8bdb929..861e5da 100644 --- a/scripts/plotting/redundant/corr_data.R +++ b/scripts/plotting/redundant/corr_data.R @@ -1,263 +1,168 @@ -#!/usr/bin/env Rscript +#!/usr/bin/env Rscript ######################################################### -# TASK: Prepare for correlation data +# TASK: Script to format data for corr plots +######################################################### +#library(dplyr) -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting") -getwd() +#================================================= +# Data for Corrplots +#================================================= +cat("\n==========================================" + , "\nCORR PLOTS data: ALL params" + , "\n=========================================") -#source("~/git/LSHTM_analysis/scripts/Header_TT.R") -source("../functions/my_pairs_panel.R") # with lower panel turned off -source("../functions/plotting_globals.R") -source("../functions/plotting_data.R") -source("../functions/combining_dfs_plotting.R") -########################################################### -#=========== -# input -#=========== -#--------------------- -# call: import_dirs() -#--------------------- -import_dirs(drug, gene) +# use data +#merged_df2 +geneL_normal = c("pnca") +geneL_na_dy = c("gid") +geneL_na = c("rpob") +geneL_ppi2 = c("alr", "embb", "katg", "rpob") -#--------------------------- -# call: plotting_data() -#--------------------------- -#if (!exists("infile_params") && exists("gene")){ -if (!is.character(infile_params) && exists("gene")){ # when running as cmd - #in_filename_params = paste0(tolower(gene), "_all_params.csv") - in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid - infile_params = paste0(outdir, "/", in_filename_params) - cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") +#---------------------------- +# columns for corr plots:PS +#---------------------------- +# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find! +big_df_colnames = data.frame(names(merged_df2)) + +core_cols = c("mutationinformation", drug, "mutation_info_labels" + , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa" + , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af" + , "deepddg" , "ddg_dynamut2" + , "consurf_score" + #, "consurf_scaled" + , "snap2_score" + #, "snap2_scaled", "snap2_accuracy_pc" + , "ligand_distance") + +if (tolower(gene)%in%geneL_normal){ + corr_cols_select = core_cols +} +if (tolower(gene)%in%geneL_na_dy){ + additional_cols = c("mcsm_na_affinity" + , "ddg_dynamut" + , "ddg_encom", "dds_encom" + , "ddg_mcsm", "ddg_sdm" + , "ddg_duet" + #, "mcsm_na_scaled" + #, "ddg_dynamut_scaled" + #, "ddg_encom_scaled", "dds_encom_scaled" + #, "ddg_mcsm_scaled", "ddg_sdm_scaled" + #, "ddg_duet_scaled" + ) + + corr_cols_select = c(core_cols, additional_cols) + } -# Input 1: read _comb_afor.csv -cat("\nReading mcsm combined data file: ", infile_params) -mcsm_df = read.csv(infile_params, header = T) -pd_df = plotting_data(mcsm_df) -my_df_u = pd_df[[1]] # this forms one of the input for combining_dfs_plotting() - -#-------------------------------- -# call: combining_dfs_plotting() -#-------------------------------- -#if (!exists("infile_metadata") && exists("gene")){ -if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd - in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid - infile_metadata = paste0(outdir, "/", in_filename_metadata) - cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") +if (tolower(gene)%in%geneL_na){ + additional_cols = c("mcsm_na_affinity" + #, "mcsm_na_scaled" + ) + + corr_cols_select = c(core_cols, additional_cols) + } -# Input 2: read _meta data.csv -cat("\nReading meta data file: ", infile_metadata) +if (tolower(gene)%in%geneL_ppi2){ + additional_cols = c("mcsm_ppi2_affinity") + corr_cols_select = c(core_cols, additional_cols) +} -gene_metadata <- read.csv(infile_metadata - , stringsAsFactors = F - , header = T) +# corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels" +# , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa" +# , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af" +# , "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity" +# , "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance") -all_plot_dfs = combining_dfs_plotting(my_df_u - , gene_metadata - , lig_dist_colname = 'ligand_distance' - , lig_dist_cutoff = 10) +#=========================== +# Corr data for plots: PS +# big_df ps: ~ merged_df2 +#=========================== -cat(paste0("Directories imported:" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir)) +corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select] -cat(paste0("Variables imported:" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\nNA count for ORs:", na_count - , "\nNA count in df2:", na_count_df2 - , "\nNA count in df3:", na_count_df3)) +#----------------------- +# formatting: some cols +# Add pretty colnames +#----------------------- +corr_df_m2_f <- corr_df_m2 %>% dplyr::rename( + 'DUET' = duet_stability_change + , 'mCSM-lig' = ligand_affinity_change + , FoldX = ddg_foldx + , DeepDDG = deepddg + , ASA = asa + , RSA = rsa + , KD = kd_values + , RD = rd_values + , MAF = af + , 'Log (OR)' = log10_or_mychisq + , '-Log (P)' = neglog_pval_fisher + , Dynamut = ddg_dynamut + , 'ENCoM-DDG'= ddg_encom + , mCSM = ddg_mcsm + , SDM = ddg_sdm + , 'DUET-d' = ddg_duet + , 'ENCoM-DDS'= dds_encom + , Dynamut2 = ddg_dynamut2 + , 'mCSM-NA' = mcsm_na_affinity ) -#======= -# output -#======= -# corr_ps_df2 -# corr_lig_df2 +#=========================== +# Corr data for plots: PS +# short_df ps: ~merged_df3 +#=========================== -#################################################################### -# end of loading libraries and functions -#################################################################### +corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),] -#%%%%%%%%%%%%%%%%%%%%%%%%% -#df_ps = merged_df3 -df_ps = merged_df2 +na_or = sum(is.na(corr_df_m3$log10_or_mychisq)) +check1 = nrow(corr_df_m3) - na_or; check1 -#df_lig = merged_df3_lig -df_lig = merged_df2_lig - -#%%%%%%%%%%%%%%%%%%%%%%%%% +if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { + cat( "\nPASS: No. of rows for corr_df_m3 match" + , "\nPASS: No. of OR values checked: " , check1) +} else { + cat("\nFAIL: Numbers mismatch:" + , "\nExpected nrows: ", nrow(merged_df3) + , "\nGot: ", nrow(corr_df_m3) + , "\nExpected OR values: ", nrow(merged_df3_comp) + , "\nGot: ", check1) +} +#----------------------- +# formatting: some cols +# Add pretty colnames +#----------------------- +corr_df_m3_f <- corr_df_m3 %>% + rename( + DUET = duet_stability_change + , 'mCSM-lig' = ligand_affinity_change + , FoldX = ddg_foldx + , DeepDDG = deepddg + , ASA = asa + , RSA = rsa + , KD = kd_values + , RD = rd_values + , MAF = af + , 'Log (OR)' = log10_or_mychisq + , '-Log (P)' = neglog_pval_fisher + , Dynamut = ddg_dynamut + , 'ENCoM-DDG'= ddg_encom + , mCSM = ddg_mcsm + , SDM = ddg_sdm + , 'DUET-d' = ddg_duet + , 'ENCoM-DDS'= dds_encom + , Dynamut2 = ddg_dynamut2 + , 'mCSM-NA' = mcsm_na_affinity ) ######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#====================== -# adding log cols -#====================== -df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) -df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) - -df_ps$log10_or_kin = log10(df_ps$or_kin) -df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin) - -#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0) - -#=============================== -# Data for Correlation plots:PS -#=============================== -# subset data to generate pairwise correlations -cols_to_select = c("mutationinformation" - , "duet_scaled" - , "foldx_scaled" - #, "mutation_info_labels" - , "asa" - , "rsa" - , "rd_values" - , "kd_values" - , "log10_or_mychisq" - , "neglog_pval_fisher" - , "or_kin" - , "neglog_pwald_kin" - , "af" - #, "af_kin" - , "duet_outcome" - , drug) - -corr_data_ps = df_ps[cols_to_select] - -dim(corr_data_ps) - -# assign nice colnames (for display) -my_corr_colnames = c("Mutation" - , "DUET" - , "Foldx" - #, "Mutation class" - , "ASA" - , "RSA" - , "RD" - , "KD" - , "Log (OR)" - , "-Log (P)" - , "Adjusted (OR)" - , "-Log (P wald)" - , "AF" - , "AF_kin" - , "duet_outcome" - , drug) - -length(my_corr_colnames) - -colnames(corr_data_ps) -colnames(corr_data_ps) <- my_corr_colnames -colnames(corr_data_ps) - -start = 1 -end = which(colnames(corr_data_ps) == drug); end # should be the last column -offset = 1 - -#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug -corr_ps_df2 = corr_data_ps[start:end] -head(corr_ps_df2) - -#-------------------------- -# short_df ps: merged_df3 -#-------------------------- -corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),] - -na_or = sum(is.na(corr_ps_df3$`Log (OR)`)) -check1 = nrow(corr_ps_df3) - na_or - -na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`)) -check2 = nrow(corr_ps_df3) - na_adj_or - -#if ( nrow(corr_ps_df3) == nrow(merged_df3) ) { -# cat( "PASS: No. of rows for corr_ps_df3 match" ) -#}if ( nrow(merged_df3_comp) == check1 ){ -# cat( "PASS: No. of OR values checked" ) -#} - -################################################################################################ -#================================= -# Data for Correlation plots: LIG -#================================= -table(df_lig$ligand_outcome) - -df_lig$log10_or_mychisq = log10(df_lig$or_mychisq) -df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher) - -df_lig$log10_or_kin = log10(df_lig$or_kin) -df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin) - -# subset data to generate pairwise correlations -cols_to_select = c("mutationinformation" - , "affinity_scaled" - #, "mutation_info_labels" - , "asa" - , "rsa" - , "rd_values" - , "kd_values" - , "log10_or_mychisq" - , "neglog_pval_fisher" - , "or_kin" - , "neglog_pwald_kin" - , "af" - , "af_kin" - , "ligand_outcome" - , drug) - -corr_data_lig = df_lig[, cols_to_select] - - -dim(corr_data_lig) - -# assign nice colnames (for display) -my_corr_colnames = c("Mutation" - , "Ligand Affinity" - #, "Mutation class" - , "ASA" - , "RSA" - , "RD" - , "KD" - , "Log (OR)" - , "-Log (P)" - , "Adjusted (OR)" - , "-Log (P wald)" - , "AF" - , "AF_kin" - , "ligand_outcome" - , drug) - -length(my_corr_colnames) - -colnames(corr_data_lig) -colnames(corr_data_lig) <- my_corr_colnames -colnames(corr_data_lig) - -start = 1 -end = which(colnames(corr_data_lig) == drug); end # should be the last column -offset = 1 - -#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug -corr_lig_df2 = corr_data_lig[start:end] -head(corr_lig_df2) - -#----------------- -# short_df lig: merged_df3_lig -#----------------- - -corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),] - -####################################################### -rm(merged_df2, merged_df2_lig, merged_df3, merged_df3_lig - , merged_df2_comp , merged_df3_comp, merged_df2_comp_lig, merged_df3_comp_lig - , corr_data_ps, corr_data_lig) \ No newline at end of file +cat("\nCorr Data created:" +, "\n===================================" +, "\ncorr_df_m2: created from merged_df2" +, "\n===================================" +, "\nnrows:", nrow(corr_df_m2) +, "\nncols:", ncol(corr_df_m2) +, "\n===================================" +, "\ncorr_df_m3: created from merged_df3" +, "\n===================================" +, "\nnrows:", nrow(corr_df_m3) +, "\nncols:", ncol(corr_df_m3) +)