diff --git a/config/gid.R b/config/gid.R new file mode 100644 index 0000000..226af91 --- /dev/null +++ b/config/gid.R @@ -0,0 +1,2 @@ +gene = "gid" +drug = "streptomycin" diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py old mode 100644 new mode 100755 diff --git a/dynamut/format_results_dynamut2.py b/dynamut/format_results_dynamut2.py old mode 100644 new mode 100755 diff --git a/dynamut/notes.txt b/dynamut/notes.txt new file mode 100644 index 0000000..97e6d02 --- /dev/null +++ b/dynamut/notes.txt @@ -0,0 +1,11 @@ +Dynamut was painfully run for gid, part manually, part programatically! + +However, it was decided to ditch that and only run Dynamut2 for future targets + +Dynamut2 was run through the website in batches of 50 for +katG: 17 batches (00..16) +rpoB: 23 batches (00..22) +alr: 6 batches (00..05) + +However, the use of API was made for rpoB batches (09-22) from 13 Oct 2021 +as jobs started to flake and fail through the website! diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py old mode 100644 new mode 100755 index 02af524..cb6fe70 --- a/dynamut/run_format_results_dynamut.py +++ b/dynamut/run_format_results_dynamut.py @@ -20,8 +20,45 @@ from format_results_dynamut2 import * # variables # TODO: add cmd line args -gene = 'gid' -drug = 'streptomycin' +#gene = +#drug = + +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +#arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') # should be handled elsewhere! + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +#make_dirs = args.make_dirs + +#%% input and output dirs and files +#======= +# dirs +#======= +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' + +#%%===================================================================== + datadir = homedir + '/git/Data' indir = datadir + '/' + drug + '/input' outdir = datadir + '/' + drug + '/output' @@ -29,12 +66,12 @@ outdir_dynamut = outdir + '/dynamut_results/' outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/' # Input file -infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv' -infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv' +#infile_dynamut = outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv' +infile_dynamut2 = outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv' # Formatted output filename -outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv' -outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv' +#outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv' +outfile_dynamut2_f = outdir_dynamut2 + gene.lower() + '_dynamut2_norm.csv' #=============================== # CALL: format_results_dynamut @@ -69,4 +106,4 @@ print('Finished writing file:' , '\nExpected no. of cols:', len(dynamut2_df_f.columns) , '\n=============================================================') -#%%##################################################################### \ No newline at end of file +#%%##################################################################### diff --git a/dynamut/run_get_results_dynamut.py b/dynamut/run_get_results_dynamut.py index e9e82ef..029e934 100755 --- a/dynamut/run_get_results_dynamut.py +++ b/dynamut/run_get_results_dynamut.py @@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au' #headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"} # TODO: add cmd line args -#gene = 'gid' -drug = 'streptomycin' +# gene = +# drug = datadir = homedir + '/git/Data/' indir = datadir + drug + '/input/' outdir = datadir + drug + '/output/' @@ -41,4 +41,4 @@ get_results(url_file = my_url_file , output_dir = outdir , outfile_suffix = my_suffix) -######################################################################## \ No newline at end of file +######################################################################## diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh index b5f15f1..1f7a793 100755 --- a/dynamut/split_csv.sh +++ b/dynamut/split_csv.sh @@ -1,6 +1,6 @@ #!/bin/bash -# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA +# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20) # Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh # copy your snp file to split into the dynamut dir @@ -12,8 +12,13 @@ CHUNK=$3 mkdir -p ${OUTDIR}/${CHUNK} cd ${OUTDIR}/${CHUNK} +# makes the 2 dirs, hence ../.. split ../../${INFILE} -l ${CHUNK} -d snp_batch_ # use case #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50 +#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50 +#~/git/LSHTM_analysis/dynamut/split_csv.sh katg_mcsm_formatted_snps.csv snp_batches 50 #Date: 20/09/2021 + +# add .txt to the files diff --git a/dynamut/split_csv_chain.sh b/dynamut/split_csv_chain.sh new file mode 100755 index 0000000..ac60faa --- /dev/null +++ b/dynamut/split_csv_chain.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20) + +# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh +# copy your snp file to split into the dynamut dir +# use sed to add chain ID to snp file and then split to avoid post processing + +INFILE=$1 +OUTDIR=$2 +CHUNK=$3 + +mkdir -p ${OUTDIR}/${CHUNK}/chain_added +cd ${OUTDIR}/${CHUNK}/chain_added + +# makes the 3 dirs, hence ../.. +split ../../../${INFILE} -l ${CHUNK} -d snp_batch_ + +######################################################################## +# use cases +# Date: 20/09/2021 +# sed -e 's/^/A /g' katg_mcsm_formatted_snps.csv > katg_mcsm_formatted_snps_chain.csv +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 50 + +# Date: 01/10/2021 +# sed -e 's/^/A /g' rpob_mcsm_formatted_snps.csv > rpob_mcsm_formatted_snps_chain.csv +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 50 + +# Date: 02/10/2021 +# sed -e 's/^/A /g' alr_mcsm_formatted_snps.csv > alr_mcsm_formatted_snps_chain.csv +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50 + +# Date: 05/10/2021 +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20 + +# add .txt to the files +######################################################################## diff --git a/foldx/runFoldx.py b/foldx/runFoldx.py index 8d9358b..12e00c9 100755 --- a/foldx/runFoldx.py +++ b/foldx/runFoldx.py @@ -41,7 +41,7 @@ arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called _complex.pdb in input_dir') -arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _mcsm_snps.csv exists') +arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called _mcsm_formatted_snps.csv exists') # FIXME: Doesn't work with 2 chains yet! arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive @@ -148,6 +148,16 @@ print('Arguments being passed:' , '\noutput file:', outfile_foldx , '\n=============================================================') + +# make sure rotabase.txt exists in the process_dir +rotabase_file = process_dir + '/' + 'rotabase.txt' + +if Path(rotabase_file).is_file(): + print(f'rotabase file: {rotabase_file} exists') +else: + print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}') + sys.exit() + #### Delay for 10 seconds to check the params #### print('Sleeping for 10 seconds to give you time to cancel') time.sleep(10) @@ -235,6 +245,13 @@ def main(): nmuts = len(mutlist) print(nmuts) print(mutlist) + print('start') + #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir]) + print('\033[95mSTAGE: repair PDB\033[0m') + print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir)) + #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir]) + # once you decide to use the function + # repairPDB(pdbname) print('start') # some common parameters for foldX @@ -242,61 +259,74 @@ def main(): print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m') print('Running foldx RepairPDB for WT') - subprocess.call(['foldx' + + fold_RepairDB = ['foldx' , '--command=RepairPDB' , foldx_common - , '--pdb-dir=' + os.path.dirname(pdb_filename) +# , '--pdb-dir=' + os.path.dirname(pdb_filename) + , '--pdb-dir=' + indir , '--pdb=' + actual_pdb_filename , 'outPDB=true' - , '--output-dir=' + process_dir]) + , '--output-dir=' + process_dir] + print('CMD:', fold_RepairDB) + subprocess.call(fold_RepairDB) print('\033[95mCOMPLETED STAGE: repair PDB\033[0m') print('\n==========================================================') print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m') print('Running foldx BuildModel for WT') - subprocess.call(['foldx' + + foldx_BuildModel = ['foldx' , '--command=BuildModel' , foldx_common , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' - , '--mutant-file="individual_list_' + pdbname +'.txt"' + , '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt' , 'outPDB=true' , '--numberOfRuns=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_BuildModel) + subprocess.call( foldx_BuildModel, cwd=process_dir) print('Running foldx PrintNetworks for WT') - subprocess.call(['foldx' + foldx_PrintNetworks = ['foldx' , '--command=PrintNetworks' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_PrintNetworks) + subprocess.call(foldx_PrintNetworks, cwd=process_dir) print('Running foldx SequenceDetail for WT') - subprocess.call(['foldx' + foldx_SequenceDetail = ['foldx' , '--command=SequenceDetail' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_SequenceDetail) + subprocess.call(foldx_SequenceDetail , cwd=process_dir) + print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m') print('\n==========================================================') - print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m') for n in range(1,nmuts+1): print('\033[95mNETWORK:\033[0m', n) print('Running foldx PrintNetworks for mutation', n) - subprocess.call(['foldx' + foldx_PrintNetworksMT = ['foldx' , '--command=PrintNetworks' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb' , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_PrintNetworksMT) + subprocess.call( foldx_PrintNetworksMT , cwd=process_dir) print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m') print('\n==========================================================') @@ -323,14 +353,16 @@ def main(): print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m') chain1=chainA chain2=chainB - subprocess.call(['foldx' + foldx_AnalyseComplex = ['foldx' , '--command=AnalyseComplex' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair.pdb' , '--analyseComplexChains=' + chain1 + ',' + chain2 , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:',foldx_AnalyseComplex) + subprocess.call(foldx_AnalyseComplex, cwd=process_dir) # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout' @@ -340,14 +372,16 @@ def main(): for n in range(1,nmuts+1): print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n) - subprocess.call(['foldx' + foldx_AnalyseComplex = ['foldx' , '--command=AnalyseComplex' , '--pdb-dir=' + process_dir , '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb' , '--analyseComplexChains=' + chain1 + ',' + chain2 , '--water=PREDICT' , '--vdwDesign=1' - , '--output-dir=' + process_dir], cwd=process_dir) + , '--output-dir=' + process_dir] + print('CMD:', foldx_AnalyseComplex) + subprocess.call( foldx_AnalyseComplex , cwd=process_dir) # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout' diff --git a/mcsm/run_mcsm.py b/mcsm/run_mcsm.py index 7e38543..da621f5 100755 --- a/mcsm/run_mcsm.py +++ b/mcsm/run_mcsm.py @@ -104,7 +104,7 @@ if mutation_filename: in_filename_snps = mutation_filename else: in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv' - + infile_snps = outdir + '/' + in_filename_snps #======= diff --git a/mcsm_na/format_results_mcsm_na.py b/mcsm_na/format_results_mcsm_na.py index 95cd9e8..335301c 100644 --- a/mcsm_na/format_results_mcsm_na.py +++ b/mcsm_na/format_results_mcsm_na.py @@ -51,7 +51,7 @@ def format_mcsm_na_output(mcsm_na_output_tsv): print('Assigning meaningful colnames' , '\n=======================================================') my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded - , 'CHAIN': 'chain' # {wild_type}{mutant_type} + , 'CHAIN': 'chain' , 'WILD_RES': 'wild_type' # one letter amino acid code , 'RES_POS': 'position' # number , 'MUT_RES': 'mutant_type' # one letter amino acid code @@ -65,8 +65,8 @@ def format_mcsm_na_output(mcsm_na_output_tsv): ############# # create mutationinformation column ############# - mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type'] - + #mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type'] + mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type'] #%%===================================================================== ############# # Create col: mcsm_na_outcome @@ -131,5 +131,4 @@ def format_mcsm_na_output(mcsm_na_output_tsv): , 'chain' , 'pdb_file']] return(mcsm_na_dataf) -#%%##################################################################### - +#%%##################################################################### \ No newline at end of file diff --git a/my_header.R b/my_header.R index 5009f29..2fa892c 100644 --- a/my_header.R +++ b/my_header.R @@ -1,21 +1,31 @@ ######################################################### -### A) Installing and loading required packages +# A) Installing and loading required packages +# B) My functions +######################################################### + ######################################################### #lib_loc = "/usr/local/lib/R/site-library") -#if (!require("gplots")) { -# install.packages("gplots", dependencies = TRUE) -# library(gplots) -#} +require("getopt", quietly = TRUE) # cmd parse arguments -#if (!require("tidyverse")) { -# install.packages("tidyverse", dependencies = TRUE) -# library(tidyverse) -#} +if (!require("tidyverse")) { + install.packages("tidyverse", dependencies = TRUE) + library(tidyverse) +} -if (!require("ggplot2")) { - install.packages("ggplot2", dependencies = TRUE) - library(ggplot2) +if (!require("shiny")) { + install.packages("shiny", dependencies = TRUE) + library(shiny) +} + +if (!require("shinyBS")) { + install.packages("shinyBS", dependencies = TRUE) + library(shinyBS) +} + +if (!require("gridExtra")) { + install.packages("gridExtra", dependencies = TRUE) + library(gridExtra) } if (!require("ggridges")) { @@ -23,6 +33,35 @@ if (!require("ggridges")) { library(ggridges) } +# if (!require("ggplot2")) { +# install.packages("ggplot2", dependencies = TRUE) +# library(ggplot2) +# } + +# if (!require ("dplyr")){ +# install.packages("dplyr") +# library(dplyr) +# } + +if (!require ("DT")){ + install.packages("DT") + library(DT) +} + +if (!require ("plyr")){ + install.packages("plyr") + library(plyr) + } + +# Install +#if(!require(devtools)) install.packages("devtools") +#devtools::install_github("kassambara/ggcorrplot") + +if (!require ("ggbeeswarm")){ + install.packages("ggbeeswarm") + library(ggbeeswarm) +} + if (!require("plotly")) { install.packages("plotly", dependencies = TRUE) library(plotly) @@ -103,11 +142,6 @@ if (!require ("psych")){ library(psych) } -if (!require ("dplyr")){ - install.packages("dplyr") - library(dplyr) -} - if (!require ("compare")){ install.packages("compare") library(compare) @@ -118,18 +152,37 @@ if (!require ("arsenal")){ library(arsenal) } +if(!require(ggseqlogo)){ + install.packages("ggseqlogo") + library(ggseqlogo) +} -####TIDYVERSE -# Install -#if(!require(devtools)) install.packages("devtools") -#devtools::install_github("kassambara/ggcorrplot") - -#library(ggcorrplot) - - -###for PDB files -#install.packages("bio3d") +# for PDB files if(!require(bio3d)){ install.packages("bio3d") library(bio3d) } + +library(protr) +if(!require(protr)){ + install.packages("protr") + library(protr) +} + +#if (!requireNamespace("BiocManager", quietly = TRUE)) +# install.packages("BiocManager") + +#BiocManager::install("Logolas") +library("Logolas") + + +#################################### +# Load all my functions: +# only works if tidyverse is loaded +# hence included it here! +#################################### + +func_path = "~/git/LSHTM_analysis/scripts/functions/" +source_files <- list.files(func_path, "\\.R$") # locate all .R files +map(paste0(func_path, source_files), source) # source all your R scripts! + diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R new file mode 100644 index 0000000..ad43386 --- /dev/null +++ b/scripts/functions/bp_lineage.R @@ -0,0 +1,91 @@ +######################################## +# Lineage barplot +# Lineage and nsSNP count barplot +# Lineage Diversity barplot +######################################## + +lin_count_bp <- function( lf_data + , x_categ = "" + , y_count = "" + , bar_fill_categ = "" + , display_label_col = "" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , d_lab_size = 5 + , d_lab_hjust = 0.5 + , d_lab_vjust = 0.5 + , d_lab_col = "black" + , my_xats = 20 # x axis text size + , my_yats = 20 # y axis text size + , my_xals = 22 # x axis label size + , my_yals = 22 # y axis label size + , my_lls = 22 # legend label size + , bar_col_labels = "" + , bar_col_values = "" + , bar_leg_name = "" + , leg_location = "top" + , y_log10 = FALSE + , y_scale_percent = FALSE + , y_label = c("Count", "SNP diversity") + ) { + g = ggplot(lf_data + , aes( x = factor( eval(parse(text = x_categ)), ordered = T ) + , y = eval(parse(text = y_count)) + , fill = eval(parse(text = bar_fill_categ)) ) ) + + OutPlot = g + geom_bar( stat = bar_stat_stype + , position = position_stack(reverse = TRUE) + #, alpha = 1 + #, colour = "grey75" + ) + + theme(axis.text.x = element_text(size = my_xats + , angle = x_lab_angle) + , axis.text.y = element_text(size = my_yats + , angle = 90 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals + , colour = "black") + , axis.title.y = element_text(size = my_yals + , colour = "black") + , legend.position = leg_location + , legend.text = element_text(size = my_lls)) + + + geom_label(aes(label = eval(parse(text = display_label_col))) + , size = d_lab_size + , hjust = d_lab_hjust + , vjust = d_lab_vjust + , colour = d_lab_col + , show.legend = FALSE + #, check_overlap = TRUE + , position = position_stack(reverse = T)) + + + scale_fill_manual(values = bar_col_values + , name = bar_leg_name + , labels = bar_col_labels) + + labs(title = "" + , x = "" + , y = y_label + , colour = "black") + + if (y_log10){ + + OutPlot = OutPlot + + scale_y_continuous(trans = "log10" + , labels = trans_format("log10", math_format(10^.x) ) ) + } + + if (y_scale_percent){ + + OutPlot = OutPlot + + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + + #scale_y_continuous(labels = scales::percent) + + + labs(title = "" + , x = "" + , y = y_label + , colour = "black") + } + + return(OutPlot) +} diff --git a/scripts/functions/bp_subcolours.R b/scripts/functions/bp_subcolours.R index a3cc403..91a8914 100755 --- a/scripts/functions/bp_subcolours.R +++ b/scripts/functions/bp_subcolours.R @@ -3,7 +3,7 @@ # LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar ######################################################### -ColourPalleteMulti <- function(df, group, subgroup){ +ColourPalleteMulti = function(df, group, subgroup){ # Find how many colour categories to create and the number of colours in each categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) @@ -24,4 +24,88 @@ ColourPalleteMulti <- function(df, group, subgroup){ , category.end[i]))(categories[i,2])})) return(colours) } -######################################################### \ No newline at end of file +######################################################################### + +######################## +# Generate bp with +# colour palette derived +# from the data using +# above function +######################### + +bp_stability_hmap <- function(plotdf = merged_df3 + , xvar_colname = "position" + #, bar_col_colname = "group" + , stability_colname = "" + , stability_outcome_colname = "" + , p_title = "" # "Protein stability (DUET)" + , my_xaxls = 12 # x-axis label size + , my_yaxls = 20 # y-axis label size + , my_xaxts = 18 # x-axis text size + , my_yaxts = 20 # y-axis text size + , my_pts = 20 # plot-title size + , my_xlab = "Position" + , my_ylab = "No. of nsSNPs" +) +{ + + # order the df by position and ensure it is a factor + plotdf = plotdf[order(plotdf[[xvar_colname]]), ] + plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]]) + + #cat("\nSneak peak:\n") + head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) ) + + # stability values isolated to help with generating column called: 'group' + my_grp = plotdf[[stability_colname]] + cat( "\nLength of nsSNPs:", length(my_grp) + , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) ) + + # Add col: 'group' + plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "") + + # check unique values in normalised data + cat("\nNo. of unique values in", stability_colname, "no rounding:" + , length(unique(plotdf[[stability_colname]]))) + + # Call the function to create the palette based on the group defined above + #subcols_ps + subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname) + + cat("\nNo. of sub colours generated:", length(subcols_bp_hmap)) + + #------------------------------- + # Generate the subcols barplot + #------------------------------- + + #g = ggplot(plotdf, aes(x = factor(position, ordered = T))) + g = ggplot(plotdf, aes_string(x = xvar_colname + # , ordered = T) + )) + + + OutWidePlot = g + geom_bar(aes(fill = group) + , colour = "grey") + + + scale_fill_manual( values = subcols_bp_hmap + , guide = "none") + + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) + , plot.title = element_text(size = my_pts + , hjust = 0.5)) + + + labs(title = p_title + , x = my_xlab + , y = my_ylab) + + return(OutWidePlot) +} diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 18e0374..107c114 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -152,6 +152,46 @@ combining_dfs_plotting <- function( my_df_u unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) quit() } + + # Quick formatting: ordering df and pretty labels + + #------------------------------ + # sorting by column: position + #------------------------------ + merged_df2 = merged_df2[order(merged_df2$position), ] + + #----------------------- + # mutation_info_labels + #----------------------- + merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col + , "DM", "OM") + merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels) + #----------------------- + # lineage labels + #----------------------- + merged_df2$lineage_labels = gsub("lineage", "L", merged_df2$lineage) + + merged_df2$lineage_labels = factor(merged_df2$lineage_labels, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + #================================================================= # Merge 2: merged_df3 diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R new file mode 100644 index 0000000..675658e --- /dev/null +++ b/scripts/functions/lf_bp.R @@ -0,0 +1,197 @@ +############################# +# Barplots: ggplot +# stats +/- +# violin +/- +# barplot +/ +# beeswarm +############################# + +lf_bp <- function(lf_df + , p_title = "" + , colour_categ = "" + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , colour_bp_strip = "khaki2" + , dot_size = 3 + , dot_transparency = 0.3 + , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 # plot title size) + , make_boxplot = FALSE + , bp_width = c("auto", 0.5) + , add_stats = TRUE + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + , stat_label = c("p.format", "p.signif") ){ + + fwv = as.formula(paste0("~", facet_var)) + #fwv = reformulate(facet_var) + + p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var)) + + + facet_wrap( fwv + , nrow = n_facet_row + , scales = y_scales) + + + geom_violin(trim = T + , scale = "width" + #, position = position_dodge(width = 0.9) + , draw_quantiles = violin_quantiles) + + if (make_boxplot){ + + if (bp_width == "auto"){ + bp_width = 0.5/length(unique(lf_df[[x_grp]])) + cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n") + }else{ + cat("\nBoxplot width value provided, using:", bp_width, "\n") + bp_width = bp_width} + + p2 = p1 + geom_boxplot(fill = "white" + , outlier.colour = NA + #, position = position_dodge(width = 0.9) + , width = bp_width) + + geom_beeswarm(priority = "density" + #, shape = 21 + , size = dot_size + , alpha = dot_transparency + , show.legend = FALSE + , cex = 0.8 + , aes(colour = factor(eval(parse(text = colour_categ))) )) + + } else { + # ggbeeswarm (better than geom_point) + p2 = p1 + geom_beeswarm(priority = "density" + #, shape = 21 + , size = dot_size + , alpha = dot_transparency + , show.legend = FALSE + , cex = 0.8 + , aes(colour = factor(eval(parse(text = colour_categ))) )) + } + + # Add foramtting to graph + OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_text(size = my_ats) + , plot.title = element_text(size = my_pts + , hjust = 0.5 + , colour = "black" + , face = "bold") + , strip.background = element_rect(fill = colour_bp_strip) + , strip.text.x = element_text(size = my_fls + , colour = "black") + , legend.title = element_text(color = "black" + , size = my_als) + , legend.text = element_text(size = my_ats) + , legend.direction = "vertical") + + + labs(title = p_title + , x = "" + , y = "") + + if (add_stats){ + my_comparisonsL <- list( stat_grp_comp ) + + OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL + , method = stat_method + , paired = my_paired + , label = stat_label[2]) + return(OutPlot) + } + + return(OutPlot) +} + +############################# +# Barplot NO stats: plotly +# violin +/- +# barplot +/ +# beeswarm + +# TODO: plot_ly() +############################# +lf_bp_plotly <- function(lf_df + , p_title = "" + , colour_categ = "" + , x_grp = mutation_info + , y_var = param_value + , facet_var = param_type + , n_facet_row = 1 + , y_scales = "free_y" + , colour_bp_strip = "khaki2" + , dot_size = 3 + , dot_transparency = 0.3 + , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL + , my_ats = 20 # axis text size + , my_als = 18 # axis label size + , my_fls = 18 # facet label size + , my_pts = 22 # plot title size) + #, make_boxplot = FALSE + , bp_width = c("auto", 0.5) + #, add_stats = FALSE + #, stat_grp_comp = c("DM", "OM") + #, stat_method = "wilcox.test" + #, my_paired = FALSE + #, stat_label = c("p.format", "p.signif") + ){ + + OutPlotly = ggplot(lf_df, aes(x = eval(parse(text = x_grp)) + , y = eval(parse(text = y_var)) + , label1 = x_grp + , label2 = y_var + , lable3 = colour_categ) ) + + + facet_wrap(~ eval(parse(text = facet_var)) + , nrow = n_facet_row + , scales = y_scales) + + + geom_violin(trim = T + , scale = "width" + , draw_quantiles = violin_quantiles) + + + geom_beeswarm(priority = "density" + , size = dot_size + , alpha = dot_transparency + , show.legend = FALSE + , cex = 0.8 + , aes(colour = factor(eval(parse(text = colour_categ) ) ) ) ) + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_text(size = my_ats) + , plot.title = element_text(size = my_pts + , hjust = 0.5 + , colour = "black" + , face = "bold") + , strip.background = element_rect(fill = colour_bp_strip) + , strip.text.x = element_text(size = my_fls + , colour = "black") + , legend.title = element_text(color = "black" + , size = my_als) + , legend.text = element_text(size = my_ats) + , legend.position = "none")+ + + labs(title = p_title + , x = "" + , y = "") + + OutPlotly = ggplotly(OutPlotly + #, tooltip = c("label") + ) + return(OutPlotly) + +} diff --git a/scripts/functions/lf_unpaired_stats.R b/scripts/functions/lf_unpaired_stats.R new file mode 100644 index 0000000..28a8ad0 --- /dev/null +++ b/scripts/functions/lf_unpaired_stats.R @@ -0,0 +1,21 @@ +library(ggpubr) +################################################################### + +lf_unpaired_stats <- function(lf_data + , lf_stat_value = "param_value" + , lf_stat_group = "mutation_info" + , lf_col_statvars = "param_type" + , my_paired = FALSE + , stat_adj = "none"){ + + stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group)) + + my_stat_df = compare_means(stat_formula + , group.by = lf_col_statvars + , data = lf_data + , paired = my_paired + , p.adjust.method = stat_adj) + + + return(my_stat_df) +} \ No newline at end of file diff --git a/scripts/functions/lineage_dist.R b/scripts/functions/lineage_dist.R new file mode 100644 index 0000000..aee1b62 --- /dev/null +++ b/scripts/functions/lineage_dist.R @@ -0,0 +1,69 @@ +############################### +# TASK: function to plot lineage +# dist plots with or without facet +# think about color palette +# for stability +############################## + +#n_colours = length(unique(lin_dist_plot$duet_scaled)) +#my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1) + + +lineage_distP <- function(plotdf + , x_axis = "duet_scaled" + , y_axis = "lineage_labels" + , x_lab = "DUET" + , with_facet = F + , facet_wrap_var = "" + , fill_categ = "mutation_info_labels" + , fill_categ_cols = c("#E69F00", "#999999") + , my_ats = 15 # axis text size + , my_als = 20 # axis label size + , my_leg_ts = 16 + , my_leg_title = 16 + , my_strip_ts = 20 + , leg_pos = c(0.8, 0.9) + , leg_pos_wf = c("top", "left", "bottom", "right") + , leg_dir_wf = c("horizontal", "vertical") + , leg_label = "") + +{ + +LinDistP = ggplot(plotdf, aes_string(x = x_axis + , y = y_axis))+ + + geom_density_ridges(aes_string(fill = fill_categ) + , scale = 3 + , size = 0.3 + , alpha = 0.8) + + scale_x_continuous(expand = c(0.01, 0.01)) + + #coord_cartesian( xlim = c(-1, 1)) + + scale_fill_manual(values = fill_categ_cols) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_blank() + , strip.text = element_text(size = my_strip_ts) + , legend.text = element_text(size = my_leg_ts) + , legend.title = element_text(size = my_leg_title) + , legend.position = c(0.8, 0.9)) + + labs(x = x_lab + , fill = leg_label) + +if (with_facet){ + + # used reformulate or make as formula + #fwv = reformulate(facet_wrap_var) + fwv = as.formula(paste0("~", facet_wrap_var)) + + LinDistP = LinDistP + + facet_wrap(fwv) + + theme(legend.position = leg_pos_wf + , legend.direction = leg_dir_wf) +} + +return(LinDistP) +} diff --git a/scripts/functions/my_pairs_panel.R b/scripts/functions/my_pairs_panel.R index 0c73192..808417f 100644 --- a/scripts/functions/my_pairs_panel.R +++ b/scripts/functions/my_pairs_panel.R @@ -1,24 +1,40 @@ -my_corr_pairs <- function (corr_data){ +my_corr_pairs <- function (corr_data_all + , corr_cols = colnames(corr_data_all) + , corr_method = "spearman" # other options: "pearson" or "kendall" + , colour_categ_col = "mutation_info_labels" + , categ_colour = c("#E69F00", "#999999") + , density_show = F + , hist_col = "coral4" + , dot_size = 1.6 + , ats = 1.5 + , corr_lab_size = 3 + , corr_value_size = 1) + { - OutPlot_corr = pairs.panels(corr_data - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = TRUE # show density plots - , ellipses = F # show correlation ellipses + corr_data_df = corr_data_all[corr_cols] + my_bg = categ_colour[corr_data_all[[colour_categ_col]] ] + + OutPlot_corr = pairs.panels(corr_data_df + , method = corr_method + , hist.col = hist_col + , density = density_show + , ellipses = F + , smooth = F , stars = T , rug = F , breaks = "Sturges" , show.points = T - #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps$duet_outcome))] # foldx colours are reveresed - #, pch = 21 # for bg - , jitter = T + #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_data$duet_outcome))] # foldx colours are reveresed + , bg = my_bg + , pch = 21 , alpha = 1 - , cex = 1.8 - , cex.axis = 2 - , cex.labels = 3.5 - , cex.cor = 1 - , smooth = F) + , cex = dot_size + , cex.axis = ats + , cex.labels = corr_lab_size + , cex.cor = corr_value_size + ) return(OutPlot_corr) + #return (my_bg) } diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index ddda207..faaebca 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -16,7 +16,9 @@ library(dplyr) ## my_df_u_lig ## dup_muts #======================================================== -plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) { +plotting_data <- function(df + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10) { my_df = data.frame() my_df_u = data.frame() my_df_u_lig = data.frame() @@ -29,61 +31,6 @@ dup_muts = data.frame() cat("\nInput dimensions:", dim(df)) -#================================== -# add foldx outcome category -# and foldx scaled values - -# This will enable to always have these variables available -# when calling for plots -#================================== - -#------------------------------ -# adding foldx scaled values -# scale data b/w -1 and 1 -#------------------------------ -n = which(colnames(df) == "ddg"); n - -my_min = min(df[,n]); my_min -my_max = max(df[,n]); my_max - -df$foldx_scaled = ifelse(df[,n] < 0 - , df[,n]/abs(my_min) - , df[,n]/my_max) -# sanity check -my_min = min(df$foldx_scaled); my_min -my_max = max(df$foldx_scaled); my_max - -if (my_min == -1 && my_max == 1){ - cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1" - , "\nProceeding with assigning foldx outcome category") -}else{ - cat("\nFAIL: could not scale foldx ddg values" - , "Aborting!\n") -} - -#------------------------------ -# adding foldx outcome category -# ddg<0 = "Stabilising" (-ve) -#------------------------------ -c1 = table(df$ddg < 0) -df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising") -c2 = table(df$ddg < 0) - -if ( all(c1 == c2) ){ - cat("\nPASS: foldx outcome successfully created") -}else{ - cat("\nFAIL: foldx outcome could not be created. Aborting!\n") - exit() -} - -#------------------------------ -# renaming foldx column from -# "ddg" --> "ddg_foldx" -#------------------------------ - -# change name to foldx -colnames(df)[n] <- "ddg_foldx" - #================================== # extract unique mutation entries #================================== diff --git a/scripts/functions/plotting_globals.R b/scripts/functions/plotting_globals.R index cfd2848..c28047e 100644 --- a/scripts/functions/plotting_globals.R +++ b/scripts/functions/plotting_globals.R @@ -32,7 +32,8 @@ import_dirs <- function(drug_name, gene_name) { #=============================== # mcsm ligand distance cut off #=============================== -#mcsm_lig_cutoff <<- 10 +LigDist_colname <<- "ligand_distance" +LigDist_cutoff <<- 10 #================== # Angstroms symbol diff --git a/scripts/functions/position_count_bp.R b/scripts/functions/position_count_bp.R index ce0767c..2907b4b 100755 --- a/scripts/functions/position_count_bp.R +++ b/scripts/functions/position_count_bp.R @@ -42,7 +42,9 @@ site_snp_count_bp <- function (plotdf , "\nNo. of cols:", ncol(plotdf) , "\nNow adding column: frequency of mutational positions")) - # adding snpcount for each position + #------------------------------------------- + # adding column: snpcount for each position + #------------------------------------------- setDT(plotdf)[, pos_count := .N, by = .(eval(parse(text = df_colname)))] cat("\nCumulative nssnp count\n" @@ -64,15 +66,20 @@ site_snp_count_bp <- function (plotdf cat(paste0("\nrevised df dimensions:" , "\nNo. of rows:", nrow(plotdf) , "\nNo. of cols:", ncol(plotdf))) - + + #------------------------------------------------------ + # creating df: average count of snpcount for each position + # created in earlier step + #------------------------------------------------------- # use group by on pos_count snpsBYpos_df <- plotdf %>% - group_by(eval(parse(text = df_colname))) %>% - summarize(snpsBYpos = mean(pos_count)) - - cat("\nnssnp count\n" - , table(snpsBYpos_df$snpsBYpos)) + dplyr::group_by(eval(parse(text = df_colname))) %>% + dplyr::summarise(snpsBYpos = mean(pos_count)) # changed from summarize! + cat("\nnssnp count per position\n" + , table(snpsBYpos_df$snpsBYpos) + , "\n") + # calculating total no. of sites associated with nsSNPs tot_sites = sum(table(snpsBYpos_df$snpsBYpos)) diff --git a/scripts/functions/redundant/bp_subcolours_v2.R b/scripts/functions/redundant/bp_subcolours_v2.R new file mode 100644 index 0000000..a049ba2 --- /dev/null +++ b/scripts/functions/redundant/bp_subcolours_v2.R @@ -0,0 +1,104 @@ +######################################################### +# 1b: Define function: coloured barplot by subgroup +# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar +######################################################### + +ColourPalleteMulti = function(df, group, subgroup){ + + # Find how many colour categories to create and the number of colours in each + categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) + , df + , function(x) length(unique(x))) + # return(categories) } + + category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete + + category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom + + #return(category.start); return(category.end)} + + # Build Colour pallette + colours <- unlist(lapply(1:nrow(categories), + function(i){ + colorRampPalette(colors = c(category.start[i] + , category.end[i]))(categories[i,2])})) + return(colours) +} +######################################################################### + +bp_stability_hmap <- function(plotdf = merged_df3 + , xvar_colname = "position" + #, bar_col_colname = "group" + , stability_colname = "duet_scaled" + , stability_outcome_colname = "duet_outcome" + , p_title = "" # "Protein stability (DUET)" + , my_xaxls = 12 # x-axis label size + , my_yaxls = 20 # y-axis label size + , my_xaxts = 18 # x-axis text size + , my_yaxts = 20 # y-axis text size + , my_pts = 20 # plot-title size + , my_xlab = "Position" + , my_ylab = "No. of nsSNPs" + ) +{ + + # order the df by position and ensure it is a factor + plotdf = plotdf[order(plotdf[[xvar_colname]]), ] + plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]]) + + #cat("\nSneak peak:\n") + head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) ) + + # stability values isolated to help with generating column called: 'group' + my_grp = plotdf[[stability_colname]] + cat( "\nLength of nsSNPs:", length(my_grp) + , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) ) + + # Add col: 'group' + plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "") + + # check unique values in normalised data + cat("\nNo. of unique values in", stability_colname, "no rounding:" + , length(unique(plotdf[[stability_colname]]))) + + # Call the function to create the palette based on the group defined above + #subcols_ps + subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname) + + cat("\nNo. of sub colours generated:", length(subcols_bp_hmap)) + + #------------------------------- + # Generate the subcols barplot + #------------------------------- + + #g = ggplot(plotdf, aes(x = factor(position, ordered = T))) + g = ggplot(plotdf, aes_string(x = xvar_colname + # , ordered = T) + )) + + + OutWidePlot = g + geom_bar(aes(fill = group) + , colour = "grey") + + + scale_fill_manual( values = subcols_bp_hmap + , guide = "none") + + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) + , plot.title = element_text(size = my_pts + , hjust = 0.5)) + + + labs(title = p_title + , x = my_xlab + , y = my_ylab) + + return(OutWidePlot) +} \ No newline at end of file diff --git a/scripts/functions/redundant/lf_bp_with_stats.R b/scripts/functions/redundant/lf_bp_with_stats.R new file mode 100644 index 0000000..22533d5 --- /dev/null +++ b/scripts/functions/redundant/lf_bp_with_stats.R @@ -0,0 +1,97 @@ +library(ggpubr) +################################################################### + +#################################### +lf_bp_with_stats <- function(lf_df + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , p_title = "" + , colour_categ = "" + , colour_bp_strip = "khaki2" + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + , bp_width = c("auto", 0.5) + , dot_size = 3 + , dot_transparency = 0.3 + , stat_label = c("p.format", "p.signif") + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 # plot title size +) { + if (bp_width == "auto"){ + bp_width = 0.5/length(unique(lf_df[[x_grp]])) + cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n") + }else{ + cat("\nBoxplot width value provided, using:", bp_width, "\n") + bp_width = bp_width + } + + my_comparisonsL <- list( stat_grp_comp ) + + bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp)) + , y = eval(parse(text = y_var)) )) + + + facet_wrap(~ eval(parse(text = facet_var)) + , nrow = n_facet_row + , scales = y_scales) + + + geom_violin(trim = T + , scale = "width" + #, position = position_dodge(width = 0.9) + , draw_quantiles = c(0.25, 0.5, 0.75)) + + + # geom_boxplot(fill = "white" + # , outlier.colour = NA + # #, position = position_dodge(width = 0.9) + # , width = bp_width) + + + # geom_point(position = position_jitterdodge(dodge.width = 0.5) + # , alpha = 0.5 + # , show.legend = FALSE + # , aes(colour = factor(eval(parse(text = colour_categ))) )) + + + # ggbeeswarm (better than geom_point) + geom_beeswarm(priority = "density" + #, shape = 21 + , size = dot_size + , alpha = dot_transparency + , show.legend = FALSE + , cex = 0.8 + , aes(colour = factor(eval(parse(text = colour_categ))) )) + + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_ats) + , axis.title.y = element_text(size = my_ats) + , plot.title = element_text(size = my_pts + , hjust = 0.5 + , colour = "black" + , face = "bold") + , strip.background = element_rect(fill = colour_bp_strip) + , strip.text.x = element_text(size = my_fls + , colour = "black") + , legend.title = element_text(color = "black" + , size = my_als) + , legend.text = element_text(size = my_ats) + , legend.direction = "vertical") + + + labs(title = p_title + , x = "" + , y = "")+ + + stat_compare_means(comparisons = my_comparisonsL + , method = stat_method + , paired = my_paired + , label = stat_label[1]) + + return(bp_statP) + +} diff --git a/scripts/functions/redundant/test_lf_bp_with_stats.R b/scripts/functions/redundant/test_lf_bp_with_stats.R new file mode 100644 index 0000000..51654a1 --- /dev/null +++ b/scripts/functions/redundant/test_lf_bp_with_stats.R @@ -0,0 +1,83 @@ +setwd("~/git/LSHTM_analysis/scripts/plotting/") + +source("../functions/lf_bp_with_stats.R") +source("../functions/lf_bp.R") + +###################### +# Make plot +###################### +# Note: Data +# run other_plots_data.R +# to get the long format data to test this function + +lf_bp(lf_df = lf_dynamut2 + , p_title = "Dynamut2" + , colour_categ = "ddg_dynamut2_outcome" + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , colour_bp_strip = "khaki2" + , dot_size = 3 + , dot_transparency = 0.3 + , violin_quantiles = c(0.25, 0.5, 0.75) + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 # plot title size + , make_boxplot = F + , bp_width = "auto" + , add_stats = T + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + , stat_label = c("p.format", "p.signif") ) + +# foo = lf_dynamut2 %>% +# group_by(mutation_info, param_type) %>% +# summarise( Mean = mean(param_value, na.rm = T) +# , SD = sd(param_value, na.rm = T) +# , Median = median(param_value, na.rm = T) +# , IQR = IQR(param_value, na.rm = T) ) + +# Quick tests +plotdata_sel = subset(lf_dynamut2 + , lf_dynamut2$param_type == "ASA") + +plot_sum = plotdata_sel %>% + group_by(mutation_info, param_type) %>% + summarise(n = n() + , Mean = mean(param_value, na.rm = T) + , SD = sd(param_value, na.rm = T) + , Min = min(param_value, na.rm = T) + , Q1 = quantile(param_value, na.rm = T, 0.25) + , Median = median(param_value, na.rm = T) + , Q3 = quantile(param_value, na.rm = T, 0.75) + , Max = max(param_value, na.rm = T) ) %>% + rename('Mutation Class' = mutation_info + , Parameter = param_type) +plot_sum = as.data.frame(plot_sum, row.names = NULL) +plot_sum + +bar = compare_means(param_value ~ mutation_info + , group.by = "param_type" + , data = plotdata_sel + , paired = FALSE + , p.adjust.method = "BH") +bar2 = bar[c("param_type" + , "group1" + , "group2" + , "p.format" + , "p.signif" + , "p.adj")] %>% + rename(Parameter = param_type + , Group1 = group1 + , Group2 = group2 + , "P-value" = p.format + , "P-sig" = p.signif + , "P-adj" = p.adj) +bar2 = data.frame(bar2); bar2 + +library(Hmisc) +describe(lf_dynamut2) diff --git a/scripts/functions/stability_count_bp.R b/scripts/functions/stability_count_bp.R index 8f7d9ed..e5ed684 100644 --- a/scripts/functions/stability_count_bp.R +++ b/scripts/functions/stability_count_bp.R @@ -15,39 +15,42 @@ theme_set(theme_grey()) ## ...opt args #========================================================== stability_count_bp <- function(plotdf - , df_colname - , leg_title = "Legend title" - , axis_text_size = 25 - , axis_label_size = 22 - , leg_text_size = 20 - , leg_title_size = 22 + , df_colname = "" + , leg_title = "Legend Title" + , ats = 25 # axis text size + , als = 22 # axis label size + , lts = 20 # legend text size + , ltis = 22 # label title size + , geom_ls = 10 # geom_label size , yaxis_title = "Number of nsSNPs" , bp_plot_title = "" , label_categories = c("Destabilising", "Stabilising") , title_colour = "chocolate4" , subtitle_text = NULL - , subtitle_size = 20 + , sts = 20 , subtitle_colour = "pink" #, leg_position = c(0.73,0.8) # within plot area , leg_position = "top"){ - OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + +# OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + + OutPlot_count = ggplot(plotdf, aes_string(x = df_colname)) + geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) + geom_label(stat = "count" , aes(label = ..count..) , color = "black" , show.legend = FALSE - , size = 10) + + , size = geom_ls) + theme(axis.text.x = element_blank() , axis.title.x = element_blank() - , axis.title.y = element_text(size = axis_label_size) - , axis.text.y = element_text(size = axis_text_size) + , axis.title.y = element_text(size = als) + , axis.text.y = element_text(size = ats) , legend.position = leg_position - , legend.text = element_text(size = leg_text_size) - , legend.title = element_text(size = leg_title_size) - , plot.title = element_text(size = axis_label_size - , colour = title_colour) - , plot.subtitle = element_text(size = subtitle_size + , legend.text = element_text(size = lts) + , legend.title = element_text(size = ltis) + , plot.title = element_text(size = als + , colour = title_colour + , hjust = 0.5) + , plot.subtitle = element_text(size = sts , hjust = 0.5 , colour = subtitle_colour)) + labs(title = bp_plot_title diff --git a/scripts/functions/test_aa_prop_bp.R b/scripts/functions/tests/test_aa_prop_bp.R similarity index 100% rename from scripts/functions/test_aa_prop_bp.R rename to scripts/functions/tests/test_aa_prop_bp.R diff --git a/scripts/functions/test_af_or_calcs.R b/scripts/functions/tests/test_af_or_calcs.R similarity index 100% rename from scripts/functions/test_af_or_calcs.R rename to scripts/functions/tests/test_af_or_calcs.R diff --git a/scripts/functions/test_bp.R b/scripts/functions/tests/test_bp.R similarity index 100% rename from scripts/functions/test_bp.R rename to scripts/functions/tests/test_bp.R diff --git a/scripts/functions/tests/test_bp_lineage.R b/scripts/functions/tests/test_bp_lineage.R new file mode 100644 index 0000000..876f237 --- /dev/null +++ b/scripts/functions/tests/test_bp_lineage.R @@ -0,0 +1,62 @@ +setwd("~/git/LSHTM_analysis/scripts/plotting") + +source ('get_plotting_dfs.R') +source("../functions/bp_lineage.R") + +######################################### +# Lineage and SNP count: lineage lf data +######################################### +#========================= +# Data: All lineages or +# selected few +#========================= +sel_lineages = levels(lin_lf$sel_lineages_f) +sel_lineages +lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,] + +# drop unused factor levels +lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f) +levels(lin_lf_plot$sel_lineages_f) +#========================= +# Lineage count plot +#========================= +lin_count_bp(lin_lf_plot + , x_categ = "sel_lineages_f" + , y_count = "p_count" + , bar_fill_categ = "count_categ" + , display_label_col = "p_count" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , bar_col_labels = c("Mutations", "Total Samples") + , bar_col_values = c("grey50", "gray75") + , y_scale_percent = F # T for diversity + , y_log10 = F + , y_label = "Count") + +############################################### +# Lineage SNP diversity count: lineage wf data +############################################### +#========================= +# Data: All lineages or +# selected few +#========================= +sel_lineages = levels(lin_wf$sel_lineages_f) +sel_lineages +lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,] + +# drop unused factor levels +lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f) +levels(lin_wf_plot$sel_lineages_f) +#========================= +# Lineage Diversity plot +#========================= +lin_count_bp(lin_wf_plot + , x_categ = "sel_lineages_f" + , y_count = "snp_diversity" + , display_label_col = "snp_diversity_f" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , y_scale_percent = T + , y_label = "SNP diversity") diff --git a/scripts/functions/tests/test_bp_subcolours.R b/scripts/functions/tests/test_bp_subcolours.R new file mode 100644 index 0000000..8866ffe --- /dev/null +++ b/scripts/functions/tests/test_bp_subcolours.R @@ -0,0 +1,78 @@ +#!/usr/bin/env Rscript +#source("~/git/Misc/shiny/myshiny/gid_data.R") + +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R") + +# p1 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "duet_scaled" + , stability_outcome_colname = "duet_outcome" + , p_title = "DUET" ) + +# p2 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "foldx_scaled" + , stability_outcome_colname = "foldx_outcome" + , p_title = "FoldX" ) + +# p3 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "deepddg_scaled" + , stability_outcome_colname = "deepddg_outcome" + , p_title = "DeepDDG" ) + +# p4 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_dynamut2_scaled" + , stability_outcome_colname = "ddg_dynamut2_outcome" + , p_title = "Dynamut2" ) + +# p5 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "mcsm_na_scaled" + , stability_outcome_colname = "mcsm_na_outcome" + , p_title = "mCSM-NA" ) + +# p6 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_dynamut_scaled" + , stability_outcome_colname = "ddg_dynamut_outcome" + , p_title = "Dynamut" ) + +# p7 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_mcsm_scaled" + , stability_outcome_colname = "ddg_mcsm_outcome" + , p_title = "mCSM" ) + +# p8 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_duet_scaled" + , stability_outcome_colname = "ddg_duet_outcome" + , p_title = "DUET-d" ) + +# p9 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_sdm_scaled" + , stability_outcome_colname = "ddg_sdm_outcome" + , p_title = "SDM" ) + +# p10 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "ddg_encom_scaled" + , stability_outcome_colname = "ddg_encom_outcome" + , p_title = "ENCoM-Stability" ) + +# p11 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "dds_encom_scaled" + , stability_outcome_colname = "dds_encom_outcome" + , p_title = "ENCoM-Flexibility" ) + +# p12 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "affinity_scaled" + , stability_outcome_colname = "ligand_outcome" + , p_title = "mCSM-lig" ) diff --git a/scripts/functions/tests/test_bp_subcolours_i.R b/scripts/functions/tests/test_bp_subcolours_i.R new file mode 100644 index 0000000..d8c1b42 --- /dev/null +++ b/scripts/functions/tests/test_bp_subcolours_i.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript +#source("~/git/Misc/shiny/myshiny/gid_data.R") + +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R") + +# p1 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "duet_scaled" + , stability_outcome_colname = "duet_outcome" + , p_title = "DUET" ) + +# p2 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "foldx_scaled" + , stability_outcome_colname = "foldx_outcome" + , p_title = "FoldX" ) + +# p3 +bp_stability_hmap(plotdf = merged_df3 + , stability_colname = "deepddg_scaled" + , stability_outcome_colname = "deepddg_outcome" + , p_title = "DeepDDG" ) + +################################################## + +merged_df3_f = merged_df3 + +setDT(merged_df3_f)[, pos_count := .N, by = position] + +################################################## +ui <- basicPage( + plotOutput("plot1", click = "plot_click"), + verbatimTextOutput("info") +) + +server <- function(input, output) { + output$plot1 <- renderPlot({ + + #plot(mtcars$wt, mtcars$mpg) + bp_stability_hmap(plotdf = merged_df3_f + , xvar_colname = "position" + , stability_colname = "foldx_scaled" + , stability_outcome_colname = "foldx_outcome" + , p_title = "FoldX" ) + + }) + + output$info <- renderPrint({ + # With base graphics, need to tell it what the x and y variables are. + nearPoints(merged_df3_f, input$plot_click + , xvar = "position" + , yvar = "pos_count" + ) + }) +} + +shinyApp(ui, server) \ No newline at end of file diff --git a/scripts/functions/test_combining_dfs_plotting.R b/scripts/functions/tests/test_combining_dfs_plotting.R similarity index 100% rename from scripts/functions/test_combining_dfs_plotting.R rename to scripts/functions/tests/test_combining_dfs_plotting.R diff --git a/scripts/functions/tests/test_lf_bp.R b/scripts/functions/tests/test_lf_bp.R new file mode 100644 index 0000000..f3d2327 --- /dev/null +++ b/scripts/functions/tests/test_lf_bp.R @@ -0,0 +1,58 @@ +setwd("~/git/LSHTM_analysis/scripts/plotting/") +source("Header_TT.R") +source("../functions/lf_bp.R") +# ================================================ +# Data: run get_plotting_data.R +# to get the long format data to test this function +# drug = "streptomycin" +# gene = "gid" +# source("get_plotting_dfs.R") +# ================================================== + +###################### +# Make plot: ggplot +###################### +lf_bp(lf_df = lf_encomddg + , p_title = "ENCoM-DDG" + , colour_categ = "ddg_encom_outcome" + , x_grp = "mutation_info" + , y_var = "param_value" + , facet_var = "param_type" + , n_facet_row = 1 + , y_scales = "free_y" + , colour_bp_strip = "khaki2" + , dot_size = 3 + , dot_transparency = 0.3 + , violin_quantiles = c(0.25, 0.5, 0.75) + , my_ats = 22 # axis text size + , my_als = 20 # axis label size + , my_fls = 20 # facet label size + , my_pts = 22 # plot title size + , make_boxplot = F + , bp_width = "auto" + , add_stats = T + , stat_grp_comp = c("DM", "OM") + , stat_method = "wilcox.test" + , my_paired = FALSE + , stat_label = c("p.format", "p.signif") ) + +#wilcox.test(wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "DM"] +# , wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "OM"]) + +###################### +# Make plot: plotly +###################### +# FIXME: This labels are not working as I want! +# lf_bp_plotly(lf_df = lf_deepddg +# , p_title = "DeepDDG" +# , colour_categ = "deepddg_outcome" +# , x_grp = "mutation_info" +# , y_var = "param_value" +# , facet_var = "param_type" +# , n_facet_row = 1 +# , y_scales = "free_y" +# , colour_bp_strip = "khaki2" +# , dot_size = 3 +# , dot_transparency = 0.3 +# , violin_quantiles = c(0.25, 0.5, 0.75) +# ) diff --git a/scripts/functions/tests/test_lf_unpaired_stats.R b/scripts/functions/tests/test_lf_unpaired_stats.R new file mode 100644 index 0000000..a1ff9d1 --- /dev/null +++ b/scripts/functions/tests/test_lf_unpaired_stats.R @@ -0,0 +1,19 @@ +setwd("~/git/LSHTM_analysis/scripts/functions") +source("lf_unpaired_stats.R") + +##################### +# call stat function() +# a useful way to check stats +# for any lf data +##################### +# Note: Data +# run other_plots_data.R +# to get the long format data to test this function + +stat_results_df <- lf_unpaired_stats(lf_data = lf_duet + , lf_stat_value = "param_value" + , lf_stat_group = "mutation_info" + , lf_col_statvars = "param_type" + , my_paired = FALSE + , stat_adj = "none" +) \ No newline at end of file diff --git a/scripts/functions/tests/test_lineage_dist.R b/scripts/functions/tests/test_lineage_dist.R new file mode 100644 index 0000000..eeeebe5 --- /dev/null +++ b/scripts/functions/tests/test_lineage_dist.R @@ -0,0 +1,33 @@ +############################### +# TEST function lineage_dist.R +# to plot lineage +# dist plots with or without facet +############################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") + +source("get_plotting_dfs.R") + +cat("cols imported:" + , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2) + + +############################################################# +# without facet +lineage_distP(lin_dist_plot + , with_facet = F + , leg_label = "Mutation Class" +) + +# without facet +lineage_distP(lin_dist_plot + , with_facet = T + , facet_wrap_var = "mutation_info_labels" + , leg_label = "Mutation Class" + , leg_pos_wf = "none" + , leg_dir_wf = "horizontal" + +) \ No newline at end of file diff --git a/scripts/functions/test_plotting_data.R b/scripts/functions/tests/test_plotting_data.R similarity index 100% rename from scripts/functions/test_plotting_data.R rename to scripts/functions/tests/test_plotting_data.R diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R index 199031b..2fa892c 100755 --- a/scripts/plotting/Header_TT.R +++ b/scripts/plotting/Header_TT.R @@ -1,14 +1,11 @@ ######################################################### -### A) Installing and loading required packages +# A) Installing and loading required packages +# B) My functions +######################################################### + ######################################################### #lib_loc = "/usr/local/lib/R/site-library") -#if (!require("gplots")) { -# install.packages("gplots", dependencies = TRUE) -# library(gplots) -#} -require(extrafont) - require("getopt", quietly = TRUE) # cmd parse arguments if (!require("tidyverse")) { @@ -16,9 +13,53 @@ if (!require("tidyverse")) { library(tidyverse) } -if (!require("ggplot2")) { - install.packages("ggplot2", dependencies = TRUE) - library(ggplot2) +if (!require("shiny")) { + install.packages("shiny", dependencies = TRUE) + library(shiny) +} + +if (!require("shinyBS")) { + install.packages("shinyBS", dependencies = TRUE) + library(shinyBS) +} + +if (!require("gridExtra")) { + install.packages("gridExtra", dependencies = TRUE) + library(gridExtra) +} + +if (!require("ggridges")) { + install.packages("ggridges", dependencies = TRUE) + library(ggridges) +} + +# if (!require("ggplot2")) { +# install.packages("ggplot2", dependencies = TRUE) +# library(ggplot2) +# } + +# if (!require ("dplyr")){ +# install.packages("dplyr") +# library(dplyr) +# } + +if (!require ("DT")){ + install.packages("DT") + library(DT) +} + +if (!require ("plyr")){ + install.packages("plyr") + library(plyr) + } + +# Install +#if(!require(devtools)) install.packages("devtools") +#devtools::install_github("kassambara/ggcorrplot") + +if (!require ("ggbeeswarm")){ + install.packages("ggbeeswarm") + library(ggbeeswarm) } if (!require("plotly")) { @@ -101,11 +142,6 @@ if (!require ("psych")){ library(psych) } -if (!require ("dplyr")){ - install.packages("dplyr") - library(dplyr) -} - if (!require ("compare")){ install.packages("compare") library(compare) @@ -116,6 +152,22 @@ if (!require ("arsenal")){ library(arsenal) } +if(!require(ggseqlogo)){ + install.packages("ggseqlogo") + library(ggseqlogo) +} + +# for PDB files +if(!require(bio3d)){ + install.packages("bio3d") + library(bio3d) +} + +library(protr) +if(!require(protr)){ + install.packages("protr") + library(protr) +} #if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") @@ -123,24 +175,14 @@ if (!require ("arsenal")){ #BiocManager::install("Logolas") library("Logolas") -#install.packages("ggseqlogo") -library(ggseqlogo) +#################################### +# Load all my functions: +# only works if tidyverse is loaded +# hence included it here! +#################################### -####TIDYVERSE -# Install -#if(!require(devtools)) install.packages("devtools") -#devtools::install_github("kassambara/ggcorrplot") +func_path = "~/git/LSHTM_analysis/scripts/functions/" +source_files <- list.files(func_path, "\\.R$") # locate all .R files +map(paste0(func_path, source_files), source) # source all your R scripts! -library(ggcorrplot) - - -###for PDB files -#install.packages("bio3d") -if(!require(bio3d)){ - install.packages("bio3d") - library(bio3d) -} - -#install.packages("protr") -library(protr) diff --git a/scripts/plotting/barplots_subcolours.R b/scripts/plotting/barplots_subcolours.R index 4e4806a..1f98bec 100755 --- a/scripts/plotting/barplots_subcolours.R +++ b/scripts/plotting/barplots_subcolours.R @@ -124,4 +124,4 @@ print(outPlot_bp_lig) dev.off() ######################################################################= # End of script -######################################################################= \ No newline at end of file +######################################################################= diff --git a/scripts/plotting/basic_barplots_combined.R b/scripts/plotting/basic_barplots_combined.R old mode 100644 new mode 100755 index 7fee2d7..2643b0d --- a/scripts/plotting/basic_barplots_combined.R +++ b/scripts/plotting/basic_barplots_combined.R @@ -23,7 +23,7 @@ plot_basic_bp_combined_labelled = paste0(plotdir,"/", basic_bp_combined_labell #======================================================================= #======= -# combin DUET and Ligand affinity plots +# combine DUET and Ligand affinity plots #======= svg(plot_basic_bp_combined_labelled , width = 12, height = 12 ) diff --git a/scripts/plotting/corr_adjusted_PS_LIG.R b/scripts/plotting/corr_adjusted_PS_LIG.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/corr_data.R b/scripts/plotting/corr_data.R new file mode 100644 index 0000000..3120763 --- /dev/null +++ b/scripts/plotting/corr_data.R @@ -0,0 +1,120 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for corr plots +######################################################### + +#================================================= +# Data for Corrplots +#================================================= +cat("\n==========================================" + , "\nCORR PLOTS data: ALL params" + , "\n=========================================") + +# use data +#merged_df2 + +#---------------------------- +# columns for corr plots:PS +#---------------------------- +# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find! +big_df_colnames = data.frame(names(merged_df2)) + +corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels" + , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa" + , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af" + , "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity" + , "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance") + +#=========================== +# Corr data for plots: PS +# big_df ps: ~ merged_df2 +#=========================== + +corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select] + +#----------------------- +# formatting: some cols +# Add pretty colnames +#----------------------- +corr_df_m2_f <- corr_df_m2 %>% + rename( + DUET = duet_stability_change + , 'mCSM-lig' = ligand_affinity_change + , FoldX = ddg_foldx + , DeepDDG = deepddg + , ASA = asa + , RSA = rsa + , KD = kd_values + , RD = rd_values + , MAF = af + , 'Log (OR)' = log10_or_mychisq + , '-Log (P)' = neglog_pval_fisher + , Dynamut = ddg_dynamut + , 'ENCoM-DDG'= ddg_encom + , mCSM = ddg_mcsm + , SDM = ddg_sdm + , 'DUET-d' = ddg_duet + , 'ENCoM-DDS'= dds_encom + , Dynamut2 = ddg_dynamut2 + , 'mCSM-NA' = mcsm_na_affinity ) + +#=========================== +# Corr data for plots: PS +# short_df ps: ~merged_df3 +#=========================== + +corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),] + +na_or = sum(is.na(corr_df_m3$log10_or_mychisq)) +check1 = nrow(corr_df_m3) - na_or; check1 + +if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { + cat( "\nPASS: No. of rows for corr_df_m3 match" + , "\nPASS: No. of OR values checked: " , check1) +} else { + cat("\nFAIL: Numbers mismatch:" + , "\nExpected nrows: ", nrow(merged_df3) + , "\nGot: ", nrow(corr_df_m3) + , "\nExpected OR values: ", nrow(merged_df3_comp) + , "\nGot: ", check1) +} + +#----------------------- +# formatting: some cols +# Add pretty colnames +#----------------------- +corr_df_m3_f <- corr_df_m3 %>% + rename( + DUET = duet_stability_change + , 'mCSM-lig' = ligand_affinity_change + , FoldX = ddg_foldx + , DeepDDG = deepddg + , ASA = asa + , RSA = rsa + , KD = kd_values + , RD = rd_values + , MAF = af + , 'Log (OR)' = log10_or_mychisq + , '-Log (P)' = neglog_pval_fisher + , Dynamut = ddg_dynamut + , 'ENCoM-DDG'= ddg_encom + , mCSM = ddg_mcsm + , SDM = ddg_sdm + , 'DUET-d' = ddg_duet + , 'ENCoM-DDS'= dds_encom + , Dynamut2 = ddg_dynamut2 + , 'mCSM-NA' = mcsm_na_affinity ) + +######################################################################## +cat("\nCorr Data created:" +, "\n===================================" +, "\ncorr_df_m2: created from merged_df2" +, "\n===================================" +, "\nnrows:", nrow(corr_df_m2) +, "\nncols:", ncol(corr_df_m2) +, "\n===================================" +, "\ncorr_df_m3: created from merged_df3" +, "\n===================================" +, "\nnrows:", nrow(corr_df_m3) +, "\nncols:", ncol(corr_df_m3) +) diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/dist_plots_check.R b/scripts/plotting/dist_plots_check.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/dm_om_data.R b/scripts/plotting/dm_om_data.R new file mode 100644 index 0000000..4bd82e7 --- /dev/null +++ b/scripts/plotting/dm_om_data.R @@ -0,0 +1,416 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for dm om plots: +# generating LF data +# sourced by get_plotting_dfs.R +######################################################### +##======================================================================== +# cols to select: +# THINK: whu + +comb_df <- merged_df3[, c("mutationinformation", "mutation" + , "mutation_info","mutation_info_labels" + , "position" + , LigDist_colname + , "duet_stability_change", "duet_scaled", "duet_outcome" + , "ligand_affinity_change", "affinity_scaled", "ligand_outcome" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_scaled", "deepddg_outcome" + , "asa", "rsa" + , "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af" + , "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome" + , "ddg_dynamut", "ddg_dynamut_scaled","ddg_dynamut_outcome" + , "ddg_encom", "ddg_encom_scaled", "ddg_encom_outcome" + , "dds_encom", "dds_encom_scaled", "dds_encom_outcome" + , "ddg_mcsm", "ddg_mcsm_scaled", "ddg_mcsm_outcome" + , "ddg_sdm", "ddg_sdm_scaled", "ddg_sdm_outcome" + , "ddg_duet", "ddg_duet_scaled", "ddg_duet_outcome" + , "ddg_dynamut2","ddg_dynamut2_scaled", "ddg_dynamut2_outcome")] + + +comb_df_s = arrange(comb_df, position) + +#======================================================================= +fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] +fact_cols +lapply(comb_df_s[, fact_cols], class) +comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor) + +if (any(lapply(comb_df_s[, fact_cols], class) == "character")){ + cat("\nChanging cols to factor") + comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor) + if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +} +lapply(comb_df_s[, fact_cols], class) + +#======================================================================= +table(comb_df_s$mutation_info) + + # further checks to make sure dr and other muts are indeed unique +dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,] +dr_muts_names = unique(dr_muts$mutation) + +other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,] +other_muts_names = unique(other_muts$mutation) + +if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && + table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){ + cat("PASS: dr and other muts are indeed unique") +}else{ + cat("FAIL: dr and others muts are NOT unique!") + quit() +} + +# pretty display names i.e. labels to reduce major code duplication later +foo_cnames = data.frame(colnames(comb_df_s)) +names(foo_cnames) <- "old_name" + +stability_suffix <- paste0(delta_symbol, delta_symbol, "G") +flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") + +lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn +duet_dn = paste0("DUET ", stability_suffix); duet_dn +foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn +deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn +mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn +dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn +dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn +encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn +encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn +sdm_dn = paste0("SDM " , stability_suffix); sdm_dn +mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn + +# Change colnames of some columns using datatable +comb_df_sl = comb_df_s +names(comb_df_sl) + +setnames(comb_df_sl + , old = c("asa", "rsa", "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af" + , LigDist_colname + , "duet_scaled" + , "foldx_scaled" + , "deepddg_scaled" + , "mcsm_na_scaled" + , "ddg_dynamut_scaled" + , "ddg_dynamut2_scaled" + , "ddg_encom_scaled" + , "dds_encom_scaled" + , "ddg_sdm" + , "ddg_mcsm") + + , new = c("ASA", "RSA", "RD", "KD" + , "Log10 (OR)", "-Log (P)", "MAF" + , lig_dn + , duet_dn + , foldx_dn + , deepddg_dn + , mcsm_na_dn + , dynamut_dn + , dynamut2_dn + , encom_ddg_dn + , encom_dds_dn + , sdm_dn + , mcsm_dn) + ) + +foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl)) + +# some more pretty labels +table(comb_df_sl$mutation_info) + +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM" +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM" + +table(comb_df_sl$mutation_info) + +####################################################################### +#====================== +# Selecting dfs +# with appropriate cols +#======================= +static_cols_start = c("mutationinformation" + , "position" + , "mutation" + , "mutation_info") + +static_cols_end = c(lig_dn + , "ASA" + , "RSA" + , "RD" + , "KD") + +# ordering is important! + +######################################################################### +#============== +# DUET: LF +#============== +cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end) +wf_duet = comb_df_sl[, cols_to_select_duet] + +#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps +pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet + +expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet)) +expected_rows_lf + +# LF data: duet +lf_duet = gather(wf_duet + , key = param_type + , value = param_value + , all_of(duet_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_duet) == expected_rows_lf){ + cat("\nPASS: long format data created for ", duet_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# FoldX: LF +#============== +cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end) +wf_foldx = comb_df_sl[, cols_to_select_foldx] + +pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx + +expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx)) +expected_rows_lf + +# LF data: Foldx +lf_foldx <<- gather(wf_foldx + , key = param_type + , value = param_value + , all_of(foldx_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_foldx) == expected_rows_lf){ + cat("\nPASS: long format data created for ", foldx_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Deepddg: LF +#============== +cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end) +wf_deepddg = comb_df_sl[, cols_to_select_deepddg] + +pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg + +expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg)) +expected_rows_lf + +# LF data: Deepddg +lf_deepddg = gather(wf_deepddg + , key = param_type + , value = param_value + , all_of(deepddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_deepddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", deepddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM-NA: LF +#============== +cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) +wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + +pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na + +expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) +expected_rows_lf + +# LF data: mcsm_na +lf_mcsm_na = gather(wf_mcsm_na + , key = param_type + , value = param_value + , all_of(mcsm_na_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm_na) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_na_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut: LF +#============== +cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end) +wf_dynamut = comb_df_sl[, cols_to_select_dynamut] + +pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut + +expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut)) +expected_rows_lf + +# LF data: dynamut +lf_dynamut = gather(wf_dynamut + , key = param_type + , value = param_value + , all_of(dynamut_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut2: LF +#============== +cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end) + +wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2] + +pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2 + +expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2)) +expected_rows_lf + +# LF data: dynamut2 +lf_dynamut2 = gather(wf_dynamut2 + , key = param_type + , value = param_value + , all_of(dynamut2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut2_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# EnCOM ddg: LF +#============== +cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end) +wf_encomddg = comb_df_sl[, cols_to_select_encomddg] + +pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg + +expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg)) +expected_rows_lf + +# LF data: encomddg +lf_encomddg = gather(wf_encomddg + , key = param_type + , value = param_value + , all_of(encom_ddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", encom_ddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} +############################################################################ +#============== +# EnCOM dds: LF +#============== +cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end) +wf_encomdds = comb_df_sl[, cols_to_select_encomdds] + +pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds + +expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds)) +expected_rows_lf + +# LF data: encomdds +lf_encomdds = gather(wf_encomdds + , key = param_type + , value = param_value + , all_of(encom_dds_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomdds) == expected_rows_lf){ + cat("\nPASS: long format data created for", encom_dds_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# SDM: LF +#============== +cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end) +wf_sdm = comb_df_sl[, cols_to_select_sdm] + +pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm + +expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm)) +expected_rows_lf + +# LF data: sdm +lf_sdm = gather(wf_sdm + , key = param_type + , value = param_value + , all_of(sdm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_sdm) == expected_rows_lf){ + cat("\nPASS: long format data created for", sdm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM: LF +#============== +cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end) +wf_mcsm = comb_df_sl[, cols_to_select_mcsm] + +pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm + +expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm)) +expected_rows_lf + +# LF data: mcsm +lf_mcsm = gather(wf_mcsm + , key = param_type + , value = param_value + , all_of(mcsm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm) == expected_rows_lf){ + cat("\nPASS: long format data created for", mcsm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +#========================== +# Duet-d(from Dynamut): LF +#=========================== + +#Not created, redundant and chaos! + +############################################################################ + diff --git a/scripts/plotting/extreme_muts.R b/scripts/plotting/extreme_muts.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R old mode 100644 new mode 100755 index a9e78e9..d5d1535 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -1,32 +1,27 @@ #!/usr/bin/env Rscript + ######################################################### # TASK: Get formatted data for plots -#======================================================================= +######################################################### # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() source("Header_TT.R") -source("../functions/my_pairs_panel.R") # with lower panel turned off -source("../functions/plotting_globals.R") -source("../functions/plotting_data.R") -source("../functions/combining_dfs_plotting.R") -source("../functions/bp_subcolours.R") #******************** # cmd args passed # in from other scripts # to call this #******************** -#drug = 'streptomycin' -#gene = 'gid' + #==================== # variables for lig #==================== -LigDist_colname = "ligand_distance" -LigDist_cutoff = 10 +#LigDist_colname = "ligand_distance" +#LigDist_cutoff = 10 #=========== # input @@ -41,8 +36,8 @@ import_dirs(drug, gene) #--------------------------- if (!exists("infile_params") && exists("gene")){ #if (!is.character(infile_params) && exists("gene")){ # when running as cmd - #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA - in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA (and for gid finally) 10/09/21 + #in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid infile_params = paste0(outdir, "/", in_filename_params) cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") } @@ -54,10 +49,15 @@ pd_df = plotting_data(mcsm_df , lig_dist_colname = LigDist_colname , lig_dist_cutoff = LigDist_cutoff) -my_df = pd_df[[1]] -my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() -my_df_u_lig = pd_df[[3]] -dup_muts = pd_df[[4]] +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() + +max_ang <- round(max(my_df_u[LigDist_colname])) +min_ang <- round(min(my_df_u[LigDist_colname])) + +cat("\nLigand distance cut off, colname:", LigDist_colname + , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b" + , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b") #-------------------------------- # call: combining_dfs_plotting() @@ -81,509 +81,149 @@ all_plot_dfs = combining_dfs_plotting(my_df_u , lig_dist_colname = LigDist_colname , lig_dist_cutoff = LigDist_cutoff) -merged_df2 = all_plot_dfs[[1]] -merged_df3 = all_plot_dfs[[2]] -merged_df2_comp = all_plot_dfs[[3]] -merged_df3_comp = all_plot_dfs[[4]] -merged_df2_lig = all_plot_dfs[[5]] -merged_df3_lig = all_plot_dfs[[6]] -merged_df2_comp_lig = all_plot_dfs[[7]] -merged_df3_comp_lig = all_plot_dfs[[8]] +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +merged_df2_comp = all_plot_dfs[[3]] +merged_df3_comp = all_plot_dfs[[4]] +#====================================================================== +#TODO: Think! MOVE TO COMBINE or singular file for deepddg + +#============================ +# adding deepddg scaled values +# scale data b/w -1 and 1 +#============================ +# n = which(colnames(merged_df3) == "deepddg"); n +# +# my_min = min(merged_df3[,n]); my_min +# my_max = max(merged_df3[,n]); my_max +# +# merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0 +# , merged_df3[,n]/abs(my_min) +# , merged_df3[,n]/my_max) +# # sanity check +# my_min = min(merged_df3$deepddg_scaled); my_min +# my_max = max(merged_df3$deepddg_scaled); my_max +# +# if (my_min == -1 && my_max == 1){ +# cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1" +# #, "\nProceeding with assigning deep outcome category") +# , "\n") +# }else{ +# cat("\nFAIL: could not scale DeepDDG ddg values" +# , "Aborting!") +# } +# #################################################################### -# Data for subcols barplot (~heatmpa) +# Data for combining other dfs #################################################################### -# can include: mutation, or_kin, pwald, af_kin -cols_to_select = c("mutationinformation", "drtype" - , "wild_type" - , "position" - , "mutant_type" - , "chain", "ligand_id", "ligand_distance" - , "duet_stability_change", "duet_outcome", "duet_scaled" - , "ligand_affinity_change", "ligand_outcome", "affinity_scaled" - , "ddg_foldx", "foldx_scaled", "foldx_outcome" - , "deepddg", "deepddg_outcome" # comment out as not available for pnca - , "asa", "rsa", "rd_values", "kd_values" - , "af", "or_mychisq", "pval_fisher" - , "or_fisher", "or_logistic", "pval_logistic" - , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity" - , "wt_calcprop", "mut_calcprop") -#======================= -# Data for sub colours -# barplot: PS -#======================= +#source("other_dfs_data.R") +# Fixed this at source i.e python script +# Moved: "other_dfs_data.R" to redundant/ -cat("\nNo. of cols to select:", length(cols_to_select)) +#################################################################### +# Data for subcols barplot (~heatmap) +#################################################################### -subcols_df_ps = merged_df3[, cols_to_select] - -cat("\nNo of unique positions for ps:" - , length(unique(subcols_df_ps$position))) - -# add count_pos col that counts the no. of nsSNPS at a position -setDT(subcols_df_ps)[, pos_count := .N, by = .(position)] - -# should be a factor -if (is.factor(subcols_df_ps$duet_outcome)){ - cat("\nDuet_outcome is factor") - table(subcols_df_ps$duet_outcome) -}else{ - cat("\nConverting duet_outcome to factor") - subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome) - table(subcols_df_ps$duet_outcome) -} - -# should be -1 and 1 -min(subcols_df_ps$duet_scaled) -max(subcols_df_ps$duet_scaled) - -tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min) -tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max) - -# check unique values in normalised data -cat("\nNo. of unique values in duet scaled, no rounding:" - , length(unique(subcols_df_ps$duet_scaled))) - -# No rounding -my_grp = subcols_df_ps$duet_scaled; length(my_grp) - -# Add rounding is to be used -n = 3 -subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n) - -cat("\nNo. of unique values in duet scaled", n, "places rounding:" - , length(unique(subcols_df_ps$duet_scaledR))) - -my_grp_r = subcols_df_ps$duet_scaledR # rounding - -# Add grp cols -subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "") -subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "") - -# Call the function to create the palette based on the group defined above -subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp") -subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r") - -print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours")) -print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")) - -#======================= -# Data for sub colours -# barplot: LIG -#======================= -cat("\nNo. of cols to select:", length(cols_to_select)) - -subcols_df_lig = merged_df3_lig[, cols_to_select] - -cat("\nNo of unique positions for LIG:" - , length(unique(subcols_df_lig$position))) - -# should be a factor -if (is.factor(subcols_df_lig$ligand_outcome)){ - cat("\nLigand_outcome is factor") - table(subcols_df_lig$ligand_outcome) -}else{ - cat("\nConverting ligand_outcome to factor") - subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome) - table(subcols_df_lig$ligand_outcome) -} - -# should be -1 and 1 -min(subcols_df_lig$affinity_scaled) -max(subcols_df_lig$affinity_scaled) - -tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min) -tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max) - -# check unique values in normalised data -cat("\nNo. of unique values in affinity scaled, no rounding:" - , length(unique(subcols_df_lig$affinity_scaled))) - -# No rounding -my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig) - -# Add rounding is to be used -n = 3 -subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n) - -cat("\nNo. of unique values in duet scaled", n, "places rounding:" - , length(unique(subcols_df_lig$affinity_scaledR))) - -my_grp_lig_r = subcols_df_lig$affinity_scaledR # rounding - -# Add grp cols -subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "") -subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "") - -# Call the function to create the palette based on the group defined above -subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig") -subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r") - -print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours")) -print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours")) +#source("coloured_bp_data.R") +# Repurposed function so that params can be passed instead to generate +# data required for plotting. +# Moved "coloured_bp_data.R" to redundant/ #################################################################### # Data for logoplots #################################################################### -#------------------------- -# choose df for logoplot -#------------------------- -logo_data = merged_df3 -#logo_data = merged_df3_comp -# quick checks -colnames(logo_data) -str(logo_data) +source("logo_data.R") -c1 = unique(logo_data$position) -nrow(logo_data) -cat("No. of rows in my_data:", nrow(logo_data) - , "\nDistinct positions corresponding to snps:", length(c1) - , "\n===========================================================") -#======================================================================= -#================== -# logo data: OR -#================== -foo = logo_data[, c("position" - , "mutant_type","duet_scaled", "or_mychisq" - , "mut_prop_polarity", "mut_prop_water")] - -logo_data$log10or = log10(logo_data$or_mychisq) -logo_data_plot = logo_data[, c("position" - , "mutant_type", "or_mychisq", "log10or")] - -logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")] -wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0) - -wide_df_or = as.matrix(wide_df_or) -rownames(wide_df_or) = wide_df_or[,1] -dim(wide_df_or) -wide_df_or = wide_df_or[,-1] -str(wide_df_or) - -position_or = as.numeric(colnames(wide_df_or)) - -#================== -# logo data: logOR -#================== -logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")] -wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0) - -wide_df_logor = as.matrix(wide_df_logor) - -rownames(wide_df_logor) = wide_df_logor[,1] -wide_df_logor = subset(wide_df_logor, select = -c(1) ) -colnames(wide_df_logor) -wide_df_logor_m = data.matrix(wide_df_logor) - -rownames(wide_df_logor_m) -colnames(wide_df_logor_m) - -position_logor = as.numeric(colnames(wide_df_logor_m)) - -#=============================== -# logo data: multiple nsSNPs (>1) -#================================= -#require(data.table) - -# get freq count of positions so you can subset freq<1 -setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] - -table(logo_data$position) -table(logo_data$mut_pos_occurrence) - -max_mut = max(table(logo_data$position)) - -# extract freq_pos > 1 -my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] -u = unique(my_data_snp$position) -max_mult_mut = max(table(my_data_snp$position)) - -if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){ - - cat("PASS: positions with multiple muts extracted" - , "\nNo. of mutations:", nrow(my_data_snp) - , "\nNo. of positions:", length(u) - , "\nMax no. of muts at any position", max_mult_mut) -}else{ - cat("FAIL: positions with multiple muts could NOT be extracted" - , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] - , "\nGot:", nrow(my_data_snp) ) -} - -cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]]) - -#-------------------------------------- -# matrix for_mychisq mutant type -# frequency of mutant type by position -#--------------------------------------- -table(my_data_snp$mutant_type, my_data_snp$position) -tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) -class(tab_mt) - -# unclass to convert to matrix -tab_mt = unclass(tab_mt) -tab_mt = as.matrix(tab_mt, rownames = T) - -# should be TRUE -is.matrix(tab_mt) - -rownames(tab_mt) #aa -colnames(tab_mt) #pos - -#------------------------------------- -# matrix for wild type -# frequency of wild type by position -#------------------------------------- -tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt -tab_wt = unclass(tab_wt) - -# remove wt duplicates -wt = my_data_snp[, c("position", "wild_type")] -wt = wt[!duplicated(wt),] - -tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 - -rownames(tab_wt) -rownames(tab_wt) - -identical(colnames(tab_mt), colnames(tab_wt)) -identical(ncol(tab_mt), ncol(tab_wt)) - -#---------------------------------- -# logo data OR: multiple nsSNPs (>1) -#---------------------------------- -logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")] -#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0) -wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA) - -wide_df_or_mult = as.matrix(wide_df_or_mult) -rownames(wide_df_or_mult) = wide_df_or_mult[,1] -wide_df_or_mult = wide_df_or_mult[,-1] -str(wide_df_or_mult) - -position_or_mult = as.numeric(colnames(wide_df_or_mult)) +s1 = c("\nSuccessfully sourced logo_data.R") +cat(s1) #################################################################### -# Data for Corrplots +# Data for DM OM Plots: Long format dfs #################################################################### -cat("\n==========================================" - , "\nCORR PLOTS data: PS" - , "\n===========================================") -df_ps = merged_df2 +#source("other_plots_data.R") -#-------------------- -# adding log cols : NEW UNCOMMENT -#-------------------- -#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) -#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) +source("dm_om_data.R") -##df_ps$log10_or_kin = log10(df_ps$or_kin) -##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin) +s2 = c("\nSuccessfully sourced other_plots_data.R") +cat(s2) -#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0) +#################################################################### +# Data for Lineage barplots: WF and LF dfs +#################################################################### -#---------------------------- -# columns for corr plots:PS -#---------------------------- -# subset data to generate pairwise correlations -cols_to_select = c("mutationinformation" - , "duet_scaled" - , "foldx_scaled" - #, "mutation_info_labels" - , "asa" - , "rsa" - , "rd_values" - , "kd_values" - , "log10_or_mychisq" - , "neglog_pval_fisher" - ##, "or_kin" - ##, "neglog_pwald_kin" - , "af" - ##, "af_kin" - , "duet_outcome" - , drug) +source("lineage_data.R") -corr_data_ps = df_ps[cols_to_select] +s3 = c("\nSuccessfully sourced lineage_data.R") +cat(s3) -dim(corr_data_ps) +#################################################################### +# Data for corr plots: +#################################################################### +# make sure the above script works because merged_df2_combined is needed +source("corr_data.R") -#-------------------------------------- -# assign nice colnames (for display) -#-------------------------------------- -my_corr_colnames = c("Mutation" - , "DUET" - , "FoldX" - #, "Mutation class" - , "ASA" - , "RSA" - , "RD" - , "KD" - , "Log (OR)" - , "-Log (P)" - ##, "Adjusted (OR)" - ##, "-Log (P wald)" - , "MAF" - ##, "AF_kin" - , "duet_outcome" - , drug) - -length(my_corr_colnames) - -colnames(corr_data_ps) -colnames(corr_data_ps) <- my_corr_colnames -colnames(corr_data_ps) - -start = 1 -end = which(colnames(corr_data_ps) == drug); end # should be the last column -offset = 1 - -#=========================== -# Corr data for plots: PS -# big_df ps: ~ merged_df2 -#=========================== - -#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug -corr_ps_df2 = corr_data_ps[start:end] -head(corr_ps_df2) - -#=========================== -# Corr data for plots: PS -# short_df ps: ~merged_df3 -#=========================== -corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),] - -na_or = sum(is.na(corr_ps_df3$`Log (OR)`)) -check1 = nrow(corr_ps_df3) - na_or - -##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`)) -##check2 = nrow(corr_ps_df3) - na_adj_or - -if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { - cat( "\nPASS: No. of rows for corr_ps_df3 match" - , "\nPASS: No. of OR values checked: " , check1) -} else { - cat("\nFAIL: Numbers mismatch:" - , "\nExpected nrows: ", nrow(merged_df3) - , "\nGot: ", nrow(corr_ps_df3) - , "\nExpected OR values: ", nrow(merged_df3_comp) - , "\nGot: ", check1) -} - -#================================= -# Data for Correlation plots: LIG -#================================= -cat("\n==========================================" - , "\nCORR PLOTS data: LIG" - , "\n===========================================") - -df_lig = merged_df2_lig - -table(df_lig$ligand_outcome) - -#-------------------- -# adding log cols : NEW UNCOMMENT -#-------------------- -#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq) -#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher) - -##df_lig$log10_or_kin = log10(df_lig$or_kin) -##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin) - -#---------------------------- -# columns for corr plots:PS -#---------------------------- -# subset data to generate pairwise correlations -cols_to_select = c("mutationinformation" - , "affinity_scaled" - #, "mutation_info_labels" - , "asa" - , "rsa" - , "rd_values" - , "kd_values" - , "log10_or_mychisq" - , "neglog_pval_fisher" - ##, "or_kin" - ##, "neglog_pwald_kin" - , "af" - ##, "af_kin" - , "ligand_outcome" - , drug) - -corr_data_lig = df_lig[, cols_to_select] - -dim(corr_data_lig) - -#-------------------------------------- -# assign nice colnames (for display) -#-------------------------------------- -my_corr_colnames = c("Mutation" - , "Ligand Affinity" - #, "Mutation class" - , "ASA" - , "RSA" - , "RD" - , "KD" - , "Log (OR)" - , "-Log (P)" - ##, "Adjusted (OR)" - ##, "-Log (P wald)" - , "MAF" - ##, "MAF_kin" - , "ligand_outcome" - , drug) - -length(my_corr_colnames) - -colnames(corr_data_lig) -colnames(corr_data_lig) <- my_corr_colnames -colnames(corr_data_lig) - -start = 1 -end = which(colnames(corr_data_lig) == drug); end # should be the last column -offset = 1 - -#============================= -# Corr data for plots: LIG -# big_df lig: ~ merged_df2_lig -#============================== -#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug -corr_lig_df2 = corr_data_lig[start:end] -head(corr_lig_df2) - -#============================= -# Corr data for plots: LIG -# short_df lig: ~ merged_df3_lig -#============================== -corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),] - -na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`)) -check1_lig = nrow(corr_lig_df3) - na_or_lig - -if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) { - cat( "\nPASS: No. of rows for corr_lig_df3 match" - , "\nPASS: No. of OR values checked: " , check1_lig) -} else { - cat("\nFAIL: Numbers mismatch:" - , "\nExpected nrows: ", nrow(merged_df3_lig) - , "\nGot: ", nrow(corr_ps_df3_lig) - , "\nExpected OR values: ", nrow(merged_df3_comp_lig) - , "\nGot: ", check1_lig) -} - -# remove unnecessary columns -identical(corr_data_lig, corr_lig_df2) -identical(corr_data_ps, corr_ps_df2) - -#rm(df_ps, df_lig, corr_data_ps, corr_data_lig) +s4 = c("\nSuccessfully sourced corr_data.R") +cat(s4) ######################################################################## # End of script ######################################################################## -rm(foo) +if ( all( length(s1), length(s2), length(s3), length(s4) ) >0 ){ + cat( + "\n##################################################" + , "\nSuccessful: get_plotting_dfs.R worked!" + , "\n###################################################\n") +} else { + cat( + "\n#################################################" + , "\nFAIL: get_plotting_dfs.R didn't complete fully!Please check" + , "\n###################################################\n" ) + } + +######################################################################## +# clear excess variables +rm(c1, c2, c3, c4, check1 + , curr_count, curr_total + , cols_check + , cols_to_select + , cols_to_select_deepddg + , cols_to_select_duet + , cols_to_select_dynamut + , cols_to_select_dynamut2 + , cols_to_select_encomddg + , cols_to_select_encomdds + , cols_to_select_mcsm + , cols_to_select_mcsm_na + , cols_to_select_sdm + , infile_metadata + , infile_params + #, infilename_dynamut + #, infilename_dynamut2 + #, infilename_mcsm_f_snps + #, infilename_mcsm_na + ) -cat("\n===================================================\n" - , "\nSuccessful: get_plotting_dfs.R worked!" - , "\n====================================================") \ No newline at end of file +rm(pivot_cols +, pivot_cols_deepddg +, pivot_cols_duet +, pivot_cols_dynamut +, pivot_cols_dynamut2 +, pivot_cols_encomddg +, pivot_cols_encomdds +, pivot_cols_foldx +, pivot_cols_mcsm +, pivot_cols_mcsm_na +, pivot_cols_n +, pivot_cols_sdm) + +rm(expected_cols +, expected_ncols +, expected_rows +, expected_rows_lf +, fact_cols) + + diff --git a/scripts/plotting/get_plotting_dfs_with_lig.R b/scripts/plotting/get_plotting_dfs_with_lig.R new file mode 100755 index 0000000..f17e997 --- /dev/null +++ b/scripts/plotting/get_plotting_dfs_with_lig.R @@ -0,0 +1,589 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Get formatted data for plots +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +source("Header_TT.R") +source("../functions/my_pairs_panel.R") # with lower panel turned off +source("../functions/plotting_globals.R") +source("../functions/plotting_data.R") +source("../functions/combining_dfs_plotting.R") +source("../functions/bp_subcolours.R") + +#******************** +# cmd args passed +# in from other scripts +# to call this +#******************** +#drug = 'streptomycin' +#gene = 'gid' +#==================== +# variables for lig +#==================== + +LigDist_colname = "ligand_distance" +LigDist_cutoff = 10 + +#=========== +# input +#=========== +#--------------------- +# call: import_dirs() +#--------------------- +import_dirs(drug, gene) + +#--------------------------- +# call: plotting_data() +#--------------------------- +if (!exists("infile_params") && exists("gene")){ +#if (!is.character(infile_params) && exists("gene")){ # when running as cmd + #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA + in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + infile_params = paste0(outdir, "/", in_filename_params) + cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") +} + +# Input 1: read _comb_afor.csv +cat("\nReading mcsm combined data file: ", infile_params) +mcsm_df = read.csv(infile_params, header = T) +pd_df = plotting_data(mcsm_df + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) + +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] + +#-------------------------------- +# call: combining_dfs_plotting() +#-------------------------------- +if (!exists("infile_metadata") && exists("gene")){ +#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd + in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid + infile_metadata = paste0(outdir, "/", in_filename_metadata) + cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") +} + +# Input 2: read _meta data.csv +cat("\nReading meta data file: ", infile_metadata) + +gene_metadata <- read.csv(infile_metadata + , stringsAsFactors = F + , header = T) + +all_plot_dfs = combining_dfs_plotting(my_df_u + , gene_metadata + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) + +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +merged_df2_comp = all_plot_dfs[[3]] +merged_df3_comp = all_plot_dfs[[4]] +merged_df2_lig = all_plot_dfs[[5]] +merged_df3_lig = all_plot_dfs[[6]] +merged_df2_comp_lig = all_plot_dfs[[7]] +merged_df3_comp_lig = all_plot_dfs[[8]] + +#################################################################### +# Data for subcols barplot (~heatmap) +#################################################################### +# can include: mutation, or_kin, pwald, af_kin +cols_to_select = c("mutationinformation", "drtype" + , "wild_type" + , "position" + , "mutant_type" + , "chain", "ligand_id", "ligand_distance" + , "duet_stability_change", "duet_outcome", "duet_scaled" + , "ligand_affinity_change", "ligand_outcome", "affinity_scaled" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_outcome" # comment out as not available for pnca + , "asa", "rsa", "rd_values", "kd_values" + , "af", "or_mychisq", "pval_fisher" + , "or_fisher", "or_logistic", "pval_logistic" + , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity" + , "wt_calcprop", "mut_calcprop") + +#======================= +# Data for sub colours +# barplot: PS +#======================= + +cat("\nNo. of cols to select:", length(cols_to_select)) + +subcols_df_ps = merged_df3[, cols_to_select] + +cat("\nNo of unique positions for ps:" + , length(unique(subcols_df_ps$position))) + +# add count_pos col that counts the no. of nsSNPS at a position +setDT(subcols_df_ps)[, pos_count := .N, by = .(position)] + +# should be a factor +if (is.factor(subcols_df_ps$duet_outcome)){ + cat("\nDuet_outcome is factor") + table(subcols_df_ps$duet_outcome) +}else{ + cat("\nConverting duet_outcome to factor") + subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome) + table(subcols_df_ps$duet_outcome) +} + +# should be -1 and 1 +min(subcols_df_ps$duet_scaled) +max(subcols_df_ps$duet_scaled) + +tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min) +tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max) + +# check unique values in normalised data +cat("\nNo. of unique values in duet scaled, no rounding:" + , length(unique(subcols_df_ps$duet_scaled))) + +# No rounding +my_grp = subcols_df_ps$duet_scaled; length(my_grp) + +# Add rounding is to be used +n = 3 +subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n) + +cat("\nNo. of unique values in duet scaled", n, "places rounding:" + , length(unique(subcols_df_ps$duet_scaledR))) + +my_grp_r = subcols_df_ps$duet_scaledR # rounding + +# Add grp cols +subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "") +subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "") + +# Call the function to create the palette based on the group defined above +subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp") +subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r") + +print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours")) +print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")) + +#======================= +# Data for sub colours +# barplot: LIG +#======================= +cat("\nNo. of cols to select:", length(cols_to_select)) + +subcols_df_lig = merged_df3_lig[, cols_to_select] + +cat("\nNo of unique positions for LIG:" + , length(unique(subcols_df_lig$position))) + +# should be a factor +if (is.factor(subcols_df_lig$ligand_outcome)){ + cat("\nLigand_outcome is factor") + table(subcols_df_lig$ligand_outcome) +}else{ + cat("\nConverting ligand_outcome to factor") + subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome) + table(subcols_df_lig$ligand_outcome) +} + +# should be -1 and 1 +min(subcols_df_lig$affinity_scaled) +max(subcols_df_lig$affinity_scaled) + +tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min) +tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max) + +# check unique values in normalised data +cat("\nNo. of unique values in affinity scaled, no rounding:" + , length(unique(subcols_df_lig$affinity_scaled))) + +# No rounding +my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig) + +# Add rounding is to be used +n = 3 +subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n) + +cat("\nNo. of unique values in duet scaled", n, "places rounding:" + , length(unique(subcols_df_lig$affinity_scaledR))) + +my_grp_lig_r = subcols_df_lig$affinity_scaledR # rounding + +# Add grp cols +subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "") +subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "") + +# Call the function to create the palette based on the group defined above +subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig") +subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r") + +print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours")) +print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours")) + +#################################################################### +# Data for logoplots +#################################################################### +#------------------------- +# choose df for logoplot +#------------------------- +logo_data = merged_df3 +#logo_data = merged_df3_comp + +# quick checks +colnames(logo_data) +str(logo_data) + +c1 = unique(logo_data$position) +nrow(logo_data) +cat("No. of rows in my_data:", nrow(logo_data) + , "\nDistinct positions corresponding to snps:", length(c1) + , "\n===========================================================") +#======================================================================= +#================== +# logo data: OR +#================== +foo = logo_data[, c("position" + , "mutant_type","duet_scaled", "or_mychisq" + , "mut_prop_polarity", "mut_prop_water")] + +logo_data$log10or = log10(logo_data$or_mychisq) +logo_data_plot = logo_data[, c("position" + , "mutant_type", "or_mychisq", "log10or")] + +logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")] +wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0) + +wide_df_or = as.matrix(wide_df_or) +rownames(wide_df_or) = wide_df_or[,1] +dim(wide_df_or) +wide_df_or = wide_df_or[,-1] +str(wide_df_or) + +position_or = as.numeric(colnames(wide_df_or)) + +#================== +# logo data: logOR +#================== +logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")] +wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0) + +wide_df_logor = as.matrix(wide_df_logor) + +rownames(wide_df_logor) = wide_df_logor[,1] +wide_df_logor = subset(wide_df_logor, select = -c(1) ) +colnames(wide_df_logor) +wide_df_logor_m = data.matrix(wide_df_logor) + +rownames(wide_df_logor_m) +colnames(wide_df_logor_m) + +position_logor = as.numeric(colnames(wide_df_logor_m)) + +#=============================== +# logo data: multiple nsSNPs (>1) +#================================= +#require(data.table) + +# get freq count of positions so you can subset freq<1 +setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] + +table(logo_data$position) +table(logo_data$mut_pos_occurrence) + +max_mut = max(table(logo_data$position)) + +# extract freq_pos > 1 +my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] +u = unique(my_data_snp$position) +max_mult_mut = max(table(my_data_snp$position)) + +if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){ + + cat("PASS: positions with multiple muts extracted" + , "\nNo. of mutations:", nrow(my_data_snp) + , "\nNo. of positions:", length(u) + , "\nMax no. of muts at any position", max_mult_mut) +}else{ + cat("FAIL: positions with multiple muts could NOT be extracted" + , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] + , "\nGot:", nrow(my_data_snp) ) +} + +cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]]) + +#-------------------------------------- +# matrix for_mychisq mutant type +# frequency of mutant type by position +#--------------------------------------- +table(my_data_snp$mutant_type, my_data_snp$position) +tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) +class(tab_mt) + +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +# should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos + +#------------------------------------- +# matrix for wild type +# frequency of wild type by position +#------------------------------------- +tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt +tab_wt = unclass(tab_wt) + +# remove wt duplicates +wt = my_data_snp[, c("position", "wild_type")] +wt = wt[!duplicated(wt),] + +tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 + +rownames(tab_wt) +rownames(tab_wt) + +identical(colnames(tab_mt), colnames(tab_wt)) +identical(ncol(tab_mt), ncol(tab_wt)) + +#---------------------------------- +# logo data OR: multiple nsSNPs (>1) +#---------------------------------- +logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")] +#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0) +wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA) + +wide_df_or_mult = as.matrix(wide_df_or_mult) +rownames(wide_df_or_mult) = wide_df_or_mult[,1] +wide_df_or_mult = wide_df_or_mult[,-1] +str(wide_df_or_mult) + +position_or_mult = as.numeric(colnames(wide_df_or_mult)) + +#################################################################### +# Data for Corrplots +#################################################################### +cat("\n==========================================" + , "\nCORR PLOTS data: PS" + , "\n===========================================") + +df_ps = merged_df2 + +#-------------------- +# adding log cols : NEW UNCOMMENT +#-------------------- +#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) +#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) + +##df_ps$log10_or_kin = log10(df_ps$or_kin) +##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin) + +#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0) + +#---------------------------- +# columns for corr plots:PS +#---------------------------- +# subset data to generate pairwise correlations +cols_to_select = c("mutationinformation" + , "duet_scaled" + , "foldx_scaled" + #, "mutation_info_labels" + , "asa" + , "rsa" + , "rd_values" + , "kd_values" + , "log10_or_mychisq" + , "neglog_pval_fisher" + ##, "or_kin" + ##, "neglog_pwald_kin" + , "af" + ##, "af_kin" + , "duet_outcome" + , drug) + +corr_data_ps = df_ps[cols_to_select] + +dim(corr_data_ps) + +#-------------------------------------- +# assign nice colnames (for display) +#-------------------------------------- +my_corr_colnames = c("Mutation" + , "DUET" + , "FoldX" + #, "Mutation class" + , "ASA" + , "RSA" + , "RD" + , "KD" + , "Log (OR)" + , "-Log (P)" + ##, "Adjusted (OR)" + ##, "-Log (P wald)" + , "MAF" + ##, "AF_kin" + , "duet_outcome" + , drug) + +length(my_corr_colnames) + +colnames(corr_data_ps) +colnames(corr_data_ps) <- my_corr_colnames +colnames(corr_data_ps) + +start = 1 +end = which(colnames(corr_data_ps) == drug); end # should be the last column +offset = 1 + +#=========================== +# Corr data for plots: PS +# big_df ps: ~ merged_df2 +#=========================== + +#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug +corr_ps_df2 = corr_data_ps[start:end] +head(corr_ps_df2) + +#=========================== +# Corr data for plots: PS +# short_df ps: ~merged_df3 +#=========================== +corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),] + +na_or = sum(is.na(corr_ps_df3$`Log (OR)`)) +check1 = nrow(corr_ps_df3) - na_or + +##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`)) +##check2 = nrow(corr_ps_df3) - na_adj_or + +if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) { + cat( "\nPASS: No. of rows for corr_ps_df3 match" + , "\nPASS: No. of OR values checked: " , check1) +} else { + cat("\nFAIL: Numbers mismatch:" + , "\nExpected nrows: ", nrow(merged_df3) + , "\nGot: ", nrow(corr_ps_df3) + , "\nExpected OR values: ", nrow(merged_df3_comp) + , "\nGot: ", check1) +} + +#================================= +# Data for Correlation plots: LIG +#================================= +cat("\n==========================================" + , "\nCORR PLOTS data: LIG" + , "\n===========================================") + +df_lig = merged_df2_lig + +table(df_lig$ligand_outcome) + +#-------------------- +# adding log cols : NEW UNCOMMENT +#-------------------- +#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq) +#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher) + +##df_lig$log10_or_kin = log10(df_lig$or_kin) +##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin) + +#---------------------------- +# columns for corr plots:PS +#---------------------------- +# subset data to generate pairwise correlations +cols_to_select = c("mutationinformation" + , "affinity_scaled" + #, "mutation_info_labels" + , "asa" + , "rsa" + , "rd_values" + , "kd_values" + , "log10_or_mychisq" + , "neglog_pval_fisher" + ##, "or_kin" + ##, "neglog_pwald_kin" + , "af" + ##, "af_kin" + , "ligand_outcome" + , drug) + +corr_data_lig = df_lig[, cols_to_select] + +dim(corr_data_lig) + +#-------------------------------------- +# assign nice colnames (for display) +#-------------------------------------- +my_corr_colnames = c("Mutation" + , "Ligand Affinity" + #, "Mutation class" + , "ASA" + , "RSA" + , "RD" + , "KD" + , "Log (OR)" + , "-Log (P)" + ##, "Adjusted (OR)" + ##, "-Log (P wald)" + , "MAF" + ##, "MAF_kin" + , "ligand_outcome" + , drug) + +length(my_corr_colnames) + +colnames(corr_data_lig) +colnames(corr_data_lig) <- my_corr_colnames +colnames(corr_data_lig) + +start = 1 +end = which(colnames(corr_data_lig) == drug); end # should be the last column +offset = 1 + +#============================= +# Corr data for plots: LIG +# big_df lig: ~ merged_df2_lig +#============================== +#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug +corr_lig_df2 = corr_data_lig[start:end] +head(corr_lig_df2) + +#============================= +# Corr data for plots: LIG +# short_df lig: ~ merged_df3_lig +#============================== +corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),] + +na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`)) +check1_lig = nrow(corr_lig_df3) - na_or_lig + +if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) { + cat( "\nPASS: No. of rows for corr_lig_df3 match" + , "\nPASS: No. of OR values checked: " , check1_lig) +} else { + cat("\nFAIL: Numbers mismatch:" + , "\nExpected nrows: ", nrow(merged_df3_lig) + , "\nGot: ", nrow(corr_ps_df3_lig) + , "\nExpected OR values: ", nrow(merged_df3_comp_lig) + , "\nGot: ", check1_lig) +} + +# remove unnecessary columns +identical(corr_data_lig, corr_lig_df2) +identical(corr_data_ps, corr_ps_df2) + +#rm(df_ps, df_lig, corr_data_ps, corr_data_lig) + +######################################################################## +# End of script +######################################################################## +rm(foo) + +cat("\n===================================================\n" + , "\nSuccessful: get_plotting_dfs.R worked!" + , "\n====================================================") \ No newline at end of file diff --git a/scripts/plotting/ggcorr_all_PS_LIG.R b/scripts/plotting/ggcorr_all_PS_LIG.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/hist_af_or_base.R b/scripts/plotting/hist_af_or_base.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/hist_af_or_combined.R b/scripts/plotting/hist_af_or_combined.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/legend_adjustment.R b/scripts/plotting/legend_adjustment.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R new file mode 100755 index 0000000..837e57b --- /dev/null +++ b/scripts/plotting/lineage_basic_barplots_combined.R @@ -0,0 +1,127 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() +######################################################### +# TASK: Basic lineage barplots showing numbers + +# Output: Basic barplot with lineage samples and mut count +# + SNP diversity + +########################################################## +# Installing and loading required packages +########################################################## +source("Header_TT.R") + +#=========== +# input +#=========== +#drug = 'streptomycin' +#gene = 'gid' + +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character", + "data_file1" , "fa", 2, "character", + "data_file2" , "fb", 2, "character" +), byrow = TRUE, ncol = 4) + +opt = getopt(spec) + +drug = opt$drug +gene = opt$gene +infile_params = opt$data_file1 +infile_metadata = opt$data_file2 + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} + +source ('get_plotting_dfs.R') + +#======= +# output +#======= +# plot 1 +basic_bp_lineage_cl = "basic_lineage_barplots_combined.svg" +plot_basic_bp_lineage_cl = paste0(plotdir,"/", basic_bp_lineage_cl) +plot_basic_bp_lineage_cl +################################################################# +#============================= +# PLOT 1: Lineage count plot: +# LF data +#============================= +#------------------------ +# Data: All lineages or +# selected few +#------------------------ +lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%c("L1", "L2", "L3", "L4"),] +str(lin_lf_plot) + +# drop unused factor levels +lin_lf_plot$sel_lineages = factor(lin_lf_plot$sel_lineages) +levels(lin_lf_plot$sel_lineages) +str(lin_lf_plot) + +#------------------------ +# plot from my function: +#------------------------ +lin_countP = lin_count_bp(lin_lf_plot + , x_categ = "sel_lineages" + , y_count = "p_count" + , bar_fill_categ = "count_categ" + , display_label_col = "p_count" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , bar_col_labels = c("Mutations", "Total Samples") + , bar_col_values = c("grey50", "gray75") + , y_scale_percent = F # T for diversity + , y_log10 = F + , y_label = "Count") +lin_countP +#================================ +# PLOT 2: Lineage Diversity plot +# WF data +#================================ +#------------------------ +# Data: All lineages or +# selected few +#------------------------ +lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%c("L1", "L2", "L3", "L4"),] +str(lin_wf_plot) + +# drop unused factor levels +lin_wf_plot$sel_lineages = factor(lin_wf_plot$sel_lineages) +levels(lin_wf_plot$sel_lineages) +str(lin_wf_plot) + +#------------------------ +# plot from my function: +#------------------------ +lin_diversityP = lin_count_bp(lin_wf_plot + , x_categ = "sel_lineages" + , y_count = "snp_diversity" + , display_label_col = "snp_diversity_f" + , bar_stat_stype = "identity" + , x_lab_angle = 90 + , my_xats = 20 + , y_scale_percent = T + , y_label = "SNP diversity") + +lin_diversityP +#########################################################################333 +#================================ +# Combine plots +#================================ + +svg(plot_basic_bp_lineage_cl , width = 8, height = 15 ) + +lineage_bp_combined = cowplot::plot_grid(lin_countP, lin_diversityP + #, labels = c("(a)", "(b)", "(c)", "(d)") + , nrow = 2 + , labels = "AUTO" + , label_size = 25) + +lineage_bp_combined +dev.off() diff --git a/scripts/plotting/lineage_data.R b/scripts/plotting/lineage_data.R new file mode 100755 index 0000000..9549863 --- /dev/null +++ b/scripts/plotting/lineage_data.R @@ -0,0 +1,147 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for lineage barplots: +# WF and LF data with lineage sample, and snp counts +# sourced by get_plotting_dfs.R +######################################################### + +#================================================= +# Get data with lineage count, and snp diversity +#================================================= +table(merged_df2$lineage) + +if (table(merged_df2$lineage == "")[[2]]) { + +cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]]) + +} + +table(merged_df2$lineage_labels) +class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels) + +#========================================== +# WF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +#========================================== +sel_lineages = levels(merged_df2$lineage_labels) + +lin_wf = data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i]) + #print(curr_total) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = merged_df2[merged_df2$lineage_labels==i,] + print(paste0(i, "=======\n")) + print(length(unique(foo$mutationinformation))) + curr_count = length(unique(foo$mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} +lin_wf + +# Add these counts as columns to the df +lin_wf$num_snps_u = total_snps_u +lin_wf$total_samples = total_samples +lin_wf + +# Add SNP diversity +lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples +lin_wf + +#---------------------- +# Add some formatting +#---------------------- +# SNP diversity +lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0) +lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%") + +lin_wf$sel_lineages + +# Important: Check factors so that x-axis categ appear as you want +lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + +levels(lin_wf$sel_lineages) + +#================================= +# LF data: lineages with +# snp count +# total_samples +# snp diversity (perc) +#================================= +names(lin_wf) +tot_cols = ncol(lin_wf) +pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f") +pivot_cols_n = length(pivot_cols) + +expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n ) + +lin_lf <- gather(lin_wf + , count_categ + , p_count + , num_snps_u:total_samples + , factor_key = TRUE) +lin_lf + +# quick checks +if ( nrow(lin_lf ) == expected_rows ){ + cat("\nPASS: Lineage LF data created" + , "\nnrow: ", nrow(lin_lf) + , "\nncol: ", ncol(lin_lf)) +} else { + cat("\nFAIL: numbers mismatch" + , "\nExpected nrow: ", expected_rows) +} + +# Important: Relevel factors so that x-axis categ appear as you want +lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1" + , "L2" + , "L3" + , "L4" + , "L5" + , "L6" + , "L7" + , "LBOV" + , "L1;L2" + , "L1;L3" + , "L1;L4" + , "L2;L3" + , "L2;L3;L4" + , "L2;L4" + , "L2;L6" + , "L2;LBOV" + , "L3;L4" + , "L4;L6" + , "L4;L7" + , "")) + +levels(lin_lf$sel_lineages) + +################################################################ + diff --git a/scripts/plotting/lineage_dist_plots.R b/scripts/plotting/lineage_dist_plots.R new file mode 100644 index 0000000..cd1563d --- /dev/null +++ b/scripts/plotting/lineage_dist_plots.R @@ -0,0 +1,143 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Lineage dist plots: ggridges + +# Output: 1 or 2 SVGs for PS stability + +########################################################## +# Installing and loading required packages +########################################################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") # also loads all my functions + +#=========== +# input +#=========== +drug = "streptomycin" +gene = "gid" +#source("get_plotting_dfs.R") + +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character", + "data_file1" , "fa", 2, "character", + "data_file2" , "fb", 2, "character" +), byrow = TRUE, ncol = 4) + +opt = getopt(spec) + +drug = opt$drug +gene = opt$gene +infile_params = opt$data_file1 +infile_metadata = opt$data_file2 + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} + +#======= +# output +#======= +lineage_dist_dm_om_ps = "lineage_dist_dm_om_PS.svg" +plot_lineage_dist_dm_om_ps = paste0(plotdir,"/", lineage_dist_dm_om_ps) +#======================================================================== + +########################### +# Data for plots +# you need merged_df2_combined or merged_df2_combined_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +#=================== +# Data for plots +#=================== +# quick checks +table(merged_df2_combined$mutation_info_labels); levels(merged_df2_combined$lineage_labels) +table(merged_df2_combined$lineage_labels); levels(merged_df2_combined$mutation_info_labels) + +sel_lineages = c("L1", "L2", "L3", "L4") + +lin_dist_plot = merged_df2_combined[merged_df2_combined$lineage_labels%in%sel_lineages,] +table(lin_dist_plot$lineage_labels); nlevels(lin_dist_plot$lineage_labels) + +# refactor +lin_dist_plot$lineage_labels = factor(lin_dist_plot$lineage_labels) +nlevels(lin_dist_plot$lineage_labels) + +#----------------------------------------------------------------------- +# IMPORTANT RESULTS to put inside table or text for interactive plots + +sum(table(lin_dist_plot$lineage_labels)) #{RESULT: Total number of samples for lineage} + +table(lin_dist_plot$lineage_labels)#{RESULT: No of samples within lineage} + +length(unique(lin_dist_plot$mutationinformation))#{Result: No. of unique mutations selected lineages contribute to} +length(lin_dist_plot$mutationinformation) + +u2 = unique(merged_df2_combined$mutationinformation) +u = unique(lin_dist_plot$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} +#----------------------------------------------------------------------- + +my_x_and_t = c("duet_scaled", "mCSM-DUET") +my_x_and_t = c("foldx_scaled", "FoldX") +#my_x_and_t = c("deepddg_scaled", "DeepDDG") + +my_x_and_t = c("ddg_dynamut2_scaled", "Dynamut2") +my_x_and_t = c("ddg_dynamut_scaled", "Dynamut") + +my_x_and_t = c("ddg_mcsm_scaled", "mCSM") +my_x_and_t = c("ddg_sdm_scaled", "SDM") +my_x_and_t = c("ddg_duet_scaled", "DUET-d") + +my_x_and_t = c("ddg_encom_scaled", "EnCOM-Stability") +my_x_and_t = c("dds_encom_scaled", "EnCOM-Flexibility") + +my_x_and_t = c("mcsm_na_scaled", "mCSM-NA") + +# TO DO +my_x_and_t = c("affinity_scaled", "mCSM-Lig") #ligdist< 10 + +#===================== +# Plot: without facet +#===================== + +linP_dm_om = lineage_distP(lin_dist_plot + , x_axis = my_x_and_t[1] + , x_lab = my_x_and_t[2] + , y_axis = "lineage_labels" + , leg_label = "Mutation Class" + , with_facet = F) +linP_dm_om + +#===================== +# Plot: with facet +#===================== + +linP_dm_om_facet = lineage_distP(lin_dist_plot + , x_axis = my_x_and_t[1] + , x_lab = my_x_and_t[2] + , y_axis = "lineage_labels" + , with_facet = T + , facet_wrap_var = "mutation_info_labels" + , leg_label = "Mutation Class" + , leg_pos_wf = "none" + , leg_dir_wf = "horizontal") +linP_dm_om_facet + +#================= +# output plot: +# without facet +#================= +svg(plot_lineage_dist_dm_om_ps) + +linP_dm_om + +dev.off() diff --git a/scripts/plotting/logo_data.R b/scripts/plotting/logo_data.R new file mode 100644 index 0000000..7eaf1b6 --- /dev/null +++ b/scripts/plotting/logo_data.R @@ -0,0 +1,142 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for Logo_plots +######################################################### +#------------------------- +# choose df for logoplot +#------------------------- +logo_data = merged_df3 +#logo_data = merged_df3_comp + +# quick checks +colnames(logo_data) +str(logo_data) + +c1 = unique(logo_data$position) +nrow(logo_data) +cat("No. of rows in my_data:", nrow(logo_data) + , "\nDistinct positions corresponding to snps:", length(c1) + , "\n===========================================================") +#======================================================================= +#================== +# logo data: OR +#================== +foo = logo_data[, c("position" + , "mutant_type","duet_scaled", "or_mychisq" + , "mut_prop_polarity", "mut_prop_water")] + +logo_data$log10or = log10(logo_data$or_mychisq) +logo_data_plot = logo_data[, c("position" + , "mutant_type", "or_mychisq", "log10or")] + +logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")] +wide_df_or = logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0) + +wide_df_or = as.matrix(wide_df_or) +rownames(wide_df_or) = wide_df_or[,1] +dim(wide_df_or) +wide_df_or = wide_df_or[,-1] +str(wide_df_or) + +position_or = as.numeric(colnames(wide_df_or)) + +#================== +# logo data: logOR +#================== +logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")] +wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0) + +wide_df_logor = as.matrix(wide_df_logor) + +rownames(wide_df_logor) = wide_df_logor[,1] +wide_df_logor = subset(wide_df_logor, select = -c(1) ) +colnames(wide_df_logor) +wide_df_logor_m = data.matrix(wide_df_logor) + +rownames(wide_df_logor_m) +colnames(wide_df_logor_m) + +position_logor = as.numeric(colnames(wide_df_logor_m)) + +#=============================== +# logo data: multiple nsSNPs (>1) +#================================= +#require(data.table) + +# get freq count of positions so you can subset freq<1 +setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] + +table(logo_data$position) +table(logo_data$mut_pos_occurrence) + +max_mut = max(table(logo_data$position)) + +# extract freq_pos > 1 +my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] +u = unique(my_data_snp$position) +max_mult_mut = max(table(my_data_snp$position)) + +if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){ + + cat("PASS: positions with multiple muts extracted" + , "\nNo. of mutations:", nrow(my_data_snp) + , "\nNo. of positions:", length(u) + , "\nMax no. of muts at any position", max_mult_mut) +}else{ + cat("FAIL: positions with multiple muts could NOT be extracted" + , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] + , "\nGot:", nrow(my_data_snp) ) +} + +cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]]) + +#-------------------------------------- +# matrix for_mychisq mutant type +# frequency of mutant type by position +#--------------------------------------- +table(my_data_snp$mutant_type, my_data_snp$position) +tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) +class(tab_mt) + +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +# should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos + +#------------------------------------- +# matrix for wild type +# frequency of wild type by position +#------------------------------------- +tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt +tab_wt = unclass(tab_wt) + +# remove wt duplicates +wt = my_data_snp[, c("position", "wild_type")] +wt = wt[!duplicated(wt),] + +tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 + +rownames(tab_wt) +rownames(tab_wt) + +identical(colnames(tab_mt), colnames(tab_wt)) +identical(ncol(tab_mt), ncol(tab_wt)) + +#---------------------------------- +# logo data OR: multiple nsSNPs (>1) +#---------------------------------- +logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")] +#wide_df_or = logo_data_or %>% spread(position, or_mychisq, fill = 0.0) +wide_df_or_mult = logo_data_or_mult %>% spread(position, or_mychisq, fill = NA) + +wide_df_or_mult = as.matrix(wide_df_or_mult) +rownames(wide_df_or_mult) = wide_df_or_mult[,1] +wide_df_or_mult = wide_df_or_mult[,-1] +str(wide_df_or_mult) + +position_or_mult = as.numeric(colnames(wide_df_or_mult)) diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/or_plots_combined.R b/scripts/plotting/or_plots_combined.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/other_plots_combined.R b/scripts/plotting/other_plots_combined.R old mode 100644 new mode 100755 index d927808..3047f38 --- a/scripts/plotting/other_plots_combined.R +++ b/scripts/plotting/other_plots_combined.R @@ -35,7 +35,7 @@ plot_dr_other_combined_labelled = paste0(plotdir,"/", dr_other_combined_labell #my_comparisons <- list( c(dr_muts_col, other_muts_col) ) my_comparisons <- list( c("DM", "OM") ) -my_ats = 22# axis text size +my_ats = 22 # axis text size my_als = 20 # axis label size my_fls = 20 # facet label size my_pts = 22 # plot title size @@ -45,12 +45,15 @@ my_pts = 22 # plot title size #=========== # Plot1: PS #=========== -my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type" - , data = df_lf_ps, paired = FALSE, p.adjust.method = "BH") +# my_stat_ps = compare_means(param_value~mutation_info +# , group.by = "param_type" +# , data = df_lf_ps +# , paired = FALSE +# , p.adjust.method = "BH") y_value = "param_value" -p1 = ggplot(df_lf_ps, aes(x = mutation_info +p1 = ggplot(lf_duet, aes(x = mutation_info , y = eval(parse(text=y_value)) )) + facet_wrap(~ param_type , nrow = 1 @@ -61,7 +64,7 @@ p1 = ggplot(df_lf_ps, aes(x = mutation_info geom_point(position = position_jitterdodge(dodge.width=0.01) , alpha = 0.5 , show.legend = FALSE - , aes(colour = factor(duet_outcome))) + + , aes(colour = duet_outcome)) + theme(axis.text.x = element_text(size = my_ats) , axis.text.y = element_text(size = my_ats , angle = 0 diff --git a/scripts/plotting/output_tables.R b/scripts/plotting/output_tables.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/ps_plots_combined.R b/scripts/plotting/ps_plots_combined.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/redundant/coloured_bp_data.R b/scripts/plotting/redundant/coloured_bp_data.R new file mode 100644 index 0000000..a1f0964 --- /dev/null +++ b/scripts/plotting/redundant/coloured_bp_data.R @@ -0,0 +1,80 @@ +#!/usr/bin/env Rscript +################################################################# +# TASK: Script to add bp colours ~ barplot heatmap +################################################################# + +my_df = merged_df3 + +cols_to_select = c("mutationinformation", "drtype" + , "wild_type" + , "position" + , "mutant_type" + , "chain", "ligand_id", "ligand_distance" + , "duet_stability_change", "duet_outcome", "duet_scaled" + , "ligand_affinity_change", "ligand_outcome", "affinity_scaled" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_outcome" # comment out as not available for pnca + , "asa", "rsa", "rd_values", "kd_values" + , "af", "or_mychisq", "pval_fisher" + , "or_fisher", "or_logistic", "pval_logistic" + , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity" + , "wt_calcprop", "mut_calcprop") + +#======================= +# Data for sub colours +# barplot: PS +#======================= + +cat("\nNo. of cols to select:", length(cols_to_select)) + +subcols_df_ps = my_df[, cols_to_select] + +cat("\nNo of unique positions for ps:" + , length(unique(subcols_df_ps$position))) + +# add count_pos col that counts the no. of nsSNPS at a position +setDT(subcols_df_ps)[, pos_count := .N, by = .(position)] + +# should be a factor +if (is.factor(subcols_df_ps$duet_outcome)){ + cat("\nDuet_outcome is factor") + table(subcols_df_ps$duet_outcome) +}else{ + cat("\nConverting duet_outcome to factor") + subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome) + table(subcols_df_ps$duet_outcome) +} + +# should be -1 and 1 +min(subcols_df_ps$duet_scaled) +max(subcols_df_ps$duet_scaled) + +tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min) +tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max) + +# check unique values in normalised data +cat("\nNo. of unique values in duet scaled, no rounding:" + , length(unique(subcols_df_ps$duet_scaled))) + +# No rounding +my_grp = subcols_df_ps$duet_scaled; length(my_grp) + +# Add rounding is to be used +n = 3 +subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n) + +cat("\nNo. of unique values in duet scaled", n, "places rounding:" + , length(unique(subcols_df_ps$duet_scaledR))) + +my_grp_r = subcols_df_ps$duet_scaledR # rounding + +# Add grp cols +subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "") +subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "") + +# Call the function to create the palette based on the group defined above +subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp") +subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r") + +cat("Colour palette generated for my_grp: ", length(subcols_ps), " colours") +cat("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours") diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/redundant/lineage_basic_barplot.R similarity index 100% rename from scripts/plotting/lineage_basic_barplot.R rename to scripts/plotting/redundant/lineage_basic_barplot.R diff --git a/scripts/plotting/lineage_dist_combined_PS.R b/scripts/plotting/redundant/lineage_dist_combined_PS.R old mode 100644 new mode 100755 similarity index 100% rename from scripts/plotting/lineage_dist_combined_PS.R rename to scripts/plotting/redundant/lineage_dist_combined_PS.R diff --git a/scripts/plotting/lineage_dist_dm_om_combined_PS.R b/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R old mode 100644 new mode 100755 similarity index 69% rename from scripts/plotting/lineage_dist_dm_om_combined_PS.R rename to scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R index 07912ac..3875382 --- a/scripts/plotting/lineage_dist_dm_om_combined_PS.R +++ b/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R @@ -15,44 +15,8 @@ setwd("~/git/LSHTM_analysis/scripts/plotting/") getwd() source("Header_TT.R") -library(ggridges) -library(plyr) -source("combining_dfs_plotting.R") -# PS combined: -# 1) merged_df2 -# 2) merged_df2_comp -# 3) merged_df3 -# 4) merged_df3_comp -# LIG combined: -# 5) merged_df2_lig -# 6) merged_df2_comp_lig -# 7) merged_df3_lig -# 8) merged_df3_comp_lig - -# 9) my_df_u -# 10) my_df_u_lig - -cat("Directories imported:" - , "\n====================" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir) - -cat("Variables imported:" - , "\n=====================" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of duplicated muts:", dup_muts_nu - , "\nNA count for ORs:", na_count - , "\nNA count in df2:", na_count_df2 - , "\nNA count in df3:", na_count_df3 - , "\ndr_muts_col:", dr_muts_col - , "\nother_muts_col:", other_muts_col - , "\ndrtype_col:", resistance_col) +source("get_plotting_dfs.R") cat("cols imported:" , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2) @@ -77,67 +41,24 @@ plot_lineage_dist_combined_dm_om_L = paste0(plotdir,"/", lineage_dist_combined # we lose some muts and at this level, we should use # as much info as available, hence use df with NA ########################### -# REASSIGNMENT -my_df = merged_df2 - -# delete variables not required -rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp - , merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig) - -# quick checks -colnames(my_df) -str(my_df) - -table(my_df$mutation_info) #=================== # Data for plots #=================== -table(my_df$lineage); str(my_df$lineage) - -# select lineages 1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - #, "lineage5" - #, "lineage6" - #, "lineage7") - -# works nicely with facet wrap using labeller, but not otherwise -#my_labels = c('Lineage 1' -# , 'Lineage 2' -# , 'Lineage 3' -# , 'Lineage 4') -# #, 'Lineage 5' -# #, 'Lineage 6' -# #, 'Lineage 7') - -#names(my_labels) = c('lineage1' -# , 'lineage2' -# , 'lineage3' -# , 'lineage4') -# #, 'lineage5' -# #, 'lineage6' -# #, 'lineage7') - -#========================== -# subset selected lineages -#========================== -df_lin = subset(my_df, subset = lineage %in% sel_lineages) -table(df_lin$lineage) +lin_dist_plot = merged_df2[merged_df2$lineage%in%c("lineage1", "lineage2", "lineage3", "lineage4"),] +table(lin_dist_plot$lineage) #{RESULT: Total number of samples for lineage} -sum(table(df_lin$lineage)) +sum(table(lin_dist_plot$lineage)) #{RESULT: No of samples within lineage} -table(df_lin$lineage) +table(lin_dist_plot$lineage) #{Result: No. of unique mutations the 4 lineages contribute to} -length(unique(df_lin$mutationinformation)) +length(unique(lin_dist_plot$mutationinformation)) -u2 = unique(my_df$mutationinformation) -u = unique(df_lin$mutationinformation) +u2 = unique(lin_dist_plot$mutationinformation) +u = unique(lin_dist_plot$mutationinformation) #{Result:Muts not present within selected lineages} check = u2[!u2%in%u]; print(check) @@ -148,37 +69,38 @@ check = u2[!u2%in%u]; print(check) # from "plyr" #================== #{Result:No of samples in selected lineages} -table(df_lin$lineage) +table(lin_dist_plot$lineage) -df_lin$lineage_labels = mapvalues(df_lin$lineage +lin_dist_plot$lineage_labels = mapvalues(lin_dist_plot$lineage , from = c("lineage1","lineage2", "lineage3", "lineage4") , to = c("Lineage 1", "Lineage 2", "Lineage 3", "Lineage 4")) -table(df_lin$lineage_labels) +table(lin_dist_plot$lineage_labels) -table(df_lin$lineage_labels) == table(df_lin$lineage) +table(lin_dist_plot$lineage_labels) == table(lin_dist_plot$lineage) #======================== # mutation_info: labels #======================== #{Result:No of DM and OM muts in selected lineages} -table(df_lin$mutation_info) +table(lin_dist_plot$mutation_info) -df_lin$mutation_info_labels = ifelse(df_lin$mutation_info == dr_muts_col, "DM", "OM") -table(df_lin$mutation_info_labels) - -table(df_lin$mutation_info) == table(df_lin$mutation_info_labels) +lin_dist_plot$mutation_info_labels = ifelse(lin_dist_plot$mutation_info == dr_muts_col + , "DM", "OM") +table(lin_dist_plot$mutation_info_labels) +table(lin_dist_plot$mutation_info) == table(lin_dist_plot$mutation_info_labels) #======================== # duet_outcome: labels #======================== #{Result: No. of D and S mutations in selected lineages} -table(df_lin$duet_outcome) +table(lin_dist_plot$duet_outcome) -df_lin$duet_outcome_labels = ifelse(df_lin$duet_outcome == "Destabilising", "D", "S") -table(df_lin$duet_outcome_labels) +lin_dist_plot$duet_outcome_labels = ifelse(lin_dist_plot$duet_outcome == "Destabilising" + , "D", "S") +table(lin_dist_plot$duet_outcome_labels) -table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels) +table(lin_dist_plot$duet_outcome) == table(lin_dist_plot$duet_outcome_labels) #======================= @@ -198,25 +120,14 @@ table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels) ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## - -#========================== -# Distribution plots -#============================ - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df <- df_lin -#%%%%%%%%%%%%%%%%%%%%%%%%% - -rm(df_lin) - #****************** # generate distribution plot of lineages #****************** # 2 : ggridges (good!) my_ats = 15 # axis text size my_als = 20 # axis label size -n_colours = length(unique(df$duet_scaled)) +n_colours = length(unique(lin_dist_plot$duet_scaled)) + my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1) #======================================= @@ -232,17 +143,23 @@ my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcs #plot_lineage_dist_duet_f #svg(plot_lineage_dist_duet_f) -p1 = ggplot(df, aes(x = duet_scaled - , y = duet_outcome))+ +p1 = ggplot(lin_dist_plot, aes(x = duet_scaled + #, y = duet_outcome + , y = mutation_info_labels + ))+ geom_density_ridges_gradient(aes(fill = ..x..) #, jittered_points = TRUE , scale = 3 , size = 0.3 ) + facet_wrap( ~lineage_labels + #~mutation_info_labels + # ~mutation_info_labels # , scales = "free" # , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1)) + + #coord_cartesian( xlim = c(-1, 1)) + + scale_x_continuous(expand = c(0.01, 0)) + + scale_fill_gradientn(colours = my_palette , name = "DUET" #, breaks = c(-1, 0, 1) @@ -264,8 +181,8 @@ p1 = ggplot(df, aes(x = duet_scaled #, legend.title = element_text(size = my_als-6) , legend.title = element_blank() , legend.position = c(-0.08, 0.41) - #, legend.direction = "horizontal" - #, legend.position = "left" + , legend.direction = "horizontal" + , legend.position = "top" )+ labs(x = "DUET") @@ -286,13 +203,14 @@ p1 #plot_lineage_dist_duet_dm_om #svg(plot_lineage_dist_duet_dm_om) -p2 = ggplot(df, aes(x = duet_scaled +p2 = ggplot(lin_dist_plot, aes(x = duet_scaled , y = lineage_labels))+ geom_density_ridges(aes(fill = factor(mutation_info_labels)) , scale = 3 , size = 0.3 , alpha = 0.8) + - coord_cartesian( xlim = c(-1, 1)) + + scale_x_continuous(expand = c(0.01, 0)) + + #coord_cartesian( xlim = c(-1, 1)) + scale_fill_manual(values = c("#E69F00", "#999999")) + theme(axis.text.x = element_text(size = my_ats , angle = 90 @@ -325,14 +243,18 @@ p2 #plot_lineage_dist_duet_nf #svg(plot_lineage_dist_duet_nf) -p3 = ggplot(df, aes(x = duet_scaled +p3 = ggplot(lin_dist_plot, aes(x = duet_scaled , y = lineage_labels))+ - geom_density_ridges_gradient(aes(fill = ..x..) - #, jittered_points = TRUE - , scale = 3 - , size = 0.3 ) + - coord_cartesian( xlim = c(-1, 1)) + - scale_fill_gradientn(colours = my_palette, name = "DUET") + + # geom_density_ridges_gradient(aes(fill = ..x..) + # #, jittered_points = TRUE + # , scale = 3 + # , size = 0.3 ) + + geom_density_ridges()+ + #facet_wrap (~mutation_info_labels) + + #coord_cartesian( xlim = c(-1, 1)) + + scale_x_continuous(expand = c(0.01, 0)) + + + #scale_fill_gradientn(colours = my_palette, name = "DUET") + theme(axis.text.x = element_text(size = my_ats , angle = 90 , hjust = 1 diff --git a/scripts/plotting/redundant/other_dfs_data.R b/scripts/plotting/redundant/other_dfs_data.R new file mode 100644 index 0000000..97b0567 --- /dev/null +++ b/scripts/plotting/redundant/other_dfs_data.R @@ -0,0 +1,117 @@ +#!/usr/bin/env Rscript + +# Didn't end up using it: sorted it at the source +# .py script to combine all dfs to output all_params + +################################################################# +# TASK: Script to add all other dfs to merged_df2 and merged_df3 + +################################################################# +# Combine other dfs: +# dynamut_df, dynamut2_df, mcsm_na_df, +# perhaps : deepddg and mcsm ppi (for embb) +################################################################ +# read other files +infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene + , "_complex_dynamut_norm.csv") + +infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene + , "_complex_dynamut2_norm.csv") + +infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene + , "_complex_mcsm_na_norm.csv") + +infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene + , "_mcsm_formatted_snps.csv") + +dynamut_df = read.csv(infilename_dynamut) +dynamut2_df = read.csv(infilename_dynamut2) +mcsm_na_df = read.csv(infilename_mcsm_na) +mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F) +names(mcsm_f_snps) = "mutationinformation" + +#================================= +# check with intersect to find the common col, but use +c1 = length(intersect(names(dynamut_df), names(dynamut2_df))) +c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df))) + +if (c1 == 1 && c2 == 1) { + n_common = 1 +}else{ + cat("\nMore than one common col found, inspect before merging!") +} + +# mutationinformation column to be on the safe side +# delete chain from dynamut2_df +#dynamut2_df = subset(dynamut2_df, select = -chain) + +# quick checks +lapply(list(dynamut_df + , dynamut2_df + , mcsm_na_df), ncol) + +lapply(list(dynamut_df + , dynamut2_df + , mcsm_na_df), colnames) + +lapply(list(dynamut_df + , dynamut2_df + , mcsm_na_df), nrow) + +ncols_comb = lapply(list(dynamut_df + , dynamut2_df + , mcsm_na_df), ncol) + +#--------------------------------- +# Combine 1: all other params dfs +#--------------------------------- +combined_dfs = Reduce(inner_join, list(dynamut_df + , dynamut2_df + , mcsm_na_df)) +# Reduce("+", ncols_comb) + +#----------------------------------------- +# Combine 2: combine1 result + merged_df2 +#----------------------------------------- +drop_cols = intersect(names(combined_dfs), names(merged_df2)) +drop_cols = drop_cols + +drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")] + +combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols] + +nrow(combined_dfs_f); nrow(merged_df2) +ncol(combined_dfs_f); ncol(merged_df2) + +#----------------------------------------- +# Combined merged_df2 +#----------------------------------------- +merged_df2_combined = merge(merged_df2 + , combined_dfs_f + , by = "mutationinformation" +) + +expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1 + +if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){ + + cat("\nPASS: merged_df2 combined with other parameters dfs." + , "\nUse this for lineage distribution plots") +}else{ + + cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs") + quit() + +} + +rm(combined_dfs, combined_dfs_f) + +#================================ +# combined data +# short_df ps: ~ merged_df3 +# TODO: later integrate properly +#================================ +#----------------------------------------- +# Combined merged_df2 +#----------------------------------------- +merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),] diff --git a/scripts/plotting/redundant/other_plots_data.R b/scripts/plotting/redundant/other_plots_data.R new file mode 100755 index 0000000..61a508f --- /dev/null +++ b/scripts/plotting/redundant/other_plots_data.R @@ -0,0 +1,470 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Script to format data for dm om plots: +# generating LF data +# sourced by get_plotting_dfs.R +######################################################### +# working dir and loading libraries +# getwd() +# setwd("~/git/LSHTM_analysis/scripts/plotting") +# getwd() + +# make cmd +# globals +# drug = "streptomycin" +# gene = "gid" + +# source("get_plotting_dfs.R") +#======================================================================= +# MOVE TO COMBINE or singular file for deepddg +# +# cols_to_select = c("mutation", "mutationinformation" +# , "wild_type", "position", "mutant_type" +# , "mutation_info") +# +# merged_df3_short = merged_df3[, cols_to_select] + +# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene +# , "_mcsm_formatted_snps.csv") +# +# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F) +# names(mcsm_f_snps) <- "mutationinformation" + +# write merged_df3 to generate structural figure on chimera +#write.csv(merged_df3_short, "merged_df3_short.csv") +#======================================================================== + +#======================================================================== +# cols to select + +cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation" + , "mutation_info", "position" + , LigDist_colname + , "duet_stability_change", "duet_scaled", "duet_outcome" + , "ligand_affinity_change", "affinity_scaled", "ligand_outcome" + , "ddg_foldx", "foldx_scaled", "foldx_outcome" + , "deepddg", "deepddg_scaled", "deepddg_outcome" + , "asa", "rsa" + , "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af")] + +cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation" + , "mcsm_na_affinity", "mcsm_na_scaled" + , "mcsm_na_outcome")] +# entire dynamut_df + +cols_dynamut2_df <- dynamut2_df[, c("mutationinformation" + , "ddg_dynamut2", "ddg_dynamut2_scaled" + , "ddg_dynamut2_outcome")] + +n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) + + length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols + +i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df)) +i2<- intersect(names(dynamut_df), names(cols_dynamut2_df)) +merging_cols <- intersect(i1, i2) +cat("\nmerging_cols:", merging_cols) + +if (merging_cols == "mutationinformation") { + cat("\nStage 1: Found common col between dfs, checking values in it...") + c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]]) + c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]]) + c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]]) + c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]]) + cols_check <- c(c1, c2, c3, c4) + expected_cols = n_comb_cols - ( length(cols_check) - 1) + if (all(cols_check)){ + cat("\nStage 2: Proceeding with merging dfs:\n") + comb_df <- Reduce(inner_join, list(cols_mcsm_df + , cols_mcsm_na_df + , dynamut_df + , cols_dynamut2_df)) + comb_df_s = arrange(comb_df, position) + + # if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) { + # cat("\Stage3, PASS: dfs merged sucessfully" + # , "\nnrow of merged_df: ", nrow(comb_df_s) + # , "\nncol of merged_df:", ncol(comb_df_s)) + # } + + } +} +#names(comb_df_s) +cat("\n!!!IT GOT TO HERE!!!!") +#======================================================================= +fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )] +fact_cols +lapply(comb_df_s[, fact_cols], class) +comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor) + +if (any(lapply(comb_df_s[, fact_cols], class) == "character")){ + cat("\nChanging cols to factor") + comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor) + if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){ + cat("\nSuccessful: cols changed to factor") + } +} +lapply(comb_df_s[, fact_cols], class) + +#======================================================================= +table(comb_df_s$mutation_info) + + # further checks to make sure dr and other muts are indeed unique +dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,] +dr_muts_names = unique(dr_muts$mutation) + +other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,] +other_muts_names = unique(other_muts$mutation) + +if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) && + table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){ + cat("PASS: dr and other muts are indeed unique") +}else{ + cat("FAIL: dr and others muts are NOT unique!") + quit() +} + +# pretty display names i.e. labels to reduce major code duplication later +foo_cnames = data.frame(colnames(comb_df_s)) +names(foo_cnames) <- "old_name" + +stability_suffix <- paste0(delta_symbol, delta_symbol, "G") +flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S") + +lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn +duet_dn = paste0("DUET ", stability_suffix); duet_dn +foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn +deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn +mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn +dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn +dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn +encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn +encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn +sdm_dn = paste0("SDM " , stability_suffix); sdm_dn +mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn + +# Change colnames of some columns using datatable +comb_df_sl = comb_df_s +names(comb_df_sl) + +setnames(comb_df_sl + , old = c("asa", "rsa", "rd_values", "kd_values" + , "log10_or_mychisq", "neglog_pval_fisher", "af" + , LigDist_colname + , "duet_scaled" + , "foldx_scaled" + , "deepddg_scaled" + , "mcsm_na_scaled" + , "ddg_dynamut_scaled" + , "ddg_dynamut2_scaled" + , "ddg_encom_scaled" + , "dds_encom_scaled" + , "ddg_sdm" + , "ddg_mcsm") + + , new = c("ASA", "RSA", "RD", "KD" + , "Log10 (OR)", "-Log (P)", "MAF" + , lig_dn + , duet_dn + , foldx_dn + , deepddg_dn + , mcsm_na_dn + , dynamut_dn + , dynamut2_dn + , encom_ddg_dn + , encom_dds_dn + , sdm_dn + , mcsm_dn) + ) + +foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl)) + +# some more pretty labels +table(comb_df_sl$mutation_info) + +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM" +levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM" + +table(comb_df_sl$mutation_info) + +####################################################################### +#====================== +# Selecting dfs +# with appropriate cols +#======================= +static_cols_start = c("mutationinformation" + , "position" + , "mutation" + , "mutation_info") + +static_cols_end = c(lig_dn + , "ASA" + , "RSA" + , "RD" + , "KD") + +# ordering is important! + +######################################################################### +#============== +# DUET: LF +#============== +cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end) +wf_duet = comb_df_sl[, cols_to_select_duet] + +#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps +pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet + +expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet)) +expected_rows_lf + +# LF data: duet +lf_duet = gather(wf_duet + , key = param_type + , value = param_value + , all_of(duet_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_duet) == expected_rows_lf){ + cat("\nPASS: long format data created for ", duet_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# FoldX: LF +#============== +cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end) +wf_foldx = comb_df_sl[, cols_to_select_foldx] + +pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx + +expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx)) +expected_rows_lf + +# LF data: duet +print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>") +lf_foldx <<- gather(wf_foldx + , key = param_type + , value = param_value + , all_of(foldx_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_foldx) == expected_rows_lf){ + cat("\nPASS: long format data created for ", foldx_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Deepddg: LF +#============== +cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end) +wf_deepddg = comb_df_sl[, cols_to_select_deepddg] + +pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg + +expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg)) +expected_rows_lf + +# LF data: duet +lf_deepddg = gather(wf_deepddg + , key = param_type + , value = param_value + , all_of(deepddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_deepddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", deepddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM-NA: LF +#============== +cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end) +wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na] + +pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na + +expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na)) +expected_rows_lf + +# LF data: duet +lf_mcsm_na = gather(wf_mcsm_na + , key = param_type + , value = param_value + , all_of(mcsm_na_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm_na) == expected_rows_lf){ + cat("\nPASS: long format data created for ", mcsm_na_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut: LF +#============== +cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end) +wf_dynamut = comb_df_sl[, cols_to_select_dynamut] + +pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut + +expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut)) +expected_rows_lf + +# LF data: duet +lf_dynamut = gather(wf_dynamut + , key = param_type + , value = param_value + , all_of(dynamut_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# Dynamut2: LF +#============== +cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end) + +wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2] + +pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2 + +expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2)) +expected_rows_lf + +# LF data: duet +lf_dynamut2 = gather(wf_dynamut2 + , key = param_type + , value = param_value + , all_of(dynamut2_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_dynamut2) == expected_rows_lf){ + cat("\nPASS: long format data created for ", dynamut2_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# EnCOM ddg: LF +#============== +cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end) +wf_encomddg = comb_df_sl[, cols_to_select_encomddg] + +pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg + +expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg)) +expected_rows_lf + +# LF data: encomddg +lf_encomddg = gather(wf_encomddg + , key = param_type + , value = param_value + , all_of(encom_ddg_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomddg) == expected_rows_lf){ + cat("\nPASS: long format data created for ", encom_ddg_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} +############################################################################ +#============== +# EnCOM dds: LF +#============== +cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end) +wf_encomdds = comb_df_sl[, cols_to_select_encomdds] + +pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds + +expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds)) +expected_rows_lf + +# LF data: encomddg +lf_encomdds = gather(wf_encomdds + , key = param_type + , value = param_value + , all_of(encom_dds_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_encomdds) == expected_rows_lf){ + cat("\nPASS: long format data created for", encom_dds_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# SDM: LF +#============== +cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end) +wf_sdm = comb_df_sl[, cols_to_select_sdm] + +pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm + +expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm)) +expected_rows_lf + +# LF data: encomddg +lf_sdm = gather(wf_sdm + , key = param_type + , value = param_value + , all_of(sdm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_sdm) == expected_rows_lf){ + cat("\nPASS: long format data created for", sdm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} + +############################################################################ +#============== +# mCSM: LF +#============== +cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end) +wf_mcsm = comb_df_sl[, cols_to_select_mcsm] + +pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm + +expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm)) +expected_rows_lf + +# LF data: encomddg +lf_mcsm = gather(wf_mcsm + , key = param_type + , value = param_value + , all_of(mcsm_dn):tail(static_cols_end,1) + , factor_key = TRUE) + +if (nrow(lf_mcsm) == expected_rows_lf){ + cat("\nPASS: long format data created for", mcsm_dn) +}else{ + cat("\nFAIL: long format data could not be created for duet") + quit() +} +############################################################################ + diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R similarity index 100% rename from scripts/plotting/other_plots_data.R rename to scripts/plotting/redundant/other_plots_data_SAFEGUARD.R diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt index 8d7ba0c..369bc4b 100644 --- a/scripts/plotting/running_plotting_scripts.txt +++ b/scripts/plotting/running_plotting_scripts.txt @@ -112,6 +112,30 @@ note: - fa flag has default if not supplied - fb flag has default if not supplied + +#==================================== +# lineage_basic_barplots_combined.R +#==================================== +#----------------------------------------------------------------------- +./lineage_basic_barplots_combined.R-d streptomycin -g gid +#----------------------------------------------------------------------- + +It replaces (and has an added diversity plot) + ## lineage_basic_barplot.R +These have been moved to redundant/ + + +sources: + ## get_plotting_dfs.R + ## functions//bp_lineage.R" + + outputs: 1 svg in the plotdir + ## basic_lineage_barplots_combined.svg + +note: + - fa flag has default if not supplied + - fb flag has default if not supplied + ######################################################################## # TODO Delete: dirs.R