Merge branch 'gidb_dev' (including merge conflict while after adding CLI arguments for mcsm_na/format_results_mcsm_na.py)

This commit is contained in:
Tanushree Tunstall 2021-10-28 12:45:39 +01:00
commit e2bc1cdde1
70 changed files with 4048 additions and 819 deletions

2
config/gid.R Normal file
View file

@ -0,0 +1,2 @@
gene = "gid"
drug = "streptomycin"

0
dynamut/format_results_dynamut.py Normal file → Executable file
View file

0
dynamut/format_results_dynamut2.py Normal file → Executable file
View file

11
dynamut/notes.txt Normal file
View file

@ -0,0 +1,11 @@
Dynamut was painfully run for gid, part manually, part programatically!
However, it was decided to ditch that and only run Dynamut2 for future targets
Dynamut2 was run through the website in batches of 50 for
katG: 17 batches (00..16)
rpoB: 23 batches (00..22)
alr: 6 batches (00..05)
However, the use of API was made for rpoB batches (09-22) from 13 Oct 2021
as jobs started to flake and fail through the website!

51
dynamut/run_format_results_dynamut.py Normal file → Executable file
View file

@ -20,8 +20,45 @@ from format_results_dynamut2 import *
# variables
# TODO: add cmd line args
gene = 'gid'
drug = 'streptomycin'
#gene =
#drug =
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
#arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') # should be handled elsewhere!
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output paths & filenames
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
#make_dirs = args.make_dirs
#%% input and output dirs and files
#=======
# dirs
#=======
if not datadir:
datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
#%%=====================================================================
datadir = homedir + '/git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'
@ -29,12 +66,12 @@ outdir_dynamut = outdir + '/dynamut_results/'
outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'
# Input file
infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
#infile_dynamut = outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv'
infile_dynamut2 = outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv'
# Formatted output filename
outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv'
outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv'
#outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
outfile_dynamut2_f = outdir_dynamut2 + gene.lower() + '_dynamut2_norm.csv'
#===============================
# CALL: format_results_dynamut
@ -69,4 +106,4 @@ print('Finished writing file:'
, '\nExpected no. of cols:', len(dynamut2_df_f.columns)
, '\n=============================================================')
#%%#####################################################################
#%%#####################################################################

View file

@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au'
#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
# TODO: add cmd line args
#gene = 'gid'
drug = 'streptomycin'
# gene =
# drug =
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
@ -41,4 +41,4 @@ get_results(url_file = my_url_file
, output_dir = outdir
, outfile_suffix = my_suffix)
########################################################################
########################################################################

View file

@ -1,6 +1,6 @@
#!/bin/bash
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
# copy your snp file to split into the dynamut dir
@ -12,8 +12,13 @@ CHUNK=$3
mkdir -p ${OUTDIR}/${CHUNK}
cd ${OUTDIR}/${CHUNK}
# makes the 2 dirs, hence ../..
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
# use case
#~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
#~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
#~/git/LSHTM_analysis/dynamut/split_csv.sh katg_mcsm_formatted_snps.csv snp_batches 50 #Date: 20/09/2021
# add .txt to the files

37
dynamut/split_csv_chain.sh Executable file
View file

@ -0,0 +1,37 @@
#!/bin/bash
# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
# copy your snp file to split into the dynamut dir
# use sed to add chain ID to snp file and then split to avoid post processing
INFILE=$1
OUTDIR=$2
CHUNK=$3
mkdir -p ${OUTDIR}/${CHUNK}/chain_added
cd ${OUTDIR}/${CHUNK}/chain_added
# makes the 3 dirs, hence ../..
split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
########################################################################
# use cases
# Date: 20/09/2021
# sed -e 's/^/A /g' katg_mcsm_formatted_snps.csv > katg_mcsm_formatted_snps_chain.csv
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 50
# Date: 01/10/2021
# sed -e 's/^/A /g' rpob_mcsm_formatted_snps.csv > rpob_mcsm_formatted_snps_chain.csv
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 50
# Date: 02/10/2021
# sed -e 's/^/A /g' alr_mcsm_formatted_snps.csv > alr_mcsm_formatted_snps_chain.csv
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50
# Date: 05/10/2021
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20
# add .txt to the files
########################################################################

View file

@ -41,7 +41,7 @@ arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_formatted_snps.csv exists')
# FIXME: Doesn't work with 2 chains yet!
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
@ -148,6 +148,16 @@ print('Arguments being passed:'
, '\noutput file:', outfile_foldx
, '\n=============================================================')
# make sure rotabase.txt exists in the process_dir
rotabase_file = process_dir + '/' + 'rotabase.txt'
if Path(rotabase_file).is_file():
print(f'rotabase file: {rotabase_file} exists')
else:
print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}')
sys.exit()
#### Delay for 10 seconds to check the params ####
print('Sleeping for 10 seconds to give you time to cancel')
time.sleep(10)
@ -235,6 +245,13 @@ def main():
nmuts = len(mutlist)
print(nmuts)
print(mutlist)
print('start')
#subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
print('\033[95mSTAGE: repair PDB\033[0m')
print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
#subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
# once you decide to use the function
# repairPDB(pdbname)
print('start')
# some common parameters for foldX
@ -242,61 +259,74 @@ def main():
print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
print('Running foldx RepairPDB for WT')
subprocess.call(['foldx'
fold_RepairDB = ['foldx'
, '--command=RepairPDB'
, foldx_common
, '--pdb-dir=' + os.path.dirname(pdb_filename)
# , '--pdb-dir=' + os.path.dirname(pdb_filename)
, '--pdb-dir=' + indir
, '--pdb=' + actual_pdb_filename
, 'outPDB=true'
, '--output-dir=' + process_dir])
, '--output-dir=' + process_dir]
print('CMD:', fold_RepairDB)
subprocess.call(fold_RepairDB)
print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
print('Running foldx BuildModel for WT')
subprocess.call(['foldx'
foldx_BuildModel = ['foldx'
, '--command=BuildModel'
, foldx_common
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--mutant-file="individual_list_' + pdbname +'.txt"'
, '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt'
, 'outPDB=true'
, '--numberOfRuns=1'
, '--output-dir=' + process_dir], cwd=process_dir)
, '--output-dir=' + process_dir]
print('CMD:', foldx_BuildModel)
subprocess.call( foldx_BuildModel, cwd=process_dir)
print('Running foldx PrintNetworks for WT')
subprocess.call(['foldx'
foldx_PrintNetworks = ['foldx'
, '--command=PrintNetworks'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
, '--output-dir=' + process_dir]
print('CMD:', foldx_PrintNetworks)
subprocess.call(foldx_PrintNetworks, cwd=process_dir)
print('Running foldx SequenceDetail for WT')
subprocess.call(['foldx'
foldx_SequenceDetail = ['foldx'
, '--command=SequenceDetail'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
, '--output-dir=' + process_dir]
print('CMD:', foldx_SequenceDetail)
subprocess.call(foldx_SequenceDetail , cwd=process_dir)
print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
print('\n==========================================================')
print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
for n in range(1,nmuts+1):
print('\033[95mNETWORK:\033[0m', n)
print('Running foldx PrintNetworks for mutation', n)
subprocess.call(['foldx'
foldx_PrintNetworksMT = ['foldx'
, '--command=PrintNetworks'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
, '--output-dir=' + process_dir]
print('CMD:', foldx_PrintNetworksMT)
subprocess.call( foldx_PrintNetworksMT , cwd=process_dir)
print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
print('\n==========================================================')
@ -323,14 +353,16 @@ def main():
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
chain1=chainA
chain2=chainB
subprocess.call(['foldx'
foldx_AnalyseComplex = ['foldx'
, '--command=AnalyseComplex'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair.pdb'
, '--analyseComplexChains=' + chain1 + ',' + chain2
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
, '--output-dir=' + process_dir]
print('CMD:',foldx_AnalyseComplex)
subprocess.call(foldx_AnalyseComplex, cwd=process_dir)
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
@ -340,14 +372,16 @@ def main():
for n in range(1,nmuts+1):
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
subprocess.call(['foldx'
foldx_AnalyseComplex = ['foldx'
, '--command=AnalyseComplex'
, '--pdb-dir=' + process_dir
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
, '--analyseComplexChains=' + chain1 + ',' + chain2
, '--water=PREDICT'
, '--vdwDesign=1'
, '--output-dir=' + process_dir], cwd=process_dir)
, '--output-dir=' + process_dir]
print('CMD:', foldx_AnalyseComplex)
subprocess.call( foldx_AnalyseComplex , cwd=process_dir)
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'

View file

@ -104,7 +104,7 @@ if mutation_filename:
in_filename_snps = mutation_filename
else:
in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
infile_snps = outdir + '/' + in_filename_snps
#=======

View file

@ -51,7 +51,7 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
print('Assigning meaningful colnames'
, '\n=======================================================')
my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
, 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
, 'CHAIN': 'chain'
, 'WILD_RES': 'wild_type' # one letter amino acid code
, 'RES_POS': 'position' # number
, 'MUT_RES': 'mutant_type' # one letter amino acid code
@ -65,8 +65,8 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
#############
# create mutationinformation column
#############
mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
#mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type']
#%%=====================================================================
#############
# Create col: mcsm_na_outcome
@ -131,5 +131,4 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
, 'chain'
, 'pdb_file']]
return(mcsm_na_dataf)
#%%#####################################################################
#%%#####################################################################

View file

@ -1,21 +1,31 @@
#########################################################
### A) Installing and loading required packages
# A) Installing and loading required packages
# B) My functions
#########################################################
#########################################################
#lib_loc = "/usr/local/lib/R/site-library")
#if (!require("gplots")) {
# install.packages("gplots", dependencies = TRUE)
# library(gplots)
#}
require("getopt", quietly = TRUE) # cmd parse arguments
#if (!require("tidyverse")) {
# install.packages("tidyverse", dependencies = TRUE)
# library(tidyverse)
#}
if (!require("tidyverse")) {
install.packages("tidyverse", dependencies = TRUE)
library(tidyverse)
}
if (!require("ggplot2")) {
install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)
if (!require("shiny")) {
install.packages("shiny", dependencies = TRUE)
library(shiny)
}
if (!require("shinyBS")) {
install.packages("shinyBS", dependencies = TRUE)
library(shinyBS)
}
if (!require("gridExtra")) {
install.packages("gridExtra", dependencies = TRUE)
library(gridExtra)
}
if (!require("ggridges")) {
@ -23,6 +33,35 @@ if (!require("ggridges")) {
library(ggridges)
}
# if (!require("ggplot2")) {
# install.packages("ggplot2", dependencies = TRUE)
# library(ggplot2)
# }
# if (!require ("dplyr")){
# install.packages("dplyr")
# library(dplyr)
# }
if (!require ("DT")){
install.packages("DT")
library(DT)
}
if (!require ("plyr")){
install.packages("plyr")
library(plyr)
}
# Install
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
if (!require ("ggbeeswarm")){
install.packages("ggbeeswarm")
library(ggbeeswarm)
}
if (!require("plotly")) {
install.packages("plotly", dependencies = TRUE)
library(plotly)
@ -103,11 +142,6 @@ if (!require ("psych")){
library(psych)
}
if (!require ("dplyr")){
install.packages("dplyr")
library(dplyr)
}
if (!require ("compare")){
install.packages("compare")
library(compare)
@ -118,18 +152,37 @@ if (!require ("arsenal")){
library(arsenal)
}
if(!require(ggseqlogo)){
install.packages("ggseqlogo")
library(ggseqlogo)
}
####TIDYVERSE
# Install
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
#library(ggcorrplot)
###for PDB files
#install.packages("bio3d")
# for PDB files
if(!require(bio3d)){
install.packages("bio3d")
library(bio3d)
}
library(protr)
if(!require(protr)){
install.packages("protr")
library(protr)
}
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("Logolas")
library("Logolas")
####################################
# Load all my functions:
# only works if tidyverse is loaded
# hence included it here!
####################################
func_path = "~/git/LSHTM_analysis/scripts/functions/"
source_files <- list.files(func_path, "\\.R$") # locate all .R files
map(paste0(func_path, source_files), source) # source all your R scripts!

View file

@ -0,0 +1,91 @@
########################################
# Lineage barplot
# Lineage and nsSNP count barplot
# Lineage Diversity barplot
########################################
lin_count_bp <- function( lf_data
, x_categ = ""
, y_count = ""
, bar_fill_categ = ""
, display_label_col = ""
, bar_stat_stype = "identity"
, x_lab_angle = 90
, d_lab_size = 5
, d_lab_hjust = 0.5
, d_lab_vjust = 0.5
, d_lab_col = "black"
, my_xats = 20 # x axis text size
, my_yats = 20 # y axis text size
, my_xals = 22 # x axis label size
, my_yals = 22 # y axis label size
, my_lls = 22 # legend label size
, bar_col_labels = ""
, bar_col_values = ""
, bar_leg_name = ""
, leg_location = "top"
, y_log10 = FALSE
, y_scale_percent = FALSE
, y_label = c("Count", "SNP diversity")
) {
g = ggplot(lf_data
, aes( x = factor( eval(parse(text = x_categ)), ordered = T )
, y = eval(parse(text = y_count))
, fill = eval(parse(text = bar_fill_categ)) ) )
OutPlot = g + geom_bar( stat = bar_stat_stype
, position = position_stack(reverse = TRUE)
#, alpha = 1
#, colour = "grey75"
) +
theme(axis.text.x = element_text(size = my_xats
, angle = x_lab_angle)
, axis.text.y = element_text(size = my_yats
, angle = 90
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xals
, colour = "black")
, axis.title.y = element_text(size = my_yals
, colour = "black")
, legend.position = leg_location
, legend.text = element_text(size = my_lls)) +
geom_label(aes(label = eval(parse(text = display_label_col)))
, size = d_lab_size
, hjust = d_lab_hjust
, vjust = d_lab_vjust
, colour = d_lab_col
, show.legend = FALSE
#, check_overlap = TRUE
, position = position_stack(reverse = T)) +
scale_fill_manual(values = bar_col_values
, name = bar_leg_name
, labels = bar_col_labels) +
labs(title = ""
, x = ""
, y = y_label
, colour = "black")
if (y_log10){
OutPlot = OutPlot +
scale_y_continuous(trans = "log10"
, labels = trans_format("log10", math_format(10^.x) ) )
}
if (y_scale_percent){
OutPlot = OutPlot +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
#scale_y_continuous(labels = scales::percent) +
labs(title = ""
, x = ""
, y = y_label
, colour = "black")
}
return(OutPlot)
}

View file

@ -3,7 +3,7 @@
# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
#########################################################
ColourPalleteMulti <- function(df, group, subgroup){
ColourPalleteMulti = function(df, group, subgroup){
# Find how many colour categories to create and the number of colours in each
categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
@ -24,4 +24,88 @@ ColourPalleteMulti <- function(df, group, subgroup){
, category.end[i]))(categories[i,2])}))
return(colours)
}
#########################################################
#########################################################################
########################
# Generate bp with
# colour palette derived
# from the data using
# above function
#########################
bp_stability_hmap <- function(plotdf = merged_df3
, xvar_colname = "position"
#, bar_col_colname = "group"
, stability_colname = ""
, stability_outcome_colname = ""
, p_title = "" # "Protein stability (DUET)"
, my_xaxls = 12 # x-axis label size
, my_yaxls = 20 # y-axis label size
, my_xaxts = 18 # x-axis text size
, my_yaxts = 20 # y-axis text size
, my_pts = 20 # plot-title size
, my_xlab = "Position"
, my_ylab = "No. of nsSNPs"
)
{
# order the df by position and ensure it is a factor
plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
#cat("\nSneak peak:\n")
head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
# stability values isolated to help with generating column called: 'group'
my_grp = plotdf[[stability_colname]]
cat( "\nLength of nsSNPs:", length(my_grp)
, "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
# Add col: 'group'
plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
# check unique values in normalised data
cat("\nNo. of unique values in", stability_colname, "no rounding:"
, length(unique(plotdf[[stability_colname]])))
# Call the function to create the palette based on the group defined above
#subcols_ps
subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
#-------------------------------
# Generate the subcols barplot
#-------------------------------
#g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
g = ggplot(plotdf, aes_string(x = xvar_colname
# , ordered = T)
))
OutWidePlot = g + geom_bar(aes(fill = group)
, colour = "grey") +
scale_fill_manual( values = subcols_bp_hmap
, guide = "none") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts )
, plot.title = element_text(size = my_pts
, hjust = 0.5)) +
labs(title = p_title
, x = my_xlab
, y = my_ylab)
return(OutWidePlot)
}

View file

@ -152,6 +152,46 @@ combining_dfs_plotting <- function( my_df_u
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
quit()
}
# Quick formatting: ordering df and pretty labels
#------------------------------
# sorting by column: position
#------------------------------
merged_df2 = merged_df2[order(merged_df2$position), ]
#-----------------------
# mutation_info_labels
#-----------------------
merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col
, "DM", "OM")
merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels)
#-----------------------
# lineage labels
#-----------------------
merged_df2$lineage_labels = gsub("lineage", "L", merged_df2$lineage)
merged_df2$lineage_labels = factor(merged_df2$lineage_labels, c("L1"
, "L2"
, "L3"
, "L4"
, "L5"
, "L6"
, "L7"
, "LBOV"
, "L1;L2"
, "L1;L3"
, "L1;L4"
, "L2;L3"
, "L2;L3;L4"
, "L2;L4"
, "L2;L6"
, "L2;LBOV"
, "L3;L4"
, "L4;L6"
, "L4;L7"
, ""))
#=================================================================
# Merge 2: merged_df3

197
scripts/functions/lf_bp.R Normal file
View file

@ -0,0 +1,197 @@
#############################
# Barplots: ggplot
# stats +/-
# violin +/-
# barplot +/
# beeswarm
#############################
lf_bp <- function(lf_df
, p_title = ""
, colour_categ = ""
, x_grp = "mutation_info"
, y_var = "param_value"
, facet_var = "param_type"
, n_facet_row = 1
, y_scales = "free_y"
, colour_bp_strip = "khaki2"
, dot_size = 3
, dot_transparency = 0.3
, violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
, my_ats = 22 # axis text size
, my_als = 20 # axis label size
, my_fls = 20 # facet label size
, my_pts = 22 # plot title size)
, make_boxplot = FALSE
, bp_width = c("auto", 0.5)
, add_stats = TRUE
, stat_grp_comp = c("DM", "OM")
, stat_method = "wilcox.test"
, my_paired = FALSE
, stat_label = c("p.format", "p.signif") ){
fwv = as.formula(paste0("~", facet_var))
#fwv = reformulate(facet_var)
p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var)) +
facet_wrap( fwv
, nrow = n_facet_row
, scales = y_scales) +
geom_violin(trim = T
, scale = "width"
#, position = position_dodge(width = 0.9)
, draw_quantiles = violin_quantiles)
if (make_boxplot){
if (bp_width == "auto"){
bp_width = 0.5/length(unique(lf_df[[x_grp]]))
cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
}else{
cat("\nBoxplot width value provided, using:", bp_width, "\n")
bp_width = bp_width}
p2 = p1 + geom_boxplot(fill = "white"
, outlier.colour = NA
#, position = position_dodge(width = 0.9)
, width = bp_width) +
geom_beeswarm(priority = "density"
#, shape = 21
, size = dot_size
, alpha = dot_transparency
, show.legend = FALSE
, cex = 0.8
, aes(colour = factor(eval(parse(text = colour_categ))) ))
} else {
# ggbeeswarm (better than geom_point)
p2 = p1 + geom_beeswarm(priority = "density"
#, shape = 21
, size = dot_size
, alpha = dot_transparency
, show.legend = FALSE
, cex = 0.8
, aes(colour = factor(eval(parse(text = colour_categ))) ))
}
# Add foramtting to graph
OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_ats)
, axis.title.y = element_text(size = my_ats)
, plot.title = element_text(size = my_pts
, hjust = 0.5
, colour = "black"
, face = "bold")
, strip.background = element_rect(fill = colour_bp_strip)
, strip.text.x = element_text(size = my_fls
, colour = "black")
, legend.title = element_text(color = "black"
, size = my_als)
, legend.text = element_text(size = my_ats)
, legend.direction = "vertical") +
labs(title = p_title
, x = ""
, y = "")
if (add_stats){
my_comparisonsL <- list( stat_grp_comp )
OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL
, method = stat_method
, paired = my_paired
, label = stat_label[2])
return(OutPlot)
}
return(OutPlot)
}
#############################
# Barplot NO stats: plotly
# violin +/-
# barplot +/
# beeswarm
# TODO: plot_ly()
#############################
lf_bp_plotly <- function(lf_df
, p_title = ""
, colour_categ = ""
, x_grp = mutation_info
, y_var = param_value
, facet_var = param_type
, n_facet_row = 1
, y_scales = "free_y"
, colour_bp_strip = "khaki2"
, dot_size = 3
, dot_transparency = 0.3
, violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
, my_ats = 20 # axis text size
, my_als = 18 # axis label size
, my_fls = 18 # facet label size
, my_pts = 22 # plot title size)
#, make_boxplot = FALSE
, bp_width = c("auto", 0.5)
#, add_stats = FALSE
#, stat_grp_comp = c("DM", "OM")
#, stat_method = "wilcox.test"
#, my_paired = FALSE
#, stat_label = c("p.format", "p.signif")
){
OutPlotly = ggplot(lf_df, aes(x = eval(parse(text = x_grp))
, y = eval(parse(text = y_var))
, label1 = x_grp
, label2 = y_var
, lable3 = colour_categ) ) +
facet_wrap(~ eval(parse(text = facet_var))
, nrow = n_facet_row
, scales = y_scales) +
geom_violin(trim = T
, scale = "width"
, draw_quantiles = violin_quantiles) +
geom_beeswarm(priority = "density"
, size = dot_size
, alpha = dot_transparency
, show.legend = FALSE
, cex = 0.8
, aes(colour = factor(eval(parse(text = colour_categ) ) ) ) ) +
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_ats)
, axis.title.y = element_text(size = my_ats)
, plot.title = element_text(size = my_pts
, hjust = 0.5
, colour = "black"
, face = "bold")
, strip.background = element_rect(fill = colour_bp_strip)
, strip.text.x = element_text(size = my_fls
, colour = "black")
, legend.title = element_text(color = "black"
, size = my_als)
, legend.text = element_text(size = my_ats)
, legend.position = "none")+
labs(title = p_title
, x = ""
, y = "")
OutPlotly = ggplotly(OutPlotly
#, tooltip = c("label")
)
return(OutPlotly)
}

View file

@ -0,0 +1,21 @@
library(ggpubr)
###################################################################
lf_unpaired_stats <- function(lf_data
, lf_stat_value = "param_value"
, lf_stat_group = "mutation_info"
, lf_col_statvars = "param_type"
, my_paired = FALSE
, stat_adj = "none"){
stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group))
my_stat_df = compare_means(stat_formula
, group.by = lf_col_statvars
, data = lf_data
, paired = my_paired
, p.adjust.method = stat_adj)
return(my_stat_df)
}

View file

@ -0,0 +1,69 @@
###############################
# TASK: function to plot lineage
# dist plots with or without facet
# think about color palette
# for stability
##############################
#n_colours = length(unique(lin_dist_plot$duet_scaled))
#my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
lineage_distP <- function(plotdf
, x_axis = "duet_scaled"
, y_axis = "lineage_labels"
, x_lab = "DUET"
, with_facet = F
, facet_wrap_var = ""
, fill_categ = "mutation_info_labels"
, fill_categ_cols = c("#E69F00", "#999999")
, my_ats = 15 # axis text size
, my_als = 20 # axis label size
, my_leg_ts = 16
, my_leg_title = 16
, my_strip_ts = 20
, leg_pos = c(0.8, 0.9)
, leg_pos_wf = c("top", "left", "bottom", "right")
, leg_dir_wf = c("horizontal", "vertical")
, leg_label = "")
{
LinDistP = ggplot(plotdf, aes_string(x = x_axis
, y = y_axis))+
geom_density_ridges(aes_string(fill = fill_categ)
, scale = 3
, size = 0.3
, alpha = 0.8) +
scale_x_continuous(expand = c(0.01, 0.01)) +
#coord_cartesian( xlim = c(-1, 1)) +
scale_fill_manual(values = fill_categ_cols) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats)
, axis.title.x = element_text(size = my_ats)
, axis.title.y = element_blank()
, strip.text = element_text(size = my_strip_ts)
, legend.text = element_text(size = my_leg_ts)
, legend.title = element_text(size = my_leg_title)
, legend.position = c(0.8, 0.9)) +
labs(x = x_lab
, fill = leg_label)
if (with_facet){
# used reformulate or make as formula
#fwv = reformulate(facet_wrap_var)
fwv = as.formula(paste0("~", facet_wrap_var))
LinDistP = LinDistP +
facet_wrap(fwv) +
theme(legend.position = leg_pos_wf
, legend.direction = leg_dir_wf)
}
return(LinDistP)
}

View file

@ -1,24 +1,40 @@
my_corr_pairs <- function (corr_data){
my_corr_pairs <- function (corr_data_all
, corr_cols = colnames(corr_data_all)
, corr_method = "spearman" # other options: "pearson" or "kendall"
, colour_categ_col = "mutation_info_labels"
, categ_colour = c("#E69F00", "#999999")
, density_show = F
, hist_col = "coral4"
, dot_size = 1.6
, ats = 1.5
, corr_lab_size = 3
, corr_value_size = 1)
{
OutPlot_corr = pairs.panels(corr_data
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
corr_data_df = corr_data_all[corr_cols]
my_bg = categ_colour[corr_data_all[[colour_categ_col]] ]
OutPlot_corr = pairs.panels(corr_data_df
, method = corr_method
, hist.col = hist_col
, density = density_show
, ellipses = F
, smooth = F
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
#, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps$duet_outcome))] # foldx colours are reveresed
#, pch = 21 # for bg
, jitter = T
#, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_data$duet_outcome))] # foldx colours are reveresed
, bg = my_bg
, pch = 21
, alpha = 1
, cex = 1.8
, cex.axis = 2
, cex.labels = 3.5
, cex.cor = 1
, smooth = F)
, cex = dot_size
, cex.axis = ats
, cex.labels = corr_lab_size
, cex.cor = corr_value_size
)
return(OutPlot_corr)
#return (my_bg)
}

View file

@ -16,7 +16,9 @@ library(dplyr)
## my_df_u_lig
## dup_muts
#========================================================
plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) {
plotting_data <- function(df
, lig_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10) {
my_df = data.frame()
my_df_u = data.frame()
my_df_u_lig = data.frame()
@ -29,61 +31,6 @@ dup_muts = data.frame()
cat("\nInput dimensions:", dim(df))
#==================================
# add foldx outcome category
# and foldx scaled values
# This will enable to always have these variables available
# when calling for plots
#==================================
#------------------------------
# adding foldx scaled values
# scale data b/w -1 and 1
#------------------------------
n = which(colnames(df) == "ddg"); n
my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max
df$foldx_scaled = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max)
# sanity check
my_min = min(df$foldx_scaled); my_min
my_max = max(df$foldx_scaled); my_max
if (my_min == -1 && my_max == 1){
cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
, "\nProceeding with assigning foldx outcome category")
}else{
cat("\nFAIL: could not scale foldx ddg values"
, "Aborting!\n")
}
#------------------------------
# adding foldx outcome category
# ddg<0 = "Stabilising" (-ve)
#------------------------------
c1 = table(df$ddg < 0)
df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
c2 = table(df$ddg < 0)
if ( all(c1 == c2) ){
cat("\nPASS: foldx outcome successfully created")
}else{
cat("\nFAIL: foldx outcome could not be created. Aborting!\n")
exit()
}
#------------------------------
# renaming foldx column from
# "ddg" --> "ddg_foldx"
#------------------------------
# change name to foldx
colnames(df)[n] <- "ddg_foldx"
#==================================
# extract unique mutation entries
#==================================

View file

@ -32,7 +32,8 @@ import_dirs <- function(drug_name, gene_name) {
#===============================
# mcsm ligand distance cut off
#===============================
#mcsm_lig_cutoff <<- 10
LigDist_colname <<- "ligand_distance"
LigDist_cutoff <<- 10
#==================
# Angstroms symbol

View file

@ -42,7 +42,9 @@ site_snp_count_bp <- function (plotdf
, "\nNo. of cols:", ncol(plotdf)
, "\nNow adding column: frequency of mutational positions"))
# adding snpcount for each position
#-------------------------------------------
# adding column: snpcount for each position
#-------------------------------------------
setDT(plotdf)[, pos_count := .N, by = .(eval(parse(text = df_colname)))]
cat("\nCumulative nssnp count\n"
@ -64,15 +66,20 @@ site_snp_count_bp <- function (plotdf
cat(paste0("\nrevised df dimensions:"
, "\nNo. of rows:", nrow(plotdf)
, "\nNo. of cols:", ncol(plotdf)))
#------------------------------------------------------
# creating df: average count of snpcount for each position
# created in earlier step
#-------------------------------------------------------
# use group by on pos_count
snpsBYpos_df <- plotdf %>%
group_by(eval(parse(text = df_colname))) %>%
summarize(snpsBYpos = mean(pos_count))
cat("\nnssnp count\n"
, table(snpsBYpos_df$snpsBYpos))
dplyr::group_by(eval(parse(text = df_colname))) %>%
dplyr::summarise(snpsBYpos = mean(pos_count)) # changed from summarize!
cat("\nnssnp count per position\n"
, table(snpsBYpos_df$snpsBYpos)
, "\n")
# calculating total no. of sites associated with nsSNPs
tot_sites = sum(table(snpsBYpos_df$snpsBYpos))

View file

@ -0,0 +1,104 @@
#########################################################
# 1b: Define function: coloured barplot by subgroup
# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
#########################################################
ColourPalleteMulti = function(df, group, subgroup){
# Find how many colour categories to create and the number of colours in each
categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
, df
, function(x) length(unique(x)))
# return(categories) }
category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
#return(category.start); return(category.end)}
# Build Colour pallette
colours <- unlist(lapply(1:nrow(categories),
function(i){
colorRampPalette(colors = c(category.start[i]
, category.end[i]))(categories[i,2])}))
return(colours)
}
#########################################################################
bp_stability_hmap <- function(plotdf = merged_df3
, xvar_colname = "position"
#, bar_col_colname = "group"
, stability_colname = "duet_scaled"
, stability_outcome_colname = "duet_outcome"
, p_title = "" # "Protein stability (DUET)"
, my_xaxls = 12 # x-axis label size
, my_yaxls = 20 # y-axis label size
, my_xaxts = 18 # x-axis text size
, my_yaxts = 20 # y-axis text size
, my_pts = 20 # plot-title size
, my_xlab = "Position"
, my_ylab = "No. of nsSNPs"
)
{
# order the df by position and ensure it is a factor
plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
#cat("\nSneak peak:\n")
head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
# stability values isolated to help with generating column called: 'group'
my_grp = plotdf[[stability_colname]]
cat( "\nLength of nsSNPs:", length(my_grp)
, "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
# Add col: 'group'
plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
# check unique values in normalised data
cat("\nNo. of unique values in", stability_colname, "no rounding:"
, length(unique(plotdf[[stability_colname]])))
# Call the function to create the palette based on the group defined above
#subcols_ps
subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
#-------------------------------
# Generate the subcols barplot
#-------------------------------
#g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
g = ggplot(plotdf, aes_string(x = xvar_colname
# , ordered = T)
))
OutWidePlot = g + geom_bar(aes(fill = group)
, colour = "grey") +
scale_fill_manual( values = subcols_bp_hmap
, guide = "none") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts )
, plot.title = element_text(size = my_pts
, hjust = 0.5)) +
labs(title = p_title
, x = my_xlab
, y = my_ylab)
return(OutWidePlot)
}

View file

@ -0,0 +1,97 @@
library(ggpubr)
###################################################################
####################################
lf_bp_with_stats <- function(lf_df
, x_grp = "mutation_info"
, y_var = "param_value"
, facet_var = "param_type"
, n_facet_row = 1
, y_scales = "free_y"
, p_title = ""
, colour_categ = ""
, colour_bp_strip = "khaki2"
, stat_grp_comp = c("DM", "OM")
, stat_method = "wilcox.test"
, my_paired = FALSE
, bp_width = c("auto", 0.5)
, dot_size = 3
, dot_transparency = 0.3
, stat_label = c("p.format", "p.signif")
, my_ats = 22 # axis text size
, my_als = 20 # axis label size
, my_fls = 20 # facet label size
, my_pts = 22 # plot title size
) {
if (bp_width == "auto"){
bp_width = 0.5/length(unique(lf_df[[x_grp]]))
cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
}else{
cat("\nBoxplot width value provided, using:", bp_width, "\n")
bp_width = bp_width
}
my_comparisonsL <- list( stat_grp_comp )
bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp))
, y = eval(parse(text = y_var)) )) +
facet_wrap(~ eval(parse(text = facet_var))
, nrow = n_facet_row
, scales = y_scales) +
geom_violin(trim = T
, scale = "width"
#, position = position_dodge(width = 0.9)
, draw_quantiles = c(0.25, 0.5, 0.75)) +
# geom_boxplot(fill = "white"
# , outlier.colour = NA
# #, position = position_dodge(width = 0.9)
# , width = bp_width) +
# geom_point(position = position_jitterdodge(dodge.width = 0.5)
# , alpha = 0.5
# , show.legend = FALSE
# , aes(colour = factor(eval(parse(text = colour_categ))) )) +
# ggbeeswarm (better than geom_point)
geom_beeswarm(priority = "density"
#, shape = 21
, size = dot_size
, alpha = dot_transparency
, show.legend = FALSE
, cex = 0.8
, aes(colour = factor(eval(parse(text = colour_categ))) )) +
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_ats)
, axis.title.y = element_text(size = my_ats)
, plot.title = element_text(size = my_pts
, hjust = 0.5
, colour = "black"
, face = "bold")
, strip.background = element_rect(fill = colour_bp_strip)
, strip.text.x = element_text(size = my_fls
, colour = "black")
, legend.title = element_text(color = "black"
, size = my_als)
, legend.text = element_text(size = my_ats)
, legend.direction = "vertical") +
labs(title = p_title
, x = ""
, y = "")+
stat_compare_means(comparisons = my_comparisonsL
, method = stat_method
, paired = my_paired
, label = stat_label[1])
return(bp_statP)
}

View file

@ -0,0 +1,83 @@
setwd("~/git/LSHTM_analysis/scripts/plotting/")
source("../functions/lf_bp_with_stats.R")
source("../functions/lf_bp.R")
######################
# Make plot
######################
# Note: Data
# run other_plots_data.R
# to get the long format data to test this function
lf_bp(lf_df = lf_dynamut2
, p_title = "Dynamut2"
, colour_categ = "ddg_dynamut2_outcome"
, x_grp = "mutation_info"
, y_var = "param_value"
, facet_var = "param_type"
, n_facet_row = 1
, y_scales = "free_y"
, colour_bp_strip = "khaki2"
, dot_size = 3
, dot_transparency = 0.3
, violin_quantiles = c(0.25, 0.5, 0.75)
, my_ats = 22 # axis text size
, my_als = 20 # axis label size
, my_fls = 20 # facet label size
, my_pts = 22 # plot title size
, make_boxplot = F
, bp_width = "auto"
, add_stats = T
, stat_grp_comp = c("DM", "OM")
, stat_method = "wilcox.test"
, my_paired = FALSE
, stat_label = c("p.format", "p.signif") )
# foo = lf_dynamut2 %>%
# group_by(mutation_info, param_type) %>%
# summarise( Mean = mean(param_value, na.rm = T)
# , SD = sd(param_value, na.rm = T)
# , Median = median(param_value, na.rm = T)
# , IQR = IQR(param_value, na.rm = T) )
# Quick tests
plotdata_sel = subset(lf_dynamut2
, lf_dynamut2$param_type == "ASA")
plot_sum = plotdata_sel %>%
group_by(mutation_info, param_type) %>%
summarise(n = n()
, Mean = mean(param_value, na.rm = T)
, SD = sd(param_value, na.rm = T)
, Min = min(param_value, na.rm = T)
, Q1 = quantile(param_value, na.rm = T, 0.25)
, Median = median(param_value, na.rm = T)
, Q3 = quantile(param_value, na.rm = T, 0.75)
, Max = max(param_value, na.rm = T) ) %>%
rename('Mutation Class' = mutation_info
, Parameter = param_type)
plot_sum = as.data.frame(plot_sum, row.names = NULL)
plot_sum
bar = compare_means(param_value ~ mutation_info
, group.by = "param_type"
, data = plotdata_sel
, paired = FALSE
, p.adjust.method = "BH")
bar2 = bar[c("param_type"
, "group1"
, "group2"
, "p.format"
, "p.signif"
, "p.adj")] %>%
rename(Parameter = param_type
, Group1 = group1
, Group2 = group2
, "P-value" = p.format
, "P-sig" = p.signif
, "P-adj" = p.adj)
bar2 = data.frame(bar2); bar2
library(Hmisc)
describe(lf_dynamut2)

View file

@ -15,39 +15,42 @@ theme_set(theme_grey())
## ...opt args
#==========================================================
stability_count_bp <- function(plotdf
, df_colname
, leg_title = "Legend title"
, axis_text_size = 25
, axis_label_size = 22
, leg_text_size = 20
, leg_title_size = 22
, df_colname = ""
, leg_title = "Legend Title"
, ats = 25 # axis text size
, als = 22 # axis label size
, lts = 20 # legend text size
, ltis = 22 # label title size
, geom_ls = 10 # geom_label size
, yaxis_title = "Number of nsSNPs"
, bp_plot_title = ""
, label_categories = c("Destabilising", "Stabilising")
, title_colour = "chocolate4"
, subtitle_text = NULL
, subtitle_size = 20
, sts = 20
, subtitle_colour = "pink"
#, leg_position = c(0.73,0.8) # within plot area
, leg_position = "top"){
OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) +
# OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) +
OutPlot_count = ggplot(plotdf, aes_string(x = df_colname)) +
geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) +
geom_label(stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) +
, size = geom_ls) +
theme(axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size = axis_label_size)
, axis.text.y = element_text(size = axis_text_size)
, axis.title.y = element_text(size = als)
, axis.text.y = element_text(size = ats)
, legend.position = leg_position
, legend.text = element_text(size = leg_text_size)
, legend.title = element_text(size = leg_title_size)
, plot.title = element_text(size = axis_label_size
, colour = title_colour)
, plot.subtitle = element_text(size = subtitle_size
, legend.text = element_text(size = lts)
, legend.title = element_text(size = ltis)
, plot.title = element_text(size = als
, colour = title_colour
, hjust = 0.5)
, plot.subtitle = element_text(size = sts
, hjust = 0.5
, colour = subtitle_colour)) +
labs(title = bp_plot_title

View file

@ -0,0 +1,62 @@
setwd("~/git/LSHTM_analysis/scripts/plotting")
source ('get_plotting_dfs.R')
source("../functions/bp_lineage.R")
#########################################
# Lineage and SNP count: lineage lf data
#########################################
#=========================
# Data: All lineages or
# selected few
#=========================
sel_lineages = levels(lin_lf$sel_lineages_f)
sel_lineages
lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,]
# drop unused factor levels
lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f)
levels(lin_lf_plot$sel_lineages_f)
#=========================
# Lineage count plot
#=========================
lin_count_bp(lin_lf_plot
, x_categ = "sel_lineages_f"
, y_count = "p_count"
, bar_fill_categ = "count_categ"
, display_label_col = "p_count"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 20
, bar_col_labels = c("Mutations", "Total Samples")
, bar_col_values = c("grey50", "gray75")
, y_scale_percent = F # T for diversity
, y_log10 = F
, y_label = "Count")
###############################################
# Lineage SNP diversity count: lineage wf data
###############################################
#=========================
# Data: All lineages or
# selected few
#=========================
sel_lineages = levels(lin_wf$sel_lineages_f)
sel_lineages
lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,]
# drop unused factor levels
lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f)
levels(lin_wf_plot$sel_lineages_f)
#=========================
# Lineage Diversity plot
#=========================
lin_count_bp(lin_wf_plot
, x_categ = "sel_lineages_f"
, y_count = "snp_diversity"
, display_label_col = "snp_diversity_f"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 20
, y_scale_percent = T
, y_label = "SNP diversity")

View file

@ -0,0 +1,78 @@
#!/usr/bin/env Rscript
#source("~/git/Misc/shiny/myshiny/gid_data.R")
source("~/git/LSHTM_analysis/config/gid.R")
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R")
# p1
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "duet_scaled"
, stability_outcome_colname = "duet_outcome"
, p_title = "DUET" )
# p2
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "foldx_scaled"
, stability_outcome_colname = "foldx_outcome"
, p_title = "FoldX" )
# p3
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "deepddg_scaled"
, stability_outcome_colname = "deepddg_outcome"
, p_title = "DeepDDG" )
# p4
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "ddg_dynamut2_scaled"
, stability_outcome_colname = "ddg_dynamut2_outcome"
, p_title = "Dynamut2" )
# p5
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "mcsm_na_scaled"
, stability_outcome_colname = "mcsm_na_outcome"
, p_title = "mCSM-NA" )
# p6
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "ddg_dynamut_scaled"
, stability_outcome_colname = "ddg_dynamut_outcome"
, p_title = "Dynamut" )
# p7
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "ddg_mcsm_scaled"
, stability_outcome_colname = "ddg_mcsm_outcome"
, p_title = "mCSM" )
# p8
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "ddg_duet_scaled"
, stability_outcome_colname = "ddg_duet_outcome"
, p_title = "DUET-d" )
# p9
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "ddg_sdm_scaled"
, stability_outcome_colname = "ddg_sdm_outcome"
, p_title = "SDM" )
# p10
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "ddg_encom_scaled"
, stability_outcome_colname = "ddg_encom_outcome"
, p_title = "ENCoM-Stability" )
# p11
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "dds_encom_scaled"
, stability_outcome_colname = "dds_encom_outcome"
, p_title = "ENCoM-Flexibility" )
# p12
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "affinity_scaled"
, stability_outcome_colname = "ligand_outcome"
, p_title = "mCSM-lig" )

View file

@ -0,0 +1,59 @@
#!/usr/bin/env Rscript
#source("~/git/Misc/shiny/myshiny/gid_data.R")
source("~/git/LSHTM_analysis/config/gid.R")
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R")
# p1
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "duet_scaled"
, stability_outcome_colname = "duet_outcome"
, p_title = "DUET" )
# p2
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "foldx_scaled"
, stability_outcome_colname = "foldx_outcome"
, p_title = "FoldX" )
# p3
bp_stability_hmap(plotdf = merged_df3
, stability_colname = "deepddg_scaled"
, stability_outcome_colname = "deepddg_outcome"
, p_title = "DeepDDG" )
##################################################
merged_df3_f = merged_df3
setDT(merged_df3_f)[, pos_count := .N, by = position]
##################################################
ui <- basicPage(
plotOutput("plot1", click = "plot_click"),
verbatimTextOutput("info")
)
server <- function(input, output) {
output$plot1 <- renderPlot({
#plot(mtcars$wt, mtcars$mpg)
bp_stability_hmap(plotdf = merged_df3_f
, xvar_colname = "position"
, stability_colname = "foldx_scaled"
, stability_outcome_colname = "foldx_outcome"
, p_title = "FoldX" )
})
output$info <- renderPrint({
# With base graphics, need to tell it what the x and y variables are.
nearPoints(merged_df3_f, input$plot_click
, xvar = "position"
, yvar = "pos_count"
)
})
}
shinyApp(ui, server)

View file

@ -0,0 +1,58 @@
setwd("~/git/LSHTM_analysis/scripts/plotting/")
source("Header_TT.R")
source("../functions/lf_bp.R")
# ================================================
# Data: run get_plotting_data.R
# to get the long format data to test this function
# drug = "streptomycin"
# gene = "gid"
# source("get_plotting_dfs.R")
# ==================================================
######################
# Make plot: ggplot
######################
lf_bp(lf_df = lf_encomddg
, p_title = "ENCoM-DDG"
, colour_categ = "ddg_encom_outcome"
, x_grp = "mutation_info"
, y_var = "param_value"
, facet_var = "param_type"
, n_facet_row = 1
, y_scales = "free_y"
, colour_bp_strip = "khaki2"
, dot_size = 3
, dot_transparency = 0.3
, violin_quantiles = c(0.25, 0.5, 0.75)
, my_ats = 22 # axis text size
, my_als = 20 # axis label size
, my_fls = 20 # facet label size
, my_pts = 22 # plot title size
, make_boxplot = F
, bp_width = "auto"
, add_stats = T
, stat_grp_comp = c("DM", "OM")
, stat_method = "wilcox.test"
, my_paired = FALSE
, stat_label = c("p.format", "p.signif") )
#wilcox.test(wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "DM"]
# , wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "OM"])
######################
# Make plot: plotly
######################
# FIXME: This labels are not working as I want!
# lf_bp_plotly(lf_df = lf_deepddg
# , p_title = "DeepDDG"
# , colour_categ = "deepddg_outcome"
# , x_grp = "mutation_info"
# , y_var = "param_value"
# , facet_var = "param_type"
# , n_facet_row = 1
# , y_scales = "free_y"
# , colour_bp_strip = "khaki2"
# , dot_size = 3
# , dot_transparency = 0.3
# , violin_quantiles = c(0.25, 0.5, 0.75)
# )

View file

@ -0,0 +1,19 @@
setwd("~/git/LSHTM_analysis/scripts/functions")
source("lf_unpaired_stats.R")
#####################
# call stat function()
# a useful way to check stats
# for any lf data
#####################
# Note: Data
# run other_plots_data.R
# to get the long format data to test this function
stat_results_df <- lf_unpaired_stats(lf_data = lf_duet
, lf_stat_value = "param_value"
, lf_stat_group = "mutation_info"
, lf_col_statvars = "param_type"
, my_paired = FALSE
, stat_adj = "none"
)

View file

@ -0,0 +1,33 @@
###############################
# TEST function lineage_dist.R
# to plot lineage
# dist plots with or without facet
##############################
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("Header_TT.R")
source("get_plotting_dfs.R")
cat("cols imported:"
, mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
#############################################################
# without facet
lineage_distP(lin_dist_plot
, with_facet = F
, leg_label = "Mutation Class"
)
# without facet
lineage_distP(lin_dist_plot
, with_facet = T
, facet_wrap_var = "mutation_info_labels"
, leg_label = "Mutation Class"
, leg_pos_wf = "none"
, leg_dir_wf = "horizontal"
)

View file

@ -1,14 +1,11 @@
#########################################################
### A) Installing and loading required packages
# A) Installing and loading required packages
# B) My functions
#########################################################
#########################################################
#lib_loc = "/usr/local/lib/R/site-library")
#if (!require("gplots")) {
# install.packages("gplots", dependencies = TRUE)
# library(gplots)
#}
require(extrafont)
require("getopt", quietly = TRUE) # cmd parse arguments
if (!require("tidyverse")) {
@ -16,9 +13,53 @@ if (!require("tidyverse")) {
library(tidyverse)
}
if (!require("ggplot2")) {
install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)
if (!require("shiny")) {
install.packages("shiny", dependencies = TRUE)
library(shiny)
}
if (!require("shinyBS")) {
install.packages("shinyBS", dependencies = TRUE)
library(shinyBS)
}
if (!require("gridExtra")) {
install.packages("gridExtra", dependencies = TRUE)
library(gridExtra)
}
if (!require("ggridges")) {
install.packages("ggridges", dependencies = TRUE)
library(ggridges)
}
# if (!require("ggplot2")) {
# install.packages("ggplot2", dependencies = TRUE)
# library(ggplot2)
# }
# if (!require ("dplyr")){
# install.packages("dplyr")
# library(dplyr)
# }
if (!require ("DT")){
install.packages("DT")
library(DT)
}
if (!require ("plyr")){
install.packages("plyr")
library(plyr)
}
# Install
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
if (!require ("ggbeeswarm")){
install.packages("ggbeeswarm")
library(ggbeeswarm)
}
if (!require("plotly")) {
@ -101,11 +142,6 @@ if (!require ("psych")){
library(psych)
}
if (!require ("dplyr")){
install.packages("dplyr")
library(dplyr)
}
if (!require ("compare")){
install.packages("compare")
library(compare)
@ -116,6 +152,22 @@ if (!require ("arsenal")){
library(arsenal)
}
if(!require(ggseqlogo)){
install.packages("ggseqlogo")
library(ggseqlogo)
}
# for PDB files
if(!require(bio3d)){
install.packages("bio3d")
library(bio3d)
}
library(protr)
if(!require(protr)){
install.packages("protr")
library(protr)
}
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
@ -123,24 +175,14 @@ if (!require ("arsenal")){
#BiocManager::install("Logolas")
library("Logolas")
#install.packages("ggseqlogo")
library(ggseqlogo)
####################################
# Load all my functions:
# only works if tidyverse is loaded
# hence included it here!
####################################
####TIDYVERSE
# Install
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
func_path = "~/git/LSHTM_analysis/scripts/functions/"
source_files <- list.files(func_path, "\\.R$") # locate all .R files
map(paste0(func_path, source_files), source) # source all your R scripts!
library(ggcorrplot)
###for PDB files
#install.packages("bio3d")
if(!require(bio3d)){
install.packages("bio3d")
library(bio3d)
}
#install.packages("protr")
library(protr)

View file

@ -124,4 +124,4 @@ print(outPlot_bp_lig)
dev.off()
######################################################################=
# End of script
######################################################################=
######################################################################=

2
scripts/plotting/basic_barplots_combined.R Normal file → Executable file
View file

@ -23,7 +23,7 @@ plot_basic_bp_combined_labelled = paste0(plotdir,"/", basic_bp_combined_labell
#=======================================================================
#=======
# combin DUET and Ligand affinity plots
# combine DUET and Ligand affinity plots
#=======
svg(plot_basic_bp_combined_labelled , width = 12, height = 12 )

0
scripts/plotting/corr_adjusted_PS_LIG.R Normal file → Executable file
View file

View file

@ -0,0 +1,120 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for corr plots
#########################################################
#=================================================
# Data for Corrplots
#=================================================
cat("\n=========================================="
, "\nCORR PLOTS data: ALL params"
, "\n=========================================")
# use data
#merged_df2
#----------------------------
# columns for corr plots:PS
#----------------------------
# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find!
big_df_colnames = data.frame(names(merged_df2))
corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels"
, "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa"
, "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af"
, "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity"
, "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance")
#===========================
# Corr data for plots: PS
# big_df ps: ~ merged_df2
#===========================
corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select]
#-----------------------
# formatting: some cols
# Add pretty colnames
#-----------------------
corr_df_m2_f <- corr_df_m2 %>%
rename(
DUET = duet_stability_change
, 'mCSM-lig' = ligand_affinity_change
, FoldX = ddg_foldx
, DeepDDG = deepddg
, ASA = asa
, RSA = rsa
, KD = kd_values
, RD = rd_values
, MAF = af
, 'Log (OR)' = log10_or_mychisq
, '-Log (P)' = neglog_pval_fisher
, Dynamut = ddg_dynamut
, 'ENCoM-DDG'= ddg_encom
, mCSM = ddg_mcsm
, SDM = ddg_sdm
, 'DUET-d' = ddg_duet
, 'ENCoM-DDS'= dds_encom
, Dynamut2 = ddg_dynamut2
, 'mCSM-NA' = mcsm_na_affinity )
#===========================
# Corr data for plots: PS
# short_df ps: ~merged_df3
#===========================
corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),]
na_or = sum(is.na(corr_df_m3$log10_or_mychisq))
check1 = nrow(corr_df_m3) - na_or; check1
if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
cat( "\nPASS: No. of rows for corr_df_m3 match"
, "\nPASS: No. of OR values checked: " , check1)
} else {
cat("\nFAIL: Numbers mismatch:"
, "\nExpected nrows: ", nrow(merged_df3)
, "\nGot: ", nrow(corr_df_m3)
, "\nExpected OR values: ", nrow(merged_df3_comp)
, "\nGot: ", check1)
}
#-----------------------
# formatting: some cols
# Add pretty colnames
#-----------------------
corr_df_m3_f <- corr_df_m3 %>%
rename(
DUET = duet_stability_change
, 'mCSM-lig' = ligand_affinity_change
, FoldX = ddg_foldx
, DeepDDG = deepddg
, ASA = asa
, RSA = rsa
, KD = kd_values
, RD = rd_values
, MAF = af
, 'Log (OR)' = log10_or_mychisq
, '-Log (P)' = neglog_pval_fisher
, Dynamut = ddg_dynamut
, 'ENCoM-DDG'= ddg_encom
, mCSM = ddg_mcsm
, SDM = ddg_sdm
, 'DUET-d' = ddg_duet
, 'ENCoM-DDS'= dds_encom
, Dynamut2 = ddg_dynamut2
, 'mCSM-NA' = mcsm_na_affinity )
########################################################################
cat("\nCorr Data created:"
, "\n==================================="
, "\ncorr_df_m2: created from merged_df2"
, "\n==================================="
, "\nnrows:", nrow(corr_df_m2)
, "\nncols:", ncol(corr_df_m2)
, "\n==================================="
, "\ncorr_df_m3: created from merged_df3"
, "\n==================================="
, "\nnrows:", nrow(corr_df_m3)
, "\nncols:", ncol(corr_df_m3)
)

0
scripts/plotting/dirs.R Normal file → Executable file
View file

0
scripts/plotting/dist_plots_check.R Normal file → Executable file
View file

View file

@ -0,0 +1,416 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for dm om plots:
# generating LF data
# sourced by get_plotting_dfs.R
#########################################################
##========================================================================
# cols to select:
# THINK: whu
comb_df <- merged_df3[, c("mutationinformation", "mutation"
, "mutation_info","mutation_info_labels"
, "position"
, LigDist_colname
, "duet_stability_change", "duet_scaled", "duet_outcome"
, "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
, "deepddg", "deepddg_scaled", "deepddg_outcome"
, "asa", "rsa"
, "rd_values", "kd_values"
, "log10_or_mychisq", "neglog_pval_fisher", "af"
, "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome"
, "ddg_dynamut", "ddg_dynamut_scaled","ddg_dynamut_outcome"
, "ddg_encom", "ddg_encom_scaled", "ddg_encom_outcome"
, "dds_encom", "dds_encom_scaled", "dds_encom_outcome"
, "ddg_mcsm", "ddg_mcsm_scaled", "ddg_mcsm_outcome"
, "ddg_sdm", "ddg_sdm_scaled", "ddg_sdm_outcome"
, "ddg_duet", "ddg_duet_scaled", "ddg_duet_outcome"
, "ddg_dynamut2","ddg_dynamut2_scaled", "ddg_dynamut2_outcome")]
comb_df_s = arrange(comb_df, position)
#=======================================================================
fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
fact_cols
lapply(comb_df_s[, fact_cols], class)
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
cat("\nChanging cols to factor")
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
cat("\nSuccessful: cols changed to factor")
}
}
lapply(comb_df_s[, fact_cols], class)
#=======================================================================
table(comb_df_s$mutation_info)
# further checks to make sure dr and other muts are indeed unique
dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
dr_muts_names = unique(dr_muts$mutation)
other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
other_muts_names = unique(other_muts$mutation)
if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
cat("PASS: dr and other muts are indeed unique")
}else{
cat("FAIL: dr and others muts are NOT unique!")
quit()
}
# pretty display names i.e. labels to reduce major code duplication later
foo_cnames = data.frame(colnames(comb_df_s))
names(foo_cnames) <- "old_name"
stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
duet_dn = paste0("DUET ", stability_suffix); duet_dn
foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn
deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn
mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn
dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
sdm_dn = paste0("SDM " , stability_suffix); sdm_dn
mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn
# Change colnames of some columns using datatable
comb_df_sl = comb_df_s
names(comb_df_sl)
setnames(comb_df_sl
, old = c("asa", "rsa", "rd_values", "kd_values"
, "log10_or_mychisq", "neglog_pval_fisher", "af"
, LigDist_colname
, "duet_scaled"
, "foldx_scaled"
, "deepddg_scaled"
, "mcsm_na_scaled"
, "ddg_dynamut_scaled"
, "ddg_dynamut2_scaled"
, "ddg_encom_scaled"
, "dds_encom_scaled"
, "ddg_sdm"
, "ddg_mcsm")
, new = c("ASA", "RSA", "RD", "KD"
, "Log10 (OR)", "-Log (P)", "MAF"
, lig_dn
, duet_dn
, foldx_dn
, deepddg_dn
, mcsm_na_dn
, dynamut_dn
, dynamut2_dn
, encom_ddg_dn
, encom_dds_dn
, sdm_dn
, mcsm_dn)
)
foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
# some more pretty labels
table(comb_df_sl$mutation_info)
levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
table(comb_df_sl$mutation_info)
#######################################################################
#======================
# Selecting dfs
# with appropriate cols
#=======================
static_cols_start = c("mutationinformation"
, "position"
, "mutation"
, "mutation_info")
static_cols_end = c(lig_dn
, "ASA"
, "RSA"
, "RD"
, "KD")
# ordering is important!
#########################################################################
#==============
# DUET: LF
#==============
cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
wf_duet = comb_df_sl[, cols_to_select_duet]
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
expected_rows_lf
# LF data: duet
lf_duet = gather(wf_duet
, key = param_type
, value = param_value
, all_of(duet_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_duet) == expected_rows_lf){
cat("\nPASS: long format data created for ", duet_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# FoldX: LF
#==============
cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
wf_foldx = comb_df_sl[, cols_to_select_foldx]
pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
expected_rows_lf
# LF data: Foldx
lf_foldx <<- gather(wf_foldx
, key = param_type
, value = param_value
, all_of(foldx_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_foldx) == expected_rows_lf){
cat("\nPASS: long format data created for ", foldx_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# Deepddg: LF
#==============
cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
expected_rows_lf
# LF data: Deepddg
lf_deepddg = gather(wf_deepddg
, key = param_type
, value = param_value
, all_of(deepddg_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_deepddg) == expected_rows_lf){
cat("\nPASS: long format data created for ", deepddg_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# mCSM-NA: LF
#==============
cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
expected_rows_lf
# LF data: mcsm_na
lf_mcsm_na = gather(wf_mcsm_na
, key = param_type
, value = param_value
, all_of(mcsm_na_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_mcsm_na) == expected_rows_lf){
cat("\nPASS: long format data created for ", mcsm_na_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# Dynamut: LF
#==============
cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
expected_rows_lf
# LF data: dynamut
lf_dynamut = gather(wf_dynamut
, key = param_type
, value = param_value
, all_of(dynamut_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_dynamut) == expected_rows_lf){
cat("\nPASS: long format data created for ", dynamut_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# Dynamut2: LF
#==============
cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
expected_rows_lf
# LF data: dynamut2
lf_dynamut2 = gather(wf_dynamut2
, key = param_type
, value = param_value
, all_of(dynamut2_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_dynamut2) == expected_rows_lf){
cat("\nPASS: long format data created for ", dynamut2_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# EnCOM ddg: LF
#==============
cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg
expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
expected_rows_lf
# LF data: encomddg
lf_encomddg = gather(wf_encomddg
, key = param_type
, value = param_value
, all_of(encom_ddg_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_encomddg) == expected_rows_lf){
cat("\nPASS: long format data created for ", encom_ddg_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# EnCOM dds: LF
#==============
cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds
expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
expected_rows_lf
# LF data: encomdds
lf_encomdds = gather(wf_encomdds
, key = param_type
, value = param_value
, all_of(encom_dds_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_encomdds) == expected_rows_lf){
cat("\nPASS: long format data created for", encom_dds_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# SDM: LF
#==============
cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
wf_sdm = comb_df_sl[, cols_to_select_sdm]
pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
expected_rows_lf
# LF data: sdm
lf_sdm = gather(wf_sdm
, key = param_type
, value = param_value
, all_of(sdm_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_sdm) == expected_rows_lf){
cat("\nPASS: long format data created for", sdm_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# mCSM: LF
#==============
cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
expected_rows_lf
# LF data: mcsm
lf_mcsm = gather(wf_mcsm
, key = param_type
, value = param_value
, all_of(mcsm_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_mcsm) == expected_rows_lf){
cat("\nPASS: long format data created for", mcsm_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
#==========================
# Duet-d(from Dynamut): LF
#===========================
#Not created, redundant and chaos!
############################################################################

0
scripts/plotting/extreme_muts.R Normal file → Executable file
View file

628
scripts/plotting/get_plotting_dfs.R Normal file → Executable file
View file

@ -1,32 +1,27 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Get formatted data for plots
#=======================================================================
#########################################################
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
source("Header_TT.R")
source("../functions/my_pairs_panel.R") # with lower panel turned off
source("../functions/plotting_globals.R")
source("../functions/plotting_data.R")
source("../functions/combining_dfs_plotting.R")
source("../functions/bp_subcolours.R")
#********************
# cmd args passed
# in from other scripts
# to call this
#********************
#drug = 'streptomycin'
#gene = 'gid'
#====================
# variables for lig
#====================
LigDist_colname = "ligand_distance"
LigDist_cutoff = 10
#LigDist_colname = "ligand_distance"
#LigDist_cutoff = 10
#===========
# input
@ -41,8 +36,8 @@ import_dirs(drug, gene)
#---------------------------
if (!exists("infile_params") && exists("gene")){
#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
#in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA (and for gid finally) 10/09/21
#in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
infile_params = paste0(outdir, "/", in_filename_params)
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
}
@ -54,10 +49,15 @@ pd_df = plotting_data(mcsm_df
, lig_dist_colname = LigDist_colname
, lig_dist_cutoff = LigDist_cutoff)
my_df = pd_df[[1]]
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
my_df_u_lig = pd_df[[3]]
dup_muts = pd_df[[4]]
my_df = pd_df[[1]]
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
max_ang <- round(max(my_df_u[LigDist_colname]))
min_ang <- round(min(my_df_u[LigDist_colname]))
cat("\nLigand distance cut off, colname:", LigDist_colname
, "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
, "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")
#--------------------------------
# call: combining_dfs_plotting()
@ -81,509 +81,149 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
, lig_dist_colname = LigDist_colname
, lig_dist_cutoff = LigDist_cutoff)
merged_df2 = all_plot_dfs[[1]]
merged_df3 = all_plot_dfs[[2]]
merged_df2_comp = all_plot_dfs[[3]]
merged_df3_comp = all_plot_dfs[[4]]
merged_df2_lig = all_plot_dfs[[5]]
merged_df3_lig = all_plot_dfs[[6]]
merged_df2_comp_lig = all_plot_dfs[[7]]
merged_df3_comp_lig = all_plot_dfs[[8]]
merged_df2 = all_plot_dfs[[1]]
merged_df3 = all_plot_dfs[[2]]
merged_df2_comp = all_plot_dfs[[3]]
merged_df3_comp = all_plot_dfs[[4]]
#======================================================================
#TODO: Think! MOVE TO COMBINE or singular file for deepddg
#============================
# adding deepddg scaled values
# scale data b/w -1 and 1
#============================
# n = which(colnames(merged_df3) == "deepddg"); n
#
# my_min = min(merged_df3[,n]); my_min
# my_max = max(merged_df3[,n]); my_max
#
# merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
# , merged_df3[,n]/abs(my_min)
# , merged_df3[,n]/my_max)
# # sanity check
# my_min = min(merged_df3$deepddg_scaled); my_min
# my_max = max(merged_df3$deepddg_scaled); my_max
#
# if (my_min == -1 && my_max == 1){
# cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
# #, "\nProceeding with assigning deep outcome category")
# , "\n")
# }else{
# cat("\nFAIL: could not scale DeepDDG ddg values"
# , "Aborting!")
# }
#
####################################################################
# Data for subcols barplot (~heatmpa)
# Data for combining other dfs
####################################################################
# can include: mutation, or_kin, pwald, af_kin
cols_to_select = c("mutationinformation", "drtype"
, "wild_type"
, "position"
, "mutant_type"
, "chain", "ligand_id", "ligand_distance"
, "duet_stability_change", "duet_outcome", "duet_scaled"
, "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
, "deepddg", "deepddg_outcome" # comment out as not available for pnca
, "asa", "rsa", "rd_values", "kd_values"
, "af", "or_mychisq", "pval_fisher"
, "or_fisher", "or_logistic", "pval_logistic"
, "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
, "wt_calcprop", "mut_calcprop")
#=======================
# Data for sub colours
# barplot: PS
#=======================
#source("other_dfs_data.R")
# Fixed this at source i.e python script
# Moved: "other_dfs_data.R" to redundant/
cat("\nNo. of cols to select:", length(cols_to_select))
####################################################################
# Data for subcols barplot (~heatmap)
####################################################################
subcols_df_ps = merged_df3[, cols_to_select]
cat("\nNo of unique positions for ps:"
, length(unique(subcols_df_ps$position)))
# add count_pos col that counts the no. of nsSNPS at a position
setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
# should be a factor
if (is.factor(subcols_df_ps$duet_outcome)){
cat("\nDuet_outcome is factor")
table(subcols_df_ps$duet_outcome)
}else{
cat("\nConverting duet_outcome to factor")
subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
table(subcols_df_ps$duet_outcome)
}
# should be -1 and 1
min(subcols_df_ps$duet_scaled)
max(subcols_df_ps$duet_scaled)
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
# check unique values in normalised data
cat("\nNo. of unique values in duet scaled, no rounding:"
, length(unique(subcols_df_ps$duet_scaled)))
# No rounding
my_grp = subcols_df_ps$duet_scaled; length(my_grp)
# Add rounding is to be used
n = 3
subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
, length(unique(subcols_df_ps$duet_scaledR)))
my_grp_r = subcols_df_ps$duet_scaledR # rounding
# Add grp cols
subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
# Call the function to create the palette based on the group defined above
subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
#=======================
# Data for sub colours
# barplot: LIG
#=======================
cat("\nNo. of cols to select:", length(cols_to_select))
subcols_df_lig = merged_df3_lig[, cols_to_select]
cat("\nNo of unique positions for LIG:"
, length(unique(subcols_df_lig$position)))
# should be a factor
if (is.factor(subcols_df_lig$ligand_outcome)){
cat("\nLigand_outcome is factor")
table(subcols_df_lig$ligand_outcome)
}else{
cat("\nConverting ligand_outcome to factor")
subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
table(subcols_df_lig$ligand_outcome)
}
# should be -1 and 1
min(subcols_df_lig$affinity_scaled)
max(subcols_df_lig$affinity_scaled)
tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
# check unique values in normalised data
cat("\nNo. of unique values in affinity scaled, no rounding:"
, length(unique(subcols_df_lig$affinity_scaled)))
# No rounding
my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
# Add rounding is to be used
n = 3
subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
, length(unique(subcols_df_lig$affinity_scaledR)))
my_grp_lig_r = subcols_df_lig$affinity_scaledR # rounding
# Add grp cols
subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
# Call the function to create the palette based on the group defined above
subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
#source("coloured_bp_data.R")
# Repurposed function so that params can be passed instead to generate
# data required for plotting.
# Moved "coloured_bp_data.R" to redundant/
####################################################################
# Data for logoplots
####################################################################
#-------------------------
# choose df for logoplot
#-------------------------
logo_data = merged_df3
#logo_data = merged_df3_comp
# quick checks
colnames(logo_data)
str(logo_data)
source("logo_data.R")
c1 = unique(logo_data$position)
nrow(logo_data)
cat("No. of rows in my_data:", nrow(logo_data)
, "\nDistinct positions corresponding to snps:", length(c1)
, "\n===========================================================")
#=======================================================================
#==================
# logo data: OR
#==================
foo = logo_data[, c("position"
, "mutant_type","duet_scaled", "or_mychisq"
, "mut_prop_polarity", "mut_prop_water")]
logo_data$log10or = log10(logo_data$or_mychisq)
logo_data_plot = logo_data[, c("position"
, "mutant_type", "or_mychisq", "log10or")]
logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
wide_df_or = as.matrix(wide_df_or)
rownames(wide_df_or) = wide_df_or[,1]
dim(wide_df_or)
wide_df_or = wide_df_or[,-1]
str(wide_df_or)
position_or = as.numeric(colnames(wide_df_or))
#==================
# logo data: logOR
#==================
logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
wide_df_logor = as.matrix(wide_df_logor)
rownames(wide_df_logor) = wide_df_logor[,1]
wide_df_logor = subset(wide_df_logor, select = -c(1) )
colnames(wide_df_logor)
wide_df_logor_m = data.matrix(wide_df_logor)
rownames(wide_df_logor_m)
colnames(wide_df_logor_m)
position_logor = as.numeric(colnames(wide_df_logor_m))
#===============================
# logo data: multiple nsSNPs (>1)
#=================================
#require(data.table)
# get freq count of positions so you can subset freq<1
setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)]
table(logo_data$position)
table(logo_data$mut_pos_occurrence)
max_mut = max(table(logo_data$position))
# extract freq_pos > 1
my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,]
u = unique(my_data_snp$position)
max_mult_mut = max(table(my_data_snp$position))
if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
cat("PASS: positions with multiple muts extracted"
, "\nNo. of mutations:", nrow(my_data_snp)
, "\nNo. of positions:", length(u)
, "\nMax no. of muts at any position", max_mult_mut)
}else{
cat("FAIL: positions with multiple muts could NOT be extracted"
, "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
, "\nGot:", nrow(my_data_snp) )
}
cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
#--------------------------------------
# matrix for_mychisq mutant type
# frequency of mutant type by position
#---------------------------------------
table(my_data_snp$mutant_type, my_data_snp$position)
tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
class(tab_mt)
# unclass to convert to matrix
tab_mt = unclass(tab_mt)
tab_mt = as.matrix(tab_mt, rownames = T)
# should be TRUE
is.matrix(tab_mt)
rownames(tab_mt) #aa
colnames(tab_mt) #pos
#-------------------------------------
# matrix for wild type
# frequency of wild type by position
#-------------------------------------
tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
tab_wt = unclass(tab_wt)
# remove wt duplicates
wt = my_data_snp[, c("position", "wild_type")]
wt = wt[!duplicated(wt),]
tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
rownames(tab_wt)
rownames(tab_wt)
identical(colnames(tab_mt), colnames(tab_wt))
identical(ncol(tab_mt), ncol(tab_wt))
#----------------------------------
# logo data OR: multiple nsSNPs (>1)
#----------------------------------
logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
wide_df_or_mult = as.matrix(wide_df_or_mult)
rownames(wide_df_or_mult) = wide_df_or_mult[,1]
wide_df_or_mult = wide_df_or_mult[,-1]
str(wide_df_or_mult)
position_or_mult = as.numeric(colnames(wide_df_or_mult))
s1 = c("\nSuccessfully sourced logo_data.R")
cat(s1)
####################################################################
# Data for Corrplots
# Data for DM OM Plots: Long format dfs
####################################################################
cat("\n=========================================="
, "\nCORR PLOTS data: PS"
, "\n===========================================")
df_ps = merged_df2
#source("other_plots_data.R")
#--------------------
# adding log cols : NEW UNCOMMENT
#--------------------
#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
source("dm_om_data.R")
##df_ps$log10_or_kin = log10(df_ps$or_kin)
##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
s2 = c("\nSuccessfully sourced other_plots_data.R")
cat(s2)
#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
####################################################################
# Data for Lineage barplots: WF and LF dfs
####################################################################
#----------------------------
# columns for corr plots:PS
#----------------------------
# subset data to generate pairwise correlations
cols_to_select = c("mutationinformation"
, "duet_scaled"
, "foldx_scaled"
#, "mutation_info_labels"
, "asa"
, "rsa"
, "rd_values"
, "kd_values"
, "log10_or_mychisq"
, "neglog_pval_fisher"
##, "or_kin"
##, "neglog_pwald_kin"
, "af"
##, "af_kin"
, "duet_outcome"
, drug)
source("lineage_data.R")
corr_data_ps = df_ps[cols_to_select]
s3 = c("\nSuccessfully sourced lineage_data.R")
cat(s3)
dim(corr_data_ps)
####################################################################
# Data for corr plots:
####################################################################
# make sure the above script works because merged_df2_combined is needed
source("corr_data.R")
#--------------------------------------
# assign nice colnames (for display)
#--------------------------------------
my_corr_colnames = c("Mutation"
, "DUET"
, "FoldX"
#, "Mutation class"
, "ASA"
, "RSA"
, "RD"
, "KD"
, "Log (OR)"
, "-Log (P)"
##, "Adjusted (OR)"
##, "-Log (P wald)"
, "MAF"
##, "AF_kin"
, "duet_outcome"
, drug)
length(my_corr_colnames)
colnames(corr_data_ps)
colnames(corr_data_ps) <- my_corr_colnames
colnames(corr_data_ps)
start = 1
end = which(colnames(corr_data_ps) == drug); end # should be the last column
offset = 1
#===========================
# Corr data for plots: PS
# big_df ps: ~ merged_df2
#===========================
#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
corr_ps_df2 = corr_data_ps[start:end]
head(corr_ps_df2)
#===========================
# Corr data for plots: PS
# short_df ps: ~merged_df3
#===========================
corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
check1 = nrow(corr_ps_df3) - na_or
##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
##check2 = nrow(corr_ps_df3) - na_adj_or
if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
cat( "\nPASS: No. of rows for corr_ps_df3 match"
, "\nPASS: No. of OR values checked: " , check1)
} else {
cat("\nFAIL: Numbers mismatch:"
, "\nExpected nrows: ", nrow(merged_df3)
, "\nGot: ", nrow(corr_ps_df3)
, "\nExpected OR values: ", nrow(merged_df3_comp)
, "\nGot: ", check1)
}
#=================================
# Data for Correlation plots: LIG
#=================================
cat("\n=========================================="
, "\nCORR PLOTS data: LIG"
, "\n===========================================")
df_lig = merged_df2_lig
table(df_lig$ligand_outcome)
#--------------------
# adding log cols : NEW UNCOMMENT
#--------------------
#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
##df_lig$log10_or_kin = log10(df_lig$or_kin)
##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
#----------------------------
# columns for corr plots:PS
#----------------------------
# subset data to generate pairwise correlations
cols_to_select = c("mutationinformation"
, "affinity_scaled"
#, "mutation_info_labels"
, "asa"
, "rsa"
, "rd_values"
, "kd_values"
, "log10_or_mychisq"
, "neglog_pval_fisher"
##, "or_kin"
##, "neglog_pwald_kin"
, "af"
##, "af_kin"
, "ligand_outcome"
, drug)
corr_data_lig = df_lig[, cols_to_select]
dim(corr_data_lig)
#--------------------------------------
# assign nice colnames (for display)
#--------------------------------------
my_corr_colnames = c("Mutation"
, "Ligand Affinity"
#, "Mutation class"
, "ASA"
, "RSA"
, "RD"
, "KD"
, "Log (OR)"
, "-Log (P)"
##, "Adjusted (OR)"
##, "-Log (P wald)"
, "MAF"
##, "MAF_kin"
, "ligand_outcome"
, drug)
length(my_corr_colnames)
colnames(corr_data_lig)
colnames(corr_data_lig) <- my_corr_colnames
colnames(corr_data_lig)
start = 1
end = which(colnames(corr_data_lig) == drug); end # should be the last column
offset = 1
#=============================
# Corr data for plots: LIG
# big_df lig: ~ merged_df2_lig
#==============================
#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
corr_lig_df2 = corr_data_lig[start:end]
head(corr_lig_df2)
#=============================
# Corr data for plots: LIG
# short_df lig: ~ merged_df3_lig
#==============================
corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
check1_lig = nrow(corr_lig_df3) - na_or_lig
if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
cat( "\nPASS: No. of rows for corr_lig_df3 match"
, "\nPASS: No. of OR values checked: " , check1_lig)
} else {
cat("\nFAIL: Numbers mismatch:"
, "\nExpected nrows: ", nrow(merged_df3_lig)
, "\nGot: ", nrow(corr_ps_df3_lig)
, "\nExpected OR values: ", nrow(merged_df3_comp_lig)
, "\nGot: ", check1_lig)
}
# remove unnecessary columns
identical(corr_data_lig, corr_lig_df2)
identical(corr_data_ps, corr_ps_df2)
#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
s4 = c("\nSuccessfully sourced corr_data.R")
cat(s4)
########################################################################
# End of script
########################################################################
rm(foo)
if ( all( length(s1), length(s2), length(s3), length(s4) ) >0 ){
cat(
"\n##################################################"
, "\nSuccessful: get_plotting_dfs.R worked!"
, "\n###################################################\n")
} else {
cat(
"\n#################################################"
, "\nFAIL: get_plotting_dfs.R didn't complete fully!Please check"
, "\n###################################################\n" )
}
########################################################################
# clear excess variables
rm(c1, c2, c3, c4, check1
, curr_count, curr_total
, cols_check
, cols_to_select
, cols_to_select_deepddg
, cols_to_select_duet
, cols_to_select_dynamut
, cols_to_select_dynamut2
, cols_to_select_encomddg
, cols_to_select_encomdds
, cols_to_select_mcsm
, cols_to_select_mcsm_na
, cols_to_select_sdm
, infile_metadata
, infile_params
#, infilename_dynamut
#, infilename_dynamut2
#, infilename_mcsm_f_snps
#, infilename_mcsm_na
)
cat("\n===================================================\n"
, "\nSuccessful: get_plotting_dfs.R worked!"
, "\n====================================================")
rm(pivot_cols
, pivot_cols_deepddg
, pivot_cols_duet
, pivot_cols_dynamut
, pivot_cols_dynamut2
, pivot_cols_encomddg
, pivot_cols_encomdds
, pivot_cols_foldx
, pivot_cols_mcsm
, pivot_cols_mcsm_na
, pivot_cols_n
, pivot_cols_sdm)
rm(expected_cols
, expected_ncols
, expected_rows
, expected_rows_lf
, fact_cols)

View file

@ -0,0 +1,589 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Get formatted data for plots
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
source("Header_TT.R")
source("../functions/my_pairs_panel.R") # with lower panel turned off
source("../functions/plotting_globals.R")
source("../functions/plotting_data.R")
source("../functions/combining_dfs_plotting.R")
source("../functions/bp_subcolours.R")
#********************
# cmd args passed
# in from other scripts
# to call this
#********************
#drug = 'streptomycin'
#gene = 'gid'
#====================
# variables for lig
#====================
LigDist_colname = "ligand_distance"
LigDist_cutoff = 10
#===========
# input
#===========
#---------------------
# call: import_dirs()
#---------------------
import_dirs(drug, gene)
#---------------------------
# call: plotting_data()
#---------------------------
if (!exists("infile_params") && exists("gene")){
#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
#in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
infile_params = paste0(outdir, "/", in_filename_params)
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
}
# Input 1: read <gene>_comb_afor.csv
cat("\nReading mcsm combined data file: ", infile_params)
mcsm_df = read.csv(infile_params, header = T)
pd_df = plotting_data(mcsm_df
, lig_dist_colname = LigDist_colname
, lig_dist_cutoff = LigDist_cutoff)
my_df = pd_df[[1]]
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
my_df_u_lig = pd_df[[3]]
dup_muts = pd_df[[4]]
#--------------------------------
# call: combining_dfs_plotting()
#--------------------------------
if (!exists("infile_metadata") && exists("gene")){
#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
infile_metadata = paste0(outdir, "/", in_filename_metadata)
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
}
# Input 2: read <gene>_meta data.csv
cat("\nReading meta data file: ", infile_metadata)
gene_metadata <- read.csv(infile_metadata
, stringsAsFactors = F
, header = T)
all_plot_dfs = combining_dfs_plotting(my_df_u
, gene_metadata
, lig_dist_colname = LigDist_colname
, lig_dist_cutoff = LigDist_cutoff)
merged_df2 = all_plot_dfs[[1]]
merged_df3 = all_plot_dfs[[2]]
merged_df2_comp = all_plot_dfs[[3]]
merged_df3_comp = all_plot_dfs[[4]]
merged_df2_lig = all_plot_dfs[[5]]
merged_df3_lig = all_plot_dfs[[6]]
merged_df2_comp_lig = all_plot_dfs[[7]]
merged_df3_comp_lig = all_plot_dfs[[8]]
####################################################################
# Data for subcols barplot (~heatmap)
####################################################################
# can include: mutation, or_kin, pwald, af_kin
cols_to_select = c("mutationinformation", "drtype"
, "wild_type"
, "position"
, "mutant_type"
, "chain", "ligand_id", "ligand_distance"
, "duet_stability_change", "duet_outcome", "duet_scaled"
, "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
, "deepddg", "deepddg_outcome" # comment out as not available for pnca
, "asa", "rsa", "rd_values", "kd_values"
, "af", "or_mychisq", "pval_fisher"
, "or_fisher", "or_logistic", "pval_logistic"
, "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
, "wt_calcprop", "mut_calcprop")
#=======================
# Data for sub colours
# barplot: PS
#=======================
cat("\nNo. of cols to select:", length(cols_to_select))
subcols_df_ps = merged_df3[, cols_to_select]
cat("\nNo of unique positions for ps:"
, length(unique(subcols_df_ps$position)))
# add count_pos col that counts the no. of nsSNPS at a position
setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
# should be a factor
if (is.factor(subcols_df_ps$duet_outcome)){
cat("\nDuet_outcome is factor")
table(subcols_df_ps$duet_outcome)
}else{
cat("\nConverting duet_outcome to factor")
subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
table(subcols_df_ps$duet_outcome)
}
# should be -1 and 1
min(subcols_df_ps$duet_scaled)
max(subcols_df_ps$duet_scaled)
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
# check unique values in normalised data
cat("\nNo. of unique values in duet scaled, no rounding:"
, length(unique(subcols_df_ps$duet_scaled)))
# No rounding
my_grp = subcols_df_ps$duet_scaled; length(my_grp)
# Add rounding is to be used
n = 3
subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
, length(unique(subcols_df_ps$duet_scaledR)))
my_grp_r = subcols_df_ps$duet_scaledR # rounding
# Add grp cols
subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
# Call the function to create the palette based on the group defined above
subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
#=======================
# Data for sub colours
# barplot: LIG
#=======================
cat("\nNo. of cols to select:", length(cols_to_select))
subcols_df_lig = merged_df3_lig[, cols_to_select]
cat("\nNo of unique positions for LIG:"
, length(unique(subcols_df_lig$position)))
# should be a factor
if (is.factor(subcols_df_lig$ligand_outcome)){
cat("\nLigand_outcome is factor")
table(subcols_df_lig$ligand_outcome)
}else{
cat("\nConverting ligand_outcome to factor")
subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
table(subcols_df_lig$ligand_outcome)
}
# should be -1 and 1
min(subcols_df_lig$affinity_scaled)
max(subcols_df_lig$affinity_scaled)
tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
# check unique values in normalised data
cat("\nNo. of unique values in affinity scaled, no rounding:"
, length(unique(subcols_df_lig$affinity_scaled)))
# No rounding
my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
# Add rounding is to be used
n = 3
subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
, length(unique(subcols_df_lig$affinity_scaledR)))
my_grp_lig_r = subcols_df_lig$affinity_scaledR # rounding
# Add grp cols
subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
# Call the function to create the palette based on the group defined above
subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
####################################################################
# Data for logoplots
####################################################################
#-------------------------
# choose df for logoplot
#-------------------------
logo_data = merged_df3
#logo_data = merged_df3_comp
# quick checks
colnames(logo_data)
str(logo_data)
c1 = unique(logo_data$position)
nrow(logo_data)
cat("No. of rows in my_data:", nrow(logo_data)
, "\nDistinct positions corresponding to snps:", length(c1)
, "\n===========================================================")
#=======================================================================
#==================
# logo data: OR
#==================
foo = logo_data[, c("position"
, "mutant_type","duet_scaled", "or_mychisq"
, "mut_prop_polarity", "mut_prop_water")]
logo_data$log10or = log10(logo_data$or_mychisq)
logo_data_plot = logo_data[, c("position"
, "mutant_type", "or_mychisq", "log10or")]
logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
wide_df_or = as.matrix(wide_df_or)
rownames(wide_df_or) = wide_df_or[,1]
dim(wide_df_or)
wide_df_or = wide_df_or[,-1]
str(wide_df_or)
position_or = as.numeric(colnames(wide_df_or))
#==================
# logo data: logOR
#==================
logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
wide_df_logor = as.matrix(wide_df_logor)
rownames(wide_df_logor) = wide_df_logor[,1]
wide_df_logor = subset(wide_df_logor, select = -c(1) )
colnames(wide_df_logor)
wide_df_logor_m = data.matrix(wide_df_logor)
rownames(wide_df_logor_m)
colnames(wide_df_logor_m)
position_logor = as.numeric(colnames(wide_df_logor_m))
#===============================
# logo data: multiple nsSNPs (>1)
#=================================
#require(data.table)
# get freq count of positions so you can subset freq<1
setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)]
table(logo_data$position)
table(logo_data$mut_pos_occurrence)
max_mut = max(table(logo_data$position))
# extract freq_pos > 1
my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,]
u = unique(my_data_snp$position)
max_mult_mut = max(table(my_data_snp$position))
if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
cat("PASS: positions with multiple muts extracted"
, "\nNo. of mutations:", nrow(my_data_snp)
, "\nNo. of positions:", length(u)
, "\nMax no. of muts at any position", max_mult_mut)
}else{
cat("FAIL: positions with multiple muts could NOT be extracted"
, "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
, "\nGot:", nrow(my_data_snp) )
}
cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
#--------------------------------------
# matrix for_mychisq mutant type
# frequency of mutant type by position
#---------------------------------------
table(my_data_snp$mutant_type, my_data_snp$position)
tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
class(tab_mt)
# unclass to convert to matrix
tab_mt = unclass(tab_mt)
tab_mt = as.matrix(tab_mt, rownames = T)
# should be TRUE
is.matrix(tab_mt)
rownames(tab_mt) #aa
colnames(tab_mt) #pos
#-------------------------------------
# matrix for wild type
# frequency of wild type by position
#-------------------------------------
tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
tab_wt = unclass(tab_wt)
# remove wt duplicates
wt = my_data_snp[, c("position", "wild_type")]
wt = wt[!duplicated(wt),]
tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
rownames(tab_wt)
rownames(tab_wt)
identical(colnames(tab_mt), colnames(tab_wt))
identical(ncol(tab_mt), ncol(tab_wt))
#----------------------------------
# logo data OR: multiple nsSNPs (>1)
#----------------------------------
logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
wide_df_or_mult = as.matrix(wide_df_or_mult)
rownames(wide_df_or_mult) = wide_df_or_mult[,1]
wide_df_or_mult = wide_df_or_mult[,-1]
str(wide_df_or_mult)
position_or_mult = as.numeric(colnames(wide_df_or_mult))
####################################################################
# Data for Corrplots
####################################################################
cat("\n=========================================="
, "\nCORR PLOTS data: PS"
, "\n===========================================")
df_ps = merged_df2
#--------------------
# adding log cols : NEW UNCOMMENT
#--------------------
#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
##df_ps$log10_or_kin = log10(df_ps$or_kin)
##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
#----------------------------
# columns for corr plots:PS
#----------------------------
# subset data to generate pairwise correlations
cols_to_select = c("mutationinformation"
, "duet_scaled"
, "foldx_scaled"
#, "mutation_info_labels"
, "asa"
, "rsa"
, "rd_values"
, "kd_values"
, "log10_or_mychisq"
, "neglog_pval_fisher"
##, "or_kin"
##, "neglog_pwald_kin"
, "af"
##, "af_kin"
, "duet_outcome"
, drug)
corr_data_ps = df_ps[cols_to_select]
dim(corr_data_ps)
#--------------------------------------
# assign nice colnames (for display)
#--------------------------------------
my_corr_colnames = c("Mutation"
, "DUET"
, "FoldX"
#, "Mutation class"
, "ASA"
, "RSA"
, "RD"
, "KD"
, "Log (OR)"
, "-Log (P)"
##, "Adjusted (OR)"
##, "-Log (P wald)"
, "MAF"
##, "AF_kin"
, "duet_outcome"
, drug)
length(my_corr_colnames)
colnames(corr_data_ps)
colnames(corr_data_ps) <- my_corr_colnames
colnames(corr_data_ps)
start = 1
end = which(colnames(corr_data_ps) == drug); end # should be the last column
offset = 1
#===========================
# Corr data for plots: PS
# big_df ps: ~ merged_df2
#===========================
#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
corr_ps_df2 = corr_data_ps[start:end]
head(corr_ps_df2)
#===========================
# Corr data for plots: PS
# short_df ps: ~merged_df3
#===========================
corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
check1 = nrow(corr_ps_df3) - na_or
##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
##check2 = nrow(corr_ps_df3) - na_adj_or
if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
cat( "\nPASS: No. of rows for corr_ps_df3 match"
, "\nPASS: No. of OR values checked: " , check1)
} else {
cat("\nFAIL: Numbers mismatch:"
, "\nExpected nrows: ", nrow(merged_df3)
, "\nGot: ", nrow(corr_ps_df3)
, "\nExpected OR values: ", nrow(merged_df3_comp)
, "\nGot: ", check1)
}
#=================================
# Data for Correlation plots: LIG
#=================================
cat("\n=========================================="
, "\nCORR PLOTS data: LIG"
, "\n===========================================")
df_lig = merged_df2_lig
table(df_lig$ligand_outcome)
#--------------------
# adding log cols : NEW UNCOMMENT
#--------------------
#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
##df_lig$log10_or_kin = log10(df_lig$or_kin)
##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
#----------------------------
# columns for corr plots:PS
#----------------------------
# subset data to generate pairwise correlations
cols_to_select = c("mutationinformation"
, "affinity_scaled"
#, "mutation_info_labels"
, "asa"
, "rsa"
, "rd_values"
, "kd_values"
, "log10_or_mychisq"
, "neglog_pval_fisher"
##, "or_kin"
##, "neglog_pwald_kin"
, "af"
##, "af_kin"
, "ligand_outcome"
, drug)
corr_data_lig = df_lig[, cols_to_select]
dim(corr_data_lig)
#--------------------------------------
# assign nice colnames (for display)
#--------------------------------------
my_corr_colnames = c("Mutation"
, "Ligand Affinity"
#, "Mutation class"
, "ASA"
, "RSA"
, "RD"
, "KD"
, "Log (OR)"
, "-Log (P)"
##, "Adjusted (OR)"
##, "-Log (P wald)"
, "MAF"
##, "MAF_kin"
, "ligand_outcome"
, drug)
length(my_corr_colnames)
colnames(corr_data_lig)
colnames(corr_data_lig) <- my_corr_colnames
colnames(corr_data_lig)
start = 1
end = which(colnames(corr_data_lig) == drug); end # should be the last column
offset = 1
#=============================
# Corr data for plots: LIG
# big_df lig: ~ merged_df2_lig
#==============================
#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
corr_lig_df2 = corr_data_lig[start:end]
head(corr_lig_df2)
#=============================
# Corr data for plots: LIG
# short_df lig: ~ merged_df3_lig
#==============================
corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
check1_lig = nrow(corr_lig_df3) - na_or_lig
if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
cat( "\nPASS: No. of rows for corr_lig_df3 match"
, "\nPASS: No. of OR values checked: " , check1_lig)
} else {
cat("\nFAIL: Numbers mismatch:"
, "\nExpected nrows: ", nrow(merged_df3_lig)
, "\nGot: ", nrow(corr_ps_df3_lig)
, "\nExpected OR values: ", nrow(merged_df3_comp_lig)
, "\nGot: ", check1_lig)
}
# remove unnecessary columns
identical(corr_data_lig, corr_lig_df2)
identical(corr_data_ps, corr_ps_df2)
#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
########################################################################
# End of script
########################################################################
rm(foo)
cat("\n===================================================\n"
, "\nSuccessful: get_plotting_dfs.R worked!"
, "\n====================================================")

0
scripts/plotting/ggcorr_all_PS_LIG.R Normal file → Executable file
View file

0
scripts/plotting/hist_af_or_base.R Normal file → Executable file
View file

0
scripts/plotting/hist_af_or_combined.R Normal file → Executable file
View file

0
scripts/plotting/legend_adjustment.R Normal file → Executable file
View file

View file

@ -0,0 +1,127 @@
#!/usr/bin/env Rscript
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
#########################################################
# TASK: Basic lineage barplots showing numbers
# Output: Basic barplot with lineage samples and mut count
# + SNP diversity
##########################################################
# Installing and loading required packages
##########################################################
source("Header_TT.R")
#===========
# input
#===========
#drug = 'streptomycin'
#gene = 'gid'
spec = matrix(c(
"drug" , "d", 1, "character",
"gene" , "g", 1, "character",
"data_file1" , "fa", 2, "character",
"data_file2" , "fb", 2, "character"
), byrow = TRUE, ncol = 4)
opt = getopt(spec)
drug = opt$drug
gene = opt$gene
infile_params = opt$data_file1
infile_metadata = opt$data_file2
if(is.null(drug)|is.null(gene)) {
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
}
source ('get_plotting_dfs.R')
#=======
# output
#=======
# plot 1
basic_bp_lineage_cl = "basic_lineage_barplots_combined.svg"
plot_basic_bp_lineage_cl = paste0(plotdir,"/", basic_bp_lineage_cl)
plot_basic_bp_lineage_cl
#################################################################
#=============================
# PLOT 1: Lineage count plot:
# LF data
#=============================
#------------------------
# Data: All lineages or
# selected few
#------------------------
lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%c("L1", "L2", "L3", "L4"),]
str(lin_lf_plot)
# drop unused factor levels
lin_lf_plot$sel_lineages = factor(lin_lf_plot$sel_lineages)
levels(lin_lf_plot$sel_lineages)
str(lin_lf_plot)
#------------------------
# plot from my function:
#------------------------
lin_countP = lin_count_bp(lin_lf_plot
, x_categ = "sel_lineages"
, y_count = "p_count"
, bar_fill_categ = "count_categ"
, display_label_col = "p_count"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 20
, bar_col_labels = c("Mutations", "Total Samples")
, bar_col_values = c("grey50", "gray75")
, y_scale_percent = F # T for diversity
, y_log10 = F
, y_label = "Count")
lin_countP
#================================
# PLOT 2: Lineage Diversity plot
# WF data
#================================
#------------------------
# Data: All lineages or
# selected few
#------------------------
lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%c("L1", "L2", "L3", "L4"),]
str(lin_wf_plot)
# drop unused factor levels
lin_wf_plot$sel_lineages = factor(lin_wf_plot$sel_lineages)
levels(lin_wf_plot$sel_lineages)
str(lin_wf_plot)
#------------------------
# plot from my function:
#------------------------
lin_diversityP = lin_count_bp(lin_wf_plot
, x_categ = "sel_lineages"
, y_count = "snp_diversity"
, display_label_col = "snp_diversity_f"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 20
, y_scale_percent = T
, y_label = "SNP diversity")
lin_diversityP
#########################################################################333
#================================
# Combine plots
#================================
svg(plot_basic_bp_lineage_cl , width = 8, height = 15 )
lineage_bp_combined = cowplot::plot_grid(lin_countP, lin_diversityP
#, labels = c("(a)", "(b)", "(c)", "(d)")
, nrow = 2
, labels = "AUTO"
, label_size = 25)
lineage_bp_combined
dev.off()

147
scripts/plotting/lineage_data.R Executable file
View file

@ -0,0 +1,147 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for lineage barplots:
# WF and LF data with lineage sample, and snp counts
# sourced by get_plotting_dfs.R
#########################################################
#=================================================
# Get data with lineage count, and snp diversity
#=================================================
table(merged_df2$lineage)
if (table(merged_df2$lineage == "")[[2]]) {
cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
}
table(merged_df2$lineage_labels)
class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)
#==========================================
# WF data: lineages with
# snp count
# total_samples
# snp diversity (perc)
#==========================================
sel_lineages = levels(merged_df2$lineage_labels)
lin_wf = data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i])
#print(curr_total)
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = merged_df2[merged_df2$lineage_labels==i,]
print(paste0(i, "=======\n"))
print(length(unique(foo$mutationinformation)))
curr_count = length(unique(foo$mutationinformation))
total_snps_u = c(total_snps_u, curr_count)
}
lin_wf
# Add these counts as columns to the df
lin_wf$num_snps_u = total_snps_u
lin_wf$total_samples = total_samples
lin_wf
# Add SNP diversity
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
lin_wf
#----------------------
# Add some formatting
#----------------------
# SNP diversity
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
lin_wf$sel_lineages
# Important: Check factors so that x-axis categ appear as you want
lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1"
, "L2"
, "L3"
, "L4"
, "L5"
, "L6"
, "L7"
, "LBOV"
, "L1;L2"
, "L1;L3"
, "L1;L4"
, "L2;L3"
, "L2;L3;L4"
, "L2;L4"
, "L2;L6"
, "L2;LBOV"
, "L3;L4"
, "L4;L6"
, "L4;L7"
, ""))
levels(lin_wf$sel_lineages)
#=================================
# LF data: lineages with
# snp count
# total_samples
# snp diversity (perc)
#=================================
names(lin_wf)
tot_cols = ncol(lin_wf)
pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
pivot_cols_n = length(pivot_cols)
expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
lin_lf <- gather(lin_wf
, count_categ
, p_count
, num_snps_u:total_samples
, factor_key = TRUE)
lin_lf
# quick checks
if ( nrow(lin_lf ) == expected_rows ){
cat("\nPASS: Lineage LF data created"
, "\nnrow: ", nrow(lin_lf)
, "\nncol: ", ncol(lin_lf))
} else {
cat("\nFAIL: numbers mismatch"
, "\nExpected nrow: ", expected_rows)
}
# Important: Relevel factors so that x-axis categ appear as you want
lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1"
, "L2"
, "L3"
, "L4"
, "L5"
, "L6"
, "L7"
, "LBOV"
, "L1;L2"
, "L1;L3"
, "L1;L4"
, "L2;L3"
, "L2;L3;L4"
, "L2;L4"
, "L2;L6"
, "L2;LBOV"
, "L3;L4"
, "L4;L6"
, "L4;L7"
, ""))
levels(lin_lf$sel_lineages)
################################################################

View file

@ -0,0 +1,143 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Lineage dist plots: ggridges
# Output: 1 or 2 SVGs for PS stability
##########################################################
# Installing and loading required packages
##########################################################
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("Header_TT.R") # also loads all my functions
#===========
# input
#===========
drug = "streptomycin"
gene = "gid"
#source("get_plotting_dfs.R")
spec = matrix(c(
"drug" , "d", 1, "character",
"gene" , "g", 1, "character",
"data_file1" , "fa", 2, "character",
"data_file2" , "fb", 2, "character"
), byrow = TRUE, ncol = 4)
opt = getopt(spec)
drug = opt$drug
gene = opt$gene
infile_params = opt$data_file1
infile_metadata = opt$data_file2
if(is.null(drug)|is.null(gene)) {
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
}
#=======
# output
#=======
lineage_dist_dm_om_ps = "lineage_dist_dm_om_PS.svg"
plot_lineage_dist_dm_om_ps = paste0(plotdir,"/", lineage_dist_dm_om_ps)
#========================================================================
###########################
# Data for plots
# you need merged_df2_combined or merged_df2_combined_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################
#===================
# Data for plots
#===================
# quick checks
table(merged_df2_combined$mutation_info_labels); levels(merged_df2_combined$lineage_labels)
table(merged_df2_combined$lineage_labels); levels(merged_df2_combined$mutation_info_labels)
sel_lineages = c("L1", "L2", "L3", "L4")
lin_dist_plot = merged_df2_combined[merged_df2_combined$lineage_labels%in%sel_lineages,]
table(lin_dist_plot$lineage_labels); nlevels(lin_dist_plot$lineage_labels)
# refactor
lin_dist_plot$lineage_labels = factor(lin_dist_plot$lineage_labels)
nlevels(lin_dist_plot$lineage_labels)
#-----------------------------------------------------------------------
# IMPORTANT RESULTS to put inside table or text for interactive plots
sum(table(lin_dist_plot$lineage_labels)) #{RESULT: Total number of samples for lineage}
table(lin_dist_plot$lineage_labels)#{RESULT: No of samples within lineage}
length(unique(lin_dist_plot$mutationinformation))#{Result: No. of unique mutations selected lineages contribute to}
length(lin_dist_plot$mutationinformation)
u2 = unique(merged_df2_combined$mutationinformation)
u = unique(lin_dist_plot$mutationinformation)
check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
#-----------------------------------------------------------------------
my_x_and_t = c("duet_scaled", "mCSM-DUET")
my_x_and_t = c("foldx_scaled", "FoldX")
#my_x_and_t = c("deepddg_scaled", "DeepDDG")
my_x_and_t = c("ddg_dynamut2_scaled", "Dynamut2")
my_x_and_t = c("ddg_dynamut_scaled", "Dynamut")
my_x_and_t = c("ddg_mcsm_scaled", "mCSM")
my_x_and_t = c("ddg_sdm_scaled", "SDM")
my_x_and_t = c("ddg_duet_scaled", "DUET-d")
my_x_and_t = c("ddg_encom_scaled", "EnCOM-Stability")
my_x_and_t = c("dds_encom_scaled", "EnCOM-Flexibility")
my_x_and_t = c("mcsm_na_scaled", "mCSM-NA")
# TO DO
my_x_and_t = c("affinity_scaled", "mCSM-Lig") #ligdist< 10
#=====================
# Plot: without facet
#=====================
linP_dm_om = lineage_distP(lin_dist_plot
, x_axis = my_x_and_t[1]
, x_lab = my_x_and_t[2]
, y_axis = "lineage_labels"
, leg_label = "Mutation Class"
, with_facet = F)
linP_dm_om
#=====================
# Plot: with facet
#=====================
linP_dm_om_facet = lineage_distP(lin_dist_plot
, x_axis = my_x_and_t[1]
, x_lab = my_x_and_t[2]
, y_axis = "lineage_labels"
, with_facet = T
, facet_wrap_var = "mutation_info_labels"
, leg_label = "Mutation Class"
, leg_pos_wf = "none"
, leg_dir_wf = "horizontal")
linP_dm_om_facet
#=================
# output plot:
# without facet
#=================
svg(plot_lineage_dist_dm_om_ps)
linP_dm_om
dev.off()

View file

@ -0,0 +1,142 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for Logo_plots
#########################################################
#-------------------------
# choose df for logoplot
#-------------------------
logo_data = merged_df3
#logo_data = merged_df3_comp
# quick checks
colnames(logo_data)
str(logo_data)
c1 = unique(logo_data$position)
nrow(logo_data)
cat("No. of rows in my_data:", nrow(logo_data)
, "\nDistinct positions corresponding to snps:", length(c1)
, "\n===========================================================")
#=======================================================================
#==================
# logo data: OR
#==================
foo = logo_data[, c("position"
, "mutant_type","duet_scaled", "or_mychisq"
, "mut_prop_polarity", "mut_prop_water")]
logo_data$log10or = log10(logo_data$or_mychisq)
logo_data_plot = logo_data[, c("position"
, "mutant_type", "or_mychisq", "log10or")]
logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
wide_df_or = logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
wide_df_or = as.matrix(wide_df_or)
rownames(wide_df_or) = wide_df_or[,1]
dim(wide_df_or)
wide_df_or = wide_df_or[,-1]
str(wide_df_or)
position_or = as.numeric(colnames(wide_df_or))
#==================
# logo data: logOR
#==================
logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
wide_df_logor = as.matrix(wide_df_logor)
rownames(wide_df_logor) = wide_df_logor[,1]
wide_df_logor = subset(wide_df_logor, select = -c(1) )
colnames(wide_df_logor)
wide_df_logor_m = data.matrix(wide_df_logor)
rownames(wide_df_logor_m)
colnames(wide_df_logor_m)
position_logor = as.numeric(colnames(wide_df_logor_m))
#===============================
# logo data: multiple nsSNPs (>1)
#=================================
#require(data.table)
# get freq count of positions so you can subset freq<1
setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)]
table(logo_data$position)
table(logo_data$mut_pos_occurrence)
max_mut = max(table(logo_data$position))
# extract freq_pos > 1
my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,]
u = unique(my_data_snp$position)
max_mult_mut = max(table(my_data_snp$position))
if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
cat("PASS: positions with multiple muts extracted"
, "\nNo. of mutations:", nrow(my_data_snp)
, "\nNo. of positions:", length(u)
, "\nMax no. of muts at any position", max_mult_mut)
}else{
cat("FAIL: positions with multiple muts could NOT be extracted"
, "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
, "\nGot:", nrow(my_data_snp) )
}
cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
#--------------------------------------
# matrix for_mychisq mutant type
# frequency of mutant type by position
#---------------------------------------
table(my_data_snp$mutant_type, my_data_snp$position)
tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
class(tab_mt)
# unclass to convert to matrix
tab_mt = unclass(tab_mt)
tab_mt = as.matrix(tab_mt, rownames = T)
# should be TRUE
is.matrix(tab_mt)
rownames(tab_mt) #aa
colnames(tab_mt) #pos
#-------------------------------------
# matrix for wild type
# frequency of wild type by position
#-------------------------------------
tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
tab_wt = unclass(tab_wt)
# remove wt duplicates
wt = my_data_snp[, c("position", "wild_type")]
wt = wt[!duplicated(wt),]
tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
rownames(tab_wt)
rownames(tab_wt)
identical(colnames(tab_mt), colnames(tab_wt))
identical(ncol(tab_mt), ncol(tab_wt))
#----------------------------------
# logo data OR: multiple nsSNPs (>1)
#----------------------------------
logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
#wide_df_or = logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
wide_df_or_mult = logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
wide_df_or_mult = as.matrix(wide_df_or_mult)
rownames(wide_df_or_mult) = wide_df_or_mult[,1]
wide_df_or_mult = wide_df_or_mult[,-1]
str(wide_df_or_mult)
position_or_mult = as.numeric(colnames(wide_df_or_mult))

0
scripts/plotting/opp_mcsm_muts.R Normal file → Executable file
View file

0
scripts/plotting/or_plots_combined.R Normal file → Executable file
View file

13
scripts/plotting/other_plots_combined.R Normal file → Executable file
View file

@ -35,7 +35,7 @@ plot_dr_other_combined_labelled = paste0(plotdir,"/", dr_other_combined_labell
#my_comparisons <- list( c(dr_muts_col, other_muts_col) )
my_comparisons <- list( c("DM", "OM") )
my_ats = 22# axis text size
my_ats = 22 # axis text size
my_als = 20 # axis label size
my_fls = 20 # facet label size
my_pts = 22 # plot title size
@ -45,12 +45,15 @@ my_pts = 22 # plot title size
#===========
# Plot1: PS
#===========
my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type"
, data = df_lf_ps, paired = FALSE, p.adjust.method = "BH")
# my_stat_ps = compare_means(param_value~mutation_info
# , group.by = "param_type"
# , data = df_lf_ps
# , paired = FALSE
# , p.adjust.method = "BH")
y_value = "param_value"
p1 = ggplot(df_lf_ps, aes(x = mutation_info
p1 = ggplot(lf_duet, aes(x = mutation_info
, y = eval(parse(text=y_value)) )) +
facet_wrap(~ param_type
, nrow = 1
@ -61,7 +64,7 @@ p1 = ggplot(df_lf_ps, aes(x = mutation_info
geom_point(position = position_jitterdodge(dodge.width=0.01)
, alpha = 0.5
, show.legend = FALSE
, aes(colour = factor(duet_outcome))) +
, aes(colour = duet_outcome)) +
theme(axis.text.x = element_text(size = my_ats)
, axis.text.y = element_text(size = my_ats
, angle = 0

0
scripts/plotting/output_tables.R Normal file → Executable file
View file

0
scripts/plotting/ps_plots_combined.R Normal file → Executable file
View file

View file

@ -0,0 +1,80 @@
#!/usr/bin/env Rscript
#################################################################
# TASK: Script to add bp colours ~ barplot heatmap
#################################################################
my_df = merged_df3
cols_to_select = c("mutationinformation", "drtype"
, "wild_type"
, "position"
, "mutant_type"
, "chain", "ligand_id", "ligand_distance"
, "duet_stability_change", "duet_outcome", "duet_scaled"
, "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
, "deepddg", "deepddg_outcome" # comment out as not available for pnca
, "asa", "rsa", "rd_values", "kd_values"
, "af", "or_mychisq", "pval_fisher"
, "or_fisher", "or_logistic", "pval_logistic"
, "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
, "wt_calcprop", "mut_calcprop")
#=======================
# Data for sub colours
# barplot: PS
#=======================
cat("\nNo. of cols to select:", length(cols_to_select))
subcols_df_ps = my_df[, cols_to_select]
cat("\nNo of unique positions for ps:"
, length(unique(subcols_df_ps$position)))
# add count_pos col that counts the no. of nsSNPS at a position
setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
# should be a factor
if (is.factor(subcols_df_ps$duet_outcome)){
cat("\nDuet_outcome is factor")
table(subcols_df_ps$duet_outcome)
}else{
cat("\nConverting duet_outcome to factor")
subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
table(subcols_df_ps$duet_outcome)
}
# should be -1 and 1
min(subcols_df_ps$duet_scaled)
max(subcols_df_ps$duet_scaled)
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
# check unique values in normalised data
cat("\nNo. of unique values in duet scaled, no rounding:"
, length(unique(subcols_df_ps$duet_scaled)))
# No rounding
my_grp = subcols_df_ps$duet_scaled; length(my_grp)
# Add rounding is to be used
n = 3
subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
, length(unique(subcols_df_ps$duet_scaledR)))
my_grp_r = subcols_df_ps$duet_scaledR # rounding
# Add grp cols
subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
# Call the function to create the palette based on the group defined above
subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
cat("Colour palette generated for my_grp: ", length(subcols_ps), " colours")
cat("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")

View file

@ -15,44 +15,8 @@ setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("Header_TT.R")
library(ggridges)
library(plyr)
source("combining_dfs_plotting.R")
# PS combined:
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp
# LIG combined:
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig
# 9) my_df_u
# 10) my_df_u_lig
cat("Directories imported:"
, "\n===================="
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir)
cat("Variables imported:"
, "\n====================="
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nAngstrom symbol:", angstroms_symbol
, "\nNo. of duplicated muts:", dup_muts_nu
, "\nNA count for ORs:", na_count
, "\nNA count in df2:", na_count_df2
, "\nNA count in df3:", na_count_df3
, "\ndr_muts_col:", dr_muts_col
, "\nother_muts_col:", other_muts_col
, "\ndrtype_col:", resistance_col)
source("get_plotting_dfs.R")
cat("cols imported:"
, mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
@ -77,67 +41,24 @@ plot_lineage_dist_combined_dm_om_L = paste0(plotdir,"/", lineage_dist_combined
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################
# REASSIGNMENT
my_df = merged_df2
# delete variables not required
rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp
, merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig)
# quick checks
colnames(my_df)
str(my_df)
table(my_df$mutation_info)
#===================
# Data for plots
#===================
table(my_df$lineage); str(my_df$lineage)
# select lineages 1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
#, "lineage5"
#, "lineage6"
#, "lineage7")
# works nicely with facet wrap using labeller, but not otherwise
#my_labels = c('Lineage 1'
# , 'Lineage 2'
# , 'Lineage 3'
# , 'Lineage 4')
# #, 'Lineage 5'
# #, 'Lineage 6'
# #, 'Lineage 7')
#names(my_labels) = c('lineage1'
# , 'lineage2'
# , 'lineage3'
# , 'lineage4')
# #, 'lineage5'
# #, 'lineage6'
# #, 'lineage7')
#==========================
# subset selected lineages
#==========================
df_lin = subset(my_df, subset = lineage %in% sel_lineages)
table(df_lin$lineage)
lin_dist_plot = merged_df2[merged_df2$lineage%in%c("lineage1", "lineage2", "lineage3", "lineage4"),]
table(lin_dist_plot$lineage)
#{RESULT: Total number of samples for lineage}
sum(table(df_lin$lineage))
sum(table(lin_dist_plot$lineage))
#{RESULT: No of samples within lineage}
table(df_lin$lineage)
table(lin_dist_plot$lineage)
#{Result: No. of unique mutations the 4 lineages contribute to}
length(unique(df_lin$mutationinformation))
length(unique(lin_dist_plot$mutationinformation))
u2 = unique(my_df$mutationinformation)
u = unique(df_lin$mutationinformation)
u2 = unique(lin_dist_plot$mutationinformation)
u = unique(lin_dist_plot$mutationinformation)
#{Result:Muts not present within selected lineages}
check = u2[!u2%in%u]; print(check)
@ -148,37 +69,38 @@ check = u2[!u2%in%u]; print(check)
# from "plyr"
#==================
#{Result:No of samples in selected lineages}
table(df_lin$lineage)
table(lin_dist_plot$lineage)
df_lin$lineage_labels = mapvalues(df_lin$lineage
lin_dist_plot$lineage_labels = mapvalues(lin_dist_plot$lineage
, from = c("lineage1","lineage2", "lineage3", "lineage4")
, to = c("Lineage 1", "Lineage 2", "Lineage 3", "Lineage 4"))
table(df_lin$lineage_labels)
table(lin_dist_plot$lineage_labels)
table(df_lin$lineage_labels) == table(df_lin$lineage)
table(lin_dist_plot$lineage_labels) == table(lin_dist_plot$lineage)
#========================
# mutation_info: labels
#========================
#{Result:No of DM and OM muts in selected lineages}
table(df_lin$mutation_info)
table(lin_dist_plot$mutation_info)
df_lin$mutation_info_labels = ifelse(df_lin$mutation_info == dr_muts_col, "DM", "OM")
table(df_lin$mutation_info_labels)
table(df_lin$mutation_info) == table(df_lin$mutation_info_labels)
lin_dist_plot$mutation_info_labels = ifelse(lin_dist_plot$mutation_info == dr_muts_col
, "DM", "OM")
table(lin_dist_plot$mutation_info_labels)
table(lin_dist_plot$mutation_info) == table(lin_dist_plot$mutation_info_labels)
#========================
# duet_outcome: labels
#========================
#{Result: No. of D and S mutations in selected lineages}
table(df_lin$duet_outcome)
table(lin_dist_plot$duet_outcome)
df_lin$duet_outcome_labels = ifelse(df_lin$duet_outcome == "Destabilising", "D", "S")
table(df_lin$duet_outcome_labels)
lin_dist_plot$duet_outcome_labels = ifelse(lin_dist_plot$duet_outcome == "Destabilising"
, "D", "S")
table(lin_dist_plot$duet_outcome_labels)
table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels)
table(lin_dist_plot$duet_outcome) == table(lin_dist_plot$duet_outcome_labels)
#=======================
@ -198,25 +120,14 @@ table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Distribution plots
#============================
#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df <- df_lin
#%%%%%%%%%%%%%%%%%%%%%%%%%
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
n_colours = length(unique(df$duet_scaled))
n_colours = length(unique(lin_dist_plot$duet_scaled))
my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
#=======================================
@ -232,17 +143,23 @@ my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcs
#plot_lineage_dist_duet_f
#svg(plot_lineage_dist_duet_f)
p1 = ggplot(df, aes(x = duet_scaled
, y = duet_outcome))+
p1 = ggplot(lin_dist_plot, aes(x = duet_scaled
#, y = duet_outcome
, y = mutation_info_labels
))+
geom_density_ridges_gradient(aes(fill = ..x..)
#, jittered_points = TRUE
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage_labels
#~mutation_info_labels
# ~mutation_info_labels
# , scales = "free"
# , labeller = labeller(lineage = my_labels)
) +
coord_cartesian( xlim = c(-1, 1)) +
#coord_cartesian( xlim = c(-1, 1)) +
scale_x_continuous(expand = c(0.01, 0)) +
scale_fill_gradientn(colours = my_palette
, name = "DUET"
#, breaks = c(-1, 0, 1)
@ -264,8 +181,8 @@ p1 = ggplot(df, aes(x = duet_scaled
#, legend.title = element_text(size = my_als-6)
, legend.title = element_blank()
, legend.position = c(-0.08, 0.41)
#, legend.direction = "horizontal"
#, legend.position = "left"
, legend.direction = "horizontal"
, legend.position = "top"
)+
labs(x = "DUET")
@ -286,13 +203,14 @@ p1
#plot_lineage_dist_duet_dm_om
#svg(plot_lineage_dist_duet_dm_om)
p2 = ggplot(df, aes(x = duet_scaled
p2 = ggplot(lin_dist_plot, aes(x = duet_scaled
, y = lineage_labels))+
geom_density_ridges(aes(fill = factor(mutation_info_labels))
, scale = 3
, size = 0.3
, alpha = 0.8) +
coord_cartesian( xlim = c(-1, 1)) +
scale_x_continuous(expand = c(0.01, 0)) +
#coord_cartesian( xlim = c(-1, 1)) +
scale_fill_manual(values = c("#E69F00", "#999999")) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
@ -325,14 +243,18 @@ p2
#plot_lineage_dist_duet_nf
#svg(plot_lineage_dist_duet_nf)
p3 = ggplot(df, aes(x = duet_scaled
p3 = ggplot(lin_dist_plot, aes(x = duet_scaled
, y = lineage_labels))+
geom_density_ridges_gradient(aes(fill = ..x..)
#, jittered_points = TRUE
, scale = 3
, size = 0.3 ) +
coord_cartesian( xlim = c(-1, 1)) +
scale_fill_gradientn(colours = my_palette, name = "DUET") +
# geom_density_ridges_gradient(aes(fill = ..x..)
# #, jittered_points = TRUE
# , scale = 3
# , size = 0.3 ) +
geom_density_ridges()+
#facet_wrap (~mutation_info_labels) +
#coord_cartesian( xlim = c(-1, 1)) +
scale_x_continuous(expand = c(0.01, 0)) +
#scale_fill_gradientn(colours = my_palette, name = "DUET") +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1

View file

@ -0,0 +1,117 @@
#!/usr/bin/env Rscript
# Didn't end up using it: sorted it at the source
# .py script to combine all dfs to output all_params
#################################################################
# TASK: Script to add all other dfs to merged_df2 and merged_df3
#################################################################
# Combine other dfs:
# dynamut_df, dynamut2_df, mcsm_na_df,
# perhaps : deepddg and mcsm ppi (for embb)
################################################################
# read other files
infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
, "_complex_dynamut_norm.csv")
infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
, "_complex_dynamut2_norm.csv")
infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
, "_complex_mcsm_na_norm.csv")
infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
, "_mcsm_formatted_snps.csv")
dynamut_df = read.csv(infilename_dynamut)
dynamut2_df = read.csv(infilename_dynamut2)
mcsm_na_df = read.csv(infilename_mcsm_na)
mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F)
names(mcsm_f_snps) = "mutationinformation"
#=================================
# check with intersect to find the common col, but use
c1 = length(intersect(names(dynamut_df), names(dynamut2_df)))
c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df)))
if (c1 == 1 && c2 == 1) {
n_common = 1
}else{
cat("\nMore than one common col found, inspect before merging!")
}
# mutationinformation column to be on the safe side
# delete chain from dynamut2_df
#dynamut2_df = subset(dynamut2_df, select = -chain)
# quick checks
lapply(list(dynamut_df
, dynamut2_df
, mcsm_na_df), ncol)
lapply(list(dynamut_df
, dynamut2_df
, mcsm_na_df), colnames)
lapply(list(dynamut_df
, dynamut2_df
, mcsm_na_df), nrow)
ncols_comb = lapply(list(dynamut_df
, dynamut2_df
, mcsm_na_df), ncol)
#---------------------------------
# Combine 1: all other params dfs
#---------------------------------
combined_dfs = Reduce(inner_join, list(dynamut_df
, dynamut2_df
, mcsm_na_df))
# Reduce("+", ncols_comb)
#-----------------------------------------
# Combine 2: combine1 result + merged_df2
#-----------------------------------------
drop_cols = intersect(names(combined_dfs), names(merged_df2))
drop_cols = drop_cols
drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")]
combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols]
nrow(combined_dfs_f); nrow(merged_df2)
ncol(combined_dfs_f); ncol(merged_df2)
#-----------------------------------------
# Combined merged_df2
#-----------------------------------------
merged_df2_combined = merge(merged_df2
, combined_dfs_f
, by = "mutationinformation"
)
expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1
if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){
cat("\nPASS: merged_df2 combined with other parameters dfs."
, "\nUse this for lineage distribution plots")
}else{
cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs")
quit()
}
rm(combined_dfs, combined_dfs_f)
#================================
# combined data
# short_df ps: ~ merged_df3
# TODO: later integrate properly
#================================
#-----------------------------------------
# Combined merged_df2
#-----------------------------------------
merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]

View file

@ -0,0 +1,470 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: Script to format data for dm om plots:
# generating LF data
# sourced by get_plotting_dfs.R
#########################################################
# working dir and loading libraries
# getwd()
# setwd("~/git/LSHTM_analysis/scripts/plotting")
# getwd()
# make cmd
# globals
# drug = "streptomycin"
# gene = "gid"
# source("get_plotting_dfs.R")
#=======================================================================
# MOVE TO COMBINE or singular file for deepddg
#
# cols_to_select = c("mutation", "mutationinformation"
# , "wild_type", "position", "mutant_type"
# , "mutation_info")
#
# merged_df3_short = merged_df3[, cols_to_select]
# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
# , "_mcsm_formatted_snps.csv")
#
# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F)
# names(mcsm_f_snps) <- "mutationinformation"
# write merged_df3 to generate structural figure on chimera
#write.csv(merged_df3_short, "merged_df3_short.csv")
#========================================================================
#========================================================================
# cols to select
cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation"
, "mutation_info", "position"
, LigDist_colname
, "duet_stability_change", "duet_scaled", "duet_outcome"
, "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
, "deepddg", "deepddg_scaled", "deepddg_outcome"
, "asa", "rsa"
, "rd_values", "kd_values"
, "log10_or_mychisq", "neglog_pval_fisher", "af")]
cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation"
, "mcsm_na_affinity", "mcsm_na_scaled"
, "mcsm_na_outcome")]
# entire dynamut_df
cols_dynamut2_df <- dynamut2_df[, c("mutationinformation"
, "ddg_dynamut2", "ddg_dynamut2_scaled"
, "ddg_dynamut2_outcome")]
n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) +
length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols
i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df))
i2<- intersect(names(dynamut_df), names(cols_dynamut2_df))
merging_cols <- intersect(i1, i2)
cat("\nmerging_cols:", merging_cols)
if (merging_cols == "mutationinformation") {
cat("\nStage 1: Found common col between dfs, checking values in it...")
c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]])
c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]])
c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]])
c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]])
cols_check <- c(c1, c2, c3, c4)
expected_cols = n_comb_cols - ( length(cols_check) - 1)
if (all(cols_check)){
cat("\nStage 2: Proceeding with merging dfs:\n")
comb_df <- Reduce(inner_join, list(cols_mcsm_df
, cols_mcsm_na_df
, dynamut_df
, cols_dynamut2_df))
comb_df_s = arrange(comb_df, position)
# if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) {
# cat("\Stage3, PASS: dfs merged sucessfully"
# , "\nnrow of merged_df: ", nrow(comb_df_s)
# , "\nncol of merged_df:", ncol(comb_df_s))
# }
}
}
#names(comb_df_s)
cat("\n!!!IT GOT TO HERE!!!!")
#=======================================================================
fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
fact_cols
lapply(comb_df_s[, fact_cols], class)
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
cat("\nChanging cols to factor")
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
cat("\nSuccessful: cols changed to factor")
}
}
lapply(comb_df_s[, fact_cols], class)
#=======================================================================
table(comb_df_s$mutation_info)
# further checks to make sure dr and other muts are indeed unique
dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
dr_muts_names = unique(dr_muts$mutation)
other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
other_muts_names = unique(other_muts$mutation)
if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
cat("PASS: dr and other muts are indeed unique")
}else{
cat("FAIL: dr and others muts are NOT unique!")
quit()
}
# pretty display names i.e. labels to reduce major code duplication later
foo_cnames = data.frame(colnames(comb_df_s))
names(foo_cnames) <- "old_name"
stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
duet_dn = paste0("DUET ", stability_suffix); duet_dn
foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn
deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn
mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn
dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
sdm_dn = paste0("SDM " , stability_suffix); sdm_dn
mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn
# Change colnames of some columns using datatable
comb_df_sl = comb_df_s
names(comb_df_sl)
setnames(comb_df_sl
, old = c("asa", "rsa", "rd_values", "kd_values"
, "log10_or_mychisq", "neglog_pval_fisher", "af"
, LigDist_colname
, "duet_scaled"
, "foldx_scaled"
, "deepddg_scaled"
, "mcsm_na_scaled"
, "ddg_dynamut_scaled"
, "ddg_dynamut2_scaled"
, "ddg_encom_scaled"
, "dds_encom_scaled"
, "ddg_sdm"
, "ddg_mcsm")
, new = c("ASA", "RSA", "RD", "KD"
, "Log10 (OR)", "-Log (P)", "MAF"
, lig_dn
, duet_dn
, foldx_dn
, deepddg_dn
, mcsm_na_dn
, dynamut_dn
, dynamut2_dn
, encom_ddg_dn
, encom_dds_dn
, sdm_dn
, mcsm_dn)
)
foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
# some more pretty labels
table(comb_df_sl$mutation_info)
levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
table(comb_df_sl$mutation_info)
#######################################################################
#======================
# Selecting dfs
# with appropriate cols
#=======================
static_cols_start = c("mutationinformation"
, "position"
, "mutation"
, "mutation_info")
static_cols_end = c(lig_dn
, "ASA"
, "RSA"
, "RD"
, "KD")
# ordering is important!
#########################################################################
#==============
# DUET: LF
#==============
cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
wf_duet = comb_df_sl[, cols_to_select_duet]
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
expected_rows_lf
# LF data: duet
lf_duet = gather(wf_duet
, key = param_type
, value = param_value
, all_of(duet_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_duet) == expected_rows_lf){
cat("\nPASS: long format data created for ", duet_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# FoldX: LF
#==============
cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
wf_foldx = comb_df_sl[, cols_to_select_foldx]
pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
expected_rows_lf
# LF data: duet
print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>")
lf_foldx <<- gather(wf_foldx
, key = param_type
, value = param_value
, all_of(foldx_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_foldx) == expected_rows_lf){
cat("\nPASS: long format data created for ", foldx_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# Deepddg: LF
#==============
cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
expected_rows_lf
# LF data: duet
lf_deepddg = gather(wf_deepddg
, key = param_type
, value = param_value
, all_of(deepddg_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_deepddg) == expected_rows_lf){
cat("\nPASS: long format data created for ", deepddg_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# mCSM-NA: LF
#==============
cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
expected_rows_lf
# LF data: duet
lf_mcsm_na = gather(wf_mcsm_na
, key = param_type
, value = param_value
, all_of(mcsm_na_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_mcsm_na) == expected_rows_lf){
cat("\nPASS: long format data created for ", mcsm_na_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# Dynamut: LF
#==============
cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
expected_rows_lf
# LF data: duet
lf_dynamut = gather(wf_dynamut
, key = param_type
, value = param_value
, all_of(dynamut_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_dynamut) == expected_rows_lf){
cat("\nPASS: long format data created for ", dynamut_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# Dynamut2: LF
#==============
cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
expected_rows_lf
# LF data: duet
lf_dynamut2 = gather(wf_dynamut2
, key = param_type
, value = param_value
, all_of(dynamut2_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_dynamut2) == expected_rows_lf){
cat("\nPASS: long format data created for ", dynamut2_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# EnCOM ddg: LF
#==============
cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg
expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
expected_rows_lf
# LF data: encomddg
lf_encomddg = gather(wf_encomddg
, key = param_type
, value = param_value
, all_of(encom_ddg_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_encomddg) == expected_rows_lf){
cat("\nPASS: long format data created for ", encom_ddg_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# EnCOM dds: LF
#==============
cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds
expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
expected_rows_lf
# LF data: encomddg
lf_encomdds = gather(wf_encomdds
, key = param_type
, value = param_value
, all_of(encom_dds_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_encomdds) == expected_rows_lf){
cat("\nPASS: long format data created for", encom_dds_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# SDM: LF
#==============
cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
wf_sdm = comb_df_sl[, cols_to_select_sdm]
pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
expected_rows_lf
# LF data: encomddg
lf_sdm = gather(wf_sdm
, key = param_type
, value = param_value
, all_of(sdm_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_sdm) == expected_rows_lf){
cat("\nPASS: long format data created for", sdm_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################
#==============
# mCSM: LF
#==============
cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
expected_rows_lf
# LF data: encomddg
lf_mcsm = gather(wf_mcsm
, key = param_type
, value = param_value
, all_of(mcsm_dn):tail(static_cols_end,1)
, factor_key = TRUE)
if (nrow(lf_mcsm) == expected_rows_lf){
cat("\nPASS: long format data created for", mcsm_dn)
}else{
cat("\nFAIL: long format data could not be created for duet")
quit()
}
############################################################################

0
scripts/plotting/resolving_ambiguous_muts.R Normal file → Executable file
View file

View file

@ -112,6 +112,30 @@ note:
- fa flag has default if not supplied
- fb flag has default if not supplied
#====================================
# lineage_basic_barplots_combined.R
#====================================
#-----------------------------------------------------------------------
./lineage_basic_barplots_combined.R-d streptomycin -g gid
#-----------------------------------------------------------------------
It replaces (and has an added diversity plot)
## lineage_basic_barplot.R
These have been moved to redundant/
sources:
## get_plotting_dfs.R
## functions//bp_lineage.R"
outputs: 1 svg in the plotdir
## basic_lineage_barplots_combined.svg
note:
- fa flag has default if not supplied
- fb flag has default if not supplied
########################################################################
# TODO
Delete: dirs.R