Merge branch 'gidb_dev' (including merge conflict while after adding CLI arguments for mcsm_na/format_results_mcsm_na.py)
This commit is contained in:
commit
e2bc1cdde1
70 changed files with 4048 additions and 819 deletions
2
config/gid.R
Normal file
2
config/gid.R
Normal file
|
@ -0,0 +1,2 @@
|
|||
gene = "gid"
|
||||
drug = "streptomycin"
|
0
dynamut/format_results_dynamut.py
Normal file → Executable file
0
dynamut/format_results_dynamut.py
Normal file → Executable file
0
dynamut/format_results_dynamut2.py
Normal file → Executable file
0
dynamut/format_results_dynamut2.py
Normal file → Executable file
11
dynamut/notes.txt
Normal file
11
dynamut/notes.txt
Normal file
|
@ -0,0 +1,11 @@
|
|||
Dynamut was painfully run for gid, part manually, part programatically!
|
||||
|
||||
However, it was decided to ditch that and only run Dynamut2 for future targets
|
||||
|
||||
Dynamut2 was run through the website in batches of 50 for
|
||||
katG: 17 batches (00..16)
|
||||
rpoB: 23 batches (00..22)
|
||||
alr: 6 batches (00..05)
|
||||
|
||||
However, the use of API was made for rpoB batches (09-22) from 13 Oct 2021
|
||||
as jobs started to flake and fail through the website!
|
49
dynamut/run_format_results_dynamut.py
Normal file → Executable file
49
dynamut/run_format_results_dynamut.py
Normal file → Executable file
|
@ -20,8 +20,45 @@ from format_results_dynamut2 import *
|
|||
# variables
|
||||
# TODO: add cmd line args
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#gene =
|
||||
#drug =
|
||||
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
#arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') # should be handled elsewhere!
|
||||
|
||||
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output paths & filenames
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
#make_dirs = args.make_dirs
|
||||
|
||||
#%% input and output dirs and files
|
||||
#=======
|
||||
# dirs
|
||||
#=======
|
||||
if not datadir:
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
#%%=====================================================================
|
||||
|
||||
datadir = homedir + '/git/Data'
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
@ -29,12 +66,12 @@ outdir_dynamut = outdir + '/dynamut_results/'
|
|||
outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'
|
||||
|
||||
# Input file
|
||||
infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
|
||||
infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
|
||||
#infile_dynamut = outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv'
|
||||
infile_dynamut2 = outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv'
|
||||
|
||||
# Formatted output filename
|
||||
outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv'
|
||||
outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv'
|
||||
#outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
|
||||
outfile_dynamut2_f = outdir_dynamut2 + gene.lower() + '_dynamut2_norm.csv'
|
||||
|
||||
#===============================
|
||||
# CALL: format_results_dynamut
|
||||
|
|
|
@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au'
|
|||
#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
|
||||
|
||||
# TODO: add cmd line args
|
||||
#gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
# gene =
|
||||
# drug =
|
||||
datadir = homedir + '/git/Data/'
|
||||
indir = datadir + drug + '/input/'
|
||||
outdir = datadir + drug + '/output/'
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
|
||||
# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
|
||||
|
||||
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
|
||||
# copy your snp file to split into the dynamut dir
|
||||
|
@ -12,8 +12,13 @@ CHUNK=$3
|
|||
mkdir -p ${OUTDIR}/${CHUNK}
|
||||
cd ${OUTDIR}/${CHUNK}
|
||||
|
||||
# makes the 2 dirs, hence ../..
|
||||
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
|
||||
|
||||
# use case
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv.sh katg_mcsm_formatted_snps.csv snp_batches 50 #Date: 20/09/2021
|
||||
|
||||
# add .txt to the files
|
||||
|
|
37
dynamut/split_csv_chain.sh
Executable file
37
dynamut/split_csv_chain.sh
Executable file
|
@ -0,0 +1,37 @@
|
|||
#!/bin/bash
|
||||
|
||||
# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
|
||||
|
||||
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
|
||||
# copy your snp file to split into the dynamut dir
|
||||
# use sed to add chain ID to snp file and then split to avoid post processing
|
||||
|
||||
INFILE=$1
|
||||
OUTDIR=$2
|
||||
CHUNK=$3
|
||||
|
||||
mkdir -p ${OUTDIR}/${CHUNK}/chain_added
|
||||
cd ${OUTDIR}/${CHUNK}/chain_added
|
||||
|
||||
# makes the 3 dirs, hence ../..
|
||||
split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
|
||||
|
||||
########################################################################
|
||||
# use cases
|
||||
# Date: 20/09/2021
|
||||
# sed -e 's/^/A /g' katg_mcsm_formatted_snps.csv > katg_mcsm_formatted_snps_chain.csv
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 50
|
||||
|
||||
# Date: 01/10/2021
|
||||
# sed -e 's/^/A /g' rpob_mcsm_formatted_snps.csv > rpob_mcsm_formatted_snps_chain.csv
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 50
|
||||
|
||||
# Date: 02/10/2021
|
||||
# sed -e 's/^/A /g' alr_mcsm_formatted_snps.csv > alr_mcsm_formatted_snps_chain.csv
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50
|
||||
|
||||
# Date: 05/10/2021
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20
|
||||
|
||||
# add .txt to the files
|
||||
########################################################################
|
|
@ -41,7 +41,7 @@ arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By
|
|||
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
|
||||
|
||||
arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
||||
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
|
||||
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_formatted_snps.csv exists')
|
||||
|
||||
# FIXME: Doesn't work with 2 chains yet!
|
||||
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
|
||||
|
@ -148,6 +148,16 @@ print('Arguments being passed:'
|
|||
, '\noutput file:', outfile_foldx
|
||||
, '\n=============================================================')
|
||||
|
||||
|
||||
# make sure rotabase.txt exists in the process_dir
|
||||
rotabase_file = process_dir + '/' + 'rotabase.txt'
|
||||
|
||||
if Path(rotabase_file).is_file():
|
||||
print(f'rotabase file: {rotabase_file} exists')
|
||||
else:
|
||||
print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}')
|
||||
sys.exit()
|
||||
|
||||
#### Delay for 10 seconds to check the params ####
|
||||
print('Sleeping for 10 seconds to give you time to cancel')
|
||||
time.sleep(10)
|
||||
|
@ -235,6 +245,13 @@ def main():
|
|||
nmuts = len(mutlist)
|
||||
print(nmuts)
|
||||
print(mutlist)
|
||||
print('start')
|
||||
#subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
|
||||
print('\033[95mSTAGE: repair PDB\033[0m')
|
||||
print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
|
||||
#subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
|
||||
# once you decide to use the function
|
||||
# repairPDB(pdbname)
|
||||
|
||||
print('start')
|
||||
# some common parameters for foldX
|
||||
|
@ -242,61 +259,74 @@ def main():
|
|||
|
||||
print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
|
||||
print('Running foldx RepairPDB for WT')
|
||||
subprocess.call(['foldx'
|
||||
|
||||
fold_RepairDB = ['foldx'
|
||||
, '--command=RepairPDB'
|
||||
, foldx_common
|
||||
, '--pdb-dir=' + os.path.dirname(pdb_filename)
|
||||
# , '--pdb-dir=' + os.path.dirname(pdb_filename)
|
||||
, '--pdb-dir=' + indir
|
||||
, '--pdb=' + actual_pdb_filename
|
||||
, 'outPDB=true'
|
||||
, '--output-dir=' + process_dir])
|
||||
, '--output-dir=' + process_dir]
|
||||
print('CMD:', fold_RepairDB)
|
||||
subprocess.call(fold_RepairDB)
|
||||
print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
|
||||
print('Running foldx BuildModel for WT')
|
||||
subprocess.call(['foldx'
|
||||
|
||||
foldx_BuildModel = ['foldx'
|
||||
, '--command=BuildModel'
|
||||
, foldx_common
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--mutant-file="individual_list_' + pdbname +'.txt"'
|
||||
, '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt'
|
||||
, 'outPDB=true'
|
||||
, '--numberOfRuns=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
, '--output-dir=' + process_dir]
|
||||
print('CMD:', foldx_BuildModel)
|
||||
subprocess.call( foldx_BuildModel, cwd=process_dir)
|
||||
|
||||
print('Running foldx PrintNetworks for WT')
|
||||
subprocess.call(['foldx'
|
||||
foldx_PrintNetworks = ['foldx'
|
||||
, '--command=PrintNetworks'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
, '--output-dir=' + process_dir]
|
||||
print('CMD:', foldx_PrintNetworks)
|
||||
subprocess.call(foldx_PrintNetworks, cwd=process_dir)
|
||||
|
||||
print('Running foldx SequenceDetail for WT')
|
||||
subprocess.call(['foldx'
|
||||
foldx_SequenceDetail = ['foldx'
|
||||
, '--command=SequenceDetail'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
, '--output-dir=' + process_dir]
|
||||
print('CMD:', foldx_SequenceDetail)
|
||||
subprocess.call(foldx_SequenceDetail , cwd=process_dir)
|
||||
|
||||
print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mNETWORK:\033[0m', n)
|
||||
print('Running foldx PrintNetworks for mutation', n)
|
||||
subprocess.call(['foldx'
|
||||
foldx_PrintNetworksMT = ['foldx'
|
||||
, '--command=PrintNetworks'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
, '--output-dir=' + process_dir]
|
||||
print('CMD:', foldx_PrintNetworksMT)
|
||||
subprocess.call( foldx_PrintNetworksMT , cwd=process_dir)
|
||||
print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
@ -323,14 +353,16 @@ def main():
|
|||
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
|
||||
chain1=chainA
|
||||
chain2=chainB
|
||||
subprocess.call(['foldx'
|
||||
foldx_AnalyseComplex = ['foldx'
|
||||
, '--command=AnalyseComplex'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
, '--output-dir=' + process_dir]
|
||||
print('CMD:',foldx_AnalyseComplex)
|
||||
subprocess.call(foldx_AnalyseComplex, cwd=process_dir)
|
||||
|
||||
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
||||
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
|
||||
|
@ -340,14 +372,16 @@ def main():
|
|||
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
|
||||
subprocess.call(['foldx'
|
||||
foldx_AnalyseComplex = ['foldx'
|
||||
, '--command=AnalyseComplex'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
||||
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
, '--output-dir=' + process_dir]
|
||||
print('CMD:', foldx_AnalyseComplex)
|
||||
subprocess.call( foldx_AnalyseComplex , cwd=process_dir)
|
||||
|
||||
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
||||
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
|
||||
|
|
|
@ -51,7 +51,7 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
|
|||
print('Assigning meaningful colnames'
|
||||
, '\n=======================================================')
|
||||
my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
|
||||
, 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
|
||||
, 'CHAIN': 'chain'
|
||||
, 'WILD_RES': 'wild_type' # one letter amino acid code
|
||||
, 'RES_POS': 'position' # number
|
||||
, 'MUT_RES': 'mutant_type' # one letter amino acid code
|
||||
|
@ -65,8 +65,8 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
|
|||
#############
|
||||
# create mutationinformation column
|
||||
#############
|
||||
mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
|
||||
|
||||
#mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
|
||||
mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type']
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# Create col: mcsm_na_outcome
|
||||
|
@ -132,4 +132,3 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
|
|||
, 'pdb_file']]
|
||||
return(mcsm_na_dataf)
|
||||
#%%#####################################################################
|
||||
|
||||
|
|
107
my_header.R
107
my_header.R
|
@ -1,21 +1,31 @@
|
|||
#########################################################
|
||||
### A) Installing and loading required packages
|
||||
# A) Installing and loading required packages
|
||||
# B) My functions
|
||||
#########################################################
|
||||
|
||||
#########################################################
|
||||
#lib_loc = "/usr/local/lib/R/site-library")
|
||||
|
||||
#if (!require("gplots")) {
|
||||
# install.packages("gplots", dependencies = TRUE)
|
||||
# library(gplots)
|
||||
#}
|
||||
require("getopt", quietly = TRUE) # cmd parse arguments
|
||||
|
||||
#if (!require("tidyverse")) {
|
||||
# install.packages("tidyverse", dependencies = TRUE)
|
||||
# library(tidyverse)
|
||||
#}
|
||||
if (!require("tidyverse")) {
|
||||
install.packages("tidyverse", dependencies = TRUE)
|
||||
library(tidyverse)
|
||||
}
|
||||
|
||||
if (!require("ggplot2")) {
|
||||
install.packages("ggplot2", dependencies = TRUE)
|
||||
library(ggplot2)
|
||||
if (!require("shiny")) {
|
||||
install.packages("shiny", dependencies = TRUE)
|
||||
library(shiny)
|
||||
}
|
||||
|
||||
if (!require("shinyBS")) {
|
||||
install.packages("shinyBS", dependencies = TRUE)
|
||||
library(shinyBS)
|
||||
}
|
||||
|
||||
if (!require("gridExtra")) {
|
||||
install.packages("gridExtra", dependencies = TRUE)
|
||||
library(gridExtra)
|
||||
}
|
||||
|
||||
if (!require("ggridges")) {
|
||||
|
@ -23,6 +33,35 @@ if (!require("ggridges")) {
|
|||
library(ggridges)
|
||||
}
|
||||
|
||||
# if (!require("ggplot2")) {
|
||||
# install.packages("ggplot2", dependencies = TRUE)
|
||||
# library(ggplot2)
|
||||
# }
|
||||
|
||||
# if (!require ("dplyr")){
|
||||
# install.packages("dplyr")
|
||||
# library(dplyr)
|
||||
# }
|
||||
|
||||
if (!require ("DT")){
|
||||
install.packages("DT")
|
||||
library(DT)
|
||||
}
|
||||
|
||||
if (!require ("plyr")){
|
||||
install.packages("plyr")
|
||||
library(plyr)
|
||||
}
|
||||
|
||||
# Install
|
||||
#if(!require(devtools)) install.packages("devtools")
|
||||
#devtools::install_github("kassambara/ggcorrplot")
|
||||
|
||||
if (!require ("ggbeeswarm")){
|
||||
install.packages("ggbeeswarm")
|
||||
library(ggbeeswarm)
|
||||
}
|
||||
|
||||
if (!require("plotly")) {
|
||||
install.packages("plotly", dependencies = TRUE)
|
||||
library(plotly)
|
||||
|
@ -103,11 +142,6 @@ if (!require ("psych")){
|
|||
library(psych)
|
||||
}
|
||||
|
||||
if (!require ("dplyr")){
|
||||
install.packages("dplyr")
|
||||
library(dplyr)
|
||||
}
|
||||
|
||||
if (!require ("compare")){
|
||||
install.packages("compare")
|
||||
library(compare)
|
||||
|
@ -118,18 +152,37 @@ if (!require ("arsenal")){
|
|||
library(arsenal)
|
||||
}
|
||||
|
||||
if(!require(ggseqlogo)){
|
||||
install.packages("ggseqlogo")
|
||||
library(ggseqlogo)
|
||||
}
|
||||
|
||||
####TIDYVERSE
|
||||
# Install
|
||||
#if(!require(devtools)) install.packages("devtools")
|
||||
#devtools::install_github("kassambara/ggcorrplot")
|
||||
|
||||
#library(ggcorrplot)
|
||||
|
||||
|
||||
###for PDB files
|
||||
#install.packages("bio3d")
|
||||
# for PDB files
|
||||
if(!require(bio3d)){
|
||||
install.packages("bio3d")
|
||||
library(bio3d)
|
||||
}
|
||||
|
||||
library(protr)
|
||||
if(!require(protr)){
|
||||
install.packages("protr")
|
||||
library(protr)
|
||||
}
|
||||
|
||||
#if (!requireNamespace("BiocManager", quietly = TRUE))
|
||||
# install.packages("BiocManager")
|
||||
|
||||
#BiocManager::install("Logolas")
|
||||
library("Logolas")
|
||||
|
||||
|
||||
####################################
|
||||
# Load all my functions:
|
||||
# only works if tidyverse is loaded
|
||||
# hence included it here!
|
||||
####################################
|
||||
|
||||
func_path = "~/git/LSHTM_analysis/scripts/functions/"
|
||||
source_files <- list.files(func_path, "\\.R$") # locate all .R files
|
||||
map(paste0(func_path, source_files), source) # source all your R scripts!
|
||||
|
||||
|
|
91
scripts/functions/bp_lineage.R
Normal file
91
scripts/functions/bp_lineage.R
Normal file
|
@ -0,0 +1,91 @@
|
|||
########################################
|
||||
# Lineage barplot
|
||||
# Lineage and nsSNP count barplot
|
||||
# Lineage Diversity barplot
|
||||
########################################
|
||||
|
||||
lin_count_bp <- function( lf_data
|
||||
, x_categ = ""
|
||||
, y_count = ""
|
||||
, bar_fill_categ = ""
|
||||
, display_label_col = ""
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, d_lab_size = 5
|
||||
, d_lab_hjust = 0.5
|
||||
, d_lab_vjust = 0.5
|
||||
, d_lab_col = "black"
|
||||
, my_xats = 20 # x axis text size
|
||||
, my_yats = 20 # y axis text size
|
||||
, my_xals = 22 # x axis label size
|
||||
, my_yals = 22 # y axis label size
|
||||
, my_lls = 22 # legend label size
|
||||
, bar_col_labels = ""
|
||||
, bar_col_values = ""
|
||||
, bar_leg_name = ""
|
||||
, leg_location = "top"
|
||||
, y_log10 = FALSE
|
||||
, y_scale_percent = FALSE
|
||||
, y_label = c("Count", "SNP diversity")
|
||||
) {
|
||||
g = ggplot(lf_data
|
||||
, aes( x = factor( eval(parse(text = x_categ)), ordered = T )
|
||||
, y = eval(parse(text = y_count))
|
||||
, fill = eval(parse(text = bar_fill_categ)) ) )
|
||||
|
||||
OutPlot = g + geom_bar( stat = bar_stat_stype
|
||||
, position = position_stack(reverse = TRUE)
|
||||
#, alpha = 1
|
||||
#, colour = "grey75"
|
||||
) +
|
||||
theme(axis.text.x = element_text(size = my_xats
|
||||
, angle = x_lab_angle)
|
||||
, axis.text.y = element_text(size = my_yats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xals
|
||||
, colour = "black")
|
||||
, axis.title.y = element_text(size = my_yals
|
||||
, colour = "black")
|
||||
, legend.position = leg_location
|
||||
, legend.text = element_text(size = my_lls)) +
|
||||
|
||||
geom_label(aes(label = eval(parse(text = display_label_col)))
|
||||
, size = d_lab_size
|
||||
, hjust = d_lab_hjust
|
||||
, vjust = d_lab_vjust
|
||||
, colour = d_lab_col
|
||||
, show.legend = FALSE
|
||||
#, check_overlap = TRUE
|
||||
, position = position_stack(reverse = T)) +
|
||||
|
||||
scale_fill_manual(values = bar_col_values
|
||||
, name = bar_leg_name
|
||||
, labels = bar_col_labels) +
|
||||
labs(title = ""
|
||||
, x = ""
|
||||
, y = y_label
|
||||
, colour = "black")
|
||||
|
||||
if (y_log10){
|
||||
|
||||
OutPlot = OutPlot +
|
||||
scale_y_continuous(trans = "log10"
|
||||
, labels = trans_format("log10", math_format(10^.x) ) )
|
||||
}
|
||||
|
||||
if (y_scale_percent){
|
||||
|
||||
OutPlot = OutPlot +
|
||||
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
|
||||
#scale_y_continuous(labels = scales::percent) +
|
||||
|
||||
labs(title = ""
|
||||
, x = ""
|
||||
, y = y_label
|
||||
, colour = "black")
|
||||
}
|
||||
|
||||
return(OutPlot)
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
|
||||
#########################################################
|
||||
|
||||
ColourPalleteMulti <- function(df, group, subgroup){
|
||||
ColourPalleteMulti = function(df, group, subgroup){
|
||||
|
||||
# Find how many colour categories to create and the number of colours in each
|
||||
categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
|
||||
|
@ -24,4 +24,88 @@ ColourPalleteMulti <- function(df, group, subgroup){
|
|||
, category.end[i]))(categories[i,2])}))
|
||||
return(colours)
|
||||
}
|
||||
#########################################################
|
||||
#########################################################################
|
||||
|
||||
########################
|
||||
# Generate bp with
|
||||
# colour palette derived
|
||||
# from the data using
|
||||
# above function
|
||||
#########################
|
||||
|
||||
bp_stability_hmap <- function(plotdf = merged_df3
|
||||
, xvar_colname = "position"
|
||||
#, bar_col_colname = "group"
|
||||
, stability_colname = ""
|
||||
, stability_outcome_colname = ""
|
||||
, p_title = "" # "Protein stability (DUET)"
|
||||
, my_xaxls = 12 # x-axis label size
|
||||
, my_yaxls = 20 # y-axis label size
|
||||
, my_xaxts = 18 # x-axis text size
|
||||
, my_yaxts = 20 # y-axis text size
|
||||
, my_pts = 20 # plot-title size
|
||||
, my_xlab = "Position"
|
||||
, my_ylab = "No. of nsSNPs"
|
||||
)
|
||||
{
|
||||
|
||||
# order the df by position and ensure it is a factor
|
||||
plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
|
||||
plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
|
||||
|
||||
#cat("\nSneak peak:\n")
|
||||
head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
|
||||
|
||||
# stability values isolated to help with generating column called: 'group'
|
||||
my_grp = plotdf[[stability_colname]]
|
||||
cat( "\nLength of nsSNPs:", length(my_grp)
|
||||
, "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
|
||||
|
||||
# Add col: 'group'
|
||||
plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
|
||||
|
||||
# check unique values in normalised data
|
||||
cat("\nNo. of unique values in", stability_colname, "no rounding:"
|
||||
, length(unique(plotdf[[stability_colname]])))
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
#subcols_ps
|
||||
subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
|
||||
|
||||
cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
|
||||
|
||||
#-------------------------------
|
||||
# Generate the subcols barplot
|
||||
#-------------------------------
|
||||
|
||||
#g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
|
||||
g = ggplot(plotdf, aes_string(x = xvar_colname
|
||||
# , ordered = T)
|
||||
))
|
||||
|
||||
|
||||
OutWidePlot = g + geom_bar(aes(fill = group)
|
||||
, colour = "grey") +
|
||||
|
||||
scale_fill_manual( values = subcols_bp_hmap
|
||||
, guide = "none") +
|
||||
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts )
|
||||
, plot.title = element_text(size = my_pts
|
||||
, hjust = 0.5)) +
|
||||
|
||||
labs(title = p_title
|
||||
, x = my_xlab
|
||||
, y = my_ylab)
|
||||
|
||||
return(OutWidePlot)
|
||||
}
|
||||
|
|
|
@ -153,6 +153,46 @@ combining_dfs_plotting <- function( my_df_u
|
|||
quit()
|
||||
}
|
||||
|
||||
# Quick formatting: ordering df and pretty labels
|
||||
|
||||
#------------------------------
|
||||
# sorting by column: position
|
||||
#------------------------------
|
||||
merged_df2 = merged_df2[order(merged_df2$position), ]
|
||||
|
||||
#-----------------------
|
||||
# mutation_info_labels
|
||||
#-----------------------
|
||||
merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col
|
||||
, "DM", "OM")
|
||||
merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels)
|
||||
#-----------------------
|
||||
# lineage labels
|
||||
#-----------------------
|
||||
merged_df2$lineage_labels = gsub("lineage", "L", merged_df2$lineage)
|
||||
|
||||
merged_df2$lineage_labels = factor(merged_df2$lineage_labels, c("L1"
|
||||
, "L2"
|
||||
, "L3"
|
||||
, "L4"
|
||||
, "L5"
|
||||
, "L6"
|
||||
, "L7"
|
||||
, "LBOV"
|
||||
, "L1;L2"
|
||||
, "L1;L3"
|
||||
, "L1;L4"
|
||||
, "L2;L3"
|
||||
, "L2;L3;L4"
|
||||
, "L2;L4"
|
||||
, "L2;L6"
|
||||
, "L2;LBOV"
|
||||
, "L3;L4"
|
||||
, "L4;L6"
|
||||
, "L4;L7"
|
||||
, ""))
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Merge 2: merged_df3
|
||||
# dfs with NAs in ORs
|
||||
|
|
197
scripts/functions/lf_bp.R
Normal file
197
scripts/functions/lf_bp.R
Normal file
|
@ -0,0 +1,197 @@
|
|||
#############################
|
||||
# Barplots: ggplot
|
||||
# stats +/-
|
||||
# violin +/-
|
||||
# barplot +/
|
||||
# beeswarm
|
||||
#############################
|
||||
|
||||
lf_bp <- function(lf_df
|
||||
, p_title = ""
|
||||
, colour_categ = ""
|
||||
, x_grp = "mutation_info"
|
||||
, y_var = "param_value"
|
||||
, facet_var = "param_type"
|
||||
, n_facet_row = 1
|
||||
, y_scales = "free_y"
|
||||
, colour_bp_strip = "khaki2"
|
||||
, dot_size = 3
|
||||
, dot_transparency = 0.3
|
||||
, violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
|
||||
, my_ats = 22 # axis text size
|
||||
, my_als = 20 # axis label size
|
||||
, my_fls = 20 # facet label size
|
||||
, my_pts = 22 # plot title size)
|
||||
, make_boxplot = FALSE
|
||||
, bp_width = c("auto", 0.5)
|
||||
, add_stats = TRUE
|
||||
, stat_grp_comp = c("DM", "OM")
|
||||
, stat_method = "wilcox.test"
|
||||
, my_paired = FALSE
|
||||
, stat_label = c("p.format", "p.signif") ){
|
||||
|
||||
fwv = as.formula(paste0("~", facet_var))
|
||||
#fwv = reformulate(facet_var)
|
||||
|
||||
p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var)) +
|
||||
|
||||
facet_wrap( fwv
|
||||
, nrow = n_facet_row
|
||||
, scales = y_scales) +
|
||||
|
||||
geom_violin(trim = T
|
||||
, scale = "width"
|
||||
#, position = position_dodge(width = 0.9)
|
||||
, draw_quantiles = violin_quantiles)
|
||||
|
||||
if (make_boxplot){
|
||||
|
||||
if (bp_width == "auto"){
|
||||
bp_width = 0.5/length(unique(lf_df[[x_grp]]))
|
||||
cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
|
||||
}else{
|
||||
cat("\nBoxplot width value provided, using:", bp_width, "\n")
|
||||
bp_width = bp_width}
|
||||
|
||||
p2 = p1 + geom_boxplot(fill = "white"
|
||||
, outlier.colour = NA
|
||||
#, position = position_dodge(width = 0.9)
|
||||
, width = bp_width) +
|
||||
geom_beeswarm(priority = "density"
|
||||
#, shape = 21
|
||||
, size = dot_size
|
||||
, alpha = dot_transparency
|
||||
, show.legend = FALSE
|
||||
, cex = 0.8
|
||||
, aes(colour = factor(eval(parse(text = colour_categ))) ))
|
||||
|
||||
} else {
|
||||
# ggbeeswarm (better than geom_point)
|
||||
p2 = p1 + geom_beeswarm(priority = "density"
|
||||
#, shape = 21
|
||||
, size = dot_size
|
||||
, alpha = dot_transparency
|
||||
, show.legend = FALSE
|
||||
, cex = 0.8
|
||||
, aes(colour = factor(eval(parse(text = colour_categ))) ))
|
||||
}
|
||||
|
||||
# Add foramtting to graph
|
||||
OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_ats)
|
||||
, axis.title.y = element_text(size = my_ats)
|
||||
, plot.title = element_text(size = my_pts
|
||||
, hjust = 0.5
|
||||
, colour = "black"
|
||||
, face = "bold")
|
||||
, strip.background = element_rect(fill = colour_bp_strip)
|
||||
, strip.text.x = element_text(size = my_fls
|
||||
, colour = "black")
|
||||
, legend.title = element_text(color = "black"
|
||||
, size = my_als)
|
||||
, legend.text = element_text(size = my_ats)
|
||||
, legend.direction = "vertical") +
|
||||
|
||||
labs(title = p_title
|
||||
, x = ""
|
||||
, y = "")
|
||||
|
||||
if (add_stats){
|
||||
my_comparisonsL <- list( stat_grp_comp )
|
||||
|
||||
OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL
|
||||
, method = stat_method
|
||||
, paired = my_paired
|
||||
, label = stat_label[2])
|
||||
return(OutPlot)
|
||||
}
|
||||
|
||||
return(OutPlot)
|
||||
}
|
||||
|
||||
#############################
|
||||
# Barplot NO stats: plotly
|
||||
# violin +/-
|
||||
# barplot +/
|
||||
# beeswarm
|
||||
|
||||
# TODO: plot_ly()
|
||||
#############################
|
||||
lf_bp_plotly <- function(lf_df
|
||||
, p_title = ""
|
||||
, colour_categ = ""
|
||||
, x_grp = mutation_info
|
||||
, y_var = param_value
|
||||
, facet_var = param_type
|
||||
, n_facet_row = 1
|
||||
, y_scales = "free_y"
|
||||
, colour_bp_strip = "khaki2"
|
||||
, dot_size = 3
|
||||
, dot_transparency = 0.3
|
||||
, violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
|
||||
, my_ats = 20 # axis text size
|
||||
, my_als = 18 # axis label size
|
||||
, my_fls = 18 # facet label size
|
||||
, my_pts = 22 # plot title size)
|
||||
#, make_boxplot = FALSE
|
||||
, bp_width = c("auto", 0.5)
|
||||
#, add_stats = FALSE
|
||||
#, stat_grp_comp = c("DM", "OM")
|
||||
#, stat_method = "wilcox.test"
|
||||
#, my_paired = FALSE
|
||||
#, stat_label = c("p.format", "p.signif")
|
||||
){
|
||||
|
||||
OutPlotly = ggplot(lf_df, aes(x = eval(parse(text = x_grp))
|
||||
, y = eval(parse(text = y_var))
|
||||
, label1 = x_grp
|
||||
, label2 = y_var
|
||||
, lable3 = colour_categ) ) +
|
||||
|
||||
facet_wrap(~ eval(parse(text = facet_var))
|
||||
, nrow = n_facet_row
|
||||
, scales = y_scales) +
|
||||
|
||||
geom_violin(trim = T
|
||||
, scale = "width"
|
||||
, draw_quantiles = violin_quantiles) +
|
||||
|
||||
geom_beeswarm(priority = "density"
|
||||
, size = dot_size
|
||||
, alpha = dot_transparency
|
||||
, show.legend = FALSE
|
||||
, cex = 0.8
|
||||
, aes(colour = factor(eval(parse(text = colour_categ) ) ) ) ) +
|
||||
theme(axis.text.x = element_text(size = my_ats)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_ats)
|
||||
, axis.title.y = element_text(size = my_ats)
|
||||
, plot.title = element_text(size = my_pts
|
||||
, hjust = 0.5
|
||||
, colour = "black"
|
||||
, face = "bold")
|
||||
, strip.background = element_rect(fill = colour_bp_strip)
|
||||
, strip.text.x = element_text(size = my_fls
|
||||
, colour = "black")
|
||||
, legend.title = element_text(color = "black"
|
||||
, size = my_als)
|
||||
, legend.text = element_text(size = my_ats)
|
||||
, legend.position = "none")+
|
||||
|
||||
labs(title = p_title
|
||||
, x = ""
|
||||
, y = "")
|
||||
|
||||
OutPlotly = ggplotly(OutPlotly
|
||||
#, tooltip = c("label")
|
||||
)
|
||||
return(OutPlotly)
|
||||
|
||||
}
|
21
scripts/functions/lf_unpaired_stats.R
Normal file
21
scripts/functions/lf_unpaired_stats.R
Normal file
|
@ -0,0 +1,21 @@
|
|||
library(ggpubr)
|
||||
###################################################################
|
||||
|
||||
lf_unpaired_stats <- function(lf_data
|
||||
, lf_stat_value = "param_value"
|
||||
, lf_stat_group = "mutation_info"
|
||||
, lf_col_statvars = "param_type"
|
||||
, my_paired = FALSE
|
||||
, stat_adj = "none"){
|
||||
|
||||
stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group))
|
||||
|
||||
my_stat_df = compare_means(stat_formula
|
||||
, group.by = lf_col_statvars
|
||||
, data = lf_data
|
||||
, paired = my_paired
|
||||
, p.adjust.method = stat_adj)
|
||||
|
||||
|
||||
return(my_stat_df)
|
||||
}
|
69
scripts/functions/lineage_dist.R
Normal file
69
scripts/functions/lineage_dist.R
Normal file
|
@ -0,0 +1,69 @@
|
|||
###############################
|
||||
# TASK: function to plot lineage
|
||||
# dist plots with or without facet
|
||||
# think about color palette
|
||||
# for stability
|
||||
##############################
|
||||
|
||||
#n_colours = length(unique(lin_dist_plot$duet_scaled))
|
||||
#my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
|
||||
|
||||
|
||||
lineage_distP <- function(plotdf
|
||||
, x_axis = "duet_scaled"
|
||||
, y_axis = "lineage_labels"
|
||||
, x_lab = "DUET"
|
||||
, with_facet = F
|
||||
, facet_wrap_var = ""
|
||||
, fill_categ = "mutation_info_labels"
|
||||
, fill_categ_cols = c("#E69F00", "#999999")
|
||||
, my_ats = 15 # axis text size
|
||||
, my_als = 20 # axis label size
|
||||
, my_leg_ts = 16
|
||||
, my_leg_title = 16
|
||||
, my_strip_ts = 20
|
||||
, leg_pos = c(0.8, 0.9)
|
||||
, leg_pos_wf = c("top", "left", "bottom", "right")
|
||||
, leg_dir_wf = c("horizontal", "vertical")
|
||||
, leg_label = "")
|
||||
|
||||
{
|
||||
|
||||
LinDistP = ggplot(plotdf, aes_string(x = x_axis
|
||||
, y = y_axis))+
|
||||
|
||||
geom_density_ridges(aes_string(fill = fill_categ)
|
||||
, scale = 3
|
||||
, size = 0.3
|
||||
, alpha = 0.8) +
|
||||
scale_x_continuous(expand = c(0.01, 0.01)) +
|
||||
#coord_cartesian( xlim = c(-1, 1)) +
|
||||
scale_fill_manual(values = fill_categ_cols) +
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats)
|
||||
, axis.title.x = element_text(size = my_ats)
|
||||
, axis.title.y = element_blank()
|
||||
, strip.text = element_text(size = my_strip_ts)
|
||||
, legend.text = element_text(size = my_leg_ts)
|
||||
, legend.title = element_text(size = my_leg_title)
|
||||
, legend.position = c(0.8, 0.9)) +
|
||||
labs(x = x_lab
|
||||
, fill = leg_label)
|
||||
|
||||
if (with_facet){
|
||||
|
||||
# used reformulate or make as formula
|
||||
#fwv = reformulate(facet_wrap_var)
|
||||
fwv = as.formula(paste0("~", facet_wrap_var))
|
||||
|
||||
LinDistP = LinDistP +
|
||||
facet_wrap(fwv) +
|
||||
theme(legend.position = leg_pos_wf
|
||||
, legend.direction = leg_dir_wf)
|
||||
}
|
||||
|
||||
return(LinDistP)
|
||||
}
|
|
@ -1,24 +1,40 @@
|
|||
my_corr_pairs <- function (corr_data){
|
||||
my_corr_pairs <- function (corr_data_all
|
||||
, corr_cols = colnames(corr_data_all)
|
||||
, corr_method = "spearman" # other options: "pearson" or "kendall"
|
||||
, colour_categ_col = "mutation_info_labels"
|
||||
, categ_colour = c("#E69F00", "#999999")
|
||||
, density_show = F
|
||||
, hist_col = "coral4"
|
||||
, dot_size = 1.6
|
||||
, ats = 1.5
|
||||
, corr_lab_size = 3
|
||||
, corr_value_size = 1)
|
||||
{
|
||||
|
||||
OutPlot_corr = pairs.panels(corr_data
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
corr_data_df = corr_data_all[corr_cols]
|
||||
my_bg = categ_colour[corr_data_all[[colour_categ_col]] ]
|
||||
|
||||
OutPlot_corr = pairs.panels(corr_data_df
|
||||
, method = corr_method
|
||||
, hist.col = hist_col
|
||||
, density = density_show
|
||||
, ellipses = F
|
||||
, smooth = F
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
#, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps$duet_outcome))] # foldx colours are reveresed
|
||||
#, pch = 21 # for bg
|
||||
, jitter = T
|
||||
#, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_data$duet_outcome))] # foldx colours are reveresed
|
||||
, bg = my_bg
|
||||
, pch = 21
|
||||
, alpha = 1
|
||||
, cex = 1.8
|
||||
, cex.axis = 2
|
||||
, cex.labels = 3.5
|
||||
, cex.cor = 1
|
||||
, smooth = F)
|
||||
, cex = dot_size
|
||||
, cex.axis = ats
|
||||
, cex.labels = corr_lab_size
|
||||
, cex.cor = corr_value_size
|
||||
)
|
||||
return(OutPlot_corr)
|
||||
#return (my_bg)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -16,7 +16,9 @@ library(dplyr)
|
|||
## my_df_u_lig
|
||||
## dup_muts
|
||||
#========================================================
|
||||
plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) {
|
||||
plotting_data <- function(df
|
||||
, lig_dist_colname = 'ligand_distance'
|
||||
, lig_dist_cutoff = 10) {
|
||||
my_df = data.frame()
|
||||
my_df_u = data.frame()
|
||||
my_df_u_lig = data.frame()
|
||||
|
@ -29,61 +31,6 @@ dup_muts = data.frame()
|
|||
|
||||
cat("\nInput dimensions:", dim(df))
|
||||
|
||||
#==================================
|
||||
# add foldx outcome category
|
||||
# and foldx scaled values
|
||||
|
||||
# This will enable to always have these variables available
|
||||
# when calling for plots
|
||||
#==================================
|
||||
|
||||
#------------------------------
|
||||
# adding foldx scaled values
|
||||
# scale data b/w -1 and 1
|
||||
#------------------------------
|
||||
n = which(colnames(df) == "ddg"); n
|
||||
|
||||
my_min = min(df[,n]); my_min
|
||||
my_max = max(df[,n]); my_max
|
||||
|
||||
df$foldx_scaled = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max)
|
||||
# sanity check
|
||||
my_min = min(df$foldx_scaled); my_min
|
||||
my_max = max(df$foldx_scaled); my_max
|
||||
|
||||
if (my_min == -1 && my_max == 1){
|
||||
cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
|
||||
, "\nProceeding with assigning foldx outcome category")
|
||||
}else{
|
||||
cat("\nFAIL: could not scale foldx ddg values"
|
||||
, "Aborting!\n")
|
||||
}
|
||||
|
||||
#------------------------------
|
||||
# adding foldx outcome category
|
||||
# ddg<0 = "Stabilising" (-ve)
|
||||
#------------------------------
|
||||
c1 = table(df$ddg < 0)
|
||||
df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
|
||||
c2 = table(df$ddg < 0)
|
||||
|
||||
if ( all(c1 == c2) ){
|
||||
cat("\nPASS: foldx outcome successfully created")
|
||||
}else{
|
||||
cat("\nFAIL: foldx outcome could not be created. Aborting!\n")
|
||||
exit()
|
||||
}
|
||||
|
||||
#------------------------------
|
||||
# renaming foldx column from
|
||||
# "ddg" --> "ddg_foldx"
|
||||
#------------------------------
|
||||
|
||||
# change name to foldx
|
||||
colnames(df)[n] <- "ddg_foldx"
|
||||
|
||||
#==================================
|
||||
# extract unique mutation entries
|
||||
#==================================
|
||||
|
|
|
@ -32,7 +32,8 @@ import_dirs <- function(drug_name, gene_name) {
|
|||
#===============================
|
||||
# mcsm ligand distance cut off
|
||||
#===============================
|
||||
#mcsm_lig_cutoff <<- 10
|
||||
LigDist_colname <<- "ligand_distance"
|
||||
LigDist_cutoff <<- 10
|
||||
|
||||
#==================
|
||||
# Angstroms symbol
|
||||
|
|
|
@ -42,7 +42,9 @@ site_snp_count_bp <- function (plotdf
|
|||
, "\nNo. of cols:", ncol(plotdf)
|
||||
, "\nNow adding column: frequency of mutational positions"))
|
||||
|
||||
# adding snpcount for each position
|
||||
#-------------------------------------------
|
||||
# adding column: snpcount for each position
|
||||
#-------------------------------------------
|
||||
setDT(plotdf)[, pos_count := .N, by = .(eval(parse(text = df_colname)))]
|
||||
|
||||
cat("\nCumulative nssnp count\n"
|
||||
|
@ -65,13 +67,18 @@ site_snp_count_bp <- function (plotdf
|
|||
, "\nNo. of rows:", nrow(plotdf)
|
||||
, "\nNo. of cols:", ncol(plotdf)))
|
||||
|
||||
#------------------------------------------------------
|
||||
# creating df: average count of snpcount for each position
|
||||
# created in earlier step
|
||||
#-------------------------------------------------------
|
||||
# use group by on pos_count
|
||||
snpsBYpos_df <- plotdf %>%
|
||||
group_by(eval(parse(text = df_colname))) %>%
|
||||
summarize(snpsBYpos = mean(pos_count))
|
||||
dplyr::group_by(eval(parse(text = df_colname))) %>%
|
||||
dplyr::summarise(snpsBYpos = mean(pos_count)) # changed from summarize!
|
||||
|
||||
cat("\nnssnp count\n"
|
||||
, table(snpsBYpos_df$snpsBYpos))
|
||||
cat("\nnssnp count per position\n"
|
||||
, table(snpsBYpos_df$snpsBYpos)
|
||||
, "\n")
|
||||
|
||||
# calculating total no. of sites associated with nsSNPs
|
||||
tot_sites = sum(table(snpsBYpos_df$snpsBYpos))
|
||||
|
|
104
scripts/functions/redundant/bp_subcolours_v2.R
Normal file
104
scripts/functions/redundant/bp_subcolours_v2.R
Normal file
|
@ -0,0 +1,104 @@
|
|||
#########################################################
|
||||
# 1b: Define function: coloured barplot by subgroup
|
||||
# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
|
||||
#########################################################
|
||||
|
||||
ColourPalleteMulti = function(df, group, subgroup){
|
||||
|
||||
# Find how many colour categories to create and the number of colours in each
|
||||
categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
|
||||
, df
|
||||
, function(x) length(unique(x)))
|
||||
# return(categories) }
|
||||
|
||||
category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
|
||||
|
||||
category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
|
||||
|
||||
#return(category.start); return(category.end)}
|
||||
|
||||
# Build Colour pallette
|
||||
colours <- unlist(lapply(1:nrow(categories),
|
||||
function(i){
|
||||
colorRampPalette(colors = c(category.start[i]
|
||||
, category.end[i]))(categories[i,2])}))
|
||||
return(colours)
|
||||
}
|
||||
#########################################################################
|
||||
|
||||
bp_stability_hmap <- function(plotdf = merged_df3
|
||||
, xvar_colname = "position"
|
||||
#, bar_col_colname = "group"
|
||||
, stability_colname = "duet_scaled"
|
||||
, stability_outcome_colname = "duet_outcome"
|
||||
, p_title = "" # "Protein stability (DUET)"
|
||||
, my_xaxls = 12 # x-axis label size
|
||||
, my_yaxls = 20 # y-axis label size
|
||||
, my_xaxts = 18 # x-axis text size
|
||||
, my_yaxts = 20 # y-axis text size
|
||||
, my_pts = 20 # plot-title size
|
||||
, my_xlab = "Position"
|
||||
, my_ylab = "No. of nsSNPs"
|
||||
)
|
||||
{
|
||||
|
||||
# order the df by position and ensure it is a factor
|
||||
plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
|
||||
plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
|
||||
|
||||
#cat("\nSneak peak:\n")
|
||||
head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
|
||||
|
||||
# stability values isolated to help with generating column called: 'group'
|
||||
my_grp = plotdf[[stability_colname]]
|
||||
cat( "\nLength of nsSNPs:", length(my_grp)
|
||||
, "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
|
||||
|
||||
# Add col: 'group'
|
||||
plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
|
||||
|
||||
# check unique values in normalised data
|
||||
cat("\nNo. of unique values in", stability_colname, "no rounding:"
|
||||
, length(unique(plotdf[[stability_colname]])))
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
#subcols_ps
|
||||
subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
|
||||
|
||||
cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
|
||||
|
||||
#-------------------------------
|
||||
# Generate the subcols barplot
|
||||
#-------------------------------
|
||||
|
||||
#g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
|
||||
g = ggplot(plotdf, aes_string(x = xvar_colname
|
||||
# , ordered = T)
|
||||
))
|
||||
|
||||
|
||||
OutWidePlot = g + geom_bar(aes(fill = group)
|
||||
, colour = "grey") +
|
||||
|
||||
scale_fill_manual( values = subcols_bp_hmap
|
||||
, guide = "none") +
|
||||
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts )
|
||||
, plot.title = element_text(size = my_pts
|
||||
, hjust = 0.5)) +
|
||||
|
||||
labs(title = p_title
|
||||
, x = my_xlab
|
||||
, y = my_ylab)
|
||||
|
||||
return(OutWidePlot)
|
||||
}
|
97
scripts/functions/redundant/lf_bp_with_stats.R
Normal file
97
scripts/functions/redundant/lf_bp_with_stats.R
Normal file
|
@ -0,0 +1,97 @@
|
|||
library(ggpubr)
|
||||
###################################################################
|
||||
|
||||
####################################
|
||||
lf_bp_with_stats <- function(lf_df
|
||||
, x_grp = "mutation_info"
|
||||
, y_var = "param_value"
|
||||
, facet_var = "param_type"
|
||||
, n_facet_row = 1
|
||||
, y_scales = "free_y"
|
||||
, p_title = ""
|
||||
, colour_categ = ""
|
||||
, colour_bp_strip = "khaki2"
|
||||
, stat_grp_comp = c("DM", "OM")
|
||||
, stat_method = "wilcox.test"
|
||||
, my_paired = FALSE
|
||||
, bp_width = c("auto", 0.5)
|
||||
, dot_size = 3
|
||||
, dot_transparency = 0.3
|
||||
, stat_label = c("p.format", "p.signif")
|
||||
, my_ats = 22 # axis text size
|
||||
, my_als = 20 # axis label size
|
||||
, my_fls = 20 # facet label size
|
||||
, my_pts = 22 # plot title size
|
||||
) {
|
||||
if (bp_width == "auto"){
|
||||
bp_width = 0.5/length(unique(lf_df[[x_grp]]))
|
||||
cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
|
||||
}else{
|
||||
cat("\nBoxplot width value provided, using:", bp_width, "\n")
|
||||
bp_width = bp_width
|
||||
}
|
||||
|
||||
my_comparisonsL <- list( stat_grp_comp )
|
||||
|
||||
bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp))
|
||||
, y = eval(parse(text = y_var)) )) +
|
||||
|
||||
facet_wrap(~ eval(parse(text = facet_var))
|
||||
, nrow = n_facet_row
|
||||
, scales = y_scales) +
|
||||
|
||||
geom_violin(trim = T
|
||||
, scale = "width"
|
||||
#, position = position_dodge(width = 0.9)
|
||||
, draw_quantiles = c(0.25, 0.5, 0.75)) +
|
||||
|
||||
# geom_boxplot(fill = "white"
|
||||
# , outlier.colour = NA
|
||||
# #, position = position_dodge(width = 0.9)
|
||||
# , width = bp_width) +
|
||||
|
||||
# geom_point(position = position_jitterdodge(dodge.width = 0.5)
|
||||
# , alpha = 0.5
|
||||
# , show.legend = FALSE
|
||||
# , aes(colour = factor(eval(parse(text = colour_categ))) )) +
|
||||
|
||||
# ggbeeswarm (better than geom_point)
|
||||
geom_beeswarm(priority = "density"
|
||||
#, shape = 21
|
||||
, size = dot_size
|
||||
, alpha = dot_transparency
|
||||
, show.legend = FALSE
|
||||
, cex = 0.8
|
||||
, aes(colour = factor(eval(parse(text = colour_categ))) )) +
|
||||
|
||||
theme(axis.text.x = element_text(size = my_ats)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_ats)
|
||||
, axis.title.y = element_text(size = my_ats)
|
||||
, plot.title = element_text(size = my_pts
|
||||
, hjust = 0.5
|
||||
, colour = "black"
|
||||
, face = "bold")
|
||||
, strip.background = element_rect(fill = colour_bp_strip)
|
||||
, strip.text.x = element_text(size = my_fls
|
||||
, colour = "black")
|
||||
, legend.title = element_text(color = "black"
|
||||
, size = my_als)
|
||||
, legend.text = element_text(size = my_ats)
|
||||
, legend.direction = "vertical") +
|
||||
|
||||
labs(title = p_title
|
||||
, x = ""
|
||||
, y = "")+
|
||||
|
||||
stat_compare_means(comparisons = my_comparisonsL
|
||||
, method = stat_method
|
||||
, paired = my_paired
|
||||
, label = stat_label[1])
|
||||
|
||||
return(bp_statP)
|
||||
|
||||
}
|
83
scripts/functions/redundant/test_lf_bp_with_stats.R
Normal file
83
scripts/functions/redundant/test_lf_bp_with_stats.R
Normal file
|
@ -0,0 +1,83 @@
|
|||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
|
||||
source("../functions/lf_bp_with_stats.R")
|
||||
source("../functions/lf_bp.R")
|
||||
|
||||
######################
|
||||
# Make plot
|
||||
######################
|
||||
# Note: Data
|
||||
# run other_plots_data.R
|
||||
# to get the long format data to test this function
|
||||
|
||||
lf_bp(lf_df = lf_dynamut2
|
||||
, p_title = "Dynamut2"
|
||||
, colour_categ = "ddg_dynamut2_outcome"
|
||||
, x_grp = "mutation_info"
|
||||
, y_var = "param_value"
|
||||
, facet_var = "param_type"
|
||||
, n_facet_row = 1
|
||||
, y_scales = "free_y"
|
||||
, colour_bp_strip = "khaki2"
|
||||
, dot_size = 3
|
||||
, dot_transparency = 0.3
|
||||
, violin_quantiles = c(0.25, 0.5, 0.75)
|
||||
, my_ats = 22 # axis text size
|
||||
, my_als = 20 # axis label size
|
||||
, my_fls = 20 # facet label size
|
||||
, my_pts = 22 # plot title size
|
||||
, make_boxplot = F
|
||||
, bp_width = "auto"
|
||||
, add_stats = T
|
||||
, stat_grp_comp = c("DM", "OM")
|
||||
, stat_method = "wilcox.test"
|
||||
, my_paired = FALSE
|
||||
, stat_label = c("p.format", "p.signif") )
|
||||
|
||||
# foo = lf_dynamut2 %>%
|
||||
# group_by(mutation_info, param_type) %>%
|
||||
# summarise( Mean = mean(param_value, na.rm = T)
|
||||
# , SD = sd(param_value, na.rm = T)
|
||||
# , Median = median(param_value, na.rm = T)
|
||||
# , IQR = IQR(param_value, na.rm = T) )
|
||||
|
||||
# Quick tests
|
||||
plotdata_sel = subset(lf_dynamut2
|
||||
, lf_dynamut2$param_type == "ASA")
|
||||
|
||||
plot_sum = plotdata_sel %>%
|
||||
group_by(mutation_info, param_type) %>%
|
||||
summarise(n = n()
|
||||
, Mean = mean(param_value, na.rm = T)
|
||||
, SD = sd(param_value, na.rm = T)
|
||||
, Min = min(param_value, na.rm = T)
|
||||
, Q1 = quantile(param_value, na.rm = T, 0.25)
|
||||
, Median = median(param_value, na.rm = T)
|
||||
, Q3 = quantile(param_value, na.rm = T, 0.75)
|
||||
, Max = max(param_value, na.rm = T) ) %>%
|
||||
rename('Mutation Class' = mutation_info
|
||||
, Parameter = param_type)
|
||||
plot_sum = as.data.frame(plot_sum, row.names = NULL)
|
||||
plot_sum
|
||||
|
||||
bar = compare_means(param_value ~ mutation_info
|
||||
, group.by = "param_type"
|
||||
, data = plotdata_sel
|
||||
, paired = FALSE
|
||||
, p.adjust.method = "BH")
|
||||
bar2 = bar[c("param_type"
|
||||
, "group1"
|
||||
, "group2"
|
||||
, "p.format"
|
||||
, "p.signif"
|
||||
, "p.adj")] %>%
|
||||
rename(Parameter = param_type
|
||||
, Group1 = group1
|
||||
, Group2 = group2
|
||||
, "P-value" = p.format
|
||||
, "P-sig" = p.signif
|
||||
, "P-adj" = p.adj)
|
||||
bar2 = data.frame(bar2); bar2
|
||||
|
||||
library(Hmisc)
|
||||
describe(lf_dynamut2)
|
|
@ -15,39 +15,42 @@ theme_set(theme_grey())
|
|||
## ...opt args
|
||||
#==========================================================
|
||||
stability_count_bp <- function(plotdf
|
||||
, df_colname
|
||||
, leg_title = "Legend title"
|
||||
, axis_text_size = 25
|
||||
, axis_label_size = 22
|
||||
, leg_text_size = 20
|
||||
, leg_title_size = 22
|
||||
, df_colname = ""
|
||||
, leg_title = "Legend Title"
|
||||
, ats = 25 # axis text size
|
||||
, als = 22 # axis label size
|
||||
, lts = 20 # legend text size
|
||||
, ltis = 22 # label title size
|
||||
, geom_ls = 10 # geom_label size
|
||||
, yaxis_title = "Number of nsSNPs"
|
||||
, bp_plot_title = ""
|
||||
, label_categories = c("Destabilising", "Stabilising")
|
||||
, title_colour = "chocolate4"
|
||||
, subtitle_text = NULL
|
||||
, subtitle_size = 20
|
||||
, sts = 20
|
||||
, subtitle_colour = "pink"
|
||||
#, leg_position = c(0.73,0.8) # within plot area
|
||||
, leg_position = "top"){
|
||||
|
||||
OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) +
|
||||
# OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) +
|
||||
OutPlot_count = ggplot(plotdf, aes_string(x = df_colname)) +
|
||||
geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) +
|
||||
geom_label(stat = "count"
|
||||
, aes(label = ..count..)
|
||||
, color = "black"
|
||||
, show.legend = FALSE
|
||||
, size = 10) +
|
||||
, size = geom_ls) +
|
||||
theme(axis.text.x = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size = axis_label_size)
|
||||
, axis.text.y = element_text(size = axis_text_size)
|
||||
, axis.title.y = element_text(size = als)
|
||||
, axis.text.y = element_text(size = ats)
|
||||
, legend.position = leg_position
|
||||
, legend.text = element_text(size = leg_text_size)
|
||||
, legend.title = element_text(size = leg_title_size)
|
||||
, plot.title = element_text(size = axis_label_size
|
||||
, colour = title_colour)
|
||||
, plot.subtitle = element_text(size = subtitle_size
|
||||
, legend.text = element_text(size = lts)
|
||||
, legend.title = element_text(size = ltis)
|
||||
, plot.title = element_text(size = als
|
||||
, colour = title_colour
|
||||
, hjust = 0.5)
|
||||
, plot.subtitle = element_text(size = sts
|
||||
, hjust = 0.5
|
||||
, colour = subtitle_colour)) +
|
||||
labs(title = bp_plot_title
|
||||
|
|
62
scripts/functions/tests/test_bp_lineage.R
Normal file
62
scripts/functions/tests/test_bp_lineage.R
Normal file
|
@ -0,0 +1,62 @@
|
|||
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||
|
||||
source ('get_plotting_dfs.R')
|
||||
source("../functions/bp_lineage.R")
|
||||
|
||||
#########################################
|
||||
# Lineage and SNP count: lineage lf data
|
||||
#########################################
|
||||
#=========================
|
||||
# Data: All lineages or
|
||||
# selected few
|
||||
#=========================
|
||||
sel_lineages = levels(lin_lf$sel_lineages_f)
|
||||
sel_lineages
|
||||
lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,]
|
||||
|
||||
# drop unused factor levels
|
||||
lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f)
|
||||
levels(lin_lf_plot$sel_lineages_f)
|
||||
#=========================
|
||||
# Lineage count plot
|
||||
#=========================
|
||||
lin_count_bp(lin_lf_plot
|
||||
, x_categ = "sel_lineages_f"
|
||||
, y_count = "p_count"
|
||||
, bar_fill_categ = "count_categ"
|
||||
, display_label_col = "p_count"
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, my_xats = 20
|
||||
, bar_col_labels = c("Mutations", "Total Samples")
|
||||
, bar_col_values = c("grey50", "gray75")
|
||||
, y_scale_percent = F # T for diversity
|
||||
, y_log10 = F
|
||||
, y_label = "Count")
|
||||
|
||||
###############################################
|
||||
# Lineage SNP diversity count: lineage wf data
|
||||
###############################################
|
||||
#=========================
|
||||
# Data: All lineages or
|
||||
# selected few
|
||||
#=========================
|
||||
sel_lineages = levels(lin_wf$sel_lineages_f)
|
||||
sel_lineages
|
||||
lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,]
|
||||
|
||||
# drop unused factor levels
|
||||
lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f)
|
||||
levels(lin_wf_plot$sel_lineages_f)
|
||||
#=========================
|
||||
# Lineage Diversity plot
|
||||
#=========================
|
||||
lin_count_bp(lin_wf_plot
|
||||
, x_categ = "sel_lineages_f"
|
||||
, y_count = "snp_diversity"
|
||||
, display_label_col = "snp_diversity_f"
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, my_xats = 20
|
||||
, y_scale_percent = T
|
||||
, y_label = "SNP diversity")
|
78
scripts/functions/tests/test_bp_subcolours.R
Normal file
78
scripts/functions/tests/test_bp_subcolours.R
Normal file
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#source("~/git/Misc/shiny/myshiny/gid_data.R")
|
||||
|
||||
source("~/git/LSHTM_analysis/config/gid.R")
|
||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R")
|
||||
|
||||
# p1
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "duet_scaled"
|
||||
, stability_outcome_colname = "duet_outcome"
|
||||
, p_title = "DUET" )
|
||||
|
||||
# p2
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "foldx_scaled"
|
||||
, stability_outcome_colname = "foldx_outcome"
|
||||
, p_title = "FoldX" )
|
||||
|
||||
# p3
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "deepddg_scaled"
|
||||
, stability_outcome_colname = "deepddg_outcome"
|
||||
, p_title = "DeepDDG" )
|
||||
|
||||
# p4
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "ddg_dynamut2_scaled"
|
||||
, stability_outcome_colname = "ddg_dynamut2_outcome"
|
||||
, p_title = "Dynamut2" )
|
||||
|
||||
# p5
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "mcsm_na_scaled"
|
||||
, stability_outcome_colname = "mcsm_na_outcome"
|
||||
, p_title = "mCSM-NA" )
|
||||
|
||||
# p6
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "ddg_dynamut_scaled"
|
||||
, stability_outcome_colname = "ddg_dynamut_outcome"
|
||||
, p_title = "Dynamut" )
|
||||
|
||||
# p7
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "ddg_mcsm_scaled"
|
||||
, stability_outcome_colname = "ddg_mcsm_outcome"
|
||||
, p_title = "mCSM" )
|
||||
|
||||
# p8
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "ddg_duet_scaled"
|
||||
, stability_outcome_colname = "ddg_duet_outcome"
|
||||
, p_title = "DUET-d" )
|
||||
|
||||
# p9
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "ddg_sdm_scaled"
|
||||
, stability_outcome_colname = "ddg_sdm_outcome"
|
||||
, p_title = "SDM" )
|
||||
|
||||
# p10
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "ddg_encom_scaled"
|
||||
, stability_outcome_colname = "ddg_encom_outcome"
|
||||
, p_title = "ENCoM-Stability" )
|
||||
|
||||
# p11
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "dds_encom_scaled"
|
||||
, stability_outcome_colname = "dds_encom_outcome"
|
||||
, p_title = "ENCoM-Flexibility" )
|
||||
|
||||
# p12
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "affinity_scaled"
|
||||
, stability_outcome_colname = "ligand_outcome"
|
||||
, p_title = "mCSM-lig" )
|
59
scripts/functions/tests/test_bp_subcolours_i.R
Normal file
59
scripts/functions/tests/test_bp_subcolours_i.R
Normal file
|
@ -0,0 +1,59 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#source("~/git/Misc/shiny/myshiny/gid_data.R")
|
||||
|
||||
source("~/git/LSHTM_analysis/config/gid.R")
|
||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R")
|
||||
|
||||
# p1
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "duet_scaled"
|
||||
, stability_outcome_colname = "duet_outcome"
|
||||
, p_title = "DUET" )
|
||||
|
||||
# p2
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "foldx_scaled"
|
||||
, stability_outcome_colname = "foldx_outcome"
|
||||
, p_title = "FoldX" )
|
||||
|
||||
# p3
|
||||
bp_stability_hmap(plotdf = merged_df3
|
||||
, stability_colname = "deepddg_scaled"
|
||||
, stability_outcome_colname = "deepddg_outcome"
|
||||
, p_title = "DeepDDG" )
|
||||
|
||||
##################################################
|
||||
|
||||
merged_df3_f = merged_df3
|
||||
|
||||
setDT(merged_df3_f)[, pos_count := .N, by = position]
|
||||
|
||||
##################################################
|
||||
ui <- basicPage(
|
||||
plotOutput("plot1", click = "plot_click"),
|
||||
verbatimTextOutput("info")
|
||||
)
|
||||
|
||||
server <- function(input, output) {
|
||||
output$plot1 <- renderPlot({
|
||||
|
||||
#plot(mtcars$wt, mtcars$mpg)
|
||||
bp_stability_hmap(plotdf = merged_df3_f
|
||||
, xvar_colname = "position"
|
||||
, stability_colname = "foldx_scaled"
|
||||
, stability_outcome_colname = "foldx_outcome"
|
||||
, p_title = "FoldX" )
|
||||
|
||||
})
|
||||
|
||||
output$info <- renderPrint({
|
||||
# With base graphics, need to tell it what the x and y variables are.
|
||||
nearPoints(merged_df3_f, input$plot_click
|
||||
, xvar = "position"
|
||||
, yvar = "pos_count"
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
shinyApp(ui, server)
|
58
scripts/functions/tests/test_lf_bp.R
Normal file
58
scripts/functions/tests/test_lf_bp.R
Normal file
|
@ -0,0 +1,58 @@
|
|||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
source("Header_TT.R")
|
||||
source("../functions/lf_bp.R")
|
||||
# ================================================
|
||||
# Data: run get_plotting_data.R
|
||||
# to get the long format data to test this function
|
||||
# drug = "streptomycin"
|
||||
# gene = "gid"
|
||||
# source("get_plotting_dfs.R")
|
||||
# ==================================================
|
||||
|
||||
######################
|
||||
# Make plot: ggplot
|
||||
######################
|
||||
lf_bp(lf_df = lf_encomddg
|
||||
, p_title = "ENCoM-DDG"
|
||||
, colour_categ = "ddg_encom_outcome"
|
||||
, x_grp = "mutation_info"
|
||||
, y_var = "param_value"
|
||||
, facet_var = "param_type"
|
||||
, n_facet_row = 1
|
||||
, y_scales = "free_y"
|
||||
, colour_bp_strip = "khaki2"
|
||||
, dot_size = 3
|
||||
, dot_transparency = 0.3
|
||||
, violin_quantiles = c(0.25, 0.5, 0.75)
|
||||
, my_ats = 22 # axis text size
|
||||
, my_als = 20 # axis label size
|
||||
, my_fls = 20 # facet label size
|
||||
, my_pts = 22 # plot title size
|
||||
, make_boxplot = F
|
||||
, bp_width = "auto"
|
||||
, add_stats = T
|
||||
, stat_grp_comp = c("DM", "OM")
|
||||
, stat_method = "wilcox.test"
|
||||
, my_paired = FALSE
|
||||
, stat_label = c("p.format", "p.signif") )
|
||||
|
||||
#wilcox.test(wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "DM"]
|
||||
# , wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "OM"])
|
||||
|
||||
######################
|
||||
# Make plot: plotly
|
||||
######################
|
||||
# FIXME: This labels are not working as I want!
|
||||
# lf_bp_plotly(lf_df = lf_deepddg
|
||||
# , p_title = "DeepDDG"
|
||||
# , colour_categ = "deepddg_outcome"
|
||||
# , x_grp = "mutation_info"
|
||||
# , y_var = "param_value"
|
||||
# , facet_var = "param_type"
|
||||
# , n_facet_row = 1
|
||||
# , y_scales = "free_y"
|
||||
# , colour_bp_strip = "khaki2"
|
||||
# , dot_size = 3
|
||||
# , dot_transparency = 0.3
|
||||
# , violin_quantiles = c(0.25, 0.5, 0.75)
|
||||
# )
|
19
scripts/functions/tests/test_lf_unpaired_stats.R
Normal file
19
scripts/functions/tests/test_lf_unpaired_stats.R
Normal file
|
@ -0,0 +1,19 @@
|
|||
setwd("~/git/LSHTM_analysis/scripts/functions")
|
||||
source("lf_unpaired_stats.R")
|
||||
|
||||
#####################
|
||||
# call stat function()
|
||||
# a useful way to check stats
|
||||
# for any lf data
|
||||
#####################
|
||||
# Note: Data
|
||||
# run other_plots_data.R
|
||||
# to get the long format data to test this function
|
||||
|
||||
stat_results_df <- lf_unpaired_stats(lf_data = lf_duet
|
||||
, lf_stat_value = "param_value"
|
||||
, lf_stat_group = "mutation_info"
|
||||
, lf_col_statvars = "param_type"
|
||||
, my_paired = FALSE
|
||||
, stat_adj = "none"
|
||||
)
|
33
scripts/functions/tests/test_lineage_dist.R
Normal file
33
scripts/functions/tests/test_lineage_dist.R
Normal file
|
@ -0,0 +1,33 @@
|
|||
###############################
|
||||
# TEST function lineage_dist.R
|
||||
# to plot lineage
|
||||
# dist plots with or without facet
|
||||
##############################
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
|
||||
source("Header_TT.R")
|
||||
|
||||
source("get_plotting_dfs.R")
|
||||
|
||||
cat("cols imported:"
|
||||
, mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
|
||||
|
||||
|
||||
#############################################################
|
||||
# without facet
|
||||
lineage_distP(lin_dist_plot
|
||||
, with_facet = F
|
||||
, leg_label = "Mutation Class"
|
||||
)
|
||||
|
||||
# without facet
|
||||
lineage_distP(lin_dist_plot
|
||||
, with_facet = T
|
||||
, facet_wrap_var = "mutation_info_labels"
|
||||
, leg_label = "Mutation Class"
|
||||
, leg_pos_wf = "none"
|
||||
, leg_dir_wf = "horizontal"
|
||||
|
||||
)
|
|
@ -1,14 +1,11 @@
|
|||
#########################################################
|
||||
### A) Installing and loading required packages
|
||||
# A) Installing and loading required packages
|
||||
# B) My functions
|
||||
#########################################################
|
||||
|
||||
#########################################################
|
||||
#lib_loc = "/usr/local/lib/R/site-library")
|
||||
|
||||
#if (!require("gplots")) {
|
||||
# install.packages("gplots", dependencies = TRUE)
|
||||
# library(gplots)
|
||||
#}
|
||||
require(extrafont)
|
||||
|
||||
require("getopt", quietly = TRUE) # cmd parse arguments
|
||||
|
||||
if (!require("tidyverse")) {
|
||||
|
@ -16,9 +13,53 @@ if (!require("tidyverse")) {
|
|||
library(tidyverse)
|
||||
}
|
||||
|
||||
if (!require("ggplot2")) {
|
||||
install.packages("ggplot2", dependencies = TRUE)
|
||||
library(ggplot2)
|
||||
if (!require("shiny")) {
|
||||
install.packages("shiny", dependencies = TRUE)
|
||||
library(shiny)
|
||||
}
|
||||
|
||||
if (!require("shinyBS")) {
|
||||
install.packages("shinyBS", dependencies = TRUE)
|
||||
library(shinyBS)
|
||||
}
|
||||
|
||||
if (!require("gridExtra")) {
|
||||
install.packages("gridExtra", dependencies = TRUE)
|
||||
library(gridExtra)
|
||||
}
|
||||
|
||||
if (!require("ggridges")) {
|
||||
install.packages("ggridges", dependencies = TRUE)
|
||||
library(ggridges)
|
||||
}
|
||||
|
||||
# if (!require("ggplot2")) {
|
||||
# install.packages("ggplot2", dependencies = TRUE)
|
||||
# library(ggplot2)
|
||||
# }
|
||||
|
||||
# if (!require ("dplyr")){
|
||||
# install.packages("dplyr")
|
||||
# library(dplyr)
|
||||
# }
|
||||
|
||||
if (!require ("DT")){
|
||||
install.packages("DT")
|
||||
library(DT)
|
||||
}
|
||||
|
||||
if (!require ("plyr")){
|
||||
install.packages("plyr")
|
||||
library(plyr)
|
||||
}
|
||||
|
||||
# Install
|
||||
#if(!require(devtools)) install.packages("devtools")
|
||||
#devtools::install_github("kassambara/ggcorrplot")
|
||||
|
||||
if (!require ("ggbeeswarm")){
|
||||
install.packages("ggbeeswarm")
|
||||
library(ggbeeswarm)
|
||||
}
|
||||
|
||||
if (!require("plotly")) {
|
||||
|
@ -101,11 +142,6 @@ if (!require ("psych")){
|
|||
library(psych)
|
||||
}
|
||||
|
||||
if (!require ("dplyr")){
|
||||
install.packages("dplyr")
|
||||
library(dplyr)
|
||||
}
|
||||
|
||||
if (!require ("compare")){
|
||||
install.packages("compare")
|
||||
library(compare)
|
||||
|
@ -116,6 +152,22 @@ if (!require ("arsenal")){
|
|||
library(arsenal)
|
||||
}
|
||||
|
||||
if(!require(ggseqlogo)){
|
||||
install.packages("ggseqlogo")
|
||||
library(ggseqlogo)
|
||||
}
|
||||
|
||||
# for PDB files
|
||||
if(!require(bio3d)){
|
||||
install.packages("bio3d")
|
||||
library(bio3d)
|
||||
}
|
||||
|
||||
library(protr)
|
||||
if(!require(protr)){
|
||||
install.packages("protr")
|
||||
library(protr)
|
||||
}
|
||||
|
||||
#if (!requireNamespace("BiocManager", quietly = TRUE))
|
||||
# install.packages("BiocManager")
|
||||
|
@ -123,24 +175,14 @@ if (!require ("arsenal")){
|
|||
#BiocManager::install("Logolas")
|
||||
library("Logolas")
|
||||
|
||||
#install.packages("ggseqlogo")
|
||||
library(ggseqlogo)
|
||||
|
||||
####################################
|
||||
# Load all my functions:
|
||||
# only works if tidyverse is loaded
|
||||
# hence included it here!
|
||||
####################################
|
||||
|
||||
####TIDYVERSE
|
||||
# Install
|
||||
#if(!require(devtools)) install.packages("devtools")
|
||||
#devtools::install_github("kassambara/ggcorrplot")
|
||||
func_path = "~/git/LSHTM_analysis/scripts/functions/"
|
||||
source_files <- list.files(func_path, "\\.R$") # locate all .R files
|
||||
map(paste0(func_path, source_files), source) # source all your R scripts!
|
||||
|
||||
library(ggcorrplot)
|
||||
|
||||
|
||||
###for PDB files
|
||||
#install.packages("bio3d")
|
||||
if(!require(bio3d)){
|
||||
install.packages("bio3d")
|
||||
library(bio3d)
|
||||
}
|
||||
|
||||
#install.packages("protr")
|
||||
library(protr)
|
||||
|
|
2
scripts/plotting/basic_barplots_combined.R
Normal file → Executable file
2
scripts/plotting/basic_barplots_combined.R
Normal file → Executable file
|
@ -23,7 +23,7 @@ plot_basic_bp_combined_labelled = paste0(plotdir,"/", basic_bp_combined_labell
|
|||
|
||||
#=======================================================================
|
||||
#=======
|
||||
# combin DUET and Ligand affinity plots
|
||||
# combine DUET and Ligand affinity plots
|
||||
#=======
|
||||
svg(plot_basic_bp_combined_labelled , width = 12, height = 12 )
|
||||
|
||||
|
|
0
scripts/plotting/corr_adjusted_PS_LIG.R
Normal file → Executable file
0
scripts/plotting/corr_adjusted_PS_LIG.R
Normal file → Executable file
120
scripts/plotting/corr_data.R
Normal file
120
scripts/plotting/corr_data.R
Normal file
|
@ -0,0 +1,120 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Script to format data for corr plots
|
||||
#########################################################
|
||||
|
||||
#=================================================
|
||||
# Data for Corrplots
|
||||
#=================================================
|
||||
cat("\n=========================================="
|
||||
, "\nCORR PLOTS data: ALL params"
|
||||
, "\n=========================================")
|
||||
|
||||
# use data
|
||||
#merged_df2
|
||||
|
||||
#----------------------------
|
||||
# columns for corr plots:PS
|
||||
#----------------------------
|
||||
# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find!
|
||||
big_df_colnames = data.frame(names(merged_df2))
|
||||
|
||||
corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels"
|
||||
, "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa"
|
||||
, "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af"
|
||||
, "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity"
|
||||
, "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance")
|
||||
|
||||
#===========================
|
||||
# Corr data for plots: PS
|
||||
# big_df ps: ~ merged_df2
|
||||
#===========================
|
||||
|
||||
corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select]
|
||||
|
||||
#-----------------------
|
||||
# formatting: some cols
|
||||
# Add pretty colnames
|
||||
#-----------------------
|
||||
corr_df_m2_f <- corr_df_m2 %>%
|
||||
rename(
|
||||
DUET = duet_stability_change
|
||||
, 'mCSM-lig' = ligand_affinity_change
|
||||
, FoldX = ddg_foldx
|
||||
, DeepDDG = deepddg
|
||||
, ASA = asa
|
||||
, RSA = rsa
|
||||
, KD = kd_values
|
||||
, RD = rd_values
|
||||
, MAF = af
|
||||
, 'Log (OR)' = log10_or_mychisq
|
||||
, '-Log (P)' = neglog_pval_fisher
|
||||
, Dynamut = ddg_dynamut
|
||||
, 'ENCoM-DDG'= ddg_encom
|
||||
, mCSM = ddg_mcsm
|
||||
, SDM = ddg_sdm
|
||||
, 'DUET-d' = ddg_duet
|
||||
, 'ENCoM-DDS'= dds_encom
|
||||
, Dynamut2 = ddg_dynamut2
|
||||
, 'mCSM-NA' = mcsm_na_affinity )
|
||||
|
||||
#===========================
|
||||
# Corr data for plots: PS
|
||||
# short_df ps: ~merged_df3
|
||||
#===========================
|
||||
|
||||
corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),]
|
||||
|
||||
na_or = sum(is.na(corr_df_m3$log10_or_mychisq))
|
||||
check1 = nrow(corr_df_m3) - na_or; check1
|
||||
|
||||
if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
|
||||
cat( "\nPASS: No. of rows for corr_df_m3 match"
|
||||
, "\nPASS: No. of OR values checked: " , check1)
|
||||
} else {
|
||||
cat("\nFAIL: Numbers mismatch:"
|
||||
, "\nExpected nrows: ", nrow(merged_df3)
|
||||
, "\nGot: ", nrow(corr_df_m3)
|
||||
, "\nExpected OR values: ", nrow(merged_df3_comp)
|
||||
, "\nGot: ", check1)
|
||||
}
|
||||
|
||||
#-----------------------
|
||||
# formatting: some cols
|
||||
# Add pretty colnames
|
||||
#-----------------------
|
||||
corr_df_m3_f <- corr_df_m3 %>%
|
||||
rename(
|
||||
DUET = duet_stability_change
|
||||
, 'mCSM-lig' = ligand_affinity_change
|
||||
, FoldX = ddg_foldx
|
||||
, DeepDDG = deepddg
|
||||
, ASA = asa
|
||||
, RSA = rsa
|
||||
, KD = kd_values
|
||||
, RD = rd_values
|
||||
, MAF = af
|
||||
, 'Log (OR)' = log10_or_mychisq
|
||||
, '-Log (P)' = neglog_pval_fisher
|
||||
, Dynamut = ddg_dynamut
|
||||
, 'ENCoM-DDG'= ddg_encom
|
||||
, mCSM = ddg_mcsm
|
||||
, SDM = ddg_sdm
|
||||
, 'DUET-d' = ddg_duet
|
||||
, 'ENCoM-DDS'= dds_encom
|
||||
, Dynamut2 = ddg_dynamut2
|
||||
, 'mCSM-NA' = mcsm_na_affinity )
|
||||
|
||||
########################################################################
|
||||
cat("\nCorr Data created:"
|
||||
, "\n==================================="
|
||||
, "\ncorr_df_m2: created from merged_df2"
|
||||
, "\n==================================="
|
||||
, "\nnrows:", nrow(corr_df_m2)
|
||||
, "\nncols:", ncol(corr_df_m2)
|
||||
, "\n==================================="
|
||||
, "\ncorr_df_m3: created from merged_df3"
|
||||
, "\n==================================="
|
||||
, "\nnrows:", nrow(corr_df_m3)
|
||||
, "\nncols:", ncol(corr_df_m3)
|
||||
)
|
0
scripts/plotting/dirs.R
Normal file → Executable file
0
scripts/plotting/dirs.R
Normal file → Executable file
0
scripts/plotting/dist_plots_check.R
Normal file → Executable file
0
scripts/plotting/dist_plots_check.R
Normal file → Executable file
416
scripts/plotting/dm_om_data.R
Normal file
416
scripts/plotting/dm_om_data.R
Normal file
|
@ -0,0 +1,416 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Script to format data for dm om plots:
|
||||
# generating LF data
|
||||
# sourced by get_plotting_dfs.R
|
||||
#########################################################
|
||||
##========================================================================
|
||||
# cols to select:
|
||||
# THINK: whu
|
||||
|
||||
comb_df <- merged_df3[, c("mutationinformation", "mutation"
|
||||
, "mutation_info","mutation_info_labels"
|
||||
, "position"
|
||||
, LigDist_colname
|
||||
, "duet_stability_change", "duet_scaled", "duet_outcome"
|
||||
, "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
|
||||
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
|
||||
, "deepddg", "deepddg_scaled", "deepddg_outcome"
|
||||
, "asa", "rsa"
|
||||
, "rd_values", "kd_values"
|
||||
, "log10_or_mychisq", "neglog_pval_fisher", "af"
|
||||
, "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome"
|
||||
, "ddg_dynamut", "ddg_dynamut_scaled","ddg_dynamut_outcome"
|
||||
, "ddg_encom", "ddg_encom_scaled", "ddg_encom_outcome"
|
||||
, "dds_encom", "dds_encom_scaled", "dds_encom_outcome"
|
||||
, "ddg_mcsm", "ddg_mcsm_scaled", "ddg_mcsm_outcome"
|
||||
, "ddg_sdm", "ddg_sdm_scaled", "ddg_sdm_outcome"
|
||||
, "ddg_duet", "ddg_duet_scaled", "ddg_duet_outcome"
|
||||
, "ddg_dynamut2","ddg_dynamut2_scaled", "ddg_dynamut2_outcome")]
|
||||
|
||||
|
||||
comb_df_s = arrange(comb_df, position)
|
||||
|
||||
#=======================================================================
|
||||
fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
|
||||
fact_cols
|
||||
lapply(comb_df_s[, fact_cols], class)
|
||||
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
|
||||
|
||||
if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
|
||||
cat("\nChanging cols to factor")
|
||||
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
|
||||
if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
|
||||
cat("\nSuccessful: cols changed to factor")
|
||||
}
|
||||
}
|
||||
lapply(comb_df_s[, fact_cols], class)
|
||||
|
||||
#=======================================================================
|
||||
table(comb_df_s$mutation_info)
|
||||
|
||||
# further checks to make sure dr and other muts are indeed unique
|
||||
dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
|
||||
dr_muts_names = unique(dr_muts$mutation)
|
||||
|
||||
other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
|
||||
other_muts_names = unique(other_muts$mutation)
|
||||
|
||||
if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
|
||||
table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
|
||||
cat("PASS: dr and other muts are indeed unique")
|
||||
}else{
|
||||
cat("FAIL: dr and others muts are NOT unique!")
|
||||
quit()
|
||||
}
|
||||
|
||||
# pretty display names i.e. labels to reduce major code duplication later
|
||||
foo_cnames = data.frame(colnames(comb_df_s))
|
||||
names(foo_cnames) <- "old_name"
|
||||
|
||||
stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
|
||||
flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
|
||||
|
||||
lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
|
||||
duet_dn = paste0("DUET ", stability_suffix); duet_dn
|
||||
foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn
|
||||
deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn
|
||||
mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
|
||||
dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn
|
||||
dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
|
||||
encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
|
||||
encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
|
||||
sdm_dn = paste0("SDM " , stability_suffix); sdm_dn
|
||||
mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn
|
||||
|
||||
# Change colnames of some columns using datatable
|
||||
comb_df_sl = comb_df_s
|
||||
names(comb_df_sl)
|
||||
|
||||
setnames(comb_df_sl
|
||||
, old = c("asa", "rsa", "rd_values", "kd_values"
|
||||
, "log10_or_mychisq", "neglog_pval_fisher", "af"
|
||||
, LigDist_colname
|
||||
, "duet_scaled"
|
||||
, "foldx_scaled"
|
||||
, "deepddg_scaled"
|
||||
, "mcsm_na_scaled"
|
||||
, "ddg_dynamut_scaled"
|
||||
, "ddg_dynamut2_scaled"
|
||||
, "ddg_encom_scaled"
|
||||
, "dds_encom_scaled"
|
||||
, "ddg_sdm"
|
||||
, "ddg_mcsm")
|
||||
|
||||
, new = c("ASA", "RSA", "RD", "KD"
|
||||
, "Log10 (OR)", "-Log (P)", "MAF"
|
||||
, lig_dn
|
||||
, duet_dn
|
||||
, foldx_dn
|
||||
, deepddg_dn
|
||||
, mcsm_na_dn
|
||||
, dynamut_dn
|
||||
, dynamut2_dn
|
||||
, encom_ddg_dn
|
||||
, encom_dds_dn
|
||||
, sdm_dn
|
||||
, mcsm_dn)
|
||||
)
|
||||
|
||||
foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
|
||||
|
||||
# some more pretty labels
|
||||
table(comb_df_sl$mutation_info)
|
||||
|
||||
levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
|
||||
levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
|
||||
|
||||
table(comb_df_sl$mutation_info)
|
||||
|
||||
#######################################################################
|
||||
#======================
|
||||
# Selecting dfs
|
||||
# with appropriate cols
|
||||
#=======================
|
||||
static_cols_start = c("mutationinformation"
|
||||
, "position"
|
||||
, "mutation"
|
||||
, "mutation_info")
|
||||
|
||||
static_cols_end = c(lig_dn
|
||||
, "ASA"
|
||||
, "RSA"
|
||||
, "RD"
|
||||
, "KD")
|
||||
|
||||
# ordering is important!
|
||||
|
||||
#########################################################################
|
||||
#==============
|
||||
# DUET: LF
|
||||
#==============
|
||||
cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
|
||||
wf_duet = comb_df_sl[, cols_to_select_duet]
|
||||
|
||||
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
|
||||
pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
|
||||
|
||||
expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: duet
|
||||
lf_duet = gather(wf_duet
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(duet_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_duet) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", duet_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# FoldX: LF
|
||||
#==============
|
||||
cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
|
||||
wf_foldx = comb_df_sl[, cols_to_select_foldx]
|
||||
|
||||
pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
|
||||
|
||||
expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: Foldx
|
||||
lf_foldx <<- gather(wf_foldx
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(foldx_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_foldx) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", foldx_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# Deepddg: LF
|
||||
#==============
|
||||
cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
|
||||
wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
|
||||
|
||||
pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
|
||||
|
||||
expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: Deepddg
|
||||
lf_deepddg = gather(wf_deepddg
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(deepddg_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_deepddg) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", deepddg_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# mCSM-NA: LF
|
||||
#==============
|
||||
cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
|
||||
wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
|
||||
|
||||
pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
|
||||
|
||||
expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: mcsm_na
|
||||
lf_mcsm_na = gather(wf_mcsm_na
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(mcsm_na_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_mcsm_na) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", mcsm_na_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# Dynamut: LF
|
||||
#==============
|
||||
cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
|
||||
wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
|
||||
|
||||
pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
|
||||
|
||||
expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: dynamut
|
||||
lf_dynamut = gather(wf_dynamut
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(dynamut_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_dynamut) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", dynamut_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# Dynamut2: LF
|
||||
#==============
|
||||
cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
|
||||
|
||||
wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
|
||||
|
||||
pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
|
||||
|
||||
expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: dynamut2
|
||||
lf_dynamut2 = gather(wf_dynamut2
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(dynamut2_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_dynamut2) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", dynamut2_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# EnCOM ddg: LF
|
||||
#==============
|
||||
cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
|
||||
wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
|
||||
|
||||
pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg
|
||||
|
||||
expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: encomddg
|
||||
lf_encomddg = gather(wf_encomddg
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(encom_ddg_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_encomddg) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", encom_ddg_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
############################################################################
|
||||
#==============
|
||||
# EnCOM dds: LF
|
||||
#==============
|
||||
cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
|
||||
wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
|
||||
|
||||
pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds
|
||||
|
||||
expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: encomdds
|
||||
lf_encomdds = gather(wf_encomdds
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(encom_dds_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_encomdds) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for", encom_dds_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# SDM: LF
|
||||
#==============
|
||||
cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
|
||||
wf_sdm = comb_df_sl[, cols_to_select_sdm]
|
||||
|
||||
pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
|
||||
|
||||
expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: sdm
|
||||
lf_sdm = gather(wf_sdm
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(sdm_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_sdm) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for", sdm_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# mCSM: LF
|
||||
#==============
|
||||
cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
|
||||
wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
|
||||
|
||||
pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
|
||||
|
||||
expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: mcsm
|
||||
lf_mcsm = gather(wf_mcsm
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(mcsm_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_mcsm) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for", mcsm_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
#==========================
|
||||
# Duet-d(from Dynamut): LF
|
||||
#===========================
|
||||
|
||||
#Not created, redundant and chaos!
|
||||
|
||||
############################################################################
|
||||
|
0
scripts/plotting/extreme_muts.R
Normal file → Executable file
0
scripts/plotting/extreme_muts.R
Normal file → Executable file
628
scripts/plotting/get_plotting_dfs.R
Normal file → Executable file
628
scripts/plotting/get_plotting_dfs.R
Normal file → Executable file
|
@ -1,32 +1,27 @@
|
|||
#!/usr/bin/env Rscript
|
||||
|
||||
#########################################################
|
||||
# TASK: Get formatted data for plots
|
||||
#=======================================================================
|
||||
#########################################################
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
source("Header_TT.R")
|
||||
source("../functions/my_pairs_panel.R") # with lower panel turned off
|
||||
source("../functions/plotting_globals.R")
|
||||
source("../functions/plotting_data.R")
|
||||
source("../functions/combining_dfs_plotting.R")
|
||||
source("../functions/bp_subcolours.R")
|
||||
|
||||
#********************
|
||||
# cmd args passed
|
||||
# in from other scripts
|
||||
# to call this
|
||||
#********************
|
||||
#drug = 'streptomycin'
|
||||
#gene = 'gid'
|
||||
|
||||
#====================
|
||||
# variables for lig
|
||||
#====================
|
||||
|
||||
LigDist_colname = "ligand_distance"
|
||||
LigDist_cutoff = 10
|
||||
#LigDist_colname = "ligand_distance"
|
||||
#LigDist_cutoff = 10
|
||||
|
||||
#===========
|
||||
# input
|
||||
|
@ -41,8 +36,8 @@ import_dirs(drug, gene)
|
|||
#---------------------------
|
||||
if (!exists("infile_params") && exists("gene")){
|
||||
#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
|
||||
#in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
|
||||
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
|
||||
in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA (and for gid finally) 10/09/21
|
||||
#in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
|
||||
infile_params = paste0(outdir, "/", in_filename_params)
|
||||
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
|
||||
}
|
||||
|
@ -54,10 +49,15 @@ pd_df = plotting_data(mcsm_df
|
|||
, lig_dist_colname = LigDist_colname
|
||||
, lig_dist_cutoff = LigDist_cutoff)
|
||||
|
||||
my_df = pd_df[[1]]
|
||||
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
|
||||
my_df_u_lig = pd_df[[3]]
|
||||
dup_muts = pd_df[[4]]
|
||||
my_df = pd_df[[1]]
|
||||
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
|
||||
|
||||
max_ang <- round(max(my_df_u[LigDist_colname]))
|
||||
min_ang <- round(min(my_df_u[LigDist_colname]))
|
||||
|
||||
cat("\nLigand distance cut off, colname:", LigDist_colname
|
||||
, "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
|
||||
, "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")
|
||||
|
||||
#--------------------------------
|
||||
# call: combining_dfs_plotting()
|
||||
|
@ -81,509 +81,149 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
|
|||
, lig_dist_colname = LigDist_colname
|
||||
, lig_dist_cutoff = LigDist_cutoff)
|
||||
|
||||
merged_df2 = all_plot_dfs[[1]]
|
||||
merged_df3 = all_plot_dfs[[2]]
|
||||
merged_df2_comp = all_plot_dfs[[3]]
|
||||
merged_df3_comp = all_plot_dfs[[4]]
|
||||
merged_df2_lig = all_plot_dfs[[5]]
|
||||
merged_df3_lig = all_plot_dfs[[6]]
|
||||
merged_df2_comp_lig = all_plot_dfs[[7]]
|
||||
merged_df3_comp_lig = all_plot_dfs[[8]]
|
||||
merged_df2 = all_plot_dfs[[1]]
|
||||
merged_df3 = all_plot_dfs[[2]]
|
||||
merged_df2_comp = all_plot_dfs[[3]]
|
||||
merged_df3_comp = all_plot_dfs[[4]]
|
||||
#======================================================================
|
||||
#TODO: Think! MOVE TO COMBINE or singular file for deepddg
|
||||
|
||||
#============================
|
||||
# adding deepddg scaled values
|
||||
# scale data b/w -1 and 1
|
||||
#============================
|
||||
# n = which(colnames(merged_df3) == "deepddg"); n
|
||||
#
|
||||
# my_min = min(merged_df3[,n]); my_min
|
||||
# my_max = max(merged_df3[,n]); my_max
|
||||
#
|
||||
# merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
|
||||
# , merged_df3[,n]/abs(my_min)
|
||||
# , merged_df3[,n]/my_max)
|
||||
# # sanity check
|
||||
# my_min = min(merged_df3$deepddg_scaled); my_min
|
||||
# my_max = max(merged_df3$deepddg_scaled); my_max
|
||||
#
|
||||
# if (my_min == -1 && my_max == 1){
|
||||
# cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
|
||||
# #, "\nProceeding with assigning deep outcome category")
|
||||
# , "\n")
|
||||
# }else{
|
||||
# cat("\nFAIL: could not scale DeepDDG ddg values"
|
||||
# , "Aborting!")
|
||||
# }
|
||||
#
|
||||
|
||||
####################################################################
|
||||
# Data for subcols barplot (~heatmpa)
|
||||
# Data for combining other dfs
|
||||
####################################################################
|
||||
# can include: mutation, or_kin, pwald, af_kin
|
||||
cols_to_select = c("mutationinformation", "drtype"
|
||||
, "wild_type"
|
||||
, "position"
|
||||
, "mutant_type"
|
||||
, "chain", "ligand_id", "ligand_distance"
|
||||
, "duet_stability_change", "duet_outcome", "duet_scaled"
|
||||
, "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
|
||||
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
|
||||
, "deepddg", "deepddg_outcome" # comment out as not available for pnca
|
||||
, "asa", "rsa", "rd_values", "kd_values"
|
||||
, "af", "or_mychisq", "pval_fisher"
|
||||
, "or_fisher", "or_logistic", "pval_logistic"
|
||||
, "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
|
||||
, "wt_calcprop", "mut_calcprop")
|
||||
|
||||
#=======================
|
||||
# Data for sub colours
|
||||
# barplot: PS
|
||||
#=======================
|
||||
#source("other_dfs_data.R")
|
||||
# Fixed this at source i.e python script
|
||||
# Moved: "other_dfs_data.R" to redundant/
|
||||
|
||||
cat("\nNo. of cols to select:", length(cols_to_select))
|
||||
####################################################################
|
||||
# Data for subcols barplot (~heatmap)
|
||||
####################################################################
|
||||
|
||||
subcols_df_ps = merged_df3[, cols_to_select]
|
||||
|
||||
cat("\nNo of unique positions for ps:"
|
||||
, length(unique(subcols_df_ps$position)))
|
||||
|
||||
# add count_pos col that counts the no. of nsSNPS at a position
|
||||
setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
|
||||
|
||||
# should be a factor
|
||||
if (is.factor(subcols_df_ps$duet_outcome)){
|
||||
cat("\nDuet_outcome is factor")
|
||||
table(subcols_df_ps$duet_outcome)
|
||||
}else{
|
||||
cat("\nConverting duet_outcome to factor")
|
||||
subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
|
||||
table(subcols_df_ps$duet_outcome)
|
||||
}
|
||||
|
||||
# should be -1 and 1
|
||||
min(subcols_df_ps$duet_scaled)
|
||||
max(subcols_df_ps$duet_scaled)
|
||||
|
||||
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
|
||||
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
|
||||
|
||||
# check unique values in normalised data
|
||||
cat("\nNo. of unique values in duet scaled, no rounding:"
|
||||
, length(unique(subcols_df_ps$duet_scaled)))
|
||||
|
||||
# No rounding
|
||||
my_grp = subcols_df_ps$duet_scaled; length(my_grp)
|
||||
|
||||
# Add rounding is to be used
|
||||
n = 3
|
||||
subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
|
||||
|
||||
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
|
||||
, length(unique(subcols_df_ps$duet_scaledR)))
|
||||
|
||||
my_grp_r = subcols_df_ps$duet_scaledR # rounding
|
||||
|
||||
# Add grp cols
|
||||
subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
|
||||
subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
|
||||
subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
|
||||
|
||||
print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
|
||||
print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
|
||||
|
||||
#=======================
|
||||
# Data for sub colours
|
||||
# barplot: LIG
|
||||
#=======================
|
||||
cat("\nNo. of cols to select:", length(cols_to_select))
|
||||
|
||||
subcols_df_lig = merged_df3_lig[, cols_to_select]
|
||||
|
||||
cat("\nNo of unique positions for LIG:"
|
||||
, length(unique(subcols_df_lig$position)))
|
||||
|
||||
# should be a factor
|
||||
if (is.factor(subcols_df_lig$ligand_outcome)){
|
||||
cat("\nLigand_outcome is factor")
|
||||
table(subcols_df_lig$ligand_outcome)
|
||||
}else{
|
||||
cat("\nConverting ligand_outcome to factor")
|
||||
subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
|
||||
table(subcols_df_lig$ligand_outcome)
|
||||
}
|
||||
|
||||
# should be -1 and 1
|
||||
min(subcols_df_lig$affinity_scaled)
|
||||
max(subcols_df_lig$affinity_scaled)
|
||||
|
||||
tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
|
||||
tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
|
||||
|
||||
# check unique values in normalised data
|
||||
cat("\nNo. of unique values in affinity scaled, no rounding:"
|
||||
, length(unique(subcols_df_lig$affinity_scaled)))
|
||||
|
||||
# No rounding
|
||||
my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
|
||||
|
||||
# Add rounding is to be used
|
||||
n = 3
|
||||
subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
|
||||
|
||||
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
|
||||
, length(unique(subcols_df_lig$affinity_scaledR)))
|
||||
|
||||
my_grp_lig_r = subcols_df_lig$affinity_scaledR # rounding
|
||||
|
||||
# Add grp cols
|
||||
subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
|
||||
subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
|
||||
subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
|
||||
|
||||
print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
|
||||
print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
|
||||
#source("coloured_bp_data.R")
|
||||
# Repurposed function so that params can be passed instead to generate
|
||||
# data required for plotting.
|
||||
# Moved "coloured_bp_data.R" to redundant/
|
||||
|
||||
####################################################################
|
||||
# Data for logoplots
|
||||
####################################################################
|
||||
#-------------------------
|
||||
# choose df for logoplot
|
||||
#-------------------------
|
||||
logo_data = merged_df3
|
||||
#logo_data = merged_df3_comp
|
||||
|
||||
# quick checks
|
||||
colnames(logo_data)
|
||||
str(logo_data)
|
||||
source("logo_data.R")
|
||||
|
||||
c1 = unique(logo_data$position)
|
||||
nrow(logo_data)
|
||||
cat("No. of rows in my_data:", nrow(logo_data)
|
||||
, "\nDistinct positions corresponding to snps:", length(c1)
|
||||
, "\n===========================================================")
|
||||
#=======================================================================
|
||||
#==================
|
||||
# logo data: OR
|
||||
#==================
|
||||
foo = logo_data[, c("position"
|
||||
, "mutant_type","duet_scaled", "or_mychisq"
|
||||
, "mut_prop_polarity", "mut_prop_water")]
|
||||
|
||||
logo_data$log10or = log10(logo_data$or_mychisq)
|
||||
logo_data_plot = logo_data[, c("position"
|
||||
, "mutant_type", "or_mychisq", "log10or")]
|
||||
|
||||
logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
|
||||
wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
|
||||
|
||||
wide_df_or = as.matrix(wide_df_or)
|
||||
rownames(wide_df_or) = wide_df_or[,1]
|
||||
dim(wide_df_or)
|
||||
wide_df_or = wide_df_or[,-1]
|
||||
str(wide_df_or)
|
||||
|
||||
position_or = as.numeric(colnames(wide_df_or))
|
||||
|
||||
#==================
|
||||
# logo data: logOR
|
||||
#==================
|
||||
logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
|
||||
wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
|
||||
|
||||
wide_df_logor = as.matrix(wide_df_logor)
|
||||
|
||||
rownames(wide_df_logor) = wide_df_logor[,1]
|
||||
wide_df_logor = subset(wide_df_logor, select = -c(1) )
|
||||
colnames(wide_df_logor)
|
||||
wide_df_logor_m = data.matrix(wide_df_logor)
|
||||
|
||||
rownames(wide_df_logor_m)
|
||||
colnames(wide_df_logor_m)
|
||||
|
||||
position_logor = as.numeric(colnames(wide_df_logor_m))
|
||||
|
||||
#===============================
|
||||
# logo data: multiple nsSNPs (>1)
|
||||
#=================================
|
||||
#require(data.table)
|
||||
|
||||
# get freq count of positions so you can subset freq<1
|
||||
setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)]
|
||||
|
||||
table(logo_data$position)
|
||||
table(logo_data$mut_pos_occurrence)
|
||||
|
||||
max_mut = max(table(logo_data$position))
|
||||
|
||||
# extract freq_pos > 1
|
||||
my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,]
|
||||
u = unique(my_data_snp$position)
|
||||
max_mult_mut = max(table(my_data_snp$position))
|
||||
|
||||
if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
|
||||
|
||||
cat("PASS: positions with multiple muts extracted"
|
||||
, "\nNo. of mutations:", nrow(my_data_snp)
|
||||
, "\nNo. of positions:", length(u)
|
||||
, "\nMax no. of muts at any position", max_mult_mut)
|
||||
}else{
|
||||
cat("FAIL: positions with multiple muts could NOT be extracted"
|
||||
, "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
|
||||
, "\nGot:", nrow(my_data_snp) )
|
||||
}
|
||||
|
||||
cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
|
||||
|
||||
#--------------------------------------
|
||||
# matrix for_mychisq mutant type
|
||||
# frequency of mutant type by position
|
||||
#---------------------------------------
|
||||
table(my_data_snp$mutant_type, my_data_snp$position)
|
||||
tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
|
||||
class(tab_mt)
|
||||
|
||||
# unclass to convert to matrix
|
||||
tab_mt = unclass(tab_mt)
|
||||
tab_mt = as.matrix(tab_mt, rownames = T)
|
||||
|
||||
# should be TRUE
|
||||
is.matrix(tab_mt)
|
||||
|
||||
rownames(tab_mt) #aa
|
||||
colnames(tab_mt) #pos
|
||||
|
||||
#-------------------------------------
|
||||
# matrix for wild type
|
||||
# frequency of wild type by position
|
||||
#-------------------------------------
|
||||
tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
|
||||
tab_wt = unclass(tab_wt)
|
||||
|
||||
# remove wt duplicates
|
||||
wt = my_data_snp[, c("position", "wild_type")]
|
||||
wt = wt[!duplicated(wt),]
|
||||
|
||||
tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
|
||||
|
||||
rownames(tab_wt)
|
||||
rownames(tab_wt)
|
||||
|
||||
identical(colnames(tab_mt), colnames(tab_wt))
|
||||
identical(ncol(tab_mt), ncol(tab_wt))
|
||||
|
||||
#----------------------------------
|
||||
# logo data OR: multiple nsSNPs (>1)
|
||||
#----------------------------------
|
||||
logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
|
||||
#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
|
||||
wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
|
||||
|
||||
wide_df_or_mult = as.matrix(wide_df_or_mult)
|
||||
rownames(wide_df_or_mult) = wide_df_or_mult[,1]
|
||||
wide_df_or_mult = wide_df_or_mult[,-1]
|
||||
str(wide_df_or_mult)
|
||||
|
||||
position_or_mult = as.numeric(colnames(wide_df_or_mult))
|
||||
s1 = c("\nSuccessfully sourced logo_data.R")
|
||||
cat(s1)
|
||||
|
||||
####################################################################
|
||||
# Data for Corrplots
|
||||
# Data for DM OM Plots: Long format dfs
|
||||
####################################################################
|
||||
cat("\n=========================================="
|
||||
, "\nCORR PLOTS data: PS"
|
||||
, "\n===========================================")
|
||||
|
||||
df_ps = merged_df2
|
||||
#source("other_plots_data.R")
|
||||
|
||||
#--------------------
|
||||
# adding log cols : NEW UNCOMMENT
|
||||
#--------------------
|
||||
#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
|
||||
#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
|
||||
source("dm_om_data.R")
|
||||
|
||||
##df_ps$log10_or_kin = log10(df_ps$or_kin)
|
||||
##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
|
||||
s2 = c("\nSuccessfully sourced other_plots_data.R")
|
||||
cat(s2)
|
||||
|
||||
#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
|
||||
####################################################################
|
||||
# Data for Lineage barplots: WF and LF dfs
|
||||
####################################################################
|
||||
|
||||
#----------------------------
|
||||
# columns for corr plots:PS
|
||||
#----------------------------
|
||||
# subset data to generate pairwise correlations
|
||||
cols_to_select = c("mutationinformation"
|
||||
, "duet_scaled"
|
||||
, "foldx_scaled"
|
||||
#, "mutation_info_labels"
|
||||
, "asa"
|
||||
, "rsa"
|
||||
, "rd_values"
|
||||
, "kd_values"
|
||||
, "log10_or_mychisq"
|
||||
, "neglog_pval_fisher"
|
||||
##, "or_kin"
|
||||
##, "neglog_pwald_kin"
|
||||
, "af"
|
||||
##, "af_kin"
|
||||
, "duet_outcome"
|
||||
, drug)
|
||||
source("lineage_data.R")
|
||||
|
||||
corr_data_ps = df_ps[cols_to_select]
|
||||
s3 = c("\nSuccessfully sourced lineage_data.R")
|
||||
cat(s3)
|
||||
|
||||
dim(corr_data_ps)
|
||||
####################################################################
|
||||
# Data for corr plots:
|
||||
####################################################################
|
||||
# make sure the above script works because merged_df2_combined is needed
|
||||
source("corr_data.R")
|
||||
|
||||
#--------------------------------------
|
||||
# assign nice colnames (for display)
|
||||
#--------------------------------------
|
||||
my_corr_colnames = c("Mutation"
|
||||
, "DUET"
|
||||
, "FoldX"
|
||||
#, "Mutation class"
|
||||
, "ASA"
|
||||
, "RSA"
|
||||
, "RD"
|
||||
, "KD"
|
||||
, "Log (OR)"
|
||||
, "-Log (P)"
|
||||
##, "Adjusted (OR)"
|
||||
##, "-Log (P wald)"
|
||||
, "MAF"
|
||||
##, "AF_kin"
|
||||
, "duet_outcome"
|
||||
, drug)
|
||||
|
||||
length(my_corr_colnames)
|
||||
|
||||
colnames(corr_data_ps)
|
||||
colnames(corr_data_ps) <- my_corr_colnames
|
||||
colnames(corr_data_ps)
|
||||
|
||||
start = 1
|
||||
end = which(colnames(corr_data_ps) == drug); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
#===========================
|
||||
# Corr data for plots: PS
|
||||
# big_df ps: ~ merged_df2
|
||||
#===========================
|
||||
|
||||
#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
|
||||
corr_ps_df2 = corr_data_ps[start:end]
|
||||
head(corr_ps_df2)
|
||||
|
||||
#===========================
|
||||
# Corr data for plots: PS
|
||||
# short_df ps: ~merged_df3
|
||||
#===========================
|
||||
corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
|
||||
|
||||
na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
|
||||
check1 = nrow(corr_ps_df3) - na_or
|
||||
|
||||
##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
|
||||
##check2 = nrow(corr_ps_df3) - na_adj_or
|
||||
|
||||
if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
|
||||
cat( "\nPASS: No. of rows for corr_ps_df3 match"
|
||||
, "\nPASS: No. of OR values checked: " , check1)
|
||||
} else {
|
||||
cat("\nFAIL: Numbers mismatch:"
|
||||
, "\nExpected nrows: ", nrow(merged_df3)
|
||||
, "\nGot: ", nrow(corr_ps_df3)
|
||||
, "\nExpected OR values: ", nrow(merged_df3_comp)
|
||||
, "\nGot: ", check1)
|
||||
}
|
||||
|
||||
#=================================
|
||||
# Data for Correlation plots: LIG
|
||||
#=================================
|
||||
cat("\n=========================================="
|
||||
, "\nCORR PLOTS data: LIG"
|
||||
, "\n===========================================")
|
||||
|
||||
df_lig = merged_df2_lig
|
||||
|
||||
table(df_lig$ligand_outcome)
|
||||
|
||||
#--------------------
|
||||
# adding log cols : NEW UNCOMMENT
|
||||
#--------------------
|
||||
#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
|
||||
#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
|
||||
|
||||
##df_lig$log10_or_kin = log10(df_lig$or_kin)
|
||||
##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
|
||||
|
||||
#----------------------------
|
||||
# columns for corr plots:PS
|
||||
#----------------------------
|
||||
# subset data to generate pairwise correlations
|
||||
cols_to_select = c("mutationinformation"
|
||||
, "affinity_scaled"
|
||||
#, "mutation_info_labels"
|
||||
, "asa"
|
||||
, "rsa"
|
||||
, "rd_values"
|
||||
, "kd_values"
|
||||
, "log10_or_mychisq"
|
||||
, "neglog_pval_fisher"
|
||||
##, "or_kin"
|
||||
##, "neglog_pwald_kin"
|
||||
, "af"
|
||||
##, "af_kin"
|
||||
, "ligand_outcome"
|
||||
, drug)
|
||||
|
||||
corr_data_lig = df_lig[, cols_to_select]
|
||||
|
||||
dim(corr_data_lig)
|
||||
|
||||
#--------------------------------------
|
||||
# assign nice colnames (for display)
|
||||
#--------------------------------------
|
||||
my_corr_colnames = c("Mutation"
|
||||
, "Ligand Affinity"
|
||||
#, "Mutation class"
|
||||
, "ASA"
|
||||
, "RSA"
|
||||
, "RD"
|
||||
, "KD"
|
||||
, "Log (OR)"
|
||||
, "-Log (P)"
|
||||
##, "Adjusted (OR)"
|
||||
##, "-Log (P wald)"
|
||||
, "MAF"
|
||||
##, "MAF_kin"
|
||||
, "ligand_outcome"
|
||||
, drug)
|
||||
|
||||
length(my_corr_colnames)
|
||||
|
||||
colnames(corr_data_lig)
|
||||
colnames(corr_data_lig) <- my_corr_colnames
|
||||
colnames(corr_data_lig)
|
||||
|
||||
start = 1
|
||||
end = which(colnames(corr_data_lig) == drug); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
#=============================
|
||||
# Corr data for plots: LIG
|
||||
# big_df lig: ~ merged_df2_lig
|
||||
#==============================
|
||||
#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
|
||||
corr_lig_df2 = corr_data_lig[start:end]
|
||||
head(corr_lig_df2)
|
||||
|
||||
#=============================
|
||||
# Corr data for plots: LIG
|
||||
# short_df lig: ~ merged_df3_lig
|
||||
#==============================
|
||||
corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
|
||||
|
||||
na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
|
||||
check1_lig = nrow(corr_lig_df3) - na_or_lig
|
||||
|
||||
if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
|
||||
cat( "\nPASS: No. of rows for corr_lig_df3 match"
|
||||
, "\nPASS: No. of OR values checked: " , check1_lig)
|
||||
} else {
|
||||
cat("\nFAIL: Numbers mismatch:"
|
||||
, "\nExpected nrows: ", nrow(merged_df3_lig)
|
||||
, "\nGot: ", nrow(corr_ps_df3_lig)
|
||||
, "\nExpected OR values: ", nrow(merged_df3_comp_lig)
|
||||
, "\nGot: ", check1_lig)
|
||||
}
|
||||
|
||||
# remove unnecessary columns
|
||||
identical(corr_data_lig, corr_lig_df2)
|
||||
identical(corr_data_ps, corr_ps_df2)
|
||||
|
||||
#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
|
||||
s4 = c("\nSuccessfully sourced corr_data.R")
|
||||
cat(s4)
|
||||
|
||||
########################################################################
|
||||
# End of script
|
||||
########################################################################
|
||||
rm(foo)
|
||||
if ( all( length(s1), length(s2), length(s3), length(s4) ) >0 ){
|
||||
cat(
|
||||
"\n##################################################"
|
||||
, "\nSuccessful: get_plotting_dfs.R worked!"
|
||||
, "\n###################################################\n")
|
||||
} else {
|
||||
cat(
|
||||
"\n#################################################"
|
||||
, "\nFAIL: get_plotting_dfs.R didn't complete fully!Please check"
|
||||
, "\n###################################################\n" )
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# clear excess variables
|
||||
rm(c1, c2, c3, c4, check1
|
||||
, curr_count, curr_total
|
||||
, cols_check
|
||||
, cols_to_select
|
||||
, cols_to_select_deepddg
|
||||
, cols_to_select_duet
|
||||
, cols_to_select_dynamut
|
||||
, cols_to_select_dynamut2
|
||||
, cols_to_select_encomddg
|
||||
, cols_to_select_encomdds
|
||||
, cols_to_select_mcsm
|
||||
, cols_to_select_mcsm_na
|
||||
, cols_to_select_sdm
|
||||
, infile_metadata
|
||||
, infile_params
|
||||
#, infilename_dynamut
|
||||
#, infilename_dynamut2
|
||||
#, infilename_mcsm_f_snps
|
||||
#, infilename_mcsm_na
|
||||
)
|
||||
|
||||
rm(pivot_cols
|
||||
, pivot_cols_deepddg
|
||||
, pivot_cols_duet
|
||||
, pivot_cols_dynamut
|
||||
, pivot_cols_dynamut2
|
||||
, pivot_cols_encomddg
|
||||
, pivot_cols_encomdds
|
||||
, pivot_cols_foldx
|
||||
, pivot_cols_mcsm
|
||||
, pivot_cols_mcsm_na
|
||||
, pivot_cols_n
|
||||
, pivot_cols_sdm)
|
||||
|
||||
rm(expected_cols
|
||||
, expected_ncols
|
||||
, expected_rows
|
||||
, expected_rows_lf
|
||||
, fact_cols)
|
||||
|
||||
|
||||
cat("\n===================================================\n"
|
||||
, "\nSuccessful: get_plotting_dfs.R worked!"
|
||||
, "\n====================================================")
|
589
scripts/plotting/get_plotting_dfs_with_lig.R
Executable file
589
scripts/plotting/get_plotting_dfs_with_lig.R
Executable file
|
@ -0,0 +1,589 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Get formatted data for plots
|
||||
#=======================================================================
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
source("Header_TT.R")
|
||||
source("../functions/my_pairs_panel.R") # with lower panel turned off
|
||||
source("../functions/plotting_globals.R")
|
||||
source("../functions/plotting_data.R")
|
||||
source("../functions/combining_dfs_plotting.R")
|
||||
source("../functions/bp_subcolours.R")
|
||||
|
||||
#********************
|
||||
# cmd args passed
|
||||
# in from other scripts
|
||||
# to call this
|
||||
#********************
|
||||
#drug = 'streptomycin'
|
||||
#gene = 'gid'
|
||||
#====================
|
||||
# variables for lig
|
||||
#====================
|
||||
|
||||
LigDist_colname = "ligand_distance"
|
||||
LigDist_cutoff = 10
|
||||
|
||||
#===========
|
||||
# input
|
||||
#===========
|
||||
#---------------------
|
||||
# call: import_dirs()
|
||||
#---------------------
|
||||
import_dirs(drug, gene)
|
||||
|
||||
#---------------------------
|
||||
# call: plotting_data()
|
||||
#---------------------------
|
||||
if (!exists("infile_params") && exists("gene")){
|
||||
#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
|
||||
#in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
|
||||
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
|
||||
infile_params = paste0(outdir, "/", in_filename_params)
|
||||
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
|
||||
}
|
||||
|
||||
# Input 1: read <gene>_comb_afor.csv
|
||||
cat("\nReading mcsm combined data file: ", infile_params)
|
||||
mcsm_df = read.csv(infile_params, header = T)
|
||||
pd_df = plotting_data(mcsm_df
|
||||
, lig_dist_colname = LigDist_colname
|
||||
, lig_dist_cutoff = LigDist_cutoff)
|
||||
|
||||
my_df = pd_df[[1]]
|
||||
my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
|
||||
my_df_u_lig = pd_df[[3]]
|
||||
dup_muts = pd_df[[4]]
|
||||
|
||||
#--------------------------------
|
||||
# call: combining_dfs_plotting()
|
||||
#--------------------------------
|
||||
if (!exists("infile_metadata") && exists("gene")){
|
||||
#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
|
||||
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
|
||||
infile_metadata = paste0(outdir, "/", in_filename_metadata)
|
||||
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
|
||||
}
|
||||
|
||||
# Input 2: read <gene>_meta data.csv
|
||||
cat("\nReading meta data file: ", infile_metadata)
|
||||
|
||||
gene_metadata <- read.csv(infile_metadata
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
|
||||
all_plot_dfs = combining_dfs_plotting(my_df_u
|
||||
, gene_metadata
|
||||
, lig_dist_colname = LigDist_colname
|
||||
, lig_dist_cutoff = LigDist_cutoff)
|
||||
|
||||
merged_df2 = all_plot_dfs[[1]]
|
||||
merged_df3 = all_plot_dfs[[2]]
|
||||
merged_df2_comp = all_plot_dfs[[3]]
|
||||
merged_df3_comp = all_plot_dfs[[4]]
|
||||
merged_df2_lig = all_plot_dfs[[5]]
|
||||
merged_df3_lig = all_plot_dfs[[6]]
|
||||
merged_df2_comp_lig = all_plot_dfs[[7]]
|
||||
merged_df3_comp_lig = all_plot_dfs[[8]]
|
||||
|
||||
####################################################################
|
||||
# Data for subcols barplot (~heatmap)
|
||||
####################################################################
|
||||
# can include: mutation, or_kin, pwald, af_kin
|
||||
cols_to_select = c("mutationinformation", "drtype"
|
||||
, "wild_type"
|
||||
, "position"
|
||||
, "mutant_type"
|
||||
, "chain", "ligand_id", "ligand_distance"
|
||||
, "duet_stability_change", "duet_outcome", "duet_scaled"
|
||||
, "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
|
||||
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
|
||||
, "deepddg", "deepddg_outcome" # comment out as not available for pnca
|
||||
, "asa", "rsa", "rd_values", "kd_values"
|
||||
, "af", "or_mychisq", "pval_fisher"
|
||||
, "or_fisher", "or_logistic", "pval_logistic"
|
||||
, "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
|
||||
, "wt_calcprop", "mut_calcprop")
|
||||
|
||||
#=======================
|
||||
# Data for sub colours
|
||||
# barplot: PS
|
||||
#=======================
|
||||
|
||||
cat("\nNo. of cols to select:", length(cols_to_select))
|
||||
|
||||
subcols_df_ps = merged_df3[, cols_to_select]
|
||||
|
||||
cat("\nNo of unique positions for ps:"
|
||||
, length(unique(subcols_df_ps$position)))
|
||||
|
||||
# add count_pos col that counts the no. of nsSNPS at a position
|
||||
setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
|
||||
|
||||
# should be a factor
|
||||
if (is.factor(subcols_df_ps$duet_outcome)){
|
||||
cat("\nDuet_outcome is factor")
|
||||
table(subcols_df_ps$duet_outcome)
|
||||
}else{
|
||||
cat("\nConverting duet_outcome to factor")
|
||||
subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
|
||||
table(subcols_df_ps$duet_outcome)
|
||||
}
|
||||
|
||||
# should be -1 and 1
|
||||
min(subcols_df_ps$duet_scaled)
|
||||
max(subcols_df_ps$duet_scaled)
|
||||
|
||||
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
|
||||
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
|
||||
|
||||
# check unique values in normalised data
|
||||
cat("\nNo. of unique values in duet scaled, no rounding:"
|
||||
, length(unique(subcols_df_ps$duet_scaled)))
|
||||
|
||||
# No rounding
|
||||
my_grp = subcols_df_ps$duet_scaled; length(my_grp)
|
||||
|
||||
# Add rounding is to be used
|
||||
n = 3
|
||||
subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
|
||||
|
||||
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
|
||||
, length(unique(subcols_df_ps$duet_scaledR)))
|
||||
|
||||
my_grp_r = subcols_df_ps$duet_scaledR # rounding
|
||||
|
||||
# Add grp cols
|
||||
subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
|
||||
subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
|
||||
subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
|
||||
|
||||
print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
|
||||
print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
|
||||
|
||||
#=======================
|
||||
# Data for sub colours
|
||||
# barplot: LIG
|
||||
#=======================
|
||||
cat("\nNo. of cols to select:", length(cols_to_select))
|
||||
|
||||
subcols_df_lig = merged_df3_lig[, cols_to_select]
|
||||
|
||||
cat("\nNo of unique positions for LIG:"
|
||||
, length(unique(subcols_df_lig$position)))
|
||||
|
||||
# should be a factor
|
||||
if (is.factor(subcols_df_lig$ligand_outcome)){
|
||||
cat("\nLigand_outcome is factor")
|
||||
table(subcols_df_lig$ligand_outcome)
|
||||
}else{
|
||||
cat("\nConverting ligand_outcome to factor")
|
||||
subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
|
||||
table(subcols_df_lig$ligand_outcome)
|
||||
}
|
||||
|
||||
# should be -1 and 1
|
||||
min(subcols_df_lig$affinity_scaled)
|
||||
max(subcols_df_lig$affinity_scaled)
|
||||
|
||||
tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
|
||||
tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
|
||||
|
||||
# check unique values in normalised data
|
||||
cat("\nNo. of unique values in affinity scaled, no rounding:"
|
||||
, length(unique(subcols_df_lig$affinity_scaled)))
|
||||
|
||||
# No rounding
|
||||
my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
|
||||
|
||||
# Add rounding is to be used
|
||||
n = 3
|
||||
subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
|
||||
|
||||
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
|
||||
, length(unique(subcols_df_lig$affinity_scaledR)))
|
||||
|
||||
my_grp_lig_r = subcols_df_lig$affinity_scaledR # rounding
|
||||
|
||||
# Add grp cols
|
||||
subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
|
||||
subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
|
||||
subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
|
||||
|
||||
print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
|
||||
print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
|
||||
|
||||
####################################################################
|
||||
# Data for logoplots
|
||||
####################################################################
|
||||
#-------------------------
|
||||
# choose df for logoplot
|
||||
#-------------------------
|
||||
logo_data = merged_df3
|
||||
#logo_data = merged_df3_comp
|
||||
|
||||
# quick checks
|
||||
colnames(logo_data)
|
||||
str(logo_data)
|
||||
|
||||
c1 = unique(logo_data$position)
|
||||
nrow(logo_data)
|
||||
cat("No. of rows in my_data:", nrow(logo_data)
|
||||
, "\nDistinct positions corresponding to snps:", length(c1)
|
||||
, "\n===========================================================")
|
||||
#=======================================================================
|
||||
#==================
|
||||
# logo data: OR
|
||||
#==================
|
||||
foo = logo_data[, c("position"
|
||||
, "mutant_type","duet_scaled", "or_mychisq"
|
||||
, "mut_prop_polarity", "mut_prop_water")]
|
||||
|
||||
logo_data$log10or = log10(logo_data$or_mychisq)
|
||||
logo_data_plot = logo_data[, c("position"
|
||||
, "mutant_type", "or_mychisq", "log10or")]
|
||||
|
||||
logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
|
||||
wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
|
||||
|
||||
wide_df_or = as.matrix(wide_df_or)
|
||||
rownames(wide_df_or) = wide_df_or[,1]
|
||||
dim(wide_df_or)
|
||||
wide_df_or = wide_df_or[,-1]
|
||||
str(wide_df_or)
|
||||
|
||||
position_or = as.numeric(colnames(wide_df_or))
|
||||
|
||||
#==================
|
||||
# logo data: logOR
|
||||
#==================
|
||||
logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
|
||||
wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
|
||||
|
||||
wide_df_logor = as.matrix(wide_df_logor)
|
||||
|
||||
rownames(wide_df_logor) = wide_df_logor[,1]
|
||||
wide_df_logor = subset(wide_df_logor, select = -c(1) )
|
||||
colnames(wide_df_logor)
|
||||
wide_df_logor_m = data.matrix(wide_df_logor)
|
||||
|
||||
rownames(wide_df_logor_m)
|
||||
colnames(wide_df_logor_m)
|
||||
|
||||
position_logor = as.numeric(colnames(wide_df_logor_m))
|
||||
|
||||
#===============================
|
||||
# logo data: multiple nsSNPs (>1)
|
||||
#=================================
|
||||
#require(data.table)
|
||||
|
||||
# get freq count of positions so you can subset freq<1
|
||||
setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)]
|
||||
|
||||
table(logo_data$position)
|
||||
table(logo_data$mut_pos_occurrence)
|
||||
|
||||
max_mut = max(table(logo_data$position))
|
||||
|
||||
# extract freq_pos > 1
|
||||
my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,]
|
||||
u = unique(my_data_snp$position)
|
||||
max_mult_mut = max(table(my_data_snp$position))
|
||||
|
||||
if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
|
||||
|
||||
cat("PASS: positions with multiple muts extracted"
|
||||
, "\nNo. of mutations:", nrow(my_data_snp)
|
||||
, "\nNo. of positions:", length(u)
|
||||
, "\nMax no. of muts at any position", max_mult_mut)
|
||||
}else{
|
||||
cat("FAIL: positions with multiple muts could NOT be extracted"
|
||||
, "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
|
||||
, "\nGot:", nrow(my_data_snp) )
|
||||
}
|
||||
|
||||
cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
|
||||
|
||||
#--------------------------------------
|
||||
# matrix for_mychisq mutant type
|
||||
# frequency of mutant type by position
|
||||
#---------------------------------------
|
||||
table(my_data_snp$mutant_type, my_data_snp$position)
|
||||
tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
|
||||
class(tab_mt)
|
||||
|
||||
# unclass to convert to matrix
|
||||
tab_mt = unclass(tab_mt)
|
||||
tab_mt = as.matrix(tab_mt, rownames = T)
|
||||
|
||||
# should be TRUE
|
||||
is.matrix(tab_mt)
|
||||
|
||||
rownames(tab_mt) #aa
|
||||
colnames(tab_mt) #pos
|
||||
|
||||
#-------------------------------------
|
||||
# matrix for wild type
|
||||
# frequency of wild type by position
|
||||
#-------------------------------------
|
||||
tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
|
||||
tab_wt = unclass(tab_wt)
|
||||
|
||||
# remove wt duplicates
|
||||
wt = my_data_snp[, c("position", "wild_type")]
|
||||
wt = wt[!duplicated(wt),]
|
||||
|
||||
tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
|
||||
|
||||
rownames(tab_wt)
|
||||
rownames(tab_wt)
|
||||
|
||||
identical(colnames(tab_mt), colnames(tab_wt))
|
||||
identical(ncol(tab_mt), ncol(tab_wt))
|
||||
|
||||
#----------------------------------
|
||||
# logo data OR: multiple nsSNPs (>1)
|
||||
#----------------------------------
|
||||
logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
|
||||
#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
|
||||
wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
|
||||
|
||||
wide_df_or_mult = as.matrix(wide_df_or_mult)
|
||||
rownames(wide_df_or_mult) = wide_df_or_mult[,1]
|
||||
wide_df_or_mult = wide_df_or_mult[,-1]
|
||||
str(wide_df_or_mult)
|
||||
|
||||
position_or_mult = as.numeric(colnames(wide_df_or_mult))
|
||||
|
||||
####################################################################
|
||||
# Data for Corrplots
|
||||
####################################################################
|
||||
cat("\n=========================================="
|
||||
, "\nCORR PLOTS data: PS"
|
||||
, "\n===========================================")
|
||||
|
||||
df_ps = merged_df2
|
||||
|
||||
#--------------------
|
||||
# adding log cols : NEW UNCOMMENT
|
||||
#--------------------
|
||||
#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
|
||||
#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
|
||||
|
||||
##df_ps$log10_or_kin = log10(df_ps$or_kin)
|
||||
##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
|
||||
|
||||
#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
|
||||
|
||||
#----------------------------
|
||||
# columns for corr plots:PS
|
||||
#----------------------------
|
||||
# subset data to generate pairwise correlations
|
||||
cols_to_select = c("mutationinformation"
|
||||
, "duet_scaled"
|
||||
, "foldx_scaled"
|
||||
#, "mutation_info_labels"
|
||||
, "asa"
|
||||
, "rsa"
|
||||
, "rd_values"
|
||||
, "kd_values"
|
||||
, "log10_or_mychisq"
|
||||
, "neglog_pval_fisher"
|
||||
##, "or_kin"
|
||||
##, "neglog_pwald_kin"
|
||||
, "af"
|
||||
##, "af_kin"
|
||||
, "duet_outcome"
|
||||
, drug)
|
||||
|
||||
corr_data_ps = df_ps[cols_to_select]
|
||||
|
||||
dim(corr_data_ps)
|
||||
|
||||
#--------------------------------------
|
||||
# assign nice colnames (for display)
|
||||
#--------------------------------------
|
||||
my_corr_colnames = c("Mutation"
|
||||
, "DUET"
|
||||
, "FoldX"
|
||||
#, "Mutation class"
|
||||
, "ASA"
|
||||
, "RSA"
|
||||
, "RD"
|
||||
, "KD"
|
||||
, "Log (OR)"
|
||||
, "-Log (P)"
|
||||
##, "Adjusted (OR)"
|
||||
##, "-Log (P wald)"
|
||||
, "MAF"
|
||||
##, "AF_kin"
|
||||
, "duet_outcome"
|
||||
, drug)
|
||||
|
||||
length(my_corr_colnames)
|
||||
|
||||
colnames(corr_data_ps)
|
||||
colnames(corr_data_ps) <- my_corr_colnames
|
||||
colnames(corr_data_ps)
|
||||
|
||||
start = 1
|
||||
end = which(colnames(corr_data_ps) == drug); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
#===========================
|
||||
# Corr data for plots: PS
|
||||
# big_df ps: ~ merged_df2
|
||||
#===========================
|
||||
|
||||
#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
|
||||
corr_ps_df2 = corr_data_ps[start:end]
|
||||
head(corr_ps_df2)
|
||||
|
||||
#===========================
|
||||
# Corr data for plots: PS
|
||||
# short_df ps: ~merged_df3
|
||||
#===========================
|
||||
corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
|
||||
|
||||
na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
|
||||
check1 = nrow(corr_ps_df3) - na_or
|
||||
|
||||
##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
|
||||
##check2 = nrow(corr_ps_df3) - na_adj_or
|
||||
|
||||
if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
|
||||
cat( "\nPASS: No. of rows for corr_ps_df3 match"
|
||||
, "\nPASS: No. of OR values checked: " , check1)
|
||||
} else {
|
||||
cat("\nFAIL: Numbers mismatch:"
|
||||
, "\nExpected nrows: ", nrow(merged_df3)
|
||||
, "\nGot: ", nrow(corr_ps_df3)
|
||||
, "\nExpected OR values: ", nrow(merged_df3_comp)
|
||||
, "\nGot: ", check1)
|
||||
}
|
||||
|
||||
#=================================
|
||||
# Data for Correlation plots: LIG
|
||||
#=================================
|
||||
cat("\n=========================================="
|
||||
, "\nCORR PLOTS data: LIG"
|
||||
, "\n===========================================")
|
||||
|
||||
df_lig = merged_df2_lig
|
||||
|
||||
table(df_lig$ligand_outcome)
|
||||
|
||||
#--------------------
|
||||
# adding log cols : NEW UNCOMMENT
|
||||
#--------------------
|
||||
#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
|
||||
#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
|
||||
|
||||
##df_lig$log10_or_kin = log10(df_lig$or_kin)
|
||||
##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
|
||||
|
||||
#----------------------------
|
||||
# columns for corr plots:PS
|
||||
#----------------------------
|
||||
# subset data to generate pairwise correlations
|
||||
cols_to_select = c("mutationinformation"
|
||||
, "affinity_scaled"
|
||||
#, "mutation_info_labels"
|
||||
, "asa"
|
||||
, "rsa"
|
||||
, "rd_values"
|
||||
, "kd_values"
|
||||
, "log10_or_mychisq"
|
||||
, "neglog_pval_fisher"
|
||||
##, "or_kin"
|
||||
##, "neglog_pwald_kin"
|
||||
, "af"
|
||||
##, "af_kin"
|
||||
, "ligand_outcome"
|
||||
, drug)
|
||||
|
||||
corr_data_lig = df_lig[, cols_to_select]
|
||||
|
||||
dim(corr_data_lig)
|
||||
|
||||
#--------------------------------------
|
||||
# assign nice colnames (for display)
|
||||
#--------------------------------------
|
||||
my_corr_colnames = c("Mutation"
|
||||
, "Ligand Affinity"
|
||||
#, "Mutation class"
|
||||
, "ASA"
|
||||
, "RSA"
|
||||
, "RD"
|
||||
, "KD"
|
||||
, "Log (OR)"
|
||||
, "-Log (P)"
|
||||
##, "Adjusted (OR)"
|
||||
##, "-Log (P wald)"
|
||||
, "MAF"
|
||||
##, "MAF_kin"
|
||||
, "ligand_outcome"
|
||||
, drug)
|
||||
|
||||
length(my_corr_colnames)
|
||||
|
||||
colnames(corr_data_lig)
|
||||
colnames(corr_data_lig) <- my_corr_colnames
|
||||
colnames(corr_data_lig)
|
||||
|
||||
start = 1
|
||||
end = which(colnames(corr_data_lig) == drug); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
#=============================
|
||||
# Corr data for plots: LIG
|
||||
# big_df lig: ~ merged_df2_lig
|
||||
#==============================
|
||||
#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
|
||||
corr_lig_df2 = corr_data_lig[start:end]
|
||||
head(corr_lig_df2)
|
||||
|
||||
#=============================
|
||||
# Corr data for plots: LIG
|
||||
# short_df lig: ~ merged_df3_lig
|
||||
#==============================
|
||||
corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
|
||||
|
||||
na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
|
||||
check1_lig = nrow(corr_lig_df3) - na_or_lig
|
||||
|
||||
if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
|
||||
cat( "\nPASS: No. of rows for corr_lig_df3 match"
|
||||
, "\nPASS: No. of OR values checked: " , check1_lig)
|
||||
} else {
|
||||
cat("\nFAIL: Numbers mismatch:"
|
||||
, "\nExpected nrows: ", nrow(merged_df3_lig)
|
||||
, "\nGot: ", nrow(corr_ps_df3_lig)
|
||||
, "\nExpected OR values: ", nrow(merged_df3_comp_lig)
|
||||
, "\nGot: ", check1_lig)
|
||||
}
|
||||
|
||||
# remove unnecessary columns
|
||||
identical(corr_data_lig, corr_lig_df2)
|
||||
identical(corr_data_ps, corr_ps_df2)
|
||||
|
||||
#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
|
||||
|
||||
########################################################################
|
||||
# End of script
|
||||
########################################################################
|
||||
rm(foo)
|
||||
|
||||
cat("\n===================================================\n"
|
||||
, "\nSuccessful: get_plotting_dfs.R worked!"
|
||||
, "\n====================================================")
|
0
scripts/plotting/ggcorr_all_PS_LIG.R
Normal file → Executable file
0
scripts/plotting/ggcorr_all_PS_LIG.R
Normal file → Executable file
0
scripts/plotting/hist_af_or_base.R
Normal file → Executable file
0
scripts/plotting/hist_af_or_base.R
Normal file → Executable file
0
scripts/plotting/hist_af_or_combined.R
Normal file → Executable file
0
scripts/plotting/hist_af_or_combined.R
Normal file → Executable file
0
scripts/plotting/legend_adjustment.R
Normal file → Executable file
0
scripts/plotting/legend_adjustment.R
Normal file → Executable file
127
scripts/plotting/lineage_basic_barplots_combined.R
Executable file
127
scripts/plotting/lineage_basic_barplots_combined.R
Executable file
|
@ -0,0 +1,127 @@
|
|||
#!/usr/bin/env Rscript
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
#########################################################
|
||||
# TASK: Basic lineage barplots showing numbers
|
||||
|
||||
# Output: Basic barplot with lineage samples and mut count
|
||||
# + SNP diversity
|
||||
|
||||
##########################################################
|
||||
# Installing and loading required packages
|
||||
##########################################################
|
||||
source("Header_TT.R")
|
||||
|
||||
#===========
|
||||
# input
|
||||
#===========
|
||||
#drug = 'streptomycin'
|
||||
#gene = 'gid'
|
||||
|
||||
spec = matrix(c(
|
||||
"drug" , "d", 1, "character",
|
||||
"gene" , "g", 1, "character",
|
||||
"data_file1" , "fa", 2, "character",
|
||||
"data_file2" , "fb", 2, "character"
|
||||
), byrow = TRUE, ncol = 4)
|
||||
|
||||
opt = getopt(spec)
|
||||
|
||||
drug = opt$drug
|
||||
gene = opt$gene
|
||||
infile_params = opt$data_file1
|
||||
infile_metadata = opt$data_file2
|
||||
|
||||
if(is.null(drug)|is.null(gene)) {
|
||||
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
|
||||
}
|
||||
|
||||
source ('get_plotting_dfs.R')
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
# plot 1
|
||||
basic_bp_lineage_cl = "basic_lineage_barplots_combined.svg"
|
||||
plot_basic_bp_lineage_cl = paste0(plotdir,"/", basic_bp_lineage_cl)
|
||||
plot_basic_bp_lineage_cl
|
||||
#################################################################
|
||||
#=============================
|
||||
# PLOT 1: Lineage count plot:
|
||||
# LF data
|
||||
#=============================
|
||||
#------------------------
|
||||
# Data: All lineages or
|
||||
# selected few
|
||||
#------------------------
|
||||
lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%c("L1", "L2", "L3", "L4"),]
|
||||
str(lin_lf_plot)
|
||||
|
||||
# drop unused factor levels
|
||||
lin_lf_plot$sel_lineages = factor(lin_lf_plot$sel_lineages)
|
||||
levels(lin_lf_plot$sel_lineages)
|
||||
str(lin_lf_plot)
|
||||
|
||||
#------------------------
|
||||
# plot from my function:
|
||||
#------------------------
|
||||
lin_countP = lin_count_bp(lin_lf_plot
|
||||
, x_categ = "sel_lineages"
|
||||
, y_count = "p_count"
|
||||
, bar_fill_categ = "count_categ"
|
||||
, display_label_col = "p_count"
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, my_xats = 20
|
||||
, bar_col_labels = c("Mutations", "Total Samples")
|
||||
, bar_col_values = c("grey50", "gray75")
|
||||
, y_scale_percent = F # T for diversity
|
||||
, y_log10 = F
|
||||
, y_label = "Count")
|
||||
lin_countP
|
||||
#================================
|
||||
# PLOT 2: Lineage Diversity plot
|
||||
# WF data
|
||||
#================================
|
||||
#------------------------
|
||||
# Data: All lineages or
|
||||
# selected few
|
||||
#------------------------
|
||||
lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%c("L1", "L2", "L3", "L4"),]
|
||||
str(lin_wf_plot)
|
||||
|
||||
# drop unused factor levels
|
||||
lin_wf_plot$sel_lineages = factor(lin_wf_plot$sel_lineages)
|
||||
levels(lin_wf_plot$sel_lineages)
|
||||
str(lin_wf_plot)
|
||||
|
||||
#------------------------
|
||||
# plot from my function:
|
||||
#------------------------
|
||||
lin_diversityP = lin_count_bp(lin_wf_plot
|
||||
, x_categ = "sel_lineages"
|
||||
, y_count = "snp_diversity"
|
||||
, display_label_col = "snp_diversity_f"
|
||||
, bar_stat_stype = "identity"
|
||||
, x_lab_angle = 90
|
||||
, my_xats = 20
|
||||
, y_scale_percent = T
|
||||
, y_label = "SNP diversity")
|
||||
|
||||
lin_diversityP
|
||||
#########################################################################333
|
||||
#================================
|
||||
# Combine plots
|
||||
#================================
|
||||
|
||||
svg(plot_basic_bp_lineage_cl , width = 8, height = 15 )
|
||||
|
||||
lineage_bp_combined = cowplot::plot_grid(lin_countP, lin_diversityP
|
||||
#, labels = c("(a)", "(b)", "(c)", "(d)")
|
||||
, nrow = 2
|
||||
, labels = "AUTO"
|
||||
, label_size = 25)
|
||||
|
||||
lineage_bp_combined
|
||||
dev.off()
|
147
scripts/plotting/lineage_data.R
Executable file
147
scripts/plotting/lineage_data.R
Executable file
|
@ -0,0 +1,147 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Script to format data for lineage barplots:
|
||||
# WF and LF data with lineage sample, and snp counts
|
||||
# sourced by get_plotting_dfs.R
|
||||
#########################################################
|
||||
|
||||
#=================================================
|
||||
# Get data with lineage count, and snp diversity
|
||||
#=================================================
|
||||
table(merged_df2$lineage)
|
||||
|
||||
if (table(merged_df2$lineage == "")[[2]]) {
|
||||
|
||||
cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
|
||||
|
||||
}
|
||||
|
||||
table(merged_df2$lineage_labels)
|
||||
class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)
|
||||
|
||||
#==========================================
|
||||
# WF data: lineages with
|
||||
# snp count
|
||||
# total_samples
|
||||
# snp diversity (perc)
|
||||
#==========================================
|
||||
sel_lineages = levels(merged_df2$lineage_labels)
|
||||
|
||||
lin_wf = data.frame(sel_lineages) #4, 1
|
||||
total_snps_u = NULL
|
||||
total_samples = NULL
|
||||
|
||||
for (i in sel_lineages){
|
||||
#print(i)
|
||||
curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i])
|
||||
#print(curr_total)
|
||||
total_samples = c(total_samples, curr_total)
|
||||
print(total_samples)
|
||||
|
||||
foo = merged_df2[merged_df2$lineage_labels==i,]
|
||||
print(paste0(i, "=======\n"))
|
||||
print(length(unique(foo$mutationinformation)))
|
||||
curr_count = length(unique(foo$mutationinformation))
|
||||
|
||||
total_snps_u = c(total_snps_u, curr_count)
|
||||
}
|
||||
lin_wf
|
||||
|
||||
# Add these counts as columns to the df
|
||||
lin_wf$num_snps_u = total_snps_u
|
||||
lin_wf$total_samples = total_samples
|
||||
lin_wf
|
||||
|
||||
# Add SNP diversity
|
||||
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
|
||||
lin_wf
|
||||
|
||||
#----------------------
|
||||
# Add some formatting
|
||||
#----------------------
|
||||
# SNP diversity
|
||||
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
|
||||
lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
|
||||
|
||||
lin_wf$sel_lineages
|
||||
|
||||
# Important: Check factors so that x-axis categ appear as you want
|
||||
lin_wf$sel_lineages = factor(lin_wf$sel_lineages, c("L1"
|
||||
, "L2"
|
||||
, "L3"
|
||||
, "L4"
|
||||
, "L5"
|
||||
, "L6"
|
||||
, "L7"
|
||||
, "LBOV"
|
||||
, "L1;L2"
|
||||
, "L1;L3"
|
||||
, "L1;L4"
|
||||
, "L2;L3"
|
||||
, "L2;L3;L4"
|
||||
, "L2;L4"
|
||||
, "L2;L6"
|
||||
, "L2;LBOV"
|
||||
, "L3;L4"
|
||||
, "L4;L6"
|
||||
, "L4;L7"
|
||||
, ""))
|
||||
|
||||
levels(lin_wf$sel_lineages)
|
||||
|
||||
#=================================
|
||||
# LF data: lineages with
|
||||
# snp count
|
||||
# total_samples
|
||||
# snp diversity (perc)
|
||||
#=================================
|
||||
names(lin_wf)
|
||||
tot_cols = ncol(lin_wf)
|
||||
pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
|
||||
pivot_cols_n = length(pivot_cols)
|
||||
|
||||
expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
|
||||
|
||||
lin_lf <- gather(lin_wf
|
||||
, count_categ
|
||||
, p_count
|
||||
, num_snps_u:total_samples
|
||||
, factor_key = TRUE)
|
||||
lin_lf
|
||||
|
||||
# quick checks
|
||||
if ( nrow(lin_lf ) == expected_rows ){
|
||||
cat("\nPASS: Lineage LF data created"
|
||||
, "\nnrow: ", nrow(lin_lf)
|
||||
, "\nncol: ", ncol(lin_lf))
|
||||
} else {
|
||||
cat("\nFAIL: numbers mismatch"
|
||||
, "\nExpected nrow: ", expected_rows)
|
||||
}
|
||||
|
||||
# Important: Relevel factors so that x-axis categ appear as you want
|
||||
lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c("L1"
|
||||
, "L2"
|
||||
, "L3"
|
||||
, "L4"
|
||||
, "L5"
|
||||
, "L6"
|
||||
, "L7"
|
||||
, "LBOV"
|
||||
, "L1;L2"
|
||||
, "L1;L3"
|
||||
, "L1;L4"
|
||||
, "L2;L3"
|
||||
, "L2;L3;L4"
|
||||
, "L2;L4"
|
||||
, "L2;L6"
|
||||
, "L2;LBOV"
|
||||
, "L3;L4"
|
||||
, "L4;L6"
|
||||
, "L4;L7"
|
||||
, ""))
|
||||
|
||||
levels(lin_lf$sel_lineages)
|
||||
|
||||
################################################################
|
||||
|
143
scripts/plotting/lineage_dist_plots.R
Normal file
143
scripts/plotting/lineage_dist_plots.R
Normal file
|
@ -0,0 +1,143 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Lineage dist plots: ggridges
|
||||
|
||||
# Output: 1 or 2 SVGs for PS stability
|
||||
|
||||
##########################################################
|
||||
# Installing and loading required packages
|
||||
##########################################################
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
||||
getwd()
|
||||
|
||||
source("Header_TT.R") # also loads all my functions
|
||||
|
||||
#===========
|
||||
# input
|
||||
#===========
|
||||
drug = "streptomycin"
|
||||
gene = "gid"
|
||||
#source("get_plotting_dfs.R")
|
||||
|
||||
spec = matrix(c(
|
||||
"drug" , "d", 1, "character",
|
||||
"gene" , "g", 1, "character",
|
||||
"data_file1" , "fa", 2, "character",
|
||||
"data_file2" , "fb", 2, "character"
|
||||
), byrow = TRUE, ncol = 4)
|
||||
|
||||
opt = getopt(spec)
|
||||
|
||||
drug = opt$drug
|
||||
gene = opt$gene
|
||||
infile_params = opt$data_file1
|
||||
infile_metadata = opt$data_file2
|
||||
|
||||
if(is.null(drug)|is.null(gene)) {
|
||||
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
|
||||
}
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
lineage_dist_dm_om_ps = "lineage_dist_dm_om_PS.svg"
|
||||
plot_lineage_dist_dm_om_ps = paste0(plotdir,"/", lineage_dist_dm_om_ps)
|
||||
#========================================================================
|
||||
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2_combined or merged_df2_combined_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
# using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available, hence use df with NA
|
||||
###########################
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
# quick checks
|
||||
table(merged_df2_combined$mutation_info_labels); levels(merged_df2_combined$lineage_labels)
|
||||
table(merged_df2_combined$lineage_labels); levels(merged_df2_combined$mutation_info_labels)
|
||||
|
||||
sel_lineages = c("L1", "L2", "L3", "L4")
|
||||
|
||||
lin_dist_plot = merged_df2_combined[merged_df2_combined$lineage_labels%in%sel_lineages,]
|
||||
table(lin_dist_plot$lineage_labels); nlevels(lin_dist_plot$lineage_labels)
|
||||
|
||||
# refactor
|
||||
lin_dist_plot$lineage_labels = factor(lin_dist_plot$lineage_labels)
|
||||
nlevels(lin_dist_plot$lineage_labels)
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
# IMPORTANT RESULTS to put inside table or text for interactive plots
|
||||
|
||||
sum(table(lin_dist_plot$lineage_labels)) #{RESULT: Total number of samples for lineage}
|
||||
|
||||
table(lin_dist_plot$lineage_labels)#{RESULT: No of samples within lineage}
|
||||
|
||||
length(unique(lin_dist_plot$mutationinformation))#{Result: No. of unique mutations selected lineages contribute to}
|
||||
length(lin_dist_plot$mutationinformation)
|
||||
|
||||
u2 = unique(merged_df2_combined$mutationinformation)
|
||||
u = unique(lin_dist_plot$mutationinformation)
|
||||
check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
|
||||
#-----------------------------------------------------------------------
|
||||
|
||||
my_x_and_t = c("duet_scaled", "mCSM-DUET")
|
||||
my_x_and_t = c("foldx_scaled", "FoldX")
|
||||
#my_x_and_t = c("deepddg_scaled", "DeepDDG")
|
||||
|
||||
my_x_and_t = c("ddg_dynamut2_scaled", "Dynamut2")
|
||||
my_x_and_t = c("ddg_dynamut_scaled", "Dynamut")
|
||||
|
||||
my_x_and_t = c("ddg_mcsm_scaled", "mCSM")
|
||||
my_x_and_t = c("ddg_sdm_scaled", "SDM")
|
||||
my_x_and_t = c("ddg_duet_scaled", "DUET-d")
|
||||
|
||||
my_x_and_t = c("ddg_encom_scaled", "EnCOM-Stability")
|
||||
my_x_and_t = c("dds_encom_scaled", "EnCOM-Flexibility")
|
||||
|
||||
my_x_and_t = c("mcsm_na_scaled", "mCSM-NA")
|
||||
|
||||
# TO DO
|
||||
my_x_and_t = c("affinity_scaled", "mCSM-Lig") #ligdist< 10
|
||||
|
||||
#=====================
|
||||
# Plot: without facet
|
||||
#=====================
|
||||
|
||||
linP_dm_om = lineage_distP(lin_dist_plot
|
||||
, x_axis = my_x_and_t[1]
|
||||
, x_lab = my_x_and_t[2]
|
||||
, y_axis = "lineage_labels"
|
||||
, leg_label = "Mutation Class"
|
||||
, with_facet = F)
|
||||
linP_dm_om
|
||||
|
||||
#=====================
|
||||
# Plot: with facet
|
||||
#=====================
|
||||
|
||||
linP_dm_om_facet = lineage_distP(lin_dist_plot
|
||||
, x_axis = my_x_and_t[1]
|
||||
, x_lab = my_x_and_t[2]
|
||||
, y_axis = "lineage_labels"
|
||||
, with_facet = T
|
||||
, facet_wrap_var = "mutation_info_labels"
|
||||
, leg_label = "Mutation Class"
|
||||
, leg_pos_wf = "none"
|
||||
, leg_dir_wf = "horizontal")
|
||||
linP_dm_om_facet
|
||||
|
||||
#=================
|
||||
# output plot:
|
||||
# without facet
|
||||
#=================
|
||||
svg(plot_lineage_dist_dm_om_ps)
|
||||
|
||||
linP_dm_om
|
||||
|
||||
dev.off()
|
142
scripts/plotting/logo_data.R
Normal file
142
scripts/plotting/logo_data.R
Normal file
|
@ -0,0 +1,142 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Script to format data for Logo_plots
|
||||
#########################################################
|
||||
#-------------------------
|
||||
# choose df for logoplot
|
||||
#-------------------------
|
||||
logo_data = merged_df3
|
||||
#logo_data = merged_df3_comp
|
||||
|
||||
# quick checks
|
||||
colnames(logo_data)
|
||||
str(logo_data)
|
||||
|
||||
c1 = unique(logo_data$position)
|
||||
nrow(logo_data)
|
||||
cat("No. of rows in my_data:", nrow(logo_data)
|
||||
, "\nDistinct positions corresponding to snps:", length(c1)
|
||||
, "\n===========================================================")
|
||||
#=======================================================================
|
||||
#==================
|
||||
# logo data: OR
|
||||
#==================
|
||||
foo = logo_data[, c("position"
|
||||
, "mutant_type","duet_scaled", "or_mychisq"
|
||||
, "mut_prop_polarity", "mut_prop_water")]
|
||||
|
||||
logo_data$log10or = log10(logo_data$or_mychisq)
|
||||
logo_data_plot = logo_data[, c("position"
|
||||
, "mutant_type", "or_mychisq", "log10or")]
|
||||
|
||||
logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
|
||||
wide_df_or = logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
|
||||
|
||||
wide_df_or = as.matrix(wide_df_or)
|
||||
rownames(wide_df_or) = wide_df_or[,1]
|
||||
dim(wide_df_or)
|
||||
wide_df_or = wide_df_or[,-1]
|
||||
str(wide_df_or)
|
||||
|
||||
position_or = as.numeric(colnames(wide_df_or))
|
||||
|
||||
#==================
|
||||
# logo data: logOR
|
||||
#==================
|
||||
logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
|
||||
wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
|
||||
|
||||
wide_df_logor = as.matrix(wide_df_logor)
|
||||
|
||||
rownames(wide_df_logor) = wide_df_logor[,1]
|
||||
wide_df_logor = subset(wide_df_logor, select = -c(1) )
|
||||
colnames(wide_df_logor)
|
||||
wide_df_logor_m = data.matrix(wide_df_logor)
|
||||
|
||||
rownames(wide_df_logor_m)
|
||||
colnames(wide_df_logor_m)
|
||||
|
||||
position_logor = as.numeric(colnames(wide_df_logor_m))
|
||||
|
||||
#===============================
|
||||
# logo data: multiple nsSNPs (>1)
|
||||
#=================================
|
||||
#require(data.table)
|
||||
|
||||
# get freq count of positions so you can subset freq<1
|
||||
setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)]
|
||||
|
||||
table(logo_data$position)
|
||||
table(logo_data$mut_pos_occurrence)
|
||||
|
||||
max_mut = max(table(logo_data$position))
|
||||
|
||||
# extract freq_pos > 1
|
||||
my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,]
|
||||
u = unique(my_data_snp$position)
|
||||
max_mult_mut = max(table(my_data_snp$position))
|
||||
|
||||
if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
|
||||
|
||||
cat("PASS: positions with multiple muts extracted"
|
||||
, "\nNo. of mutations:", nrow(my_data_snp)
|
||||
, "\nNo. of positions:", length(u)
|
||||
, "\nMax no. of muts at any position", max_mult_mut)
|
||||
}else{
|
||||
cat("FAIL: positions with multiple muts could NOT be extracted"
|
||||
, "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
|
||||
, "\nGot:", nrow(my_data_snp) )
|
||||
}
|
||||
|
||||
cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
|
||||
|
||||
#--------------------------------------
|
||||
# matrix for_mychisq mutant type
|
||||
# frequency of mutant type by position
|
||||
#---------------------------------------
|
||||
table(my_data_snp$mutant_type, my_data_snp$position)
|
||||
tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
|
||||
class(tab_mt)
|
||||
|
||||
# unclass to convert to matrix
|
||||
tab_mt = unclass(tab_mt)
|
||||
tab_mt = as.matrix(tab_mt, rownames = T)
|
||||
|
||||
# should be TRUE
|
||||
is.matrix(tab_mt)
|
||||
|
||||
rownames(tab_mt) #aa
|
||||
colnames(tab_mt) #pos
|
||||
|
||||
#-------------------------------------
|
||||
# matrix for wild type
|
||||
# frequency of wild type by position
|
||||
#-------------------------------------
|
||||
tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
|
||||
tab_wt = unclass(tab_wt)
|
||||
|
||||
# remove wt duplicates
|
||||
wt = my_data_snp[, c("position", "wild_type")]
|
||||
wt = wt[!duplicated(wt),]
|
||||
|
||||
tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
|
||||
|
||||
rownames(tab_wt)
|
||||
rownames(tab_wt)
|
||||
|
||||
identical(colnames(tab_mt), colnames(tab_wt))
|
||||
identical(ncol(tab_mt), ncol(tab_wt))
|
||||
|
||||
#----------------------------------
|
||||
# logo data OR: multiple nsSNPs (>1)
|
||||
#----------------------------------
|
||||
logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
|
||||
#wide_df_or = logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
|
||||
wide_df_or_mult = logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
|
||||
|
||||
wide_df_or_mult = as.matrix(wide_df_or_mult)
|
||||
rownames(wide_df_or_mult) = wide_df_or_mult[,1]
|
||||
wide_df_or_mult = wide_df_or_mult[,-1]
|
||||
str(wide_df_or_mult)
|
||||
|
||||
position_or_mult = as.numeric(colnames(wide_df_or_mult))
|
0
scripts/plotting/opp_mcsm_muts.R
Normal file → Executable file
0
scripts/plotting/opp_mcsm_muts.R
Normal file → Executable file
0
scripts/plotting/or_plots_combined.R
Normal file → Executable file
0
scripts/plotting/or_plots_combined.R
Normal file → Executable file
13
scripts/plotting/other_plots_combined.R
Normal file → Executable file
13
scripts/plotting/other_plots_combined.R
Normal file → Executable file
|
@ -35,7 +35,7 @@ plot_dr_other_combined_labelled = paste0(plotdir,"/", dr_other_combined_labell
|
|||
#my_comparisons <- list( c(dr_muts_col, other_muts_col) )
|
||||
my_comparisons <- list( c("DM", "OM") )
|
||||
|
||||
my_ats = 22# axis text size
|
||||
my_ats = 22 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
my_fls = 20 # facet label size
|
||||
my_pts = 22 # plot title size
|
||||
|
@ -45,12 +45,15 @@ my_pts = 22 # plot title size
|
|||
#===========
|
||||
# Plot1: PS
|
||||
#===========
|
||||
my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type"
|
||||
, data = df_lf_ps, paired = FALSE, p.adjust.method = "BH")
|
||||
# my_stat_ps = compare_means(param_value~mutation_info
|
||||
# , group.by = "param_type"
|
||||
# , data = df_lf_ps
|
||||
# , paired = FALSE
|
||||
# , p.adjust.method = "BH")
|
||||
|
||||
y_value = "param_value"
|
||||
|
||||
p1 = ggplot(df_lf_ps, aes(x = mutation_info
|
||||
p1 = ggplot(lf_duet, aes(x = mutation_info
|
||||
, y = eval(parse(text=y_value)) )) +
|
||||
facet_wrap(~ param_type
|
||||
, nrow = 1
|
||||
|
@ -61,7 +64,7 @@ p1 = ggplot(df_lf_ps, aes(x = mutation_info
|
|||
geom_point(position = position_jitterdodge(dodge.width=0.01)
|
||||
, alpha = 0.5
|
||||
, show.legend = FALSE
|
||||
, aes(colour = factor(duet_outcome))) +
|
||||
, aes(colour = duet_outcome)) +
|
||||
theme(axis.text.x = element_text(size = my_ats)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
|
|
0
scripts/plotting/output_tables.R
Normal file → Executable file
0
scripts/plotting/output_tables.R
Normal file → Executable file
0
scripts/plotting/ps_plots_combined.R
Normal file → Executable file
0
scripts/plotting/ps_plots_combined.R
Normal file → Executable file
80
scripts/plotting/redundant/coloured_bp_data.R
Normal file
80
scripts/plotting/redundant/coloured_bp_data.R
Normal file
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#################################################################
|
||||
# TASK: Script to add bp colours ~ barplot heatmap
|
||||
#################################################################
|
||||
|
||||
my_df = merged_df3
|
||||
|
||||
cols_to_select = c("mutationinformation", "drtype"
|
||||
, "wild_type"
|
||||
, "position"
|
||||
, "mutant_type"
|
||||
, "chain", "ligand_id", "ligand_distance"
|
||||
, "duet_stability_change", "duet_outcome", "duet_scaled"
|
||||
, "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
|
||||
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
|
||||
, "deepddg", "deepddg_outcome" # comment out as not available for pnca
|
||||
, "asa", "rsa", "rd_values", "kd_values"
|
||||
, "af", "or_mychisq", "pval_fisher"
|
||||
, "or_fisher", "or_logistic", "pval_logistic"
|
||||
, "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
|
||||
, "wt_calcprop", "mut_calcprop")
|
||||
|
||||
#=======================
|
||||
# Data for sub colours
|
||||
# barplot: PS
|
||||
#=======================
|
||||
|
||||
cat("\nNo. of cols to select:", length(cols_to_select))
|
||||
|
||||
subcols_df_ps = my_df[, cols_to_select]
|
||||
|
||||
cat("\nNo of unique positions for ps:"
|
||||
, length(unique(subcols_df_ps$position)))
|
||||
|
||||
# add count_pos col that counts the no. of nsSNPS at a position
|
||||
setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
|
||||
|
||||
# should be a factor
|
||||
if (is.factor(subcols_df_ps$duet_outcome)){
|
||||
cat("\nDuet_outcome is factor")
|
||||
table(subcols_df_ps$duet_outcome)
|
||||
}else{
|
||||
cat("\nConverting duet_outcome to factor")
|
||||
subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
|
||||
table(subcols_df_ps$duet_outcome)
|
||||
}
|
||||
|
||||
# should be -1 and 1
|
||||
min(subcols_df_ps$duet_scaled)
|
||||
max(subcols_df_ps$duet_scaled)
|
||||
|
||||
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
|
||||
tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
|
||||
|
||||
# check unique values in normalised data
|
||||
cat("\nNo. of unique values in duet scaled, no rounding:"
|
||||
, length(unique(subcols_df_ps$duet_scaled)))
|
||||
|
||||
# No rounding
|
||||
my_grp = subcols_df_ps$duet_scaled; length(my_grp)
|
||||
|
||||
# Add rounding is to be used
|
||||
n = 3
|
||||
subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
|
||||
|
||||
cat("\nNo. of unique values in duet scaled", n, "places rounding:"
|
||||
, length(unique(subcols_df_ps$duet_scaledR)))
|
||||
|
||||
my_grp_r = subcols_df_ps$duet_scaledR # rounding
|
||||
|
||||
# Add grp cols
|
||||
subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
|
||||
subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
|
||||
subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
|
||||
|
||||
cat("Colour palette generated for my_grp: ", length(subcols_ps), " colours")
|
||||
cat("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")
|
0
scripts/plotting/lineage_dist_combined_PS.R → scripts/plotting/redundant/lineage_dist_combined_PS.R
Normal file → Executable file
0
scripts/plotting/lineage_dist_combined_PS.R → scripts/plotting/redundant/lineage_dist_combined_PS.R
Normal file → Executable file
176
scripts/plotting/lineage_dist_dm_om_combined_PS.R → scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R
Normal file → Executable file
176
scripts/plotting/lineage_dist_dm_om_combined_PS.R → scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R
Normal file → Executable file
|
@ -15,44 +15,8 @@ setwd("~/git/LSHTM_analysis/scripts/plotting/")
|
|||
getwd()
|
||||
|
||||
source("Header_TT.R")
|
||||
library(ggridges)
|
||||
library(plyr)
|
||||
source("combining_dfs_plotting.R")
|
||||
# PS combined:
|
||||
# 1) merged_df2
|
||||
# 2) merged_df2_comp
|
||||
# 3) merged_df3
|
||||
# 4) merged_df3_comp
|
||||
|
||||
# LIG combined:
|
||||
# 5) merged_df2_lig
|
||||
# 6) merged_df2_comp_lig
|
||||
# 7) merged_df3_lig
|
||||
# 8) merged_df3_comp_lig
|
||||
|
||||
# 9) my_df_u
|
||||
# 10) my_df_u_lig
|
||||
|
||||
cat("Directories imported:"
|
||||
, "\n===================="
|
||||
, "\ndatadir:", datadir
|
||||
, "\nindir:", indir
|
||||
, "\noutdir:", outdir
|
||||
, "\nplotdir:", plotdir)
|
||||
|
||||
cat("Variables imported:"
|
||||
, "\n====================="
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\ngene_match:", gene_match
|
||||
, "\nAngstrom symbol:", angstroms_symbol
|
||||
, "\nNo. of duplicated muts:", dup_muts_nu
|
||||
, "\nNA count for ORs:", na_count
|
||||
, "\nNA count in df2:", na_count_df2
|
||||
, "\nNA count in df3:", na_count_df3
|
||||
, "\ndr_muts_col:", dr_muts_col
|
||||
, "\nother_muts_col:", other_muts_col
|
||||
, "\ndrtype_col:", resistance_col)
|
||||
source("get_plotting_dfs.R")
|
||||
|
||||
cat("cols imported:"
|
||||
, mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
|
||||
|
@ -77,67 +41,24 @@ plot_lineage_dist_combined_dm_om_L = paste0(plotdir,"/", lineage_dist_combined
|
|||
# we lose some muts and at this level, we should use
|
||||
# as much info as available, hence use df with NA
|
||||
###########################
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
|
||||
# delete variables not required
|
||||
rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp
|
||||
, merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
table(my_df$mutation_info)
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
table(my_df$lineage); str(my_df$lineage)
|
||||
|
||||
# select lineages 1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
#, "lineage5"
|
||||
#, "lineage6"
|
||||
#, "lineage7")
|
||||
|
||||
# works nicely with facet wrap using labeller, but not otherwise
|
||||
#my_labels = c('Lineage 1'
|
||||
# , 'Lineage 2'
|
||||
# , 'Lineage 3'
|
||||
# , 'Lineage 4')
|
||||
# #, 'Lineage 5'
|
||||
# #, 'Lineage 6'
|
||||
# #, 'Lineage 7')
|
||||
|
||||
#names(my_labels) = c('lineage1'
|
||||
# , 'lineage2'
|
||||
# , 'lineage3'
|
||||
# , 'lineage4')
|
||||
# #, 'lineage5'
|
||||
# #, 'lineage6'
|
||||
# #, 'lineage7')
|
||||
|
||||
#==========================
|
||||
# subset selected lineages
|
||||
#==========================
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages)
|
||||
table(df_lin$lineage)
|
||||
lin_dist_plot = merged_df2[merged_df2$lineage%in%c("lineage1", "lineage2", "lineage3", "lineage4"),]
|
||||
table(lin_dist_plot$lineage)
|
||||
|
||||
#{RESULT: Total number of samples for lineage}
|
||||
sum(table(df_lin$lineage))
|
||||
sum(table(lin_dist_plot$lineage))
|
||||
|
||||
#{RESULT: No of samples within lineage}
|
||||
table(df_lin$lineage)
|
||||
table(lin_dist_plot$lineage)
|
||||
|
||||
#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
length(unique(df_lin$mutationinformation))
|
||||
length(unique(lin_dist_plot$mutationinformation))
|
||||
|
||||
u2 = unique(my_df$mutationinformation)
|
||||
u = unique(df_lin$mutationinformation)
|
||||
u2 = unique(lin_dist_plot$mutationinformation)
|
||||
u = unique(lin_dist_plot$mutationinformation)
|
||||
|
||||
#{Result:Muts not present within selected lineages}
|
||||
check = u2[!u2%in%u]; print(check)
|
||||
|
@ -148,37 +69,38 @@ check = u2[!u2%in%u]; print(check)
|
|||
# from "plyr"
|
||||
#==================
|
||||
#{Result:No of samples in selected lineages}
|
||||
table(df_lin$lineage)
|
||||
table(lin_dist_plot$lineage)
|
||||
|
||||
df_lin$lineage_labels = mapvalues(df_lin$lineage
|
||||
lin_dist_plot$lineage_labels = mapvalues(lin_dist_plot$lineage
|
||||
, from = c("lineage1","lineage2", "lineage3", "lineage4")
|
||||
, to = c("Lineage 1", "Lineage 2", "Lineage 3", "Lineage 4"))
|
||||
table(df_lin$lineage_labels)
|
||||
table(lin_dist_plot$lineage_labels)
|
||||
|
||||
table(df_lin$lineage_labels) == table(df_lin$lineage)
|
||||
table(lin_dist_plot$lineage_labels) == table(lin_dist_plot$lineage)
|
||||
|
||||
#========================
|
||||
# mutation_info: labels
|
||||
#========================
|
||||
#{Result:No of DM and OM muts in selected lineages}
|
||||
table(df_lin$mutation_info)
|
||||
table(lin_dist_plot$mutation_info)
|
||||
|
||||
df_lin$mutation_info_labels = ifelse(df_lin$mutation_info == dr_muts_col, "DM", "OM")
|
||||
table(df_lin$mutation_info_labels)
|
||||
|
||||
table(df_lin$mutation_info) == table(df_lin$mutation_info_labels)
|
||||
lin_dist_plot$mutation_info_labels = ifelse(lin_dist_plot$mutation_info == dr_muts_col
|
||||
, "DM", "OM")
|
||||
table(lin_dist_plot$mutation_info_labels)
|
||||
|
||||
table(lin_dist_plot$mutation_info) == table(lin_dist_plot$mutation_info_labels)
|
||||
|
||||
#========================
|
||||
# duet_outcome: labels
|
||||
#========================
|
||||
#{Result: No. of D and S mutations in selected lineages}
|
||||
table(df_lin$duet_outcome)
|
||||
table(lin_dist_plot$duet_outcome)
|
||||
|
||||
df_lin$duet_outcome_labels = ifelse(df_lin$duet_outcome == "Destabilising", "D", "S")
|
||||
table(df_lin$duet_outcome_labels)
|
||||
lin_dist_plot$duet_outcome_labels = ifelse(lin_dist_plot$duet_outcome == "Destabilising"
|
||||
, "D", "S")
|
||||
table(lin_dist_plot$duet_outcome_labels)
|
||||
|
||||
table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels)
|
||||
table(lin_dist_plot$duet_outcome) == table(lin_dist_plot$duet_outcome_labels)
|
||||
|
||||
|
||||
#=======================
|
||||
|
@ -198,25 +120,14 @@ table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels)
|
|||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Distribution plots
|
||||
#============================
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
df <- df_lin
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
rm(df_lin)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# 2 : ggridges (good!)
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
n_colours = length(unique(df$duet_scaled))
|
||||
n_colours = length(unique(lin_dist_plot$duet_scaled))
|
||||
|
||||
my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
|
||||
|
||||
#=======================================
|
||||
|
@ -232,17 +143,23 @@ my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcs
|
|||
#plot_lineage_dist_duet_f
|
||||
#svg(plot_lineage_dist_duet_f)
|
||||
|
||||
p1 = ggplot(df, aes(x = duet_scaled
|
||||
, y = duet_outcome))+
|
||||
p1 = ggplot(lin_dist_plot, aes(x = duet_scaled
|
||||
#, y = duet_outcome
|
||||
, y = mutation_info_labels
|
||||
))+
|
||||
geom_density_ridges_gradient(aes(fill = ..x..)
|
||||
#, jittered_points = TRUE
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage_labels
|
||||
#~mutation_info_labels
|
||||
# ~mutation_info_labels
|
||||
# , scales = "free"
|
||||
# , labeller = labeller(lineage = my_labels)
|
||||
) +
|
||||
coord_cartesian( xlim = c(-1, 1)) +
|
||||
#coord_cartesian( xlim = c(-1, 1)) +
|
||||
scale_x_continuous(expand = c(0.01, 0)) +
|
||||
|
||||
scale_fill_gradientn(colours = my_palette
|
||||
, name = "DUET"
|
||||
#, breaks = c(-1, 0, 1)
|
||||
|
@ -264,8 +181,8 @@ p1 = ggplot(df, aes(x = duet_scaled
|
|||
#, legend.title = element_text(size = my_als-6)
|
||||
, legend.title = element_blank()
|
||||
, legend.position = c(-0.08, 0.41)
|
||||
#, legend.direction = "horizontal"
|
||||
#, legend.position = "left"
|
||||
, legend.direction = "horizontal"
|
||||
, legend.position = "top"
|
||||
)+
|
||||
labs(x = "DUET")
|
||||
|
||||
|
@ -286,13 +203,14 @@ p1
|
|||
#plot_lineage_dist_duet_dm_om
|
||||
#svg(plot_lineage_dist_duet_dm_om)
|
||||
|
||||
p2 = ggplot(df, aes(x = duet_scaled
|
||||
p2 = ggplot(lin_dist_plot, aes(x = duet_scaled
|
||||
, y = lineage_labels))+
|
||||
geom_density_ridges(aes(fill = factor(mutation_info_labels))
|
||||
, scale = 3
|
||||
, size = 0.3
|
||||
, alpha = 0.8) +
|
||||
coord_cartesian( xlim = c(-1, 1)) +
|
||||
scale_x_continuous(expand = c(0.01, 0)) +
|
||||
#coord_cartesian( xlim = c(-1, 1)) +
|
||||
scale_fill_manual(values = c("#E69F00", "#999999")) +
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
|
@ -325,14 +243,18 @@ p2
|
|||
#plot_lineage_dist_duet_nf
|
||||
#svg(plot_lineage_dist_duet_nf)
|
||||
|
||||
p3 = ggplot(df, aes(x = duet_scaled
|
||||
p3 = ggplot(lin_dist_plot, aes(x = duet_scaled
|
||||
, y = lineage_labels))+
|
||||
geom_density_ridges_gradient(aes(fill = ..x..)
|
||||
#, jittered_points = TRUE
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
coord_cartesian( xlim = c(-1, 1)) +
|
||||
scale_fill_gradientn(colours = my_palette, name = "DUET") +
|
||||
# geom_density_ridges_gradient(aes(fill = ..x..)
|
||||
# #, jittered_points = TRUE
|
||||
# , scale = 3
|
||||
# , size = 0.3 ) +
|
||||
geom_density_ridges()+
|
||||
#facet_wrap (~mutation_info_labels) +
|
||||
#coord_cartesian( xlim = c(-1, 1)) +
|
||||
scale_x_continuous(expand = c(0.01, 0)) +
|
||||
|
||||
#scale_fill_gradientn(colours = my_palette, name = "DUET") +
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
117
scripts/plotting/redundant/other_dfs_data.R
Normal file
117
scripts/plotting/redundant/other_dfs_data.R
Normal file
|
@ -0,0 +1,117 @@
|
|||
#!/usr/bin/env Rscript
|
||||
|
||||
# Didn't end up using it: sorted it at the source
|
||||
# .py script to combine all dfs to output all_params
|
||||
|
||||
#################################################################
|
||||
# TASK: Script to add all other dfs to merged_df2 and merged_df3
|
||||
|
||||
#################################################################
|
||||
# Combine other dfs:
|
||||
# dynamut_df, dynamut2_df, mcsm_na_df,
|
||||
# perhaps : deepddg and mcsm ppi (for embb)
|
||||
################################################################
|
||||
# read other files
|
||||
infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
|
||||
, "_complex_dynamut_norm.csv")
|
||||
|
||||
infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
|
||||
, "_complex_dynamut2_norm.csv")
|
||||
|
||||
infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
|
||||
, "_complex_mcsm_na_norm.csv")
|
||||
|
||||
infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
|
||||
, "_mcsm_formatted_snps.csv")
|
||||
|
||||
dynamut_df = read.csv(infilename_dynamut)
|
||||
dynamut2_df = read.csv(infilename_dynamut2)
|
||||
mcsm_na_df = read.csv(infilename_mcsm_na)
|
||||
mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F)
|
||||
names(mcsm_f_snps) = "mutationinformation"
|
||||
|
||||
#=================================
|
||||
# check with intersect to find the common col, but use
|
||||
c1 = length(intersect(names(dynamut_df), names(dynamut2_df)))
|
||||
c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df)))
|
||||
|
||||
if (c1 == 1 && c2 == 1) {
|
||||
n_common = 1
|
||||
}else{
|
||||
cat("\nMore than one common col found, inspect before merging!")
|
||||
}
|
||||
|
||||
# mutationinformation column to be on the safe side
|
||||
# delete chain from dynamut2_df
|
||||
#dynamut2_df = subset(dynamut2_df, select = -chain)
|
||||
|
||||
# quick checks
|
||||
lapply(list(dynamut_df
|
||||
, dynamut2_df
|
||||
, mcsm_na_df), ncol)
|
||||
|
||||
lapply(list(dynamut_df
|
||||
, dynamut2_df
|
||||
, mcsm_na_df), colnames)
|
||||
|
||||
lapply(list(dynamut_df
|
||||
, dynamut2_df
|
||||
, mcsm_na_df), nrow)
|
||||
|
||||
ncols_comb = lapply(list(dynamut_df
|
||||
, dynamut2_df
|
||||
, mcsm_na_df), ncol)
|
||||
|
||||
#---------------------------------
|
||||
# Combine 1: all other params dfs
|
||||
#---------------------------------
|
||||
combined_dfs = Reduce(inner_join, list(dynamut_df
|
||||
, dynamut2_df
|
||||
, mcsm_na_df))
|
||||
# Reduce("+", ncols_comb)
|
||||
|
||||
#-----------------------------------------
|
||||
# Combine 2: combine1 result + merged_df2
|
||||
#-----------------------------------------
|
||||
drop_cols = intersect(names(combined_dfs), names(merged_df2))
|
||||
drop_cols = drop_cols
|
||||
|
||||
drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")]
|
||||
|
||||
combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols]
|
||||
|
||||
nrow(combined_dfs_f); nrow(merged_df2)
|
||||
ncol(combined_dfs_f); ncol(merged_df2)
|
||||
|
||||
#-----------------------------------------
|
||||
# Combined merged_df2
|
||||
#-----------------------------------------
|
||||
merged_df2_combined = merge(merged_df2
|
||||
, combined_dfs_f
|
||||
, by = "mutationinformation"
|
||||
)
|
||||
|
||||
expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1
|
||||
|
||||
if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){
|
||||
|
||||
cat("\nPASS: merged_df2 combined with other parameters dfs."
|
||||
, "\nUse this for lineage distribution plots")
|
||||
}else{
|
||||
|
||||
cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs")
|
||||
quit()
|
||||
|
||||
}
|
||||
|
||||
rm(combined_dfs, combined_dfs_f)
|
||||
|
||||
#================================
|
||||
# combined data
|
||||
# short_df ps: ~ merged_df3
|
||||
# TODO: later integrate properly
|
||||
#================================
|
||||
#-----------------------------------------
|
||||
# Combined merged_df2
|
||||
#-----------------------------------------
|
||||
merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]
|
470
scripts/plotting/redundant/other_plots_data.R
Executable file
470
scripts/plotting/redundant/other_plots_data.R
Executable file
|
@ -0,0 +1,470 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Script to format data for dm om plots:
|
||||
# generating LF data
|
||||
# sourced by get_plotting_dfs.R
|
||||
#########################################################
|
||||
# working dir and loading libraries
|
||||
# getwd()
|
||||
# setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||
# getwd()
|
||||
|
||||
# make cmd
|
||||
# globals
|
||||
# drug = "streptomycin"
|
||||
# gene = "gid"
|
||||
|
||||
# source("get_plotting_dfs.R")
|
||||
#=======================================================================
|
||||
# MOVE TO COMBINE or singular file for deepddg
|
||||
#
|
||||
# cols_to_select = c("mutation", "mutationinformation"
|
||||
# , "wild_type", "position", "mutant_type"
|
||||
# , "mutation_info")
|
||||
#
|
||||
# merged_df3_short = merged_df3[, cols_to_select]
|
||||
|
||||
# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
|
||||
# , "_mcsm_formatted_snps.csv")
|
||||
#
|
||||
# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F)
|
||||
# names(mcsm_f_snps) <- "mutationinformation"
|
||||
|
||||
# write merged_df3 to generate structural figure on chimera
|
||||
#write.csv(merged_df3_short, "merged_df3_short.csv")
|
||||
#========================================================================
|
||||
|
||||
#========================================================================
|
||||
# cols to select
|
||||
|
||||
cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation"
|
||||
, "mutation_info", "position"
|
||||
, LigDist_colname
|
||||
, "duet_stability_change", "duet_scaled", "duet_outcome"
|
||||
, "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
|
||||
, "ddg_foldx", "foldx_scaled", "foldx_outcome"
|
||||
, "deepddg", "deepddg_scaled", "deepddg_outcome"
|
||||
, "asa", "rsa"
|
||||
, "rd_values", "kd_values"
|
||||
, "log10_or_mychisq", "neglog_pval_fisher", "af")]
|
||||
|
||||
cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation"
|
||||
, "mcsm_na_affinity", "mcsm_na_scaled"
|
||||
, "mcsm_na_outcome")]
|
||||
# entire dynamut_df
|
||||
|
||||
cols_dynamut2_df <- dynamut2_df[, c("mutationinformation"
|
||||
, "ddg_dynamut2", "ddg_dynamut2_scaled"
|
||||
, "ddg_dynamut2_outcome")]
|
||||
|
||||
n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) +
|
||||
length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols
|
||||
|
||||
i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df))
|
||||
i2<- intersect(names(dynamut_df), names(cols_dynamut2_df))
|
||||
merging_cols <- intersect(i1, i2)
|
||||
cat("\nmerging_cols:", merging_cols)
|
||||
|
||||
if (merging_cols == "mutationinformation") {
|
||||
cat("\nStage 1: Found common col between dfs, checking values in it...")
|
||||
c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]])
|
||||
c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]])
|
||||
c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]])
|
||||
c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]])
|
||||
cols_check <- c(c1, c2, c3, c4)
|
||||
expected_cols = n_comb_cols - ( length(cols_check) - 1)
|
||||
if (all(cols_check)){
|
||||
cat("\nStage 2: Proceeding with merging dfs:\n")
|
||||
comb_df <- Reduce(inner_join, list(cols_mcsm_df
|
||||
, cols_mcsm_na_df
|
||||
, dynamut_df
|
||||
, cols_dynamut2_df))
|
||||
comb_df_s = arrange(comb_df, position)
|
||||
|
||||
# if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) {
|
||||
# cat("\Stage3, PASS: dfs merged sucessfully"
|
||||
# , "\nnrow of merged_df: ", nrow(comb_df_s)
|
||||
# , "\nncol of merged_df:", ncol(comb_df_s))
|
||||
# }
|
||||
|
||||
}
|
||||
}
|
||||
#names(comb_df_s)
|
||||
cat("\n!!!IT GOT TO HERE!!!!")
|
||||
#=======================================================================
|
||||
fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
|
||||
fact_cols
|
||||
lapply(comb_df_s[, fact_cols], class)
|
||||
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
|
||||
|
||||
if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
|
||||
cat("\nChanging cols to factor")
|
||||
comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
|
||||
if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
|
||||
cat("\nSuccessful: cols changed to factor")
|
||||
}
|
||||
}
|
||||
lapply(comb_df_s[, fact_cols], class)
|
||||
|
||||
#=======================================================================
|
||||
table(comb_df_s$mutation_info)
|
||||
|
||||
# further checks to make sure dr and other muts are indeed unique
|
||||
dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
|
||||
dr_muts_names = unique(dr_muts$mutation)
|
||||
|
||||
other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
|
||||
other_muts_names = unique(other_muts$mutation)
|
||||
|
||||
if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
|
||||
table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
|
||||
cat("PASS: dr and other muts are indeed unique")
|
||||
}else{
|
||||
cat("FAIL: dr and others muts are NOT unique!")
|
||||
quit()
|
||||
}
|
||||
|
||||
# pretty display names i.e. labels to reduce major code duplication later
|
||||
foo_cnames = data.frame(colnames(comb_df_s))
|
||||
names(foo_cnames) <- "old_name"
|
||||
|
||||
stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
|
||||
flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
|
||||
|
||||
lig_dn = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
|
||||
duet_dn = paste0("DUET ", stability_suffix); duet_dn
|
||||
foldx_dn = paste0("FoldX ", stability_suffix); foldx_dn
|
||||
deepddg_dn = paste0("Deepddg " , stability_suffix); deepddg_dn
|
||||
mcsm_na_dn = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
|
||||
dynamut_dn = paste0("Dynamut ", stability_suffix); dynamut_dn
|
||||
dynamut2_dn = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
|
||||
encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
|
||||
encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
|
||||
sdm_dn = paste0("SDM " , stability_suffix); sdm_dn
|
||||
mcsm_dn = paste0("mCSM " , stability_suffix ); mcsm_dn
|
||||
|
||||
# Change colnames of some columns using datatable
|
||||
comb_df_sl = comb_df_s
|
||||
names(comb_df_sl)
|
||||
|
||||
setnames(comb_df_sl
|
||||
, old = c("asa", "rsa", "rd_values", "kd_values"
|
||||
, "log10_or_mychisq", "neglog_pval_fisher", "af"
|
||||
, LigDist_colname
|
||||
, "duet_scaled"
|
||||
, "foldx_scaled"
|
||||
, "deepddg_scaled"
|
||||
, "mcsm_na_scaled"
|
||||
, "ddg_dynamut_scaled"
|
||||
, "ddg_dynamut2_scaled"
|
||||
, "ddg_encom_scaled"
|
||||
, "dds_encom_scaled"
|
||||
, "ddg_sdm"
|
||||
, "ddg_mcsm")
|
||||
|
||||
, new = c("ASA", "RSA", "RD", "KD"
|
||||
, "Log10 (OR)", "-Log (P)", "MAF"
|
||||
, lig_dn
|
||||
, duet_dn
|
||||
, foldx_dn
|
||||
, deepddg_dn
|
||||
, mcsm_na_dn
|
||||
, dynamut_dn
|
||||
, dynamut2_dn
|
||||
, encom_ddg_dn
|
||||
, encom_dds_dn
|
||||
, sdm_dn
|
||||
, mcsm_dn)
|
||||
)
|
||||
|
||||
foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
|
||||
|
||||
# some more pretty labels
|
||||
table(comb_df_sl$mutation_info)
|
||||
|
||||
levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
|
||||
levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
|
||||
|
||||
table(comb_df_sl$mutation_info)
|
||||
|
||||
#######################################################################
|
||||
#======================
|
||||
# Selecting dfs
|
||||
# with appropriate cols
|
||||
#=======================
|
||||
static_cols_start = c("mutationinformation"
|
||||
, "position"
|
||||
, "mutation"
|
||||
, "mutation_info")
|
||||
|
||||
static_cols_end = c(lig_dn
|
||||
, "ASA"
|
||||
, "RSA"
|
||||
, "RD"
|
||||
, "KD")
|
||||
|
||||
# ordering is important!
|
||||
|
||||
#########################################################################
|
||||
#==============
|
||||
# DUET: LF
|
||||
#==============
|
||||
cols_to_select_duet = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
|
||||
wf_duet = comb_df_sl[, cols_to_select_duet]
|
||||
|
||||
#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
|
||||
pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
|
||||
|
||||
expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: duet
|
||||
lf_duet = gather(wf_duet
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(duet_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_duet) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", duet_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# FoldX: LF
|
||||
#==============
|
||||
cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
|
||||
wf_foldx = comb_df_sl[, cols_to_select_foldx]
|
||||
|
||||
pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
|
||||
|
||||
expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: duet
|
||||
print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>")
|
||||
lf_foldx <<- gather(wf_foldx
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(foldx_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_foldx) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", foldx_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# Deepddg: LF
|
||||
#==============
|
||||
cols_to_select_deepddg = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
|
||||
wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
|
||||
|
||||
pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
|
||||
|
||||
expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: duet
|
||||
lf_deepddg = gather(wf_deepddg
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(deepddg_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_deepddg) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", deepddg_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# mCSM-NA: LF
|
||||
#==============
|
||||
cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
|
||||
wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
|
||||
|
||||
pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
|
||||
|
||||
expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: duet
|
||||
lf_mcsm_na = gather(wf_mcsm_na
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(mcsm_na_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_mcsm_na) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", mcsm_na_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# Dynamut: LF
|
||||
#==============
|
||||
cols_to_select_dynamut = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
|
||||
wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
|
||||
|
||||
pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
|
||||
|
||||
expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: duet
|
||||
lf_dynamut = gather(wf_dynamut
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(dynamut_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_dynamut) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", dynamut_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# Dynamut2: LF
|
||||
#==============
|
||||
cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
|
||||
|
||||
wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
|
||||
|
||||
pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
|
||||
|
||||
expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: duet
|
||||
lf_dynamut2 = gather(wf_dynamut2
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(dynamut2_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_dynamut2) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", dynamut2_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# EnCOM ddg: LF
|
||||
#==============
|
||||
cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
|
||||
wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
|
||||
|
||||
pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg
|
||||
|
||||
expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: encomddg
|
||||
lf_encomddg = gather(wf_encomddg
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(encom_ddg_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_encomddg) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for ", encom_ddg_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
############################################################################
|
||||
#==============
|
||||
# EnCOM dds: LF
|
||||
#==============
|
||||
cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
|
||||
wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
|
||||
|
||||
pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds
|
||||
|
||||
expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: encomddg
|
||||
lf_encomdds = gather(wf_encomdds
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(encom_dds_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_encomdds) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for", encom_dds_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# SDM: LF
|
||||
#==============
|
||||
cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
|
||||
wf_sdm = comb_df_sl[, cols_to_select_sdm]
|
||||
|
||||
pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
|
||||
|
||||
expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: encomddg
|
||||
lf_sdm = gather(wf_sdm
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(sdm_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_sdm) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for", sdm_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#==============
|
||||
# mCSM: LF
|
||||
#==============
|
||||
cols_to_select_mcsm = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
|
||||
wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
|
||||
|
||||
pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
|
||||
|
||||
expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
|
||||
expected_rows_lf
|
||||
|
||||
# LF data: encomddg
|
||||
lf_mcsm = gather(wf_mcsm
|
||||
, key = param_type
|
||||
, value = param_value
|
||||
, all_of(mcsm_dn):tail(static_cols_end,1)
|
||||
, factor_key = TRUE)
|
||||
|
||||
if (nrow(lf_mcsm) == expected_rows_lf){
|
||||
cat("\nPASS: long format data created for", mcsm_dn)
|
||||
}else{
|
||||
cat("\nFAIL: long format data could not be created for duet")
|
||||
quit()
|
||||
}
|
||||
############################################################################
|
||||
|
0
scripts/plotting/resolving_ambiguous_muts.R
Normal file → Executable file
0
scripts/plotting/resolving_ambiguous_muts.R
Normal file → Executable file
|
@ -112,6 +112,30 @@ note:
|
|||
- fa flag has default if not supplied
|
||||
- fb flag has default if not supplied
|
||||
|
||||
|
||||
#====================================
|
||||
# lineage_basic_barplots_combined.R
|
||||
#====================================
|
||||
#-----------------------------------------------------------------------
|
||||
./lineage_basic_barplots_combined.R-d streptomycin -g gid
|
||||
#-----------------------------------------------------------------------
|
||||
|
||||
It replaces (and has an added diversity plot)
|
||||
## lineage_basic_barplot.R
|
||||
These have been moved to redundant/
|
||||
|
||||
|
||||
sources:
|
||||
## get_plotting_dfs.R
|
||||
## functions//bp_lineage.R"
|
||||
|
||||
outputs: 1 svg in the plotdir
|
||||
## basic_lineage_barplots_combined.svg
|
||||
|
||||
note:
|
||||
- fa flag has default if not supplied
|
||||
- fb flag has default if not supplied
|
||||
|
||||
########################################################################
|
||||
# TODO
|
||||
Delete: dirs.R
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue