refactoring code to make it take command line args

This commit is contained in:
Tanushree Tunstall 2020-04-06 19:03:41 +01:00
parent 0191ee3493
commit 3905a81c38
42 changed files with 456 additions and 7206 deletions

View file

@ -1,8 +1,10 @@
mCSM Analysis
=============
This repo does mCSM analysis using bash, python and R.
This contains scripts that does the following:
1. mCSM analysis: using bash, python and R
2. meta data analysis: using python and R
Requires an additional 'Data' directory. Batteries not included:-)
## Assumptions
@ -19,17 +21,14 @@ subdirs within this repo
*.R
*.py
mcsm\_analysis/
<drug>/
scripts/
*.R
*.py
mcsm/
*.sh
*.py
*.R
plotting/
*.R
mcsm_analysis
# <drug>/
foldx_analysis
plotting
*.R
```
More docs here as I write them.

View file

@ -1,130 +0,0 @@
#########################################################
### A) Installing and loading required packages
#########################################################
#lib_loc = "/usr/local/lib/R/site-library")
#if (!require("gplots")) {
# install.packages("gplots", dependencies = TRUE)
# library(gplots)
#}
#if (!require("tidyverse")) {
# install.packages("tidyverse", dependencies = TRUE)
# library(tidyverse)
#}
if (!require("ggplot2")) {
install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)
}
if (!require("plotly")) {
install.packages("plotly", dependencies = TRUE)
library(plotly)
}
if (!require("cowplot")) {
install.packages("copwplot", dependencies = TRUE)
library(cowplot)
}
if (!require("ggcorrplot")) {
install.packages("ggcorrplot", dependencies = TRUE)
library(ggcorrplot)
}
if (!require("ggpubr")) {
install.packages("ggpubr", dependencies = TRUE)
library(ggpubr)
}
if (!require("RColorBrewer")) {
install.packages("RColorBrewer", dependencies = TRUE)
library(RColorBrewer)
}
if (!require ("GOplot")) {
install.packages("GOplot")
library(GOplot)
}
if(!require("VennDiagram")) {
install.packages("VennDiagram", dependencies = T)
library(VennDiagram)
}
if(!require("scales")) {
install.packages("scales", dependencies = T)
library(scales)
}
if(!require("plotrix")) {
install.packages("plotrix", dependencies = T)
library(plotrix)
}
if(!require("stats")) {
install.packages("stats", dependencies = T)
library(stats)
}
if(!require("stats4")) {
install.packages("stats4", dependencies = T)
library(stats4)
}
if(!require("data.table")) {
install.packages("data.table")
library(data.table)
}
if (!require("PerformanceAnalytics")){
install.packages("PerformanceAnalytics", dependencies = T)
library(PerformaceAnalytics)
}
if (!require ("GGally")){
install.packages("GGally")
library(GGally)
}
if (!require ("corrr")){
install.packages("corrr")
library(corrr)
}
if (!require ("psych")){
install.packages("psych")
library(psych)
}
if (!require ("dplyr")){
install.packages("dplyr")
library(dplyr)
}
if (!require ("compare")){
install.packages("compare")
library(compare)
}
if (!require ("arsenal")){
install.packages("arsenal")
library(arsenal)
}
####TIDYVERSE
# Install
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
library(ggcorrplot)
###for PDB files
#install.packages("bio3d")
if(!require(bio3d)){
install.packages("bio3d")
library(bio3d)
}

View file

@ -1,157 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("../barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA for pyrazinamide:
# merged_df2
# merged_df3
# df without NA for pyrazinamide:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#%%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info); str(my_df$mutation_info)
# subset df with dr muts only
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide")
table(my_df_dr$mutation_info)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Run two times:
# uncomment as necessary
# 1) for all muts
# 2) for dr_muts
#===========================
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
#================
# for ALL muts
#================
#plot_df = my_df
#================
# for dr muts ONLY
#================
plot_df = my_df_dr
#%%%%%%%%%%%%%%%%%%%%%%%%
#============================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
table(plot_df$lineage); str(plot_df$lineage)
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df <- df_lin
#%%%%%%%%%%%%%%%%%%%%%%%%%%
rm(df_lin)
# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)
lin1 = df[df$lineage == "lineage1",]$ratioDUET
lin2 = df[df$lineage == "lineage2",]$ratioDUET
lin3 = df[df$lineage == "lineage3",]$ratioDUET
lin4 = df[df$lineage == "lineage4",]$ratioDUET
# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)
ks.test(lin2,lin3)
ks.test(lin2,lin4)
ks.test(lin3,lin4)

View file

@ -1,27 +0,0 @@
#########################################################
# 1b: Define function: coloured barplot by subgroup
# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
#########################################################
ColourPalleteMulti <- function(df, group, subgroup){
# Find how many colour categories to create and the number of colours in each
categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
, df
, function(x) length(unique(x)))
# return(categories) }
category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
#return(category.start); return(category.end)}
# Build Colour pallette
colours <- unlist(lapply(1:nrow(categories),
function(i){
colorRampPalette(colors = c(category.start[i]
, category.end[i]))(categories[i,2])}))
return(colours)
}
#########################################################

View file

@ -1,417 +0,0 @@
#########################################################
# TASK: To combine mcsm and meta data with af and or files
# Input csv files:
# 1) mcsm normalised and struct params
# 2) gene associated meta_data_with_AFandOR
# Output:
# 1) muts with opposite effects on stability
# 2) large combined df including NAs for AF, OR,etc
# Dim: same no. of rows as gene associated meta_data_with_AFandOR
# 3) small combined df including NAs for AF, OR, etc.
# Dim: same as mcsm data
# 4) large combined df excluding NAs
# Dim: dim(#1) - no. of NAs(AF|OR) + 1
# 5) small combined df excluding NAs
# Dim: dim(#2) - no. of unique NAs - 1
# This script is sourced from other .R scripts for plotting
#########################################################
getwd()
setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/')
getwd()
##########################################################
# Installing and loading required packages
##########################################################
source('Header_TT.R')
#require(data.table)
#require(arsenal)
#require(compare)
#library(tidyverse)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#===========
# data dir
#===========
datadir = paste0('~/git/Data')
#===========
# input
#===========
# infile1: mCSM data
#indir = '~/git/Data/pyrazinamide/input/processed/'
indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
#in_filename = 'mcsm_complex1_normalised.csv'
in_filename = 'pnca_mcsm_struct_params.csv'
infile = paste0(indir, '/', in_filename)
cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
# infile2: gene associated meta data combined with AF and OR
#indir: same as above
in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv')
infile_comb = paste0(indir, '/', in_filename_comb)
cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
#===========
# output
#===========
# Uncomment if and when required to output
outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir
cat('Output dir: ', outdir)
#out_filename = paste0(tolower(gene), 'XXX')
#outfile = paste0(outdir, '/', out_filename)
#cat(paste0('Output file with full path:', outfile))
#%% end of variable assignment for input and output files
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
cat('Reading mcsm_data:'
, '\nindir: ', indir
, '\ninfile_comb: ', in_filename)
mcsm_data = read.csv(infile
, row.names = 1
, stringsAsFactors = F
, header = T)
cat('Read mcsm_data file:'
, '\nNo.of rows: ', nrow(mcsm_data)
, '\nNo. of cols:', ncol(mcsm_data))
# clear variables
rm(in_filename, infile)
str(mcsm_data)
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
# spelling Correction 1: DUET
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
# spelling Correction 2: Ligand
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
# muts with opposing effects on protomer and ligand stability
table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),]
# sanity check: redundant, but uber cautious!
dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome)
cat('Identifying muts with opposite stability effects')
if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) {
cat('PASS: muts with opposite effects on stability and affinity correctly identified'
, '\nNo. of such muts: ', nrow(changes))
}else {
cat('FAIL: unsuccessful in extracting muts with changed stability effects')
}
#***************************
# write file: changed muts
out_filename = 'muts_opp_effects.csv'
outfile = paste0(outdir, '/', out_filename)
cat('Writing file for muts with opp effects:'
, '\nFilename: ', outfile
, '\nPath: ', outdir)
write.csv(changes, outfile)
#****************************
# clear variables
rm(out_filename, outfile)
rm(changes, dl_i, ld_i)
# count na in each column
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
# sort by Mutationinformation
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data$Mutationinformation)
orig_col = ncol(mcsm_data)
# get freq count of positions and add to the df
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
cat('Added 1 col: position frequency to see which posn has how many muts'
, '\nNo. of cols now', ncol(mcsm_data)
, '\nNo. of cols before: ', orig_col)
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
###########################
# 2: Read file: meta data with AFandOR
###########################
cat('Reading combined meta data and AFandOR file:'
, '\nindir: ', indir
, '\ninfile_comb: ', in_filename_comb)
meta_with_afor <- read.csv(infile_comb
, stringsAsFactors = F
, header = T)
cat('Read mcsm_data file:'
, '\nNo.of rows: ', nrow(meta_with_afor)
, '\nNo. of cols:', ncol(meta_with_afor))
# counting NAs in AF, OR cols
if (identical(sum(is.na(meta_with_afor$OR))
, sum(is.na(meta_with_afor$pvalue))
, sum(is.na(meta_with_afor$AF)))){
cat('PASS: NA count match for OR, pvalue and AF\n')
na_count = sum(is.na(meta_with_afor$AF))
cat('No. of NAs: ', sum(is.na(meta_with_afor$OR)))
} else{
cat('FAIL: NA count mismatch'
, '\nNA in OR: ', sum(is.na(meta_with_afor$OR))
, '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue))
, '\nNA in AF:', sum(is.na(meta_with_afor$AF)))
}
# clear variables
rm(in_filename_comb, infile_comb)
str(meta_with_afor)
# sort by Mutationinformation
head(meta_with_afor$Mutationinformation)
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
head(meta_with_afor$Mutationinformation)
###########################
# 3: merging two dfs: with NA
###########################
# link col name = 'Mutationinforamtion'
head(mcsm_data$Mutationinformation)
head(meta_with_afor$Mutationinformation)
cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df2')
#########
# merge 3a (merged_df2): meta data with mcsm
#########
merged_df2 = merge(x = meta_with_afor
,y = mcsm_data
, by = 'Mutationinformation'
, all.y = T)
cat('Dim of merged_df2: '
, '\nNo. of rows: ', nrow(merged_df2)
, '\nNo. of cols: ', ncol(merged_df2))
head(merged_df2$Position)
# sanity check
cat('Checking nrows in merged_df2')
if(nrow(meta_with_afor) == nrow(merged_df2)){
cat('nrow(merged_df2) = nrow (gene associated metadata)'
,'\nExpected no. of rows: ',nrow(meta_with_afor)
,'\nGot no. of rows: ', nrow(merged_df2))
} else{
cat('nrow(merged_df2)!= nrow(gene associated metadata)'
, '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
, '\nGot no. of rows: ', nrow(merged_df2)
, '\nFinding discrepancy')
merged_muts_u = unique(merged_df2$Mutationinformation)
meta_muts_u = unique(meta_with_afor$Mutationinformation)
# find the index where it differs
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
}
# sort by Position
head(merged_df2$Position)
merged_df2 = merged_df2[order(merged_df2$Position),]
head(merged_df2$Position)
merged_df2v2 = merge(x = meta_with_afor
,y = mcsm_data
, by = 'Mutationinformation'
, all.x = T)
#!=!=!=!=!=!=!=!
# COMMENT: used all.y since position 186 is not part of the struc,
# hence doesn't have a mcsm value
# but 186 is associated with mutation
#!=!=!=!=!=!=!=!
# should be False
identical(merged_df2, merged_df2v2)
table(merged_df2$Position%in%merged_df2v2$Position)
rm(merged_df2v2)
#########
# merge 3b (merged_df3):remove duplicate mutation information
#########
cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)'
,'\nCannot trust lineage info from this'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df3')
#==#=#=#=#=#=#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#=#=#=#=#=#=#=
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
# sanity check
cat('Checking nrows in merged_df3')
if(nrow(mcsm_data) == nrow(merged_df3)){
cat('PASS: No. of rows match with mcsm_data'
,'\nExpected no. of rows: ', nrow(mcsm_data)
,'\nGot no. of rows: ', nrow(merged_df3))
} else {
cat('FAIL: No. of rows mismatch'
, '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
, '\nNo. of rows merged_df3: ', nrow(merged_df3))
}
# counting NAs in AF, OR cols in merged_df3
# this is becuase mcsm has no AF, OR cols,
# so you cannot count NAs
if (identical(sum(is.na(merged_df3$OR))
, sum(is.na(merged_df3$pvalue))
, sum(is.na(merged_df3$AF)))){
cat('PASS: NA count match for OR, pvalue and AF\n')
na_count_df3 = sum(is.na(merged_df3$AF))
cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
} else{
cat('FAIL: NA count mismatch'
, '\nNA in OR: ', sum(is.na(merged_df3$OR))
, '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
, '\nNA in AF:', sum(is.na(merged_df3$AF)))
}
###########################
# 4: merging two dfs: without NA
###########################
#########
# merge 4a (merged_df2_comp): same as merge 1 but excluding NA
#########
cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df2_comp')
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),]
# sanity check
cat('Checking nrows in merged_df2_comp')
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
cat('PASS: No. of rows match'
,'\nDim of merged_df2_comp: '
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
, '\nNo. of rows: ', nrow(merged_df2_comp)
, '\nNo. of cols: ', ncol(merged_df2_comp))
}else{
cat('FAIL: No. of rows mismatch'
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
,'\nGot no. of rows: ', nrow(merged_df2_comp))
}
#########
# merge 4b (merged_df3_comp): remove duplicate mutation information
#########
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
cat('Dim of merged_df3_comp: '
, '\nNo. of rows: ', nrow(merged_df3_comp)
, '\nNo. of cols: ', ncol(merged_df3_comp))
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$AF),]
# compare dfs: foo and merged_df3_com
all.equal(foo, merged_df3)
summary(comparedf(foo, merged_df3))
# sanity check
cat('Checking nrows in merged_df3_comp')
if(nrow(merged_df3_comp) == nrow(merged_df3)){
cat('NO NAs detected in merged_df3 in AF|OR cols'
,'\nNo. of rows are identical: ', nrow(merged_df3))
} else{
if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
cat('PASS: NAs detected in merged_df3 in AF|OR cols'
, '\nNo. of NAs: ', na_count_df3
, '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
, '\nGot no. of rows: ', nrow(merged_df3_comp))
}
}
#=============== end of combining df
#*********************
# writing 1 file in the style of a loop: merged_df3
# print(output dir)
#i = 'merged_df3'
#out_filename = paste0(i, '.csv')
#outfile = paste0(outdir, '/', out_filename)
#cat('Writing output file: '
# ,'\nFilename: ', out_filename
# ,'\nPath: ', outdir)
#template: write.csv(merged_df3, 'merged_df3.csv')
#write.csv(get(i), outfile, row.names = FALSE)
#cat('Finished writing: ', outfile
# , '\nNo. of rows: ', nrow(get(i))
# , '\nNo. of cols: ', ncol(get(i)))
#%% write_output files; all 4 files:
outvars = c('merged_df2'
, 'merged_df3'
, 'merged_df2_comp'
, 'merged_df3_comp')
cat('Writing output files: '
, '\nPath:', outdir)
for (i in outvars){
# cat(i, '\n')
out_filename = paste0(i, '.csv')
# cat(out_filename, '\n')
# cat('getting value of variable: ', get(i))
outfile = paste0(outdir, '/', out_filename)
# cat('Full output path: ', outfile, '\n')
cat('Writing output file:'
,'\nFilename: ', out_filename,'\n')
write.csv(get(i), outfile, row.names = FALSE)
cat('Finished writing: ', outfile
, '\nNo. of rows: ', nrow(get(i))
, '\nNo. of cols: ', ncol(get(i)), '\n')
}
# alternate way to replace with implicit loop
# FIXME
#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv')))
#*************************
# clear variables
rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
rm(pos_count_check)
#============================= end of script

View file

@ -1,330 +0,0 @@
#########################################################
# TASK: To combine mcsm and meta data with af and or
# by filtering for distance to ligand (<10Ang).
# This script doesn't output anything.
# This script is sourced from other .R scripts for plotting ligand plots
# Input csv files:
# 1) mcsm normalised and struct params
# 2) gene associated meta_data_with_AFandOR
#########################################################
getwd()
setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/')
getwd()
##########################################################
# Installing and loading required packages
##########################################################
source('Header_TT.R')
#require(data.table)
#require(arsenal)
#require(compare)
#library(tidyverse)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#===========
# input
#===========
# infile1: mCSM data
#indir = '~/git/Data/pyrazinamide/input/processed/'
indir = paste0('~/git/Data', '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
#in_filename = 'mcsm_complex1_normalised.csv'
in_filename = 'pnca_mcsm_struct_params.csv'
infile = paste0(indir, '/', in_filename)
cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
# infile2: gene associated meta data combined with AF and OR
#indir: same as above
in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv')
infile_comb = paste0(indir, '/', in_filename_comb)
cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
#===========
# output
#===========
# Uncomment if and when required to output
outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir
cat('Output dir: ', outdir)
#out_filename = paste0(tolower(gene), 'XXX')
#outfile = paste0(outdir, '/', out_filename)
#cat(paste0('Output file with full path:', outfile))
#%% end of variable assignment for input and output files
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
cat('Reading mcsm_data:'
, '\nindir: ', indir
, '\ninfile_comb: ', in_filename)
mcsm_data = read.csv(infile
, row.names = 1
, stringsAsFactors = F
, header = T)
cat('Read mcsm_data file:'
, '\nNo.of rows: ', nrow(mcsm_data)
, '\nNo. of cols:', ncol(mcsm_data))
# clear variables
rm(in_filename, infile)
str(mcsm_data)
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
# spelling Correction 1: DUET
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
# spelling Correction 2: Ligand
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
# muts with opposing effects on protomer and ligand stability
# excluded from here as it is redundant.
# check 'combining_two_df.R' to refer if required.
########################### !!! only for mcsm_lig
# 4: Filter/subset data
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)
# count
table(mcsm_data$Dis_lig_Ang<10)
# subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
# sanity checks
max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)
# count no of unique positions
length(unique(mcsm_data2$Position))
# count no of unique mutations
length(unique(mcsm_data2$Mutationinformation))
# count Destabilisinga and stabilising
table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(mcsm_data2$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
#!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT: so as not to alter the script
mcsm_data = mcsm_data2
#!!!!!!!!!!!!!!!!!!!!!
# clear variables
rm(mcsm_data2)
# count na in each column
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
# sort by Mutationinformation
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data$Mutationinformation)
orig_col = ncol(mcsm_data)
# get freq count of positions and add to the df
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
cat('Added 1 col: position frequency to see which posn has how many muts'
, '\nNo. of cols now', ncol(mcsm_data)
, '\nNo. of cols before: ', orig_col)
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
###########################
# 2: Read file: meta data with AFandOR
###########################
cat('Reading combined meta data and AFandOR file:'
, '\nindir: ', indir
, '\ninfile_comb: ', in_filename_comb)
meta_with_afor <- read.csv(infile_comb
, stringsAsFactors = F
, header = T)
cat('Read mcsm_data file:'
, '\nNo.of rows: ', nrow(meta_with_afor)
, '\nNo. of cols:', ncol(meta_with_afor))
# clear variables
rm(in_filename_comb, infile_comb)
str(meta_with_afor)
# sort by Mutationinformation
head(meta_with_afor$Mutationinformation)
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
head(meta_with_afor$Mutationinformation)
###########################
# 3: merging two dfs: with NA
###########################
# link col name = 'Mutationinforamtion'
cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df2')
head(mcsm_data$Mutationinformation)
head(meta_with_afor$Mutationinformation)
#########
# merge 3a: meta data with mcsm
#########
merged_df2 = merge(x = meta_with_afor
,y = mcsm_data
, by = 'Mutationinformation'
, all.y = T)
cat('Dim of merged_df2: '
, '\nNo. of rows: ', nrow(merged_df2)
, '\nNo. of cols: ', ncol(merged_df2))
head(merged_df2$Position)
if(nrow(meta_with_afor) == nrow(merged_df2)){
cat('nrow(merged_df2) = nrow (gene associated metadata)'
,'\nExpected no. of rows: ',nrow(meta_with_afor)
,'\nGot no. of rows: ', nrow(merged_df2))
} else{
cat('nrow(merged_df2)!= nrow(gene associated metadata)'
, '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
, '\nGot no. of rows: ', nrow(merged_df2)
, '\nFinding discrepancy')
merged_muts_u = unique(merged_df2$Mutationinformation)
meta_muts_u = unique(meta_with_afor$Mutationinformation)
# find the index where it differs
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
}
# sort by Position
head(merged_df2$Position)
merged_df2 = merged_df2[order(merged_df2$Position),]
head(merged_df2$Position)
merged_df2v2 = merge(x = meta_with_afor
,y = mcsm_data
, by = 'Mutationinformation'
, all.x = T)
#!=!=!=!=!=!=!=!
# COMMENT: used all.y since position 186 is not part of the struc,
# hence doesn't have a mcsm value
# but 186 is associated with mutation
#!=!=!=!=!=!=!=!
# should be False
identical(merged_df2, merged_df2v2)
table(merged_df2$Position%in%merged_df2v2$Position)
rm(merged_df2v2)
#########
# merge 3b:remove duplicate mutation information
#########
cat('Merging dfs with NAs: small df (removing duplicate muts)'
,'\nCannot trust lineage info from this'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df3')
#==#=#=#=#=#=#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#=#=#=#=#=#=#=
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
# sanity checks
# nrows of merged_df3 should be the same as the nrows of mcsm_data
if(nrow(mcsm_data) == nrow(merged_df3)){
cat('PASS: No. of rows match with mcsm_data'
,'\nExpected no. of rows: ', nrow(mcsm_data)
,'\nGot no. of rows: ', nrow(merged_df3))
} else {
cat('FAIL: No. of rows mismatch'
, '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
, '\nNo. of rows merged_df3: ', nrow(merged_df3))
}
###########################
# 4: merging two dfs: without NA
###########################
cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df2_comp')
#########
# merge 4a: same as merge 1 but excluding NA
#########
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),]
cat('Dim of merged_df2_comp: '
, '\nNo. of rows: ', nrow(merged_df2_comp)
, '\nNo. of cols: ', ncol(merged_df2_comp))
#########
# merge 4b: remove duplicate mutation information
#########
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
cat('Dim of merged_df3_comp: '
, '\nNo. of rows: ', nrow(merged_df3_comp)
, '\nNo. of cols: ', ncol(merged_df3_comp))
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$AF),]
# compare dfs: foo and merged_df3_com
all.equal(foo, merged_df3)
summary(comparedf(foo, merged_df3))
#=============== end of combining df
#*********************
# write_output files
# Not required as this is a subset of the combining_two_df.R
#*************************
# clear variables
rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
rm(pos_count_check)
#============================= end of script

View file

@ -1,215 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 25 08:46:36 2019
@author: tanushree
"""
############################################
# load libraries
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
############################################
#********************************************************************
# TASK: Read in fasta files and create mutant sequences akin to a MSA,
# to allow generation of logo plots
# Requirements:
# input: Fasta file of protein/target for which mut seqs will be created
# path: "Data/<drug>/input/original/<filename>"
# output: MSA for mutant sequences
# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
#%%
# specify input and output variables
homedir = os.path.expanduser('~')
#=======
# input
#=======
#############
# fasta file
#############
indir = 'git/Data/pyrazinamide/input/original'
in_filename_fasta = "3pl1.fasta.txt"
infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta
print(infile_fasta)
#############
# meta data
#############
# FIXME when you change the dir struc
inpath_p = "git/Data/pyrazinamide/input/processed"
in_filename_meta_data = "meta_data_with_AFandOR.csv"
infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data
print("Input file is:", infile_meta_data)
#=======
# output
#=======
outdir = 'git/Data/pyrazinamide/output'
# filenames in respective sections
################## end of variable assignment for input and output files
#%%
#==========
# read files
#==========
#############
# fasta file
#############
my_fasta_o = str()
for seq_record in SeqIO.parse(infile_fasta, "fasta"):
my_seq = seq_record.seq
my_fasta_o = str(my_seq) #convert to a string
print(my_fasta_o)
print(len(my_fasta_o))
# print( type(my_fasta) )
# remove non_struc positions from fasta
def remove_char(str, n):
first_part = str[:n]
last_part = str[n+1:]
return first_part + last_part
#print(remove_char('Python', 0))
ns_pos_o = 186
offset = 1 # 0 based indexing
ns_pos = ns_pos_o - offset
my_fasta = remove_char(my_fasta_o, ns_pos)
print("orig length:", len(my_fasta_o))
print("new length:", len(my_fasta))
#############
# SNP info and no of MSA to generate
#############
# read mutant_info file and extract cols with positions and mutant_info
# This should be all samples with pncA muts
#my_data = pd.read_csv('mcsm_complex1_normalised.csv')
my_data = pd.read_csv(infile_meta_data)
list(my_data.columns)
#my_data['OR'].value_counts()
#my_data['OR'].isna().sum()
#FIXME: You need a better way to identify this
# ideally this file should not contain any non_struc pos
# remove positions not in the structure
my_data = my_data[my_data.position != ns_pos_o]
# if multiple positions, then try the example below;
# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
# count mutations per sample
mut_info = my_data[['id', 'Mutationinformation', 'wild_type', 'position', 'mutant_type']]
# test
foo = mut_info[mut_info.Mutationinformation.str.contains('C72Y')]
foo = mut_info.pivot_table(values = ['Mutationinformation']
, index = ['Mutationinformation', 'id']
# , columns =
, aggfunc = 'count')
# table
foo_tab = mut_info.pivot_table(values = ['Mutationinformation']
# , index = ['Mutationinformation']
, columns = ['id', 'Mutationinformation']
, aggfunc = 'count'
# , margins = True)
)
foo_tab.stack('id')
mut_info.to_csv('mutinfo.csv')
mut_info1 = my_data[['position', 'mutant_type']]
#%%
################
# data cleaning
################
# extract only those positions that have a frequency count of pos>1
###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
# add a column of frequency for each position
#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count')
mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts())
# sort by position
mut_info2 = mut_info1.sort_values(by=['position'])
# count how many pos have freq 1 as you will need to exclude those
mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos
# extract entries with freq_pos>1
# should be 3093-211 = 3072
mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
print("orig length:", len(mut_info1))
print("No. of excluded values:", mutfreq1_count)
print("new length:", len(mut_info3))
# sanity check
if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ):
print("Sanity check passed: Filtered data correctly")
else:
print("Error: Debug you code")
# reset index to allow iteration !!!!!!!!!! IMPORTANT
mut_info = mut_info3.reset_index(drop = True)
##del(mut_info1, mut_info2, mut_info3, my_data)
###################
# generate mut seqs
###################
mut_seqsL = [] * len(mut_info)
# iterate
for i, pos in enumerate(mut_info['position']):
my_fastaL = list(my_fasta)
mut = mut_info['mutant_type'][i]
offset_pos = pos-1
print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
my_fastaL[offset_pos] = mut
print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
mut_seq = "".join(my_fastaL)
# print(mut_seq + '\n')
print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut, mut_seq)
mut_seqsL.append(mut_seq)
###############
# sanity check
################
len_orig = len(my_fasta)
# checking if all the mutant sequences have the same length as the original fasta file sequence
for seqs in mut_seqsL:
# print(seqs)
# print(len(seqs))
if len(seqs) != len_orig:
print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
else:
print('**Hooray** Length of mutant and original sequences match')
del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)
############
# write file
############
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
print(outdir)
out_filename = "gene_msa.txt"
outfile_gene = homedir + '/' + outdir + '/' + out_filename
print(outfile_gene)
with open(outfile_gene, 'w') as file_handler:
for item in mut_seqsL:
file_handler.write("{}\n".format(item))
#R = "\n".join(mut_seqsL)
#f = open('Columns.csv','w')
#f.write(R)
#f.close()

View file

@ -1,9 +0,0 @@
#!/bin/bash
# run all bash scripts for mcsm
#./step0_check_duplicate_SNPs.sh
#./step1_lig_output_urls.sh
./step2_lig_results.sh
./step3a_results_format_interim.sh

View file

@ -1,25 +0,0 @@
#!/bin/bash
#*************************************
# need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#**********************************************************************
# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
# per line. Sort by unique, which automatically removes duplicates.
# sace file in current directory
#**********************************************************************
infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
# sort unique entries and output to current directory
sort -u ${infile} > ${outfile}
# count no. of unique snps mCSM will run on
count=$(wc -l < ${outfile})
# print to console no. of unique snps mCSM will run on
echo "${count} unique mutations for mCSM to run on"

View file

@ -1,104 +0,0 @@
#!/bin/bash
#**********************************************************************
# TASK: submit requests using curl: HANDLE redirects and refresh url.
# Iterate over mutation file and write/append result urls to a file
# Mutation file must have one mutation (format A1B) per line
# Requirements
# input: mutation list (format: A1B), complex struc: (pdb format)
# mutation: outFile from step0, one unique mutation/line, no chain ID
# path: "Data/<drug>/input/processed/<filename>"
# structure: pdb file of drug-target complex
# path: "Data/<drug>/input/structure/<filename>"
# output: should be n urls (n=no. of unique mutations in file)
# path: "Data/<drug>/input/processed/<filename>"
# NOTE: these are just result urls, not actual values for results
#**********************************************************************
############# specify variables for input and output paths and filenames
homedir="${HOME}"
#echo Home directory is ${homedir}
basedir="/git/Data/pyrazinamide/input"
# input
inpath_mut="/processed"
in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
echo Input Mut filename: ${infile_mut}
inpath_struc="/structure"
in_filename_struc="/complex1_no_water.pdb"
infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
echo Input Struc filename: ${infile_struc}
# output
outpath="/processed"
out_filename="/complex1_result_url.txt"
outfile="${homedir}${basedir}${outpath}${out_filename}"
#echo Output filename: ${outfile}
################## end of variable assignment for input and output files
# iterate over mutation file (infile_mut); line by line and
# submit query using curl
# some useful messages
echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
COUNT=0
while read -r line; do
((COUNT++))
mutation="${line}"
# echo "${mutation}"
#pdb='../Data/complex1_no_water.pdb'
pdb="${infile_struc}"
mutation="${mutation}"
chain="A"
lig_id="PZA"
affin_wt="0.99"
host="http://biosig.unimelb.edu.au"
call_url="/mcsm_lig/prediction"
#=========================================
##html field_names names required for curl
##complex_field:wild=@
##mutation_field:mutation=@
##chain_field:chain=@
##ligand_field:lig_id@
##energy_field:affin_wt
#=========================================
refresh_url=$(curl -L \
-sS \
-F "wild=@${pdb}" \
-F "mutation=${mutation}" \
-F "chain=${chain}" \
-F "lig_id=${lig_id}" \
-F "affin_wt=${affin_wt}" \
${host}${call_url} | grep "http-equiv")
#echo Refresh URL: $refresh_url
#echo Host+Refresh: ${host}${refresh_url}
# use regex to extract the relevant bit from the refresh url
# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
# Now build: result url using host and refresh url and write the urls to a file
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
sleep 10
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
# create output file with the added number of muts from file
# after much thought, bad idea as less generic!
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
echo -e "${host}${result_url}" >> ${outfile}
#echo -n '.'
done < "${infile_mut}"
#FIXME: stop executing if error else these echo statements are misleading!
echo
echo Output filename: ${outfile}
echo
echo Number of urls saved: $(wc -l < ${infile_mut})
echo
echo "Processing Complete"
# end of submitting query, receiving result url and storing results url in a file

View file

@ -1,76 +0,0 @@
#!/bin/bash
#********************************************************************
# TASK: submit result urls and fetch actual results using curl
# Iterate over each result url from the output of step1 stored in processed/
# Use curl to fetch results and extract relevant sections using hxtools
# and store these in another file in processed/
# Requirements:
# input: output of step1, file containing result urls
# path: "Data/<drug>/input/processed/<filename>"
# output: name of the file where extracted results will be stored
# path: "Data/<drug>/input/processed/<filename>"
# Optional: can make these command line args you pass when calling script
# by uncommenting code as indicated
#*********************************************************************
############################# uncomment: to make it command line args
#if [ "$#" -ne 2 ]; then
#if [ -Z $1 ]; then
# echo "
# Please provide both Input and Output files.
# Usage: batch_read_urls.sh INFILE OUTFILE
# "
# exit 1
#fi
# First argument: Input File
# Second argument: Output File
#infile=$1
#outfile=$2
############################ end of code block to make command line args
############# specify variables for input and output paths and filenames
homedir="${HOME}"
#echo Home directory is ${homedir}
basedir="/git/Data/pyrazinamide/input"
# input
inpath="/processed"
in_filename="/complex1_result_url.txt"
infile="${homedir}${basedir}${inpath}${in_filename}"
echo Input Mut filename: ${infile}
# output
outpath="/processed"
out_filename="/complex1_output_MASTER.txt"
outfile="${homedir}${basedir}${outpath}${out_filename}"
echo Output filename: ${outfile}
################## end of variable assignment for input and output files
# Iterate over each result url, and extract results using hxtools
# which nicely cleans and formats html
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
#COUNT=$(($COUNT+1))
((COUNT++))
curl --silent ${line} \
| hxnormalize -x \
| hxselect -c div.span4 \
| hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \
>> ${outfile}
#| tee -a ${outfile}
# echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
done < "${infile}"
echo
echo "Processing Complete"

View file

@ -1,74 +0,0 @@
#!/bin/bash
#********************************************************************
# TASK: Intermediate results processing
# output file has a convenient delimiter of ":" that can be used to
# format the file into two columns (col1: field_desc and col2: values)
# However the section "PredictedAffinityChange:...." and
# "DUETstabilitychange:.." are split over multiple lines and
# prevent this from happening. Additionally there are other empty lines
# that need to be omiited. In order ensure these sections are not split
# over multiple lines, this script is written.
# Requirements:
# input: output of step2, file containing mcsm results as described above
# path: "Data/<drug>/input/processed/<filename>"
# output: replaces file in place.
# Therefore first create a copy of the input file
# but rename it to remove the word "MASTER" and add the word "processed"
# file format: .txt
# NOTE: This replaces the file in place!
# the output is a txt file with no newlines and formatting
# to have the following format "<colname><:><value>
#***********************************************************************
############# specify variables for input and output paths and filenames
homedir="${HOME}"
basedir="/git/Data/pyrazinamide/input"
inpath="/processed"
# Create input file: copy and rename output file of step2
oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
cp $oldfile $newfile
echo Input filename is ${oldfile}
echo
echo Output i.e copied filename is ${newfile}
# output: No output perse
# Replacement in place inside the copied file
################## end of variable assignment for input and output files
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
# Outputs records separated by a newline, that look something like this:
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
# Mutationinformation:
# Wild-type:L
# Position:4
# Mutant-type:W
# Chain:A
# LigandID:PZA
# Distancetoligand:15.911&Aring;
# DUETstabilitychange:-2.169Kcal/mol
#
# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
# (...etc)
# This script brings everything in a convenient format for further processing in python.
sed -i '/PredictedAffinityChange/ {
N
N
N
N
s/\n//g
}
/DUETstabilitychange:/ {
N
N
s/\n//g
}
/^$/d' ${newfile}

View file

@ -1,63 +0,0 @@
#!/usr/bin/python
###################
# load libraries
import os, sys
import pandas as pd
from collections import defaultdict
####################
#********************************************************************
# TASK: Formatting results with nice colnames
# step3a processed the mcsm results to remove all newlines and
# brought data in a format where the delimiter ":" splits
# data into a convenient format of "colname": "value".
# this script formats the data and outputs a df with each row
# as a mutation and its corresponding mcsm_values
# Requirements:
# input: output of step3a, file containing "..._output_processed.txt"
# path: "Data/<drug>/input/processed/<filename>"
# output: formatted .csv file
# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
############# specify variables for input and output paths and filenames
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
basedir = "/git/Data/pyrazinamide/input"
# input
inpath = "/processed"
in_filename = "/complex1_output_processed.txt"
infile = homedir + basedir + inpath + in_filename
print("Input file is:", infile)
# output
outpath = "/processed"
out_filename = "/complex1_formatted_results.csv"
outfile = homedir + basedir + outpath + out_filename
print("Output file is:", outfile)
################## end of variable assignment for input and output files
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open(infile)]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)

View file

@ -1,230 +0,0 @@
getwd()
#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
getwd()
#=======================================================
# TASK: read formatted_results_df.csv to complete
# missing info, adding DUET categories, assigning
# meaningful colnames, etc.
# Requirements:
# input: output of step3b, python processing,
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
#=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
basedir = "/git/Data/pyrazinamide/input"
inpath = "/processed"
in_filename = "/complex1_formatted_results.csv"
infile = paste0(homedir, basedir, inpath, in_filename)
print(paste0("Input file is:", infile))
#======================================================
#TASK: To tidy the columns so you can generate figures
#=======================================================
####################
#### read file #####: this will be the output from python script (csv file)
####################
data = read.csv(infile
, header = T
, stringsAsFactors = FALSE)
dim(data)
str(data)
# clear variables
rm(homedir, basedir, inpath, in_filename, infile)
###########################
##### Data processing #####
###########################
# populate mutation information columns as currently it is empty
head(data$Mutationinformation)
tail(data$Mutationinformation)
# should not be blank: create muation information
data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
head(data$Mutationinformation)
tail(data$Mutationinformation)
#write.csv(data, 'test.csv')
##########################################
# Remove duplicate SNPs as a sanity check
##########################################
# very important
table(duplicated(data$Mutationinformation))
# extract duplicated entries
dups = data[duplicated(data$Mutationinformation),] #0
# No of dups should match with the no. of TRUE in the above table
#u_dups = unique(dups$Mutationinformation) #10
sum( table(dups$Mutationinformation) )
#***************************************************************
# select non-duplicated SNPs and create a new df
df = data[!duplicated(data$Mutationinformation),]
#***************************************************************
# sanity check
u = unique(df$Mutationinformation)
u2 = unique(data$Mutationinformation)
table(u%in%u2)
# should all be 1
sum(table(df$Mutationinformation) == 1)
# sort df by Position
# MANUAL CHECKPOINT:
#foo <- df[order(df$Position),]
#df <- df[order(df$Position),]
# clear variables
rm(u, u2, dups)
####################
#### give meaningful colnames to reflect units to enable correct data type
####################
#=======
#STEP 1
#========
# make a copy of the PredictedAffinityColumn and call it Lig_outcome
df$Lig_outcome = df$PredictedAffinityChange
#make Predicted...column numeric and outcome column categorical
head(df$PredictedAffinityChange)
df$PredictedAffinityChange = gsub("log.*"
, ""
, df$PredictedAffinityChange)
# sanity checks
head(df$PredictedAffinityChange)
# should be numeric, check and if not make it numeric
is.numeric( df$PredictedAffinityChange )
# change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
# should be TRUE
is.numeric( df$PredictedAffinityChange )
# change the column name to indicate units
n = which(colnames(df) == "PredictedAffinityChange"); n
colnames(df)[n] = "PredAffLog"
colnames(df)[n]
#========
#STEP 2
#========
# make Lig_outcome column categorical showing effect of mutation
head(df$Lig_outcome)
df$Lig_outcome = gsub("^.*-"
, "",
df$Lig_outcome)
# sanity checks
head(df$Lig_outcome)
# should be factor, check and if not change it to factor
is.factor(df$Lig_outcome)
# change to factor
df$Lig_outcome = as.factor(df$Lig_outcome)
# should be TRUE
is.factor(df$Lig_outcome)
#========
#STEP 3
#========
# gsub
head(df$Distancetoligand)
df$Distancetoligand = gsub("&Aring;"
, ""
, df$Distancetoligand)
# sanity checks
head(df$Distancetoligand)
# should be numeric, check if not change it to numeric
is.numeric(df$Distancetoligand)
# change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand)
# should be TRUE
is.numeric(df$Distancetoligand)
# change the column name to indicate units
n = which(colnames(df) == "Distancetoligand")
colnames(df)[n] <- "Dis_lig_Ang"
colnames(df)[n]
#========
#STEP 4
#========
#gsub
head(df$DUETstabilitychange)
df$DUETstabilitychange = gsub("Kcal/mol"
, ""
, df$DUETstabilitychange)
# sanity checks
head(df$DUETstabilitychange)
# should be numeric, check if not change it to numeric
is.numeric(df$DUETstabilitychange)
# change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
# should be TRUE
is.numeric(df$DUETstabilitychange)
# change the column name to indicate units
n = which(colnames(df) == "DUETstabilitychange"); n
colnames(df)[n] = "DUETStability_Kcalpermol"
colnames(df)[n]
#========
#STEP 5
#========
# create yet another extra column: classification of DUET stability only
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
, "Stabilizing"
, "Destabilizing") # spelling to be consistent with mcsm
table(df$Lig_outcome)
table(df$DUET_outcome)
#==============================
#FIXME
#Insert a venn diagram
#================================
#========
#STEP 6
#========
# assign wild and mutant colnames correctly
wt = which(colnames(df) == "Wild.type"); wt
colnames(df)[wt] <- "Wild_type"
colnames(df[wt])
mut = which(colnames(df) == "Mutant.type"); mut
colnames(df)[mut] <- "Mutant_type"
colnames(df[mut])
#========
#STEP 7
#========
# create an extra column: maybe useful for some plots
df$WildPos = paste0(df$Wild_type, df$Position)
# clear variables
rm(n, wt, mut)
################ end of data cleaning

View file

@ -1,275 +0,0 @@
##################
# load libraries
library(compare)
##################
getwd()
#=======================================================
# TASK:read cleaned data and perform rescaling
# of DUET stability scores
# of Pred affinity
# compare scaling methods with plots
# Requirements:
# input: R script, step3c_results_cleaning.R
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
# output normalised file
#=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
in_filename = "/step3c_results_cleaning.R"
infile = paste0(homedir, currdir, in_filename)
print(paste0("Input file is:", infile))
# output file
basedir = "/git/Data/pyrazinamide/input"
outpath = "/processed"
out_filename = "/mcsm_complex1_normalised.csv"
outfile = paste0(homedir, basedir, outpath, out_filename)
print(paste0("Output file is:", outfile))
####################
#### read file #####: this will be the output of my R script that cleans the data columns
####################
source(infile)
#This will outut two dataframes:
# data: unclean data: 10 cols
# df : cleaned df: 13 cols
# you can remove data if you want as you will not need it
rm(data)
colnames(df)
#===================
#3a: PredAffLog
#===================
n = which(colnames(df) == "PredAffLog"); n
group = which(colnames(df) == "Lig_outcome"); group
#===================================================
# order according to PredAffLog values
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$PredAffLog)
# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
df = df[order(df$PredAffLog),]
head(df$PredAffLog)
# sanity checks
head(df[,n]) # all negatives
tail(df[,n]) # all positives
# sanity checks
mean(df[,n])
#-0.9526746
tapply(df[,n], df[,group], mean)
#===========================
# Same as above: in 2 steps
#===========================
# find range of your data
my_min = min(df[,n]); my_min #
my_max = max(df[,n]); my_max #
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one: preserves sign
#===============================================
df$ratioPredAff = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
)# 14 cols
# sanity checks
head(df$ratioPredAff)
tail(df$ratioPredAff)
min(df$ratioPredAff); max(df$ratioPredAff)
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
# should be the same as below
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
table(df$Lig_outcome)
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
# uncomment as necessary
my_title = "Ligand_stability"
# my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioPredAff
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioPredAff )
, main = "ratio rescaling"
)
# titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
#clear variables
rm(my_min, my_max, my_title, n, group)
#===================
# 3b: DUET stability
#===================
dim(df) # 14 cols
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
group = which(colnames(df) == "DUET_outcome"); group #12
#===================================================
# order according to DUET scores
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$DUETStability_Kcalpermol)
# ORDER BY DUET scores: negative values at the top and positive at the bottom
df = df[order(df$DUETStability_Kcalpermol),]
# sanity checks
head(df[,n]) # negatives
tail(df[,n]) # positives
# sanity checks
mean(df[,n])
tapply(df[,n], df[,group], mean)
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one: preserves sign
#===============================================
# find range of your data
my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max
df$ratioDUET = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
) # 15 cols
# sanity check
head(df$ratioDUET)
tail(df$ratioDUET)
min(df$ratioDUET); max(df$ratioDUET)
# sanity checks
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
# should be the same as below (267 and 42)
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
table(df$DUET_outcome)
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
# uncomment as necessary
my_title = "DUET_stability"
#my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioDUET
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioDUET )
, main = "ratio rescaling"
)
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
# reorder by column name
#data <- data[c("A", "B", "C")]
colnames(df)
df2 = df[c("X", "Mutationinformation", "WildPos", "Position"
, "Wild_type", "Mutant_type"
, "DUETStability_Kcalpermol", "DUET_outcome"
, "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
, "ratioDUET", "ratioPredAff"
, "LigandID","Chain")]
# sanity check
# should be True
#compare(df, df2, allowAll = T)
compare(df, df2, ignoreColOrder = T)
#TRUE
#reordered columns
#===================
# write output as csv file
#===================
#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
write.csv(df2, outfile, row.names = FALSE)

View file

@ -1,131 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
require(dplyr)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
###########################
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
###########################
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
###########################
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(my_df$Position)
head(my_df$ratioDUET)
# order data frame
df = my_df[order(my_df$Position),]
head(df$Position)
head(df$ratioDUET)
#***********
# PS: average by position
#***********
mean_DUET_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.DUET = mean(ratioDUET))
#***********
# Lig: average by position
#***********
mean_Lig_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.Lig = mean(ratioPredAff))
#***********
# cbind:mean_DUET_by_position and mean_Lig_by_position
#***********
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
# sanity check
# mean_PS_Lig_Bfactor
colnames(combined)
colnames(combined) = c("Position"
, "average_DUETR"
, "Position2"
, "average_PredAffR")
colnames(combined)
identical(combined$Position, combined$Position2)
n = which(colnames(combined) == "Position2"); n
combined_df = combined[,-n]
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
#=============
# output csv
#============
outDir = "~/git/Data/pyrazinamide/input/processed/"
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
print(paste0("Output file with path will be:","", outFile))
head(combined_df$Position); tail(combined_df$Position)
write.csv(combined_df, outFile
, row.names = F)

View file

@ -1,250 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(cowplot)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#my_df = merged_df3
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# sanity check
# Ensure correct data type in columns to plot: need to be factor
is.numeric(my_df$OR)
#[1] TRUE
#<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
# FOR PS Plots
#<<<<<<<<<<<<<<<<<<<
PS_df = my_df
rm(my_df)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
########################################################################
# Read file: call script for combining df for lig #
########################################################################
getwd()
source("combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df2 = merged_df3_comp
#my_df2 = merged_df3
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df2)
str(my_df2)
# sanity check
# Ensure correct data type in columns to plot: need to be factor
is.numeric(my_df2$OR)
#[1] TRUE
# sanity check: should be <10
if (max(my_df2$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
#<<<<<<<<<<<<<<<<
# REASSIGNMENT
# FOR Lig Plots
#<<<<<<<<<<<<<<<<
Lig_df = my_df2
rm(my_df2)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
#############
# Plots: Bubble plot
# x = Position, Y = stability
# size of dots = OR
# col: stability
#############
#=================
# generate plot 1: DUET vs OR by position as geom_points
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
g = ggplot(PS_df, aes(x = factor(Position)
, y = ratioDUET))
p1 = g +
geom_point(aes(col = DUET_outcome
, size = OR)) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "DUET(PS)"
, size = "Odds Ratio"
, colour = "DUET Outcome") +
guides(colour = guide_legend(override.aes = list(size=4)))
p1
#=================
# generate plot 2: Lig vs OR by position as geom_points
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
table(Lig_df$Lig_outcome)
g = ggplot(Lig_df, aes(x = factor(Position)
, y = ratioPredAff))
p2 = g +
geom_point(aes(col = Lig_outcome
, size = OR))+
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "Ligand Affinity"
, size = "Odds Ratio"
, colour = "Ligand Outcome"
) +
guides(colour = guide_legend(override.aes = list(size=4)))
p2
#======================
#combine using cowplot
#======================
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
theme_set(theme_gray()) # to preserve default theme
printFile = cowplot::plot_grid(plot_grid(p1, p2
, ncol = 1
, align = 'v'
, labels = c("(a)", "(b)")
, label_size = my_als+5))
print(printFile)
dev.off()

View file

@ -1,154 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# Lig_outcome. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding Lig_outcome.
#============================
#===================
# Data for plots
#===================
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = my_df
#%%%%%%%%%%%%%%%%%%%%%%%%
rm(my_df)
# sanity checks
upos = unique(my_df$Position)
# should be a factor
is.factor(df$Lig_outcome)
#TRUE
table(df$Lig_outcome)
# should be -1 and 1: may not be in this case because you have filtered the data
# FIXME: normalisation before or after filtering?
min(df$ratioPredAff) #
max(df$ratioPredAff) #
# sanity checks
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
my_title = "Ligand affinity"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = Lig_outcome), colour = "grey") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -1,149 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot 2: Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# DUET outcome. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET_outcome
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
upos = unique(df$Position)
# should be a factor
is.factor(my_df$DUET_outcome)
#[1] TRUE
table(my_df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
my_title = "Protein stability (DUET)"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = DUET_outcome), colour = "grey") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -1,202 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$Lig_outcome)
my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
is.factor(my_df$Lig_outcome)
#[1] TRUE
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding Lig stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
table(df$Lig_outcome)
# should be -1 and 1: may not be in this case because you have filtered the data
# FIXME: normalisation before or after filtering?
min(df$ratioPredAff) #
max(df$ratioPredAff) #
# sanity checks
# very important!!!!
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = Lig_outcome
# subgroup = normalised score i.e ratioPredAff
# Prepare data: round off ratioLig scores
# round off to 3 significant digits:
# 165 if no rounding is performed: used to generate the originalgraph
# 156 if rounded to 3 places
# FIXME: check if reducing precision creates any ML prob
# check unique values in normalised data
u = unique(df$ratioPredAff)
# <<<<< -------------------------------------------
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioLigR = round(df$ratioPredAff, n)
u = unique(df$ratioLigR) # 156
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioLigR
df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# else
# uncomment the below if rounding is not required
#my_grp = df$ratioLig
#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# <<<<< -----------------------------------------------
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
my_title = "Ligand affinity"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -1,192 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
upos = unique(df$Position)
# should be a factor
is.factor(my_df$DUET_outcome)
#[1] TRUE
table(df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = DUET_outcome
# subgroup = normalised score i.e ratioDUET
# Prepare data: round off ratioDUET scores
# round off to 3 significant digits:
# 323 if no rounding is performed: used to generate the original graph
# 287 if rounded to 3 places
# FIXME: check if reducing precicion creates any ML prob
# check unique values in normalised data
u = unique(df$ratioDUET)
# <<<<< -------------------------------------------
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioDUETR = round(df$ratioDUET, n)
u = unique(df$ratioDUETR)
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioDUETR
df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# else
# uncomment the below if rounding is not required
#my_grp = df$ratioDUET
#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# <<<<< -----------------------------------------------
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
my_title = "Protein stability (DUET)"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -1,296 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
############################################################
# 1: Installing and loading required packages and functions
############################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
############################################################
# Output dir for plots
############################################################
out_dir = "~/git/Data/pyrazinamide/output/plots"
############################################################
# 2: call script the prepares the data with columns containing
# colours for axis labels
############################################################
source("subcols_axis_LIG.R")
# this should return
#mut_pos_cols: 52, 4
#my_df: 169, 39
# clear excess variable
# "mut_pos_cols" is just for inspection in case you need to cross check
# position numbers and colours
# open file from deskptop ("sample_axis_cols") for cross checking
table(mut_pos_cols$lab_bg)
sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True
table(mut_pos_cols$lab_bg2)
sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True
table(mut_pos_cols$lab_fg)
sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True
# very important!: should be the length of the unique positions
my_axis_colours = mut_pos_cols$lab_fg
# now clear mut_pos_cols
rm(mut_pos_cols)
###########################
# 2: Plot: Lig scores
###########################
#==========================
# Plot 2: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding PredAff stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
# sanity checks
upos = unique(my_df$Position)
str(my_df$Lig_outcome)
colnames(my_df)
#===========================
# Data preparation for plots
#===========================
#!!!!!!!!!!!!!!!!!
# REASSIGNMENT
df <- my_df
#!!!!!!!!!!!!!!!!!
rm(my_df)
# sanity checks
# should be a factor
is.factor(df$Lig_outcome);
#FALSE
df$Lig_outcome = as.factor(df$Lig_outcome)
is.factor(df$Lig_outcome);
#TRUE
table(df$Lig_outcome)
# check the range
min(df$ratioPredAff)
max(df$ratioPredAff)
# sanity checks
# very important!!!!
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = Lig_outcome
# subgroup = normalised score i.e ratioPredAff
# Prepare data: round off ratioPredAff scores
# round off to 3 significant digits:
# 323 if no rounding is performed: used to generate the original graph
# 287 if rounded to 3 places
# FIXME: check if reducing precicion creates any ML prob
# check unique values in normalised data
u = unique(df$ratioPredAff)
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioPredAffR = round(df$ratioPredAff, n)
u = unique(df$ratioPredAffR)
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioPredAffR
df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# ELSE
# uncomment the below if rounding is not required
#my_grp = df$ratioPredAff
#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#******************
# generate plot
#******************
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
my_title = "Ligand Affinity"
library(ggplot2)
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis according to frequency
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
#========================
# plot with axis colours
#========================
class(df$lab_bg)
# make this a named vector
# define cartesian coord
my_xlim = length(unique(df$Position)); my_xlim
# axis label size
my_xals = 15
my_yals = 15
# axes text size
my_xats = 15
my_yats = 18
# using geom_tile
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
coord_cartesian(xlim = c(1, my_xlim)
, ylim = c(0, 6)
, clip = "off") +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
, fill = df$lab_bg) +
geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
, fill = df$lab_bg2) +
# Here it's important to specify that your axis goes from 1 to max number of levels
theme( axis.text.x = element_text(size = my_xats
, angle = 90
, hjust = 1
, vjust = 0.4
, colour = my_axis_colours)
, axis.text.y = element_text(size = my_yats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xals)
, axis.title.y = element_text(size = my_yals )
, axis.ticks.x = element_blank()
) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
#========================
# output plot as svg/png
#========================
class(df$lab_bg)
# make this a named vector
# define cartesian coord
my_xlim = length(unique(df$Position)); my_xlim
# axis label size
my_xals = 18
my_yals = 18
# axes text size
my_xats = 16 #14 in PS
my_yats = 18
# set output dir for plots
#getwd()
#setwd("~/git/Data/pyrazinamide/output/plots")
#getwd()
plot_name = "barplot_LIG_acoloured.svg"
my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name
svg(my_plot_name, width = 26, height = 4)
g = ggplot(df, aes(factor(Position, ordered = T)))
outFile = g +
coord_cartesian(xlim = c(1, my_xlim)
, ylim = c(0, 6)
, clip = "off"
) +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
# geom_tile(aes(,-0.6, width = 0.9, height = 0.7)
# , fill = df$lab_bg) +
# geom_tile(aes(,-1, width = 0.9, height = 0.3)
# , fill = df$lab_bg2) +
geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
, fill = df$lab_bg) +
geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
, fill = df$lab_bg2) +
# Here it's important to specify that your axis goes from 1 to max number of levels
theme( axis.text.x = element_text(size = my_xats
, angle = 90
, hjust = 1
, vjust = 0.4
, colour = my_axis_colours)
, axis.text.y = element_text(size = my_yats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xals)
, axis.title.y = element_text(size = my_yals )
, axis.ticks.x = element_blank()
) +
labs(title = ""
, x = "Position"
, y = "Frequency")
print(outFile)
dev.off()
# for sanity and good practice
#rm(df)

View file

@ -1,292 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
############################################################
# 1: Installing and loading required packages and functions
############################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
############################################################
# Output dir for plots
############################################################
out_dir = "~/git/Data/pyrazinamide/output/plots"
############################################################
# 2: call script the prepares the data with columns containing
# colours for axis labels
############################################################
source("subcols_axis.R")
# this should return
#mut_pos_cols: 130, 4
#my_df: 335, 39
# clear excess variable
# "mut_pos_cols" is just for inspection in case you need to cross check
# position numbers and colours
# open file from deskptop ("sample_axis_cols") for cross checking
table(mut_pos_cols$lab_bg)
sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True
table(mut_pos_cols$lab_bg2)
sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True
table(mut_pos_cols$lab_fg)
sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True
# very important!
my_axis_colours = mut_pos_cols$lab_fg
# now clear mut_pos_cols
rm(mut_pos_cols)
###########################
# 2: Plot: DUET scores
###########################
#==========================
# Plot 2: Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
# sanity checks
upos = unique(my_df$Position)
str(my_df$DUET_outcome)
colnames(my_df)
#===========================
# Data preparation for plots
#===========================
#!!!!!!!!!!!!!!!!!
# REASSIGNMENT
df <- my_df
#!!!!!!!!!!!!!!!!!
rm(my_df)
# sanity checks
# should be a factor
is.factor(df$DUET_outcome)
#TRUE
table(df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
# sanity checks
# very important!!!!
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = DUET_outcome
# subgroup = normalised score i.e ratioDUET
# Prepare data: round off ratioDUET scores
# round off to 3 significant digits:
# 323 if no rounding is performed: used to generate the original graph
# 287 if rounded to 3 places
# FIXME: check if reducing precicion creates any ML prob
# check unique values in normalised data
u = unique(df$ratioDUET)
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioDUETR = round(df$ratioDUET, n)
u = unique(df$ratioDUETR)
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioDUETR
df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# ELSE
# uncomment the below if rounding is not required
#my_grp = df$ratioDUET
#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#******************
# generate plot
#******************
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
my_title = "Protein stability (DUET)"
library(ggplot2)
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis according to frequency
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
#========================
# plot with axis colours
#========================
class(df$lab_bg)
# make this a named vector
# define cartesian coord
my_xlim = length(unique(df$Position)); my_xlim
# axis label size
my_xals = 15
my_yals = 15
# axes text size
my_xats = 15
my_yats = 18
# using geom_tile
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
coord_cartesian(xlim = c(1, my_xlim)
, ylim = c(0, 6)
, clip = "off") +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
, fill = df$lab_bg) +
geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
, fill = df$lab_bg2) +
# Here it's important to specify that your axis goes from 1 to max number of levels
theme( axis.text.x = element_text(size = my_xats
, angle = 90
, hjust = 1
, vjust = 0.4
, colour = my_axis_colours)
, axis.text.y = element_text(size = my_yats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xals)
, axis.title.y = element_text(size = my_yals )
, axis.ticks.x = element_blank()
) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
#========================
# output plot as svg/png
#========================
class(df$lab_bg)
# make this a named vector
# define cartesian coord
my_xlim = length(unique(df$Position)); my_xlim
# axis label size
my_xals = 18
my_yals = 18
# axes text size
my_xats = 14
my_yats = 18
# set output dir for plots
#getwd()
#setwd("~/git/Data/pyrazinamide/output/plots")
#getwd()
plot_name = "barplot_PS_acoloured.svg"
my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name
svg(my_plot_name, width = 26, height = 4)
g = ggplot(df, aes(factor(Position, ordered = T)))
outFile = g +
coord_cartesian(xlim = c(1, my_xlim)
, ylim = c(0, 6)
, clip = "off"
) +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
# geom_tile(aes(,-0.6, width = 0.9, height = 0.7)
# , fill = df$lab_bg) +
# geom_tile(aes(,-1, width = 0.9, height = 0.3)
# , fill = df$lab_bg2) +
geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
, fill = df$lab_bg) +
geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
, fill = df$lab_bg2) +
# Here it's important to specify that your axis goes from 1 to max number of levels
theme( axis.text.x = element_text(size = my_xats
, angle = 90
, hjust = 1
, vjust = 0.4
, colour = my_axis_colours)
, axis.text.y = element_text(size = my_yats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xals)
, axis.title.y = element_text(size = my_yals )
, axis.ticks.x = element_blank()
) +
labs(title = ""
, x = "Position"
, y = "Frequency")
print(outFile)
dev.off()
# for sanity and good practice
#rm(df)

View file

@ -1,215 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#require(data.table)
#require(dplyr)
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$Lig_outcome)
my_df$Lig_outcome = as.factor(my_df$lig_outcome)
is.factor(my_df$Lig_outcome)
#[1] TRUE
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Basic barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
if (identical(df$Position, df$position)){
print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
print("Error!: Check column names and info contained")
}
#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('basic_barplots_LIG.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = Lig_outcome))
#prinfFile = g + geom_bar(
g + geom_bar(
aes(fill = Lig_outcome)
, show.legend = TRUE
) + geom_label(
stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) + theme(
axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()
) + labs(
title = ""
, y = "Number of SNPs"
#, fill='Ligand Outcome'
) + scale_fill_discrete(name = "Ligand Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#require(data.table)
setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
head(df$pos_count)
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#5 24 36 56 30 18
# use group by on this
snpsBYpos_df <- df %>%
group_by(Position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#5 12 12 14 6 3
# this is what will get plotted
svg('position_count_LIG.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
aes (alpha = 0.5)
, show.legend = FALSE
) +
geom_label(
stat = "count", aes(label = ..count..)
, color = "black"
, size = 10
) +
theme(
axis.text.x = element_text(
size = my_ats
, angle = 0
)
, axis.text.y = element_text(
size = my_ats
, angle = 0
, hjust = 1
)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()
) +
labs(
x = "Number of SNPs"
, y = "Number of Sites"
)
print(prinfFile)
dev.off()
########################################################################
# end of Lig barplots #
########################################################################

View file

@ -1,211 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Basic barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
if (identical(df$Position, df$position)){
print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
print("Error!: Check column names and info contained")
}
#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('basic_barplots_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
theme_set(theme_grey())
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = DUET_outcome))
prinfFile = g + geom_bar(
#g + geom_bar(
aes(fill = DUET_outcome)
, show.legend = TRUE
) + geom_label(
stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) + theme(
axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()
) + labs(
title = ""
, y = "Number of SNPs"
#, fill='DUET Outcome'
) + scale_fill_discrete(name = "DUET Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#34 76 63 104 40 18
# use group by on this
snpsBYpos_df <- df %>%
group_by(Position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#34 38 21 26 8 3
foo = select(df, Mutationinformation
, WildPos
, wild_type
, mutant_type
, mutation_info
, position
, pos_count) #335, 5
getwd()
write.csv(foo, "../Data/pos_count_freq.csv")
svg('position_count_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
aes (alpha = 0.5)
, show.legend = FALSE
) +
geom_label(
stat = "count", aes(label = ..count..)
, color = "black"
, size = 10
) +
theme(
axis.text.x = element_text(
size = my_ats
, angle = 0
)
, axis.text.y = element_text(
size = my_ats
, angle = 0
, hjust = 1
)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()
) +
labs(
x = "Number of SNPs"
, y = "Number of Sites"
)
print(prinfFile)
dev.off()
########################################################################
# end of DUET barplots #
########################################################################

View file

@ -1,175 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for PS Corr plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Correlation plots
#===========================
#===================
# Data for plots
#===================
#!!!!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT
df = my_df
#!!!!!!!!!!!!!!!!!!!!!!!!
rm(my_df)
# sanity checks
str(df)
table(df$DUET_outcome)
# unique positions
length(unique(df$Position)) #{RESULT: unique positions for comp data}
# subset data to generate pairwise correlations
corr_data = df[, c("ratioDUET"
# , "ratioPredAff"
# , "DUETStability_Kcalpermol"
# , "PredAffLog"
# , "OR"
, "logor"
# , "pvalue"
, "neglog10pvalue"
, "AF"
, "DUET_outcome"
# , "Lig_outcome"
, "pyrazinamide"
)]
dim(corr_data)
rm(df)
# assign nice colnames (for display)
my_corr_colnames = c("DUET"
# , "Ligand Affinity"
# , "DUET_raw"
# , "Lig_raw"
# , "OR"
, "Log(Odds Ratio)"
# , "P-value"
, "-LogP"
, "Allele Frequency"
, "DUET_outcome"
# , "Lig_outcome"
, "pyrazinamide")
# sanity check
if (length(my_corr_colnames) == length(corr_data)){
print("Sanity check passed: corr_data and corr_names match in length")
}else{
print("Error: length mismatch!")
}
colnames(corr_data)
colnames(corr_data) <- my_corr_colnames
colnames(corr_data)
###############
# PLOTS: corr
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
###############
#default pairs plot
start = 1
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
offset = 1
my_corr = corr_data[start:(end-offset)]
head(my_corr)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
#==========
# psych: ionformative since it draws the ellipsoid
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
#==========
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('DUET_corr.svg', width = 15, height = 15)
printFile = pairs.panels(my_corr[1:4]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
, pch = 21
, jitter = T
#, alpha = .05
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 3
, cex.cor = 1
, smooth = F
)
print(printFile)
dev.off()

View file

@ -1,187 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig Corr plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Correlation plots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
table(df$Lig_outcome)
# unique positions
length(unique(df$Position)) #{RESULT: unique positions for comp data}
# subset data to generate pairwise correlations
corr_data = df[, c(#"ratioDUET",
"ratioPredAff"
# , "DUETStability_Kcalpermol"
# , "PredAffLog"
# , "OR"
, "logor"
# , "pvalue"
, "neglog10pvalue"
, "AF"
# , "DUET_outcome"
, "Lig_outcome"
, "pyrazinamide"
)]
dim(corr_data)
rm(df)
# assign nice colnames (for display)
my_corr_colnames = c(#"DUET",
"Ligand Affinity"
# ,"DUET_raw"
# , "Lig_raw"
# , "OR"
, "Log(Odds Ratio)"
# , "P-value"
, "-LogP"
, "Allele Frequency"
# , "DUET_outcome"
, "Lig_outcome"
, "pyrazinamide")
# sanity check
if (length(my_corr_colnames) == length(corr_data)){
print("Sanity check passed: corr_data and corr_names match in length")
}else{
print("Error: length mismatch!")
}
colnames(corr_data)
colnames(corr_data) <- my_corr_colnames
colnames(corr_data)
###############
# PLOTS: corr
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
###############
# default pairs plot
start = 1
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
offset = 1
my_corr = corr_data[start:(end-offset)]
head(my_corr)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
#==========
# psych: ionformative since it draws the ellipsoid
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
#==========
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('Lig_corr.svg', width = 15, height = 15)
printFile = pairs.panels(my_corr[1:4]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
, pch = 21
, jitter = T
# , alpha = .05
# , points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 3
, cex.cor = 1
, smooth = F
)
print(printFile)
dev.off()

View file

@ -1,227 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
########################################################################
# Read file: call script for combining df #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for plots
# you need merged_df2, comprehensive one
# since this has one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
#==========================
# Plot: Lineage barplot
# x = lineage y = No. of samples
# col = Lineage
# fill = lineage
#============================
table(my_df$lineage)
# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV
#3 104 1293 264 1311 6 6 105
#===========================
# Plot: Lineage Barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# get freq count of positions so you can subset freq<1
#setDT(df)[, lineage_count := .N, by = .(lineage)]
#******************
# generate plot: barplot of mutation by lineage
#******************
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
df_lin = subset(df, subset = lineage %in% sel_lineages )
#FIXME; add sanity check for numbers.
# Done this manually
############################################################
#########
# Data for barplot: Lineage barplot
# to show total samples and number of unique mutations
# within each linege
##########
# Create df with lineage inform & no. of unique mutations
# per lineage and total samples within lineage
# this is essentially barplot with two y axis
bar = bar = as.data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(df$id)[df$lineage==i])
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = df[df$lineage==i,]
print(paste0(i, "======="))
print(length(unique(foo$Mutationinformation)))
curr_count = length(unique(foo$Mutationinformation))
total_snps_u = c(total_snps_u, curr_count)
}
print(total_snps_u)
bar$num_snps_u = total_snps_u
bar$total_samples = total_samples
bar
#*****************
# generate plot: lineage barplot with two y-axis
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
#*****************
bar$num_snps_u = y1
bar$total_samples = y2
sel_lineages = x
to_plot = data.frame(x = x
, y1 = y1
, y2 = y2)
to_plot
melted = melt(to_plot, id = "x")
melted
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_basic_barplot.svg')
my_ats = 20 # axis text size
my_als = 22 # axis label size
g = ggplot(melted
, aes(x = x
, y = value
, fill = variable)
)
printFile = g + geom_bar(
#g + geom_bar(
stat = "identity"
, position = position_stack(reverse = TRUE)
, alpha=.75
, colour='grey75'
) + theme(
axis.text.x = element_text(
size = my_ats
# , angle= 30
)
, axis.text.y = element_text(size = my_ats
#, angle = 30
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(
size = my_als
, colour = 'black'
)
, axis.title.y = element_text(
size = my_als
, colour = 'black'
)
, legend.position = "top"
, legend.text = element_text(size = my_als)
#) + geom_text(
) + geom_label(
aes(label = value)
, size = 5
, hjust = 0.5
, vjust = 0.5
, colour = 'black'
, show.legend = FALSE
#, check_overlap = TRUE
, position = position_stack(reverse = T)
#, position = ('
) + labs(
title = ''
, x = ''
, y = "Number"
, fill = 'Variable'
, colour = 'black'
) + scale_fill_manual(
values = c('grey50', 'gray75')
, name=''
, labels=c('Mutations', 'Total Samples')
) + scale_x_discrete(
breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
)
print(printFile)
dev.off()

View file

@ -1,253 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for Lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Data for plot: assign as
# necessary
#===========================
# uncomment as necessary
#!!!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT
#==================
# data for ALL muts
#==================
plot_df = my_df
my_plot_name = 'lineage_dist_PS.svg'
#my_plot_name = 'lineage_dist_PS_comp.svg'
#=======================
# data for dr_muts ONLY
#=======================
#plot_df = my_df_dr
#my_plot_name = 'lineage_dist_dr_PS.svg'
#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
#!!!!!!!!!!!!!!!!!!!!!!!
#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
#===================
# Data for plots
#===================
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#78 961 195 803
# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#77 955 194 770
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#!!!!!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT
df <- df_lin
#!!!!!!!!!!!!!!!!!!!!!!!!!
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)
my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
g <- ggplot(df, aes(x = ratioPredAff)) +
geom_density(aes(fill = Lig_outcome)
, alpha = 0.5) +
facet_wrap( ~ lineage
, scales = "free"
, labeller = labeller(lineage = my_labels) ) +
coord_cartesian(xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
)
ggtitle("Kernel Density estimates of Ligand affinity by lineage")
ggplotly(g)
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# check plot name
my_plot_name
svg(my_plot_name)
printFile = ggplot( df, aes(x = ratioPredAff
, y = Lig_outcome) ) +
geom_density_ridges_gradient( aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
# , switch = 'x'
, labeller = labeller(lineage = my_labels) ) +
coord_cartesian( xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
) +
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
, name = "Ligand Affinity" ) +
theme( axis.text.x = element_text( size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
# , axis.text.y = element_text( size = my_ats
# , angle = 0
# , hjust = 1
# , vjust = 0)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = 10)
, legend.title = element_text(size = my_als)
# , legend.position = c(0.3, 0.8)
# , legend.key.height = unit(1, 'mm')
)
print(printFile)
dev.off()
#===================================================
# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)
lin1 = df[df$lineage == "lineage1",]$ratioPredAff
lin2 = df[df$lineage == "lineage2",]$ratioPredAff
lin3 = df[df$lineage == "lineage3",]$ratioPredAff
lin4 = df[df$lineage == "lineage4",]$ratioPredAff
# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)
ks.test(lin2,lin3)
ks.test(lin2,lin4)
ks.test(lin3,lin4)

View file

@ -1,229 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("../barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA for pyrazinamide:
# merged_df2
# merged_df3
# df without NA for pyrazinamide:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info); str(my_df$mutation_info)
# subset df with dr muts only
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide")
table(my_df_dr$mutation_info)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Run two times:
# uncomment as necessary
# 1) for all muts
# 2) for dr_muts
#===========================
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
#================
# for ALL muts
#================
plot_df = my_df
my_plot_name = 'lineage_dist_PS.svg'
#my_plot_name = 'lineage_dist_PS_comp.svg'
#================
# for dr muts ONLY
#================
#plot_df = my_df_dr
#my_plot_name = 'lineage_dist_dr_PS.svg'
#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
#%%%%%%%%%%%%%%%%%%%%%%%%
#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
#===================
# Data for plots
#===================
table(plot_df$lineage); str(plot_df$lineage)
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df <- df_lin
#%%%%%%%%%%%%%%%%%%%%%%%%%
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
#library(plotly)
#library(ggridges)
g <- ggplot(df, aes(x = ratioDUET)) +
geom_density(aes(fill = DUET_outcome)
, alpha = 0.5) + facet_wrap(~ lineage,
scales = "free") +
ggtitle("Kernel Density estimates of Protein stability by lineage")
ggplotly(g)
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# check plot name
my_plot_name
# output svg
svg(my_plot_name)
printFile = ggplot(df, aes(x = ratioDUET
, y = DUET_outcome))+
#printFile=geom_density_ridges_gradient(
geom_density_ridges_gradient(aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
# , switch = 'x'
, labeller = labeller(lineage = my_labels) ) +
coord_cartesian( xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
) +
scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
, name = "DUET" ) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
# , axis.text.y = element_text(size = my_ats
# , angle = 0
# , hjust = 1
# , vjust = 0)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = 10)
, legend.title = element_text(size = my_als)
# , legend.position = c(0.3, 0.8)
# , legend.key.height = unit(1, 'mm')
)
print(printFile)
dev.off()
#=!=!=!=!=!=!=!
# COMMENT: Not much differences in the distributions
# when using merged_df2 or merged_df2_comp.
# Also, the lineage differences disappear when looking at all muts
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!=!
#===================================================
# COMPARING DISTRIBUTIONS: KS test
# run: "../KS_test_PS.R"

View file

@ -1,250 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
library(ggseqlogo)
#=======
# input
#=======
#############
# msa file: output of generate_mut_sequences.py
#############
homedir = '~'
indir = 'git/Data/pyrazinamide/output'
in_filename = "gene_msa.txt"
infile = paste0(homedir, '/', indir,'/', in_filename)
print(infile)
#=======
# input
#=======
#############
# combined dfs
#############
source("../combining_two_df.R")
###########################
# Data for Logo plots
# you need big df i.e
# merged_df2
# or
# merged_df2_comp
# since these have unique SNPs
# I prefer to use the merged_df2
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# doesn't work if you use the big df as it has duplicate snps
#rownames(my_df) = my_df$Mutationinformation
# sanity check: should be True
table(my_df$position == my_df$Position)
c1 = unique(my_df$Position) # 130
nrow(my_df) # 3092
#FIXME
#!!! RESOLVE !!!
# get freq count of positions and add to the df
setDT(my_df)[, occurrence_sample := .N, by = .(id)]
table(my_df$occurrence_sample)
my_df2 = my_df %>%
select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample)
write.csv(my_df2, "my_df2.csv")
# extract freq_pos>1 since this will not add to much in the logo plot
# pos 5 has one mutation but coming from atleast 5 samples?
table(my_df$occurrence)
foo = my_df[my_df$occurrence ==1,]
# uncomment as necessary
my_data_snp = my_df #3092
#!!! RESOLVE
# FIXME
my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019
u = unique(my_data_snp$Position) #96
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#########################################################
# Task: To generate a logo plot or bar plot but coloured
# aa properties.
# step1: read mcsm file and OR file
# step2: plot wild type positions
# step3: plot mutants per position coloured by aa properties
# step4: make the size of the letters/bars prop to OR if you can!
#########################################################
##useful links
#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
#https://omarwagih.github.io/ggseqlogo/
#https://kkdey.github.io/Logolas-pages/workflow.html
#A new sequence logo plot to highlight enrichment and depletion.
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
#==============
# matrix for mutant type
# frequency of mutant type by position
#==============
table(my_data_snp$Mutant_type, my_data_snp$Position)
tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position)
class(tab_mt)
# unclass to convert to matrix
tab_mt = unclass(tab_mt)
tab_mt = as.matrix(tab_mt, rownames = T)
# should be TRUE
is.matrix(tab_mt)
rownames(tab_mt) #aa
colnames(tab_mt) #pos
#**********************
# Plot 1: mutant logo
#**********************
my_ymax = max(my_data_snp$occurrence); my_ymax
my_ylim = c(0,my_ymax) # very important
# axis sizes
# common: text and label
my_ats = 15
my_als = 20
# individual: text and label
my_xats = 15
my_yats = 20
my_xals = 15
my_yals = 20
# legend size: text and label
my_lts = 20
#my_lls = 20
# Color scheme based on chemistry of amino acids
chemistry = data.frame(
letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'),
group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)),
col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)),
stringsAsFactors = F
)
# uncomment as necessary
my_type = "EDLogo"
#my_type = "Logo"
logomaker(tab_mt
, type = my_type
, return_heights = T
# , color_type = "per_row"
# , colors = chemistry$col
# , method = 'custom'
# , seq_type = 'aa'
# , col_scheme = "taylor"
# , col_scheme = "chemistry2"
) +
theme(legend.position = "bottom"
, legend.title = element_blank()
, legend.text = element_text(size = my_lts )
, axis.text.x = element_text(size = my_ats , angle = 90)
, axis.text.y = element_text(size = my_ats , angle = 90))
p0 = logomaker(tab_mt
, type = my_type
, return_heights = T
, color_type = "per_row"
, colors = chemistry$col
# , seq_type = 'aa'
# , col_scheme = "taylor"
# , col_scheme = "chemistry2"
) +
#ylab('my custom height') +
theme(axis.text.x = element_blank()) +
# theme_logo()+
# scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
scale_x_continuous(breaks = 1:ncol(tab_mt)
, labels = colnames(tab_mt))+
scale_y_continuous( breaks = 1:my_ymax
, limits = my_ylim)
p0
# further customisation
p1 = p0 + theme(legend.position = "bottom"
, legend.title = element_blank()
, legend.text = element_text(size = my_lts)
, axis.text.x = element_text(size = my_ats , angle = 90)
, axis.text.y = element_text(size = my_ats , angle = 90))
p1
#=======
# input
#=======
#############
# msa file: output of generate_mut_sequences.py
#############
homedir = '~'
indir = 'git/Data/pyrazinamide/output'
in_filename = "gene_msa.txt"
infile = paste0(homedir, '/', indir,'/', in_filename)
print(infile)
##############
# ggseqlogo: custom matrix of my data
##############
snps = read.csv(infile
, stringsAsFactors = F
, header = F) #3072,
class(snps); str(snps) # df and chr
# turn to a character vector
snps2 = as.character(snps[1:nrow(snps),])
class(snps2); str(snps2) #character, chr
# plot
logomaker(snps2, type = my_type
, color_type = "per_row") +
theme(axis.text.x = element_blank()) +
theme_logo()+
# scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
scale_x_continuous(breaks = 1:ncol(tab_mt)
, labels = colnames(tab_mt))+
scale_y_continuous( breaks = 0:5
, limits = my_ylim)

View file

@ -1,273 +0,0 @@
getwd()
setwd("~/git//LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
# TASK: Multiple mutations per site
# as aa symbol coloured by aa property
########################################################################
# Installing and loading required packages #
########################################################################
#source("../Header_TT.R")
#source("barplot_colour_function.R")
library(ggseqlogo)
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
#merged_df2 # 3092, 35
#merged_df2_comp #3012, 35
#merged_df3 #335, 35
#merged_df3_comp #293, 35
#==========================
###########################
# Data for Logo plots
# you need small df i.e
# merged_df3
# or
# merged_df3_comp? possibly
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3 # to show multiple mutations per site
my_df = read.csv(file.choose())
#%%%%%%%%%%%%%%%%%%%%%%%%
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
colnames(my_df)
str(my_df)
rownames(my_df) = my_df$Mutationinformation
c1 = unique(my_df$Position) #130
nrow(my_df) #335
table(my_df$occurrence)
#1 2 3 4 5 6
#34 76 63 104 40 18
# get freq count of positions so you can subset freq<1
#: already done in teh combining script
#require(data.table)
#setDT(my_df)[, occurrence := .N, by = .(Position)] #189, 36
table(my_df$Position); table(my_df$occurrence)
# extract freq_pos>1
my_data_snp = my_df[my_df$occurrence!=1,] #301, 36
u_pos = unique(my_data_snp$Position) #96
# sanity check
exp_dim = nrow(my_df) - table(my_df$occurrence)[[1]]; exp_dim
if ( nrow(my_data_snp) == exp_dim ){
print("Sanity check passed: Data filtered correctly, dim match")
} else {
print("Error: Please Debug")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#########################################################
# Task: To generate a logo plot or bar plot but coloured
# aa properties.
# step1: read data file
# step2: plot wild type positions
# step3: plot mutants per position coloured by aa properties
# step4: make the size of the letters/bars prop to OR if you can!
#########################################################
# useful links
# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
# https://omarwagih.github.io/ggseqlogo/
# https://kkdey.github.io/Logolas-pages/workflow.html
# A new sequence logo plot to highlight enrichment and depletion.
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
# very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
#############
# PLOTS: Bar plot with aa properties
# using gglogo
# useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
#############
##############
# ggseqlogo: custom matrix of my data
##############
#==============
# matrix for mutant type
# frequency of mutant type by position
#==============
table(my_data_snp$Mutant_type, my_data_snp$Position)
tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position)
class(tab_mt)
# unclass to convert to matrix
tab_mt = unclass(tab_mt)
tab_mt = as.matrix(tab_mt, rownames = T)
# should be TRUE
is.matrix(tab_mt)
rownames(tab_mt) #aa
colnames(tab_mt) #pos
#==============
# matrix for wild type
# frequency of wild type by position
#==============
# remove wt duplicates
wt = my_data_snp[, c("Position", "Wild_type")] #301, 2
wt = wt[!duplicated(wt),]#96, 2
table(wt$Wild_type) # contains duplicates
tab_wt = table(wt$Wild_type, wt$Position); tab_wt # should all be 1
tab_wt = unclass(tab_wt) #important
class(tab_wt); rownames(tab_wt)
#tab_wt = as.matrix(tab_wt, rownames = T)
rownames(tab_wt)
rownames(tab_mt)
# sanity check
if (ncol(tab_wt) == length(u_pos) ){
print("Sanity check passed: wt data dim match")
} else {
print("Error: Please debug")
}
#**************
# Plot 1: mutant logo
#**************
#install.packages("digest")
#library(digest)
# following example
require(ggplot2)
require(reshape2)
library(gglogo)
library(ggrepel)
library(ggseqlogo)
# generate seq logo for mutant type
my_ymax = max(my_data_snp$occurrence); my_ymax
my_ylim = c(0, my_ymax)
#my_yrange = 1:my_ymax; my_yrange
# axis sizes
# common: text and label
my_ats = 15
my_als = 20
# individual: text and label
my_xats = 15
my_yats = 20
my_xals = 15
my_yals = 20
# legend size: text and label
my_lts = 20
#my_lls = 20
p0 = ggseqlogo(tab_mt
, method = 'custom'
, seq_type = 'aa'
# , col_scheme = "taylor"
# , col_scheme = "chemistry2"
) +
# ylab('my custom height') +
theme(axis.text.x = element_blank()) +
theme_logo()+
# scale_x_continuous(breaks=1:51, parse (text = colnames(tab_mt)) )
scale_x_continuous(breaks = 1:ncol(tab_mt)
, labels = colnames(tab_mt))+
scale_y_continuous( breaks = 1:my_ymax
, limits = my_ylim)
p0
# further customisation
p1 = p0 + theme(legend.position = "none"
, legend.title = element_blank()
, legend.text = element_text(size = my_lts)
, axis.text.x = element_text(size = my_xats, angle = 90)
, axis.text.y = element_text(size = my_yats, angle = 90))
p1
#**************
# Plot 2: for wild_type
# with custom x axis to reflect my aa positions
#**************
# sanity check: MUST BE TRUE
# for the correctnes of the x axis
identical(colnames(tab_mt), colnames(tab_wt))
identical(ncol(tab_mt), ncol(tab_wt))
p2 = ggseqlogo(tab_wt
, method = 'custom'
, seq_type = 'aa'
# , col_scheme = "taylor"
# , col_scheme = chemistry2
) +
# ylab('my custom height') +
theme(axis.text.x = element_blank()
, axis.text.y = element_blank()) +
theme_logo() +
scale_x_continuous(breaks = 1:ncol(tab_wt)
, labels = colnames(tab_wt)) +
scale_y_continuous( breaks = 0:1
, limits = my_ylim )
p2
# further customise
p3 = p2 +
theme(legend.position = "bottom"
, legend.text = element_text(size = my_lts)
, axis.text.x = element_text(size = my_ats
, angle = 90)
, axis.text.y = element_blank())
p3
# Now combine using cowplot, which ensures the plots are aligned
suppressMessages( require(cowplot) )
plot_grid(p1, p3, ncol = 1, align = 'v') #+
# background_grid(minor = "xy"
# , size.minor = 1
# , colour.minor = "grey86")
#colour scheme
#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r

View file

@ -1,208 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
############################################################
# 1: Installing and loading required packages and functions
############################################################
#source("../Header_TT.R")
#source("../barplot_colour_function.R")
#library(tidyverse)
###########################
#2: Read file: normalised file, output of step 4 mcsm pipeline
###########################
#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv"
# , row.names = 1
# , stringsAsFactors = F
# , header = T)
# call script combining_df
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
# from Plotting to Scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA for pyrazinamide:
#merged_df2
#merged_df2_comp
# df without NA for pyrazinamide:
#merged_df3
#merged_df3_comp
#==========================
###########################
# Data to choose:
# We will be using the small dfs
# to generate the coloured axis
###########################
# uncomment as necessary
#!!!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#!!!!!!!!!!!!!!!!!!!!!!!
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
str(my_df)
my_df$Position
c1 = my_df[my_df$Mutationinformation == "A134V",]
# order my_df by Position
my_df_o = my_df[order(my_df$Position),]
head(my_df_o$Position); tail(my_df_o$Position)
c2 = my_df_o[my_df_o$Mutationinformation == "A134V",]
# sanity check
if (sum(table(c1 == c2)) == ncol(my_df)){
print ("Sanity check passsd")
}else{
print ("Error!: Please debug your code")
}
rm(my_df, c1, c2)
# create a new df with unique position numbers and cols
Position = unique(my_df_o$Position)
Position_cols = as.data.frame(Position)
head(Position_cols) ; tail(Position_cols)
# specify active site residues and bg colour
Position = c(49, 51, 57, 71
, 8, 96, 138
, 13, 68
, 103, 137
, 133, 134) #13
lab_bg = rep(c("purple"
, "yellow"
, "cornflowerblue"
, "blue"
, "green"), times = c(4, 3, 2, 2, 2)
)
# second bg colour for active site residues
#lab_bg2 = rep(c("white"
# , "green" , "white", "green"
# , "white"
# , "white"
# , "white"), times = c(4
# , 1, 1, 1
# , 2
# , 2
# , 2)
#)
#%%%%%%%%%
# revised: leave the second box coloured as the first one incase there is no second colour
#%%%%%%%%%
lab_bg2 = rep(c("purple"
, "green", "yellow", "green"
, "cornflowerblue"
, "blue"
, "green"), times = c(4
, 1, 1, 1
, 2
, 2
, 2))
# fg colour for labels for active site residues
lab_fg = rep(c("white"
, "black"
, "black"
, "white"
, "black"), times = c(4, 3, 2, 2, 2))
#%%%%%%%%%
# revised: make the purple ones black
# fg colour for labels for active site residues
#%%%%%%%%%
#lab_fg = rep(c("black"
# , "black"
# , "black"
# , "white"
# , "black"), times = c(4, 3, 2, 2, 2))
# combined df with active sites, bg and fg colours
aa_cols_ref = data.frame(Position
, lab_bg
, lab_bg2
, lab_fg
, stringsAsFactors = F) #13, 4
str(Position_cols); class(Position_cols)
str(aa_cols_ref); class(aa_cols_ref)
# since Position is int and numeric in the two dfs resp,
# converting numeric to int for consistency
aa_cols_ref$Position = as.integer(aa_cols_ref$Position)
class(aa_cols_ref$Position)
#===========
# Merge 1: merging Positions df (Position_cols) and
# active site cols (aa_cols_ref)
# linking column: "Position"
# This is so you can have colours defined for all 130 positions
#===========
head(Position_cols$Position); head(aa_cols_ref$Position)
mut_pos_cols = merge(Position_cols, aa_cols_ref
, by = "Position"
, all.x = TRUE)
head(mut_pos_cols)
# replace NA's
# :column "lab_bg" with "white"
# : column "lab_fg" with "black"
mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white"
mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white"
mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black"
head(mut_pos_cols)
#===========
# Merge 2: Merge mut_pos_cols with mcsm df
# Now combined the 130 positions with aa colours with
# the mcsm_data
#===========
# dfs to merge
df0 = my_df_o
df1 = mut_pos_cols
# check the column on which merge will be performed
head(df0$Position); tail(df0$Position)
head(df1$Position); tail(df1$Position)
# should now have 3 extra columns
my_df = merge(df0, df1
, by = "Position"
, all.x = TRUE)
# sanity check
my_df[my_df$Position == "49",]
my_df[my_df$Position == "13",]
my_df$Position
# clear variables
rm(aa_cols_ref
, df0
, df1
, my_df_o
, Position_cols
, lab_bg
, lab_bg2
, lab_fg
, Position
)

View file

@ -1,208 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
############################################################
# 1: Installing and loading required packages and functions
############################################################
#source("../Header_TT.R")
#source("../barplot_colour_function.R")
#library(tidyverse)
###########################
#2: Read file: normalised file, output of step 4 mcsm pipeline
###########################
#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv"
# , row.names = 1
# , stringsAsFactors = F
# , header = T)
# call script combining_df
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
# from Plotting to Scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA for pyrazinamide:
#merged_df2
#merged_df2_comp
# df without NA for pyrazinamide:
#merged_df3
#merged_df3_comp
#==========================
###########################
# Data to choose:
# We will be using the small dfs
# to generate the coloured axis
###########################
# uncomment as necessary
#!!!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#!!!!!!!!!!!!!!!!!!!!!!!
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
str(my_df)
my_df$Position
c1 = my_df[my_df$Mutationinformation == "L4S",]
# order my_df by Position
my_df_o = my_df[order(my_df$Position),]
head(my_df_o$Position); tail(my_df_o$Position)
c2 = my_df_o[my_df_o$Mutationinformation == "L4S",]
# sanity check
if (sum(table(c1 == c2)) == ncol(my_df)){
print ("Sanity check passsd")
}else{
print ("Error!: Please debug your code")
}
rm(my_df, c1, c2)
# create a new df with unique position numbers and cols
Position = unique(my_df_o$Position) #130
Position_cols = as.data.frame(Position)
head(Position_cols) ; tail(Position_cols)
# specify active site residues and bg colour
Position = c(49, 51, 57, 71
, 8, 96, 138
, 13, 68
, 103, 137
, 133, 134) #13
lab_bg = rep(c("purple"
, "yellow"
, "cornflowerblue"
, "blue"
, "green"), times = c(4, 3, 2, 2, 2)
)
# second bg colour for active site residues
#lab_bg2 = rep(c("white"
# , "green" , "white", "green"
# , "white"
# , "white"
# , "white"), times = c(4
# , 1, 1, 1
# , 2
# , 2
# , 2)
#)
#%%%%%%%%%
# revised: leave the second box coloured as the first one incase there is no second colour
#%%%%%%%%%
lab_bg2 = rep(c("purple"
, "green", "yellow", "green"
, "cornflowerblue"
, "blue"
, "green"), times = c(4
, 1, 1, 1
, 2
, 2
, 2))
# fg colour for labels for active site residues
lab_fg = rep(c("white"
, "black"
, "black"
, "white"
, "black"), times = c(4, 3, 2, 2, 2))
#%%%%%%%%%
# revised: make the purple ones black
# fg colour for labels for active site residues
#%%%%%%%%%
#lab_fg = rep(c("black"
# , "black"
# , "black"
# , "white"
# , "black"), times = c(4, 3, 2, 2, 2))
# combined df with active sites, bg and fg colours
aa_cols_ref = data.frame(Position
, lab_bg
, lab_bg2
, lab_fg
, stringsAsFactors = F) #13, 4
str(Position_cols); class(Position_cols)
str(aa_cols_ref); class(aa_cols_ref)
# since Position is int and numeric in the two dfs resp,
# converting numeric to int for consistency
aa_cols_ref$Position = as.integer(aa_cols_ref$Position)
class(aa_cols_ref$Position)
#===========
# Merge 1: merging Positions df (Position_cols) and
# active site cols (aa_cols_ref)
# linking column: "Position"
# This is so you can have colours defined for all 130 positions
#===========
head(Position_cols$Position); head(aa_cols_ref$Position)
mut_pos_cols = merge(Position_cols, aa_cols_ref
, by = "Position"
, all.x = TRUE)
head(mut_pos_cols)
# replace NA's
# :column "lab_bg" with "white"
# : column "lab_fg" with "black"
mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white"
mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white"
mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black"
head(mut_pos_cols)
#===========
# Merge 2: Merge mut_pos_cols with mcsm df
# Now combined the 130 positions with aa colours with
# the mcsm_data
#===========
# dfs to merge
df0 = my_df_o
df1 = mut_pos_cols
# check the column on which merge will be performed
head(df0$Position); tail(df0$Position)
head(df1$Position); tail(df1$Position)
# should now have 3 extra columns
my_df = merge(df0, df1
, by = "Position"
, all.x = TRUE)
# sanity check
my_df[my_df$Position == "49",]
my_df[my_df$Position == "13",]
my_df$Position
# clear variables
rm(aa_cols_ref
, df0
, df1
, my_df_o
, Position_cols
, lab_bg
, lab_bg2
, lab_fg
, Position
)

View file

@ -1,27 +0,0 @@
#########################
#3: Read complex pdb file
##########################
source("Header_TT.R")
# This script only reads the pdb file of your complex
# read in pdb file complex1
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
#complex2 = inFile2
# list of 8
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
rm(inDir, inFile, complex1)
#====== end of script

View file

@ -1,386 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("Header_TT.R")
#########################################################
# TASK: replace B-factors in the pdb file with normalised values
# use the complex file with no water as mCSM lig was
# performed on this file. You can check it in the script: read_pdb file.
#########################################################
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
#=========================================================
# Processing P1: Replacing B factor with mean ratioDUET scores
#=========================================================
#########################
# Read complex pdb file
# form the R script
##########################
source("read_pdb.R") # list of 8
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
#*******************************************
# plot histograms for inspection
# 1: original B-factors
# 2: original DUET Scores
# 3: replaced B-factors with DUET Scores
#*********************************************
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
#1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: DUET scores
hist(my_df$average_DUETR
, xlab = ""
, main = "Norm_DUET")
plot(density(my_df$average_DUETR)
, xlab = ""
, main = "Norm_DUET")
# 3: After the following replacement
#********************************
#=========
# step 0_P1: DONT RUN once you have double checked the matched output
#=========
# sanity check: match and assign to a separate column to double check
# colnames(my_df)
# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
#=========
# step 1_P1
#=========
# Be brave and replace in place now (don't run sanity check)
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
#=========
# step 2_P1
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
#table(d$b)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
# sanity check: should be True
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if(max(d$b) == max(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P1
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P1
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P1
#=========
# output dir
getwd()
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
write.pdb(my_pdb, outFile)
#********************************
# Add the 3rd histogram and density plots for comparisons
#********************************
# Plots continued...
# 3: hist and density of replaced B-factors with DUET Scores
hist(d$b
, xlab = ""
, main = "repalced-B")
plot(density(d$b)
, xlab = ""
, main = "replaced-B")
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = "DUET_stability"
, side = 3
, line = 0
, outer = TRUE)
#********************************
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# NOTE: This replaced B-factor distribution has the same
# x-axis as the PredAff normalised values, but the distribution
# is affected since 0 is overinflated. This is because all the positions
# where there are no SNPs have been assigned 0.
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#######################################################################
#====================== end of section 1 ==============================
#######################################################################
#=========================================================
# Processing P2: Replacing B values with PredAff Scores
#=========================================================
# clear workspace
rm(list = ls())
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
#rm(inDir, inFile)
#########################
# 3: Read complex pdb file
# form the R script
##########################
source("read_pdb.R") # list of 8
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
#*******************************************
# plot histograms for inspection
# 1: original B-factors
# 2: original Pred Aff Scores
# 3: replaced B-factors with PredAff Scores
#********************************************
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
# 1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: Pred Aff scores
hist(my_df$average_PredAffR
, xlab = ""
, main = "Norm_lig_average")
plot(density(my_df$average_PredAffR)
, xlab = ""
, main = "Norm_lig_average")
# 3: After the following replacement
#********************************
#=================================================
# Processing P2: Replacing B values with ratioPredAff scores
#=================================================
# use match to perform this replacement linking with "position no"
# in the pdb file, this corresponds to column "resno"
# in my_df, this corresponds to column "Position"
#=========
# step 0_P2: DONT RUN once you have double checked the matched output
#=========
# sanity check: match and assign to a separate column to double check
# colnames(my_df)
# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
#=========
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
#=========
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
#=========
# step 2_P2
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
#table(d$b)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if (max(d$b) == max(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P2
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P2
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P2
#=========
# output dir
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
write.pdb(my_pdb, outFile)
#********************************
# Add the 3rd histogram and density plots for comparisons
#********************************
# Plots continued...
# 3: hist and density of replaced B-factors with PredAff Scores
hist(d$b
, xlab = ""
, main = "repalced-B")
plot(density(d$b)
, xlab = ""
, main = "replaced-B")
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = "Lig_stability"
, side = 3
, line = 0
, outer = TRUE)
#********************************
###########
# end of output files with Bfactors
##########

View file

@ -1,257 +0,0 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
#########################################################
# 1: Installing and loading required packages #
#########################################################
source("Header_TT.R")
#source("barplot_colour_function.R")
##########################################################
# Checking: Entire data frame and for PS #
##########################################################
###########################
#2) Read file: combined one from the script
###########################
source("combining_two_df.R")
# df with NA:
# merged_df2
# merged_df3:
# df without NA:
# merged_df2_comp:
# merged_df3_comp:
######################
# You need to check it
# with the merged_df3
########################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# should be true
identical(my_df$Position, my_df$position)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data <- read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
str(mcsm_data)
my_colnames = colnames(mcsm_data)
#====================================
# subset my_df to include only the columns in mcsm data
my_df2 = my_df[my_colnames]
#====================================
# compare the two
head(mcsm_data$Mutationinformation)
head(mcsm_data$Position)
head(my_df2$Mutationinformation)
head(my_df2$Position)
# sort mcsm data by Mutationinformation
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data_s$Mutationinformation)
head(mcsm_data_s$Position)
# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data_s, my_df2)
# from library dplyr
setdiff(mcsm_data_s, my_df2)
#from lib compare
compare(mcsm_data_s, my_df2) # seems rownames are the problem
# FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)
#====================================================== end of section 1
##########################################################
# Checking: LIG(Filtered dataframe) #
##########################################################
# clear workspace
rm(list = ls())
###########################
#3) Read file: combined_lig from the script
###########################
source("combining_two_df_lig.R")
# df with NA:
# merged_df2 :
# merged_df3:
# df without NA:
# merged_df2_comp:
# merged_df3_comp:
######################
# You need to check it
# with the merged_df3
########################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# should be true
identical(my_df$Position, my_df$position)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data <- read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
str(mcsm_data)
###########################
# 4a: Filter/subset data: ONLY for LIGand analysis
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# sanity checks
upos = unique(mcsm_data$Position)
# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)
# Lig filtered: subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
rm(mcsm_data) #to avoid confusion
table(mcsm_data2$Dis_lig_Ang<10)
table(mcsm_data2$Dis_lig_Ang>10)
max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)
upos_f = unique(mcsm_data2$Position); upos_f
# colnames of df that you will need to subset the bigger df from
my_colnames = colnames(mcsm_data2)
#====================================
# subset bigger df i.e my_df to include only the columns in mcsm data2
my_df2 = my_df[my_colnames]
rm(my_df) #to avoid confusion
#====================================
# compare the two
head(mcsm_data2$Mutationinformation)
head(mcsm_data2$Position)
head(my_df2$Mutationinformation)
head(my_df2$Position)
# sort mcsm data by Mutationinformation
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
head(mcsm_data2_s$Mutationinformation)
head(mcsm_data2_s$Position)
# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data2_s, my_df2)
# from library dplyr
setdiff(mcsm_data2_s, my_df2)
# from library compare
compare(mcsm_data2_s, my_df2) # seems rownames are the problem
#FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)
##########################################################
# extract and write output file for SNP posn: all #
##########################################################
head(merged_df3$Position)
foo = merged_df3[order(merged_df3$Position),]
head(foo$Position)
snp_pos_unique = unique(foo$Position); snp_pos_unique
# sanity check:
table(snp_pos_unique == combined_df$Position)
#=====================
# write_output files
#=====================
outDir = "~/Data/pyrazinamide/input/processed/"
outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
print(paste0("Output file name and path will be:","", outFile1))
write.table(snp_pos_unique
, outFile1
, row.names = F
, col.names = F)
##############################################################
# extract and write output file for SNP posn: complete only #
##############################################################
head(merged_df3_comp$Position)
foo = merged_df3_comp[order(merged_df3_comp$Position),]
head(foo$Position)
snp_pos_unique = unique(foo$Position); snp_pos_unique
# outDir = "~/Data/pyrazinamide/input/processed/" # already set
outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
print(paste0("Output file name and path will be:", outFile2))
write.table(snp_pos_unique
, outFile2
, row.names = F
, col.names = F)
#============================== end of script

View file

@ -48,9 +48,10 @@ gene = 'pncA'
datadir = homedir + '/' + 'git/Data'
#=======
# input
# input from outdir
#=======
indir = datadir + '/' + drug + '/' + 'output'
#indir = datadir + '/' + drug + '/' + 'output'
outdir = datadir + '/' + drug + '/' + 'output'
#in_filename = 'pnca.dssp'
in_filename = gene.lower() +'.dssp'
infile = indir + '/' + in_filename

View file

@ -4,9 +4,6 @@
## Structure:
#
# $DATA_DIR/$DRUG/input
# |- original
# |- processed
# |- structure
#
# $DATA_DIR/$DRUG/output
# |- plots
@ -15,18 +12,17 @@
DATA_DIR=~/git/Data
if [[ $1 == '' ]]; then
echo "Error"
echo "usage: mk-drug-dirs.sh <drug name>";
exit;
else
DRUG=$1
echo Creating structure for: $DRUG
echo Creating directory structure for: $DRUG
if [ -d $DATA_DIR ]
then
echo Doing creation in $DATA_DIR
mkdir -p $DATA_DIR/$DRUG/input/original
mkdir -p $DATA_DIR/$DRUG/input/processed
mkdir -p $DATA_DIR/$DRUG/input/structure
mkdir -p $DATA_DIR/$DRUG/input
mkdir -p $DATA_DIR/$DRUG/output/plots
mkdir -p $DATA_DIR/$DRUG/output/structure

View file

@ -23,34 +23,31 @@ homedir = os.path.expanduser('~')
# set working dir
#os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#os.getcwd()
#=======================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
indir = datadir + '/' + drug + 'input'
in_filename = 'aa_codes.csv'
infile = indir + '/' + in_filename
infile = datadir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', indir
, '\nInput path:', datadir
, '\n============================================================')
#=======
# output: No output
#=======
#outdir = datadir + '/' + drug + '/' + 'output'
#out_filename = ''
#outfile = outdir + '/' + out_filename
@ -76,6 +73,7 @@ my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
# using 'index' creates a dict of dicts
# using 'records' creates a list of dicts
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
print('Printing my_aa_dict:', my_aa_dict.keys())
#================================================
# dict of aa with their corresponding properties