import commit
This commit is contained in:
commit
bccfe68192
39 changed files with 6837 additions and 0 deletions
512
mcsm_analysis/pyrazinamide/scripts/.Rhistory
Normal file
512
mcsm_analysis/pyrazinamide/scripts/.Rhistory
Normal file
|
@ -0,0 +1,512 @@
|
|||
###########################
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
###########################
|
||||
# Data for bfactor figure
|
||||
# PS average
|
||||
# Lig average
|
||||
###########################
|
||||
head(my_df$Position)
|
||||
head(my_df$ratioDUET)
|
||||
# order data frame
|
||||
df = my_df[order(my_df$Position),]
|
||||
head(df$Position)
|
||||
head(df$ratioDUET)
|
||||
#***********
|
||||
# PS: average by position
|
||||
#***********
|
||||
mean_DUET_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.DUET = mean(ratioDUET))
|
||||
#***********
|
||||
# Lig: average by position
|
||||
#***********
|
||||
mean_Lig_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.Lig = mean(ratioPredAff))
|
||||
#***********
|
||||
# cbind:mean_DUET_by_position and mean_Lig_by_position
|
||||
#***********
|
||||
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
|
||||
# sanity check
|
||||
# mean_PS_Lig_Bfactor
|
||||
colnames(combined)
|
||||
colnames(combined) = c("Position"
|
||||
, "average_DUETR"
|
||||
, "Position2"
|
||||
, "average_PredAffR")
|
||||
colnames(combined)
|
||||
identical(combined$Position, combined$Position2)
|
||||
n = which(colnames(combined) == "Position2"); n
|
||||
combined_df = combined[,-n]
|
||||
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
|
||||
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
|
||||
#=============
|
||||
# output csv
|
||||
#============
|
||||
outDir = "~/Data/pyrazinamide/input/processed/"
|
||||
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
|
||||
print(paste0("Output file with path will be:","", outFile))
|
||||
head(combined_df$Position); tail(combined_df$Position)
|
||||
write.csv(combined_df, outFile
|
||||
, row.names = F)
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
require(data.table)
|
||||
require(dplyr)
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
source("../combining_two_df.R")
|
||||
###########################
|
||||
# This will return:
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
###########################
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
###########################
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
###########################
|
||||
# Data for bfactor figure
|
||||
# PS average
|
||||
# Lig average
|
||||
###########################
|
||||
head(my_df$Position)
|
||||
head(my_df$ratioDUET)
|
||||
# order data frame
|
||||
df = my_df[order(my_df$Position),]
|
||||
head(df$Position)
|
||||
head(df$ratioDUET)
|
||||
#***********
|
||||
# PS: average by position
|
||||
#***********
|
||||
mean_DUET_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.DUET = mean(ratioDUET))
|
||||
#***********
|
||||
# Lig: average by position
|
||||
#***********
|
||||
mean_Lig_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.Lig = mean(ratioPredAff))
|
||||
#***********
|
||||
# cbind:mean_DUET_by_position and mean_Lig_by_position
|
||||
#***********
|
||||
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
|
||||
# sanity check
|
||||
# mean_PS_Lig_Bfactor
|
||||
colnames(combined)
|
||||
colnames(combined) = c("Position"
|
||||
, "average_DUETR"
|
||||
, "Position2"
|
||||
, "average_PredAffR")
|
||||
colnames(combined)
|
||||
identical(combined$Position, combined$Position2)
|
||||
n = which(colnames(combined) == "Position2"); n
|
||||
combined_df = combined[,-n]
|
||||
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
|
||||
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
|
||||
#=============
|
||||
# output csv
|
||||
#============
|
||||
outDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
|
||||
print(paste0("Output file with path will be:","", outFile))
|
||||
head(combined_df$Position); tail(combined_df$Position)
|
||||
write.csv(combined_df, outFile
|
||||
, row.names = F)
|
||||
# read in pdb file complex1
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
# read in pdb file complex1
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
complex1 = inFile
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
#########################
|
||||
#3: Read complex pdb file
|
||||
##########################
|
||||
source("Header_TT.R")
|
||||
# list of 8
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
rm(inDir, inFile)
|
||||
#====== end of script
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
complex1 = inFile
|
||||
complex1 = inFile
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
inFile
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
complex1 = inFile
|
||||
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
|
||||
#complex2 = inFile2
|
||||
# list of 8
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
rm(inDir, inFile, complex1)
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
source("Header_TT.R")
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
source("Header_TT.R")
|
||||
#########################################################
|
||||
# TASK: replace B-factors in the pdb file with normalised values
|
||||
# use the complex file with no water as mCSM lig was
|
||||
# performed on this file. You can check it in the script: read_pdb file.
|
||||
#########################################################
|
||||
###########################
|
||||
# 2: Read file: average stability values
|
||||
# or mcsm_normalised file, output of step 4 mcsm pipeline
|
||||
###########################
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
my_df <- read.csv(inFile
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
source("read_pdb.R") # list of 8
|
||||
# extract atom list into a variable
|
||||
# since in the list this corresponds to data frame, variable will be a df
|
||||
d = my_pdb[[1]]
|
||||
# make a copy: required for downstream sanity checks
|
||||
d2 = d
|
||||
# sanity checks: B factor
|
||||
max(d$b); min(d$b)
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
#1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
# 2: DUET scores
|
||||
hist(my_df$average_DUETR
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
plot(density(my_df$average_DUETR)
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
#1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
# 2: DUET scores
|
||||
hist(my_df$average_DUETR
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
plot(density(my_df$average_DUETR)
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
#=========
|
||||
# step 1_P1
|
||||
#=========
|
||||
# Be brave and replace in place now (don't run sanity check)
|
||||
# this makes all the B-factor values in the non-matched positions as NA
|
||||
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
|
||||
#=========
|
||||
# step 2_P1
|
||||
#=========
|
||||
# count NA in Bfactor
|
||||
b_na = sum(is.na(d$b)) ; b_na
|
||||
# count number of 0's in Bactor
|
||||
sum(d$b == 0)
|
||||
# replace all NA in b factor with 0
|
||||
d$b[is.na(d$b)] = 0
|
||||
# sanity check: should be 0
|
||||
sum(is.na(d$b))
|
||||
# sanity check: should be True
|
||||
if (sum(d$b == 0) == b_na){
|
||||
print ("Sanity check passed: NA's replaced with 0's successfully")
|
||||
} else {
|
||||
print("Error: NA replacement NOT successful, Debug code!")
|
||||
}
|
||||
max(d$b); min(d$b)
|
||||
# sanity checks: should be True
|
||||
if(max(d$b) == max(my_df$average_DUETR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
if (min(d$b) == min(my_df$average_DUETR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
#=========
|
||||
# step 3_P1
|
||||
#=========
|
||||
# sanity check: dim should be same before reassignment
|
||||
# should be TRUE
|
||||
dim(d) == dim(d2)
|
||||
#=========
|
||||
# step 4_P1
|
||||
#=========
|
||||
# assign it back to the pdb file
|
||||
my_pdb[[1]] = d
|
||||
max(d$b); min(d$b)
|
||||
#=========
|
||||
# step 5_P1
|
||||
#=========
|
||||
# output dir
|
||||
getwd()
|
||||
outDir = "~/git/Data/pyrazinamide/output/"
|
||||
getwd()
|
||||
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
|
||||
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure"
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
|
||||
write.pdb(my_pdb, outFile)
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "repalced-B")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "replaced-B")
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
mtext(text = "DUET_stability"
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
#=========================================================
|
||||
# Processing P2: Replacing B values with PredAff Scores
|
||||
#=========================================================
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
#=========================================================
|
||||
# Processing P2: Replacing B values with PredAff Scores
|
||||
#=========================================================
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
###########################
|
||||
# 2: Read file: average stability values
|
||||
# or mcsm_normalised file, output of step 4 mcsm pipeline
|
||||
###########################
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
#=========================================================
|
||||
# Processing P2: Replacing B factor with mean ratioLig scores
|
||||
#=========================================================
|
||||
#########################
|
||||
# 3: Read complex pdb file
|
||||
# form the R script
|
||||
##########################
|
||||
source("read_pdb.R") # list of 8
|
||||
# extract atom list into a vari
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
my_df <- read.csv(inFile
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
# extract atom list into a variable
|
||||
# since in the list this corresponds to data frame, variable will be a df
|
||||
d = my_pdb[[1]]
|
||||
# make a copy: required for downstream sanity checks
|
||||
d2 = d
|
||||
# sanity checks: B factor
|
||||
max(d$b); min(d$b)
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
# 1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
# 2: Pred Aff scores
|
||||
hist(my_df$average_PredAffR
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
plot(density(my_df$average_PredAffR)
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
# 3: After the following replacement
|
||||
#********************************
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
# 1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
# 2: Pred Aff scores
|
||||
hist(my_df$average_PredAffR
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
plot(density(my_df$average_PredAffR)
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
# 3: After the following replacement
|
||||
#********************************
|
||||
#=========
|
||||
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
|
||||
#=========
|
||||
# this makes all the B-factor values in the non-matched positions as NA
|
||||
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
|
||||
#=========
|
||||
# step 2_P2
|
||||
#=========
|
||||
# count NA in Bfactor
|
||||
b_na = sum(is.na(d$b)) ; b_na
|
||||
# count number of 0's in Bactor
|
||||
sum(d$b == 0)
|
||||
# replace all NA in b factor with 0
|
||||
d$b[is.na(d$b)] = 0
|
||||
# sanity check: should be 0
|
||||
sum(is.na(d$b))
|
||||
if (sum(d$b == 0) == b_na){
|
||||
print ("Sanity check passed: NA's replaced with 0's successfully")
|
||||
} else {
|
||||
print("Error: NA replacement NOT successful, Debug code!")
|
||||
}
|
||||
max(d$b); min(d$b)
|
||||
# sanity checks: should be True
|
||||
if (max(d$b) == max(my_df$average_PredAffR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
if (min(d$b) == min(my_df$average_PredAffR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
#=========
|
||||
# step 3_P2
|
||||
#=========
|
||||
# sanity check: dim should be same before reassignment
|
||||
# should be TRUE
|
||||
dim(d) == dim(d2)
|
||||
#=========
|
||||
# step 4_P2
|
||||
#=========
|
||||
# assign it back to the pdb file
|
||||
my_pdb[[1]] = d
|
||||
max(d$b); min(d$b)
|
||||
#=========
|
||||
# step 5_P2
|
||||
#=========
|
||||
write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
|
||||
# output dir
|
||||
getwd()
|
||||
# output dir
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
|
||||
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
|
||||
write.pdb(my_pdb, outFile)
|
129
mcsm_analysis/pyrazinamide/scripts/Header_TT.R
Normal file
129
mcsm_analysis/pyrazinamide/scripts/Header_TT.R
Normal file
|
@ -0,0 +1,129 @@
|
|||
#########################################################
|
||||
### A) Installing and loading required packages
|
||||
#########################################################
|
||||
|
||||
#if (!require("gplots")) {
|
||||
# install.packages("gplots", dependencies = TRUE)
|
||||
# library(gplots)
|
||||
#}
|
||||
|
||||
if (!require("tidyverse")) {
|
||||
install.packages("tidyverse", dependencies = TRUE)
|
||||
library(tidyverse)
|
||||
}
|
||||
|
||||
if (!require("ggplot2")) {
|
||||
install.packages("ggplot2", dependencies = TRUE)
|
||||
library(ggplot2)
|
||||
}
|
||||
|
||||
if (!require("cowplot")) {
|
||||
install.packages("copwplot", dependencies = TRUE)
|
||||
library(ggplot2)
|
||||
}
|
||||
|
||||
if (!require("ggcorrplot")) {
|
||||
install.packages("ggcorrplot", dependencies = TRUE)
|
||||
library(ggcorrplot)
|
||||
}
|
||||
|
||||
if (!require("ggpubr")) {
|
||||
install.packages("ggpubr", dependencies = TRUE)
|
||||
library(ggpubr)
|
||||
}
|
||||
|
||||
if (!require("RColorBrewer")) {
|
||||
install.packages("RColorBrewer", dependencies = TRUE)
|
||||
library(RColorBrewer)
|
||||
}
|
||||
|
||||
if (!require ("GOplot")) {
|
||||
install.packages("GOplot")
|
||||
library(GOplot)
|
||||
}
|
||||
|
||||
if(!require("VennDiagram")) {
|
||||
|
||||
install.packages("VennDiagram", dependencies = T)
|
||||
library(VennDiagram)
|
||||
}
|
||||
|
||||
if(!require("scales")) {
|
||||
|
||||
install.packages("scales", dependencies = T)
|
||||
library(scales)
|
||||
}
|
||||
|
||||
if(!require("plotrix")) {
|
||||
|
||||
install.packages("plotrix", dependencies = T)
|
||||
library(plotrix)
|
||||
}
|
||||
|
||||
if(!require("stats")) {
|
||||
|
||||
install.packages("stats", dependencies = T)
|
||||
library(stats)
|
||||
}
|
||||
|
||||
if(!require("stats4")) {
|
||||
|
||||
install.packages("stats4", dependencies = T)
|
||||
library(stats4)
|
||||
}
|
||||
|
||||
if(!require("data.table")) {
|
||||
library(stats4)
|
||||
}
|
||||
|
||||
if (!require("PerformanceAnalytics")){
|
||||
install.packages("PerformanceAnalytics", dependencies = T)
|
||||
library(PerformaceAnalytics)
|
||||
}
|
||||
|
||||
if (!require ("GGally")){
|
||||
install.packages("GGally")
|
||||
library(GGally)
|
||||
}
|
||||
|
||||
if (!require ("corrr")){
|
||||
install.packages("corrr")
|
||||
library(corrr)
|
||||
}
|
||||
|
||||
if (!require ("psych")){
|
||||
install.packages("psych")
|
||||
library(psych)
|
||||
}
|
||||
|
||||
if (!require ("dplyr")){
|
||||
install.packages("dplyr")
|
||||
library(psych)
|
||||
}
|
||||
|
||||
|
||||
if (!require ("compare")){
|
||||
install.packages("compare")
|
||||
library(psych)
|
||||
}
|
||||
|
||||
if (!require ("arsenal")){
|
||||
install.packages("arsenal")
|
||||
library(psych)
|
||||
}
|
||||
|
||||
|
||||
####TIDYVERSE
|
||||
# Install
|
||||
#if(!require(devtools)) install.packages("devtools")
|
||||
#devtools::install_github("kassambara/ggcorrplot")
|
||||
|
||||
library(ggcorrplot)
|
||||
|
||||
|
||||
###for PDB files
|
||||
#install.packages("bio3d")
|
||||
if(!require(bio3d)){
|
||||
install.packages("bio3d")
|
||||
library(bio3d)
|
||||
}
|
27
mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
Normal file
27
mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
Normal file
|
@ -0,0 +1,27 @@
|
|||
#########################################################
|
||||
# 1b: Define function: coloured barplot by subgroup
|
||||
# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
|
||||
#########################################################
|
||||
|
||||
ColourPalleteMulti <- function(df, group, subgroup){
|
||||
|
||||
# Find how many colour categories to create and the number of colours in each
|
||||
categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
|
||||
, df
|
||||
, function(x) length(unique(x)))
|
||||
# return(categories) }
|
||||
|
||||
category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
|
||||
|
||||
category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
|
||||
|
||||
#return(category.start); return(category.end)}
|
||||
|
||||
# Build Colour pallette
|
||||
colours <- unlist(lapply(1:nrow(categories),
|
||||
function(i){
|
||||
colorRampPalette(colors = c(category.start[i]
|
||||
, category.end[i]))(categories[i,2])}))
|
||||
return(colours)
|
||||
}
|
||||
#########################################################
|
299
mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
Normal file
299
mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
Normal file
|
@ -0,0 +1,299 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
|
||||
getwd()
|
||||
|
||||
#########################################################
|
||||
# TASK: To combine mcsm and meta data with af and or
|
||||
#########################################################
|
||||
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("Header_TT.R")
|
||||
#require(data.table)
|
||||
#require(arsenal)
|
||||
#require(compare)
|
||||
#library(tidyverse)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data = read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
rm(inDir, inFile)
|
||||
|
||||
str(mcsm_data)
|
||||
|
||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
||||
|
||||
# spelling Correction 1: DUET
|
||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
# checks: should be the same as above
|
||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
||||
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
|
||||
|
||||
# spelling Correction 2: Ligand
|
||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
||||
|
||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
# checks: should be the same as above
|
||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
||||
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
|
||||
|
||||
# count na in each column
|
||||
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
|
||||
|
||||
# sort by Mutationinformation
|
||||
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
|
||||
head(mcsm_data$Mutationinformation)
|
||||
|
||||
# get freq count of positions and add to the df
|
||||
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
|
||||
|
||||
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
|
||||
|
||||
###########################
|
||||
# 2: Read file: meta data with AFandOR
|
||||
###########################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
|
||||
|
||||
meta_with_afor <- read.csv(inFile2
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
|
||||
rm(inDir, inFile2)
|
||||
|
||||
str(meta_with_afor)
|
||||
|
||||
# sort by Mutationinformation
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
|
||||
# sanity check: should be True for all the mentioned columns
|
||||
#is.numeric(meta_with_afor$OR)
|
||||
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
|
||||
|
||||
c1 = NULL
|
||||
for (i in na_var){
|
||||
print(i)
|
||||
c0 = is.numeric(meta_with_afor[,i])
|
||||
c1 = c(c0, c1)
|
||||
if ( all(c1) ){
|
||||
print("Sanity check passed: These are all numeric cols")
|
||||
} else{
|
||||
print("Error: Please check your respective data types")
|
||||
}
|
||||
}
|
||||
|
||||
# If OR, and P value are not numeric, then convert to numeric and then count
|
||||
# else they will say 0
|
||||
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
|
||||
str(na_count)
|
||||
|
||||
# compare if the No of "NA" are the same for all these cols
|
||||
na_len = NULL
|
||||
for (i in na_var){
|
||||
temp = na_count[[i]]
|
||||
na_len = c(na_len, temp)
|
||||
}
|
||||
|
||||
# extract how many NAs:
|
||||
# should be all TRUE
|
||||
# should be a single number since
|
||||
# all the cols should have "equal" and "same" no. of NAs
|
||||
|
||||
my_nrows = NULL
|
||||
for ( i in 1: (length(na_len)-1) ){
|
||||
#print(compare(na_len[i]), na_len[i+1])
|
||||
c = compare(na_len[i], na_len[i+1])
|
||||
if ( c$result ) {
|
||||
my_nrows = na_len[i] }
|
||||
else {
|
||||
print("Error: Please check your numbers")
|
||||
}
|
||||
}
|
||||
|
||||
my_nrows
|
||||
|
||||
#=#=#=#=#=#=#=#=#
|
||||
# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
|
||||
# these are the same 7 ones
|
||||
#=#=#=#=#=#=#=#=#
|
||||
|
||||
# sanity check
|
||||
#which(is.na(meta_with_afor$OR))
|
||||
|
||||
# initialise an empty df with nrows as extracted above
|
||||
na_count_df = data.frame(matrix(vector(mode = 'numeric'
|
||||
# , length = length(na_var)
|
||||
)
|
||||
, nrow = my_nrows
|
||||
# , ncol = length(na_var)
|
||||
))
|
||||
|
||||
# populate the df with the indices of the cols that are NA
|
||||
for (i in na_var){
|
||||
print(i)
|
||||
na_i = which(is.na(meta_with_afor[i]))
|
||||
na_count_df = cbind(na_count_df, na_i)
|
||||
colnames(na_count_df)[which(na_var == i)] <- i
|
||||
}
|
||||
|
||||
# Now compare these indices to ensure these are the same
|
||||
c2 = NULL
|
||||
for ( i in 1: ( length(na_count_df)-1 ) ) {
|
||||
# print(na_count_df[i] == na_count_df[i+1])
|
||||
c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
|
||||
c2 = c(c1, c2)
|
||||
if ( all(c2) ) {
|
||||
print("Sanity check passed: The indices for AF, OR, etc are all the same")
|
||||
} else {
|
||||
print ("Error: Please check indices which are NA")
|
||||
}
|
||||
}
|
||||
|
||||
rm( c, c0, c1, c2, i, my_nrows
|
||||
, na_count, na_i, na_len
|
||||
, na_var, temp
|
||||
, na_count_df
|
||||
, pos_count_check )
|
||||
|
||||
###########################
|
||||
# 3:merging two dfs: with NA
|
||||
###########################
|
||||
|
||||
# link col name = Mutationinforamtion
|
||||
head(mcsm_data$Mutationinformation)
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
|
||||
#########
|
||||
# merge 1a: meta data with mcsm
|
||||
#########
|
||||
merged_df2 = merge(x = meta_with_afor
|
||||
,y = mcsm_data
|
||||
, by = "Mutationinformation"
|
||||
, all.y = T)
|
||||
|
||||
head(merged_df2$Position)
|
||||
|
||||
# sort by Position
|
||||
head(merged_df2$Position)
|
||||
merged_df2 = merged_df2[order(merged_df2$Position),]
|
||||
head(merged_df2$Position)
|
||||
|
||||
merged_df2v2 = merge(x = meta_with_afor
|
||||
,y = mcsm_data
|
||||
, by = "Mutationinformation"
|
||||
, all.x = T)
|
||||
#!=!=!=!=!=!=!=!
|
||||
# COMMENT: used all.y since position 186 is not part of the struc,
|
||||
# hence doesn't have a mcsm value
|
||||
# but 186 is associated with with mutation
|
||||
#!=!=!=!=!=!=!=!
|
||||
|
||||
# should be False
|
||||
identical(merged_df2, merged_df2v2)
|
||||
table(merged_df2$Position%in%merged_df2v2$Position)
|
||||
|
||||
rm(merged_df2v2)
|
||||
|
||||
#########
|
||||
# merge 1b:remove duplicate mutation information
|
||||
#########
|
||||
|
||||
#==#=#=#=#=#=#
|
||||
# Cannot trust lineage, country from this df as the same mutation
|
||||
# can have many different lineages
|
||||
# but this should be good for the numerical corr plots
|
||||
#=#=#=#=#=#=#=
|
||||
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
|
||||
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
|
||||
|
||||
# sanity checks
|
||||
# nrows of merged_df3 should be the same as the nrows of mcsm_data
|
||||
if(nrow(mcsm_data) == nrow(merged_df3)){
|
||||
print("sanity check: Passed")
|
||||
} else {
|
||||
print("Error!: check data, nrows is not as expected")
|
||||
}
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# uncomment as necessary
|
||||
# only need to run this if merged_df2v2 i.e non structural pos included
|
||||
#mcsm = mcsm_data$Mutationinformation
|
||||
#my_merged = merged_df3$Mutationinformation
|
||||
|
||||
# find the index where it differs
|
||||
#diff_n = which(!my_merged%in%mcsm)
|
||||
|
||||
#check if it is indeed pos 186
|
||||
#merged_df3[diff_n,]
|
||||
|
||||
# remove this entry
|
||||
#merged_df3 = merged_df3[-diff_n,]]
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
###########################
|
||||
# 3b :merging two dfs: without NA
|
||||
###########################
|
||||
|
||||
#########
|
||||
# merge 2a:same as merge 1 but excluding NA
|
||||
#########
|
||||
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
|
||||
|
||||
#########
|
||||
# merge 2b: remove duplicate mutation information
|
||||
#########
|
||||
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
|
||||
|
||||
# alternate way of deriving merged_df3_comp
|
||||
foo = merged_df3[!is.na(merged_df3$AF),]
|
||||
# compare dfs: foo and merged_df3_com
|
||||
all.equal(foo, merged_df3)
|
||||
|
||||
summary(comparedf(foo, merged_df3))
|
||||
|
||||
#=============== end of combining df
|
||||
#clear variables
|
||||
rm(mcsm_data
|
||||
, meta_with_afor
|
||||
, foo)
|
||||
|
||||
#rm(diff_n, my_merged, mcsm)
|
||||
|
||||
#=====================
|
||||
# write_output files
|
||||
#=====================
|
||||
# output dir
|
||||
outDir = "~/git/Data/pyrazinamide/output/"
|
||||
getwd()
|
||||
|
||||
outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
|
||||
write.csv(merged_df3, outFile1)
|
||||
|
||||
#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
|
||||
#write.csv(merged_df3_comp, outFile2)
|
||||
|
||||
rm(outDir
|
||||
, outFile1
|
||||
# , outFile2
|
||||
)
|
||||
#============================= end of script
|
||||
|
348
mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
Normal file
348
mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
Normal file
|
@ -0,0 +1,348 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
|
||||
getwd()
|
||||
|
||||
#########################################################
|
||||
# TASK: To combine mcsm and meta data with af and or
|
||||
# by filtering for distance to ligand (<10Ang)
|
||||
#########################################################
|
||||
|
||||
#########################################################
|
||||
# Installing and loading required packages
|
||||
#########################################################
|
||||
|
||||
#source("Header_TT.R")
|
||||
#require(data.table)
|
||||
#require(arsenal)
|
||||
#require(compare)
|
||||
#library(tidyverse)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data = read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
rm(inDir, inFile)
|
||||
|
||||
str(mcsm_data)
|
||||
|
||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
||||
|
||||
# spelling Correction 1: DUET
|
||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
# checks
|
||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
||||
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
|
||||
|
||||
# spelling Correction 2: Ligand
|
||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
||||
|
||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
# checks: should be the same as above
|
||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
||||
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
|
||||
|
||||
########################### !!! only for mcsm_lig
|
||||
# 4: Filter/subset data
|
||||
# Lig plots < 10Ang
|
||||
# Filter the lig plots for Dis_to_lig < 10Ang
|
||||
###########################
|
||||
|
||||
# check range of distances
|
||||
max(mcsm_data$Dis_lig_Ang)
|
||||
min(mcsm_data$Dis_lig_Ang)
|
||||
|
||||
# count
|
||||
table(mcsm_data$Dis_lig_Ang<10)
|
||||
|
||||
# subset data to have only values less than 10 Ang
|
||||
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
|
||||
|
||||
# sanity checks
|
||||
max(mcsm_data2$Dis_lig_Ang)
|
||||
min(mcsm_data2$Dis_lig_Ang)
|
||||
|
||||
# count no of unique positions
|
||||
length(unique(mcsm_data2$Position))
|
||||
|
||||
# count no of unique mutations
|
||||
length(unique(mcsm_data2$Mutationinformation))
|
||||
|
||||
# count Destabilisinga and stabilising
|
||||
table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT: so as not to alter the script
|
||||
mcsm_data = mcsm_data2
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(mcsm_data$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
# clear variables
|
||||
rm(mcsm_data2)
|
||||
|
||||
# count na in each column
|
||||
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
|
||||
|
||||
head(mcsm_data$Mutationinformation)
|
||||
mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
|
||||
mcsm_data[mcsm_data$Mutationinformation=="L4S",]
|
||||
|
||||
# sort by Mutationinformation
|
||||
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
|
||||
head(mcsm_data$Mutationinformation)
|
||||
|
||||
# check
|
||||
mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
|
||||
mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
|
||||
|
||||
# get freq count of positions and add to the df
|
||||
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
|
||||
|
||||
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
|
||||
|
||||
###########################
|
||||
# 2: Read file: meta data with AFandOR
|
||||
###########################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
|
||||
|
||||
meta_with_afor <- read.csv(inFile2
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
|
||||
str(meta_with_afor)
|
||||
|
||||
# sort by Mutationinformation
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
|
||||
# sanity check: should be True for all the mentioned columns
|
||||
#is.numeric(meta_with_afor$OR)
|
||||
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
|
||||
|
||||
c1 = NULL
|
||||
for (i in na_var){
|
||||
print(i)
|
||||
c0 = is.numeric(meta_with_afor[,i])
|
||||
c1 = c(c0, c1)
|
||||
if ( all(c1) ){
|
||||
print("Sanity check passed: These are all numeric cols")
|
||||
} else{
|
||||
print("Error: Please check your respective data types")
|
||||
}
|
||||
}
|
||||
|
||||
# If OR, and P value are not numeric, then convert to numeric and then count
|
||||
# else they will say 0
|
||||
|
||||
# NOW count na in each column: if you did it before, then
|
||||
# OR and Pvalue column would say 0 na since these were not numeric
|
||||
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
|
||||
str(na_count)
|
||||
|
||||
# compare if the No of "NA" are the same for all these cols
|
||||
na_len = NULL
|
||||
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
|
||||
for (i in na_var){
|
||||
temp = na_count[[i]]
|
||||
na_len = c(na_len, temp)
|
||||
}
|
||||
|
||||
my_nrows = NULL
|
||||
|
||||
for ( i in 1: (length(na_len)-1) ){
|
||||
#print(compare(na_len[i]), na_len[i+1])
|
||||
c = compare(na_len[i], na_len[i+1])
|
||||
if ( c$result ) {
|
||||
my_nrows = na_len[i] }
|
||||
else {
|
||||
print("Error: Please check your numbers")
|
||||
}
|
||||
}
|
||||
|
||||
my_nrows
|
||||
|
||||
#=#=#=#=#=#=#=#=#
|
||||
# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
|
||||
# all have 81 NA, with pyrazinamide with 960
|
||||
# and these are the same 7 ones
|
||||
#=#=#=#=#=#=#=#=#
|
||||
|
||||
# sanity check
|
||||
#which(is.na(meta_with_afor$OR))
|
||||
|
||||
# initialise an empty df with nrows as extracted above
|
||||
na_count_df = data.frame(matrix(vector(mode = 'numeric'
|
||||
# , length = length(na_var)
|
||||
)
|
||||
, nrow = my_nrows
|
||||
# , ncol = length(na_var)
|
||||
))
|
||||
|
||||
# populate the df with the indices of the cols that are NA
|
||||
for (i in na_var){
|
||||
print(i)
|
||||
na_i = which(is.na(meta_with_afor[i]))
|
||||
na_count_df = cbind(na_count_df, na_i)
|
||||
colnames(na_count_df)[which(na_var == i)] <- i
|
||||
}
|
||||
|
||||
# Now compare these indices to ensure these are the same
|
||||
c2 = NULL
|
||||
for ( i in 1: ( length(na_count_df)-1 ) ) {
|
||||
# print(na_count_df[i] == na_count_df[i+1])
|
||||
c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
|
||||
c2 = c(c1, c2)
|
||||
if ( all(c2) ) {
|
||||
print("Sanity check passed: The indices for AF, OR, etc are all the same")
|
||||
} else {
|
||||
print ("Error: Please check indices which are NA")
|
||||
}
|
||||
}
|
||||
|
||||
rm( c, c1, c2, i, my_nrows
|
||||
, na_count, na_i, na_len
|
||||
, na_var, temp
|
||||
, na_count_df
|
||||
, pos_count_check )
|
||||
|
||||
###########################
|
||||
# 3:merging two dfs: with NA
|
||||
###########################
|
||||
|
||||
# link col name = Mutationinforamtion
|
||||
head(mcsm_data$Mutationinformation)
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
|
||||
#########
|
||||
# merge 1a: meta data with mcsm
|
||||
#########
|
||||
merged_df2 = merge(x = meta_with_afor
|
||||
, y = mcsm_data
|
||||
, by = "Mutationinformation"
|
||||
, all.y = T)
|
||||
|
||||
head(merged_df2$Position)
|
||||
|
||||
# sort by Position
|
||||
head(merged_df2$Position)
|
||||
merged_df2 = merged_df2[order(merged_df2$Position),]
|
||||
head(merged_df2$Position)
|
||||
|
||||
merged_df2v2 = merge(x = meta_with_afor
|
||||
,y = mcsm_data
|
||||
, by = "Mutationinformation"
|
||||
, all.x = T)
|
||||
|
||||
#!=!=!=!=!=!=!=!
|
||||
# COMMENT: used all.y since position 186 is not part of the struc,
|
||||
# hence doesn't have a mcsm value
|
||||
# but 186 is associated with with mutation
|
||||
#!=!=!=!=!=!=!=!
|
||||
|
||||
# should be False
|
||||
identical(merged_df2, merged_df2v2)
|
||||
table(merged_df2$Position%in%merged_df2v2$Position)
|
||||
|
||||
rm(merged_df2v2)
|
||||
|
||||
#########
|
||||
# merge 1b:remove duplicate mutation information
|
||||
#########
|
||||
|
||||
#==#=#=#=#=#=#
|
||||
# Cannot trust lineage, country from this df as the same mutation
|
||||
# can have many different lineages
|
||||
# but this should be good for the numerical corr plots
|
||||
#=#=#=#=#=#=#=
|
||||
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
|
||||
head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
|
||||
|
||||
# sanity checks
|
||||
# nrows of merged_df3 should be the same as the nrows of mcsm_data
|
||||
if(nrow(mcsm_data) == nrow(merged_df3)){
|
||||
print("sanity check: Passed")
|
||||
} else {
|
||||
print("Error!: check data, nrows is not as expected")
|
||||
}
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# uncomment as necessary
|
||||
# only need to run this if merged_df2v2 i.e non structural pos included
|
||||
#mcsm = mcsm_data$Mutationinformation
|
||||
#my_merged = merged_df3$Mutationinformation
|
||||
|
||||
# find the index where it differs
|
||||
#diff_n = which(!my_merged%in%mcsm)
|
||||
|
||||
#check if it is indeed pos 186
|
||||
#merged_df3[diff_n,]
|
||||
|
||||
# remove this entry
|
||||
#merged_df3 = merged_df3[-diff_n,]
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
###########################
|
||||
# 3b :merging two dfs: without NA
|
||||
###########################
|
||||
|
||||
#########
|
||||
# merge 2a:same as merge 1 but excluding NA
|
||||
#########
|
||||
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
|
||||
|
||||
#########
|
||||
# merge 2b: remove duplicate mutation information
|
||||
#########
|
||||
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
|
||||
|
||||
# FIXME: add this as a sanity check. I have manually checked!
|
||||
|
||||
# alternate way of deriving merged_df3_comp
|
||||
foo = merged_df3[!is.na(merged_df3$AF),]
|
||||
|
||||
# compare dfs: foo and merged_df3_com
|
||||
all.equal(foo, merged_df3)
|
||||
|
||||
summary(comparedf(foo, merged_df3))
|
||||
|
||||
#=============== end of combining df
|
||||
#clear variables
|
||||
rm(mcsm_data
|
||||
, meta_with_afor
|
||||
, foo)
|
||||
|
||||
#rm(diff_n, my_merged, mcsm)
|
||||
|
||||
#===============end of script
|
||||
|
||||
#=====================
|
||||
# write_output files
|
||||
#=====================
|
||||
|
||||
# Not required as this is a subset of the "combining_two_df.R" script
|
||||
|
25
mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
Executable file
25
mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/bin/bash
|
||||
|
||||
#*************************************
|
||||
# need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#**********************************************************************
|
||||
# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
|
||||
# per line. Sort by unique, which automatically removes duplicates.
|
||||
# sace file in current directory
|
||||
#**********************************************************************
|
||||
infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
|
||||
outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
|
||||
|
||||
# sort unique entries and output to current directory
|
||||
sort -u ${infile} > ${outfile}
|
||||
|
||||
# count no. of unique snps mCSM will run on
|
||||
count=$(wc -l < ${outfile})
|
||||
|
||||
# print to console no. of unique snps mCSM will run on
|
||||
echo "${count} unique mutations for mCSM to run on"
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
#!/bin/bash
|
||||
|
||||
#*************************************
|
||||
#need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#**********************************************************************
|
||||
# TASK: submit requests using curl: HANDLE redirects and refresh url.
|
||||
# Iterate over mutation file and write/append result urls to a file
|
||||
# result url file: stored in the /Results directory
|
||||
# mutation file: one mutation per line, no chain ID
|
||||
# output: in a file, should be n urls (n=no. of mutations in file)
|
||||
# NOTE: these are just result urls, not actual values for results
|
||||
#**********************************************************************
|
||||
## iterate over mutation file; line by line and submit query using curl
|
||||
filename="../Data/pnca_mis_SNPs_v2_unique.csv"
|
||||
|
||||
## some useful messages
|
||||
echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
|
||||
COUNT=0
|
||||
while read -r line; do
|
||||
((COUNT++))
|
||||
mutation="${line}"
|
||||
# echo "${mutation}"
|
||||
pdb='../Data/complex1_no_water.pdb'
|
||||
mutation="${mutation}"
|
||||
chain="A"
|
||||
lig_id="PZA"
|
||||
affin_wt="0.99"
|
||||
host="http://biosig.unimelb.edu.au"
|
||||
call_url="/mcsm_lig/prediction"
|
||||
|
||||
##=========================================
|
||||
##html field_names names required for curl
|
||||
##complex_field:wild=@
|
||||
##mutation_field:mutation=@
|
||||
##chain_field:chain=@
|
||||
##ligand_field:lig_id@
|
||||
##energy_field:affin_wt
|
||||
#=========================================
|
||||
refresh_url=$(curl -L \
|
||||
-sS \
|
||||
-F "wild=@${pdb}" \
|
||||
-F "mutation=${mutation}" \
|
||||
-F "chain=${chain}" \
|
||||
-F "lig_id=${lig_id}" \
|
||||
-F "affin_wt=${affin_wt}" \
|
||||
${host}${call_url} | grep "http-equiv")
|
||||
|
||||
#echo $refresh_url
|
||||
#echo ${host}${refresh_url}
|
||||
|
||||
#use regex to extract the relevant bit from the refresh url
|
||||
#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
|
||||
|
||||
#Now build: result url using host and refresh url and write the urls to a file in the Results dir
|
||||
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
|
||||
sleep 10
|
||||
|
||||
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
|
||||
|
||||
echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
|
||||
#echo -n '.'
|
||||
done < "${filename}"
|
||||
|
||||
echo
|
||||
echo "Processing Complete"
|
||||
|
||||
##end of submitting query, receiving result url and storing results url in a file
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
#!/bin/bash
|
||||
#*************************************
|
||||
#need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#********************************************************************
|
||||
# TASK: submit result urls and fetch actual results using curl
|
||||
# iterate over each result url from the output of step1 in the stored
|
||||
# in file in /Results.
|
||||
# Use curl to fetch results and extract relevant sections using hxtools
|
||||
# and store these in another file in /Results
|
||||
# This script takes two arguments:
|
||||
# input file: file containing results url
|
||||
# In this case: 336_mCSM_lig_complex1_result_url.txt
|
||||
# output file: name of the file where extracted results will be stored
|
||||
# In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
|
||||
#*********************************************************************
|
||||
|
||||
#if [ "$#" -ne 2 ]; then
|
||||
#if [ -Z $1 ]; then
|
||||
# echo "
|
||||
# Please provide both Input and Output files.
|
||||
|
||||
# Usage: batch_read_urls.sh INFILE OUTFILE
|
||||
# "
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
# First argument: Input File
|
||||
# Second argument: Output File
|
||||
#infile=$1
|
||||
#outfile=$2
|
||||
|
||||
infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
|
||||
outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
|
||||
|
||||
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
|
||||
echo
|
||||
COUNT=0
|
||||
while read -r line; do
|
||||
#COUNT=$(($COUNT+1))
|
||||
((COUNT++))
|
||||
curl --silent ${line} \
|
||||
| hxnormalize -x \
|
||||
| hxselect -c div.span4 \
|
||||
| hxselect -c div.well \
|
||||
| sed -r -e 's/<[^>]*>//g' \
|
||||
| sed -re 's/ +//g' \
|
||||
>> ${outfile}
|
||||
#| tee -a ${outfile}
|
||||
# echo -n '.'
|
||||
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
|
||||
|
||||
done < "${infile}"
|
||||
|
||||
echo
|
||||
echo "Processing Complete"
|
|
@ -0,0 +1,52 @@
|
|||
#!/bin/bash
|
||||
#*************************************
|
||||
#need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#********************************************************************
|
||||
# TASK: Intermediate results processing
|
||||
# output file has a convenient delimiter of ":" that can be used to
|
||||
# format the file into two columns (col1: field_desc and col2: values)
|
||||
# However the section "PredictedAffinityChange:...." and
|
||||
# "DUETstabilitychange:.." are split over multiple lines and
|
||||
# prevent this from happening.Additionally there are other empty lines
|
||||
# that need to be omiited. In order ensure these sections are not split
|
||||
# over multiple lines, this script is written.
|
||||
#*********************************************************************
|
||||
|
||||
infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
|
||||
|
||||
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
|
||||
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
|
||||
|
||||
# Outputs records separated by a newline, that look something like this:
|
||||
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
|
||||
# Mutationinformation:
|
||||
# Wild-type:L
|
||||
# Position:4
|
||||
# Mutant-type:W
|
||||
# Chain:A
|
||||
# LigandID:PZA
|
||||
# Distancetoligand:15.911Å
|
||||
# DUETstabilitychange:-2.169Kcal/mol
|
||||
#
|
||||
# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
|
||||
# (...etc)
|
||||
|
||||
# This script brings everything in a convenient format for further processing in python.
|
||||
# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
|
||||
sed -i '/PredictedAffinityChange/ {
|
||||
N
|
||||
N
|
||||
N
|
||||
N
|
||||
s/\n//g
|
||||
}
|
||||
/DUETstabilitychange:/ {
|
||||
N
|
||||
N
|
||||
s/\n//g
|
||||
}
|
||||
/^$/d' ${infile}
|
29
mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
Executable file
29
mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
Executable file
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/python
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
|
||||
|
||||
outCols=[
|
||||
'PredictedAffinityChange',
|
||||
'Mutationinformation',
|
||||
'Wild-type',
|
||||
'Position',
|
||||
'Mutant-type',
|
||||
'Chain',
|
||||
'LigandID',
|
||||
'Distancetoligand',
|
||||
'DUETstabilitychange'
|
||||
]
|
||||
|
||||
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
|
||||
|
||||
outputs = defaultdict(list)
|
||||
|
||||
for item in lines:
|
||||
col, val = item.split(':')
|
||||
outputs[col].append(val)
|
||||
|
||||
dfOut=pd.DataFrame(outputs)
|
||||
|
||||
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
|
207
mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
Normal file
207
mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
Normal file
|
@ -0,0 +1,207 @@
|
|||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
|
||||
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
|
||||
getwd()
|
||||
|
||||
#=======================================================
|
||||
#TASK: To tidy the columns so you can generate figures
|
||||
#=======================================================
|
||||
####################
|
||||
#### read file #####: this will be the output from python script (csv file)
|
||||
####################
|
||||
data = read.csv("336_complex1_formatted_results.csv"
|
||||
, header = T
|
||||
, stringsAsFactors = FALSE)
|
||||
dim(data)
|
||||
#335, 10
|
||||
str(data)
|
||||
|
||||
###########################
|
||||
##### Data processing #####
|
||||
###########################
|
||||
|
||||
# populate mutation information columns as currently it is empty
|
||||
head(data$Mutationinformation)
|
||||
tail(data$Mutationinformation)
|
||||
|
||||
# should not be blank: create muation information
|
||||
data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
|
||||
|
||||
head(data$Mutationinformation)
|
||||
tail(data$Mutationinformation)
|
||||
#write.csv(data, 'test.csv')
|
||||
##########################################
|
||||
# Remove duplicate SNPs as a sanity check
|
||||
##########################################
|
||||
#very important
|
||||
table(duplicated(data$Mutationinformation))
|
||||
#FALSE
|
||||
#335
|
||||
|
||||
#extract duplicated entries
|
||||
dups = data[duplicated(data$Mutationinformation),] #0
|
||||
|
||||
#No of dups should match with the no. of TRUE in the above table
|
||||
#u_dups = unique(dups$Mutationinformation) #10
|
||||
sum( table(dups$Mutationinformation) ) #13
|
||||
|
||||
rm(dups)
|
||||
|
||||
#***************************************************************
|
||||
#select non-duplicated SNPs and create a new df
|
||||
df = data[!duplicated(data$Mutationinformation),] #309, 10
|
||||
#***************************************************************
|
||||
#sanity check
|
||||
u = unique(df$Mutationinformation)
|
||||
u2 = unique(data$Mutationinformation)
|
||||
table(u%in%u2)
|
||||
#TRUE
|
||||
#309
|
||||
#should all be 1, hence 309 1's
|
||||
sum(table(df$Mutationinformation) == 1)
|
||||
|
||||
#sort df by Position
|
||||
#MANUAL CHECKPOINT:
|
||||
#foo <- df[order(df$Position),]
|
||||
#df <- df[order(df$Position),]
|
||||
|
||||
rm(u, u2, dups)
|
||||
|
||||
####################
|
||||
#### give meaningful colnames to reflect units to enable correct data type
|
||||
####################
|
||||
|
||||
#=======
|
||||
#STEP 1
|
||||
#========
|
||||
#make a copy of the PredictedAffinityColumn and call it Lig_outcome
|
||||
df$Lig_outcome = df$PredictedAffinityChange #335, 11
|
||||
|
||||
#make Predicted...column numeric and outcome column categorical
|
||||
head(df$PredictedAffinityChange)
|
||||
df$PredictedAffinityChange = gsub("log.*"
|
||||
, ""
|
||||
, df$PredictedAffinityChange)
|
||||
|
||||
#sanity checks
|
||||
head(df$PredictedAffinityChange)
|
||||
|
||||
#should be numeric, check and if not make it numeric
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
#change to numeric
|
||||
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
|
||||
#should be TRUE
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
|
||||
#change the column name to indicate units
|
||||
n = which(colnames(df) == "PredictedAffinityChange"); n
|
||||
colnames(df)[n] = "PredAffLog"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 2
|
||||
#========
|
||||
#make Lig_outcome column categorical showing effect of mutation
|
||||
head(df$Lig_outcome)
|
||||
df$Lig_outcome = gsub("^.*-"
|
||||
, "",
|
||||
df$Lig_outcome)
|
||||
#sanity checks
|
||||
head(df$Lig_outcome)
|
||||
#should be factor, check and if not change it to factor
|
||||
is.factor(df$Lig_outcome)
|
||||
#change to factor
|
||||
df$Lig_outcome = as.factor(df$Lig_outcome)
|
||||
#should be TRUE
|
||||
is.factor(df$Lig_outcome)
|
||||
|
||||
#========
|
||||
#STEP 3
|
||||
#========
|
||||
#gsub
|
||||
head(df$Distancetoligand)
|
||||
df$Distancetoligand = gsub("Å"
|
||||
, ""
|
||||
, df$Distancetoligand)
|
||||
#sanity checks
|
||||
head(df$Distancetoligand)
|
||||
#should be numeric, check if not change it to numeric
|
||||
is.numeric(df$Distancetoligand)
|
||||
#change to numeric
|
||||
df$Distancetoligand = as.numeric(df$Distancetoligand)
|
||||
#should be TRUE
|
||||
is.numeric(df$Distancetoligand)
|
||||
|
||||
#change the column name to indicate units
|
||||
n = which(colnames(df) == "Distancetoligand")
|
||||
colnames(df)[n] <- "Dis_lig_Ang"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 4
|
||||
#========
|
||||
#gsub
|
||||
head(df$DUETstabilitychange)
|
||||
df$DUETstabilitychange = gsub("Kcal/mol"
|
||||
, ""
|
||||
, df$DUETstabilitychange)
|
||||
#sanity checks
|
||||
head(df$DUETstabilitychange)
|
||||
#should be numeric, check if not change it to numeric
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
#change to numeric
|
||||
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
|
||||
#should be TRUE
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
|
||||
#change the column name to indicate units
|
||||
n = which(colnames(df) == "DUETstabilitychange"); n
|
||||
colnames(df)[n] = "DUETStability_Kcalpermol"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 5
|
||||
#========
|
||||
#create yet another extra column: classification of DUET stability only
|
||||
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
|
||||
, "Stabilizing"
|
||||
, "Destabilizing") #335, 12
|
||||
|
||||
table(df$Lig_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#281 54
|
||||
|
||||
table(df$DUET_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#288 47
|
||||
#==============================
|
||||
#FIXME
|
||||
#Insert a venn diagram
|
||||
|
||||
#================================
|
||||
|
||||
|
||||
#========
|
||||
#STEP 6
|
||||
#========
|
||||
# assign wild and mutant colnames correctly
|
||||
|
||||
wt = which(colnames(df) == "Wild.type"); wt
|
||||
colnames(df)[wt] <- "Wild_type"
|
||||
colnames(df[wt])
|
||||
|
||||
mut = which(colnames(df) == "Mutant.type"); mut
|
||||
colnames(df)[mut] <- "Mutant_type"
|
||||
colnames(df[mut])
|
||||
|
||||
#========
|
||||
#STEP 7
|
||||
#========
|
||||
#create an extra column: maybe useful for some plots
|
||||
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
|
||||
|
||||
#clear variables
|
||||
rm(n, wt, mut)
|
||||
|
||||
################ end of data cleaning
|
252
mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
Normal file
252
mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
Normal file
|
@ -0,0 +1,252 @@
|
|||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
|
||||
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
|
||||
getwd()
|
||||
|
||||
#=======================================================
|
||||
#TASK:read cleaned data and perform rescaling
|
||||
# of DUET stability scores
|
||||
# of Pred affinity
|
||||
#compare scaling methods with plots
|
||||
#output normalised file
|
||||
#=======================================================
|
||||
|
||||
####################
|
||||
#### read file #####: this will be the output of my R script that cleans the data columns
|
||||
####################
|
||||
source("../Scripts/step3c_data_cleaning.R")
|
||||
##This will outut two dataframes:
|
||||
##data: unclean data: 335, 10
|
||||
##df : cleaned df 335, 13
|
||||
## you can remove data if you want as you will not need it
|
||||
rm(data)
|
||||
|
||||
colnames(df)
|
||||
|
||||
#===================
|
||||
#3a: PredAffLog
|
||||
#===================
|
||||
n = which(colnames(df) == "PredAffLog"); n
|
||||
group = which(colnames(df) == "Lig_outcome"); group
|
||||
|
||||
#===================================================
|
||||
# order according to PredAffLog values
|
||||
#===================================================
|
||||
# This is because this makes it easier to see the results of rescaling for debugging
|
||||
head(df$PredAffLog)
|
||||
|
||||
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
|
||||
df = df[order(df$PredAffLog),]
|
||||
head(df$PredAffLog)
|
||||
|
||||
#sanity checks
|
||||
head(df[,n]) #all negatives
|
||||
tail(df[,n]) #all positives
|
||||
|
||||
#sanity checks
|
||||
mean(df[,n])
|
||||
#-0.9526746
|
||||
|
||||
tapply(df[,n], df[,group], mean)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.2112100 0.3926667
|
||||
#===========================
|
||||
#Same as above: in 2 steps
|
||||
#===========================
|
||||
|
||||
#find range of your data
|
||||
my_min = min(df[,n]); my_min #-3.948
|
||||
my_max = max(df[,n]); my_max #2.23
|
||||
|
||||
#===============================================
|
||||
# WITHIN GROUP rescaling 2: method "ratio"
|
||||
# create column to store the rescaled values
|
||||
# Rescaling separately (Less dangerous)
|
||||
# =====> chosen one:as Nick prefers
|
||||
#===============================================
|
||||
df$ratioPredAff = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max
|
||||
)#335 14
|
||||
#sanity checks
|
||||
head(df$ratioPredAff)
|
||||
tail(df$ratioPredAff)
|
||||
|
||||
min(df$ratioPredAff); max(df$ratioPredAff)
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.000000000 0.005381166
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
#Destabilizing Stabilizing
|
||||
#-0.001266464 1.000000000
|
||||
|
||||
#should be the same as below (281 and 54)
|
||||
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
|
||||
|
||||
table(df$Lig_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#281 54
|
||||
|
||||
#===============================================
|
||||
# Hist and density plots to compare the rescaling
|
||||
# methods: Base R
|
||||
#===============================================
|
||||
#uncomment as necessary
|
||||
my_title = "Ligand_stability"
|
||||
#my_title = colnames(df[n])
|
||||
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(2,2))
|
||||
|
||||
hist(df[,n]
|
||||
, xlab = ""
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
hist(df$ratioPredAff
|
||||
, xlab = ""
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# Plot density plots underneath
|
||||
plot(density( df[,n] )
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
plot(density( df$ratioPredAff )
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = my_title
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
|
||||
#clear variables
|
||||
rm(my_min, my_max, my_title, n, group)
|
||||
|
||||
#===================
|
||||
# 3b: DUET stability
|
||||
#===================
|
||||
dim(df) #335, 14
|
||||
|
||||
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
|
||||
group = which(colnames(df) == "DUET_outcome"); group #12
|
||||
|
||||
#===================================================
|
||||
# order according to DUET scores
|
||||
#===================================================
|
||||
# This is because this makes it easier to see the results of rescaling for debugging
|
||||
head(df$DUETStability_Kcalpermol)
|
||||
|
||||
#ORDER BY DUET scores: negative values at the top and positive at the bottom
|
||||
df = df[order(df$DUETStability_Kcalpermol),]
|
||||
|
||||
#sanity checks
|
||||
head(df[,n]) #negatives
|
||||
tail(df[,n]) #positives
|
||||
|
||||
#sanity checks
|
||||
mean(df[,n])
|
||||
#[1] -1.173316
|
||||
|
||||
tapply(df[,n], df[,group], mean)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.4297257 0.3978723
|
||||
|
||||
#===============================================
|
||||
# WITHIN GROUP rescaling 2: method "ratio"
|
||||
# create column to store the rescaled values
|
||||
# Rescaling separately (Less dangerous)
|
||||
# =====> chosen one:as Nick prefers
|
||||
#===============================================
|
||||
#find range of your data
|
||||
my_min = min(df[,n]); my_min #-3.87
|
||||
my_max = max(df[,n]); my_max #1.689
|
||||
|
||||
df$ratioDUET = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max
|
||||
) #335, 15
|
||||
#sanity check
|
||||
head(df$ratioDUET)
|
||||
tail(df$ratioDUET)
|
||||
|
||||
min(df$ratioDUET); max(df$ratioDUET)
|
||||
|
||||
#sanity checks
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.00000000 0.01065719
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
#Destabilizing Stabilizing
|
||||
#-0.003875969 1.000000000
|
||||
|
||||
#should be the same as below (267 and 42)
|
||||
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
|
||||
|
||||
table(df$DUET_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#288 47
|
||||
|
||||
#===============================================
|
||||
# Hist and density plots to compare the rescaling
|
||||
# methods: Base R
|
||||
#===============================================
|
||||
#uncomment as necessary
|
||||
|
||||
my_title = "DUET_stability"
|
||||
#my_title = colnames(df[n])
|
||||
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(2,2))
|
||||
|
||||
hist(df[,n]
|
||||
, xlab = ""
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
hist(df$ratioDUET
|
||||
, xlab = ""
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# Plot density plots underneath
|
||||
plot(density( df[,n] )
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
plot(density( df$ratioDUET )
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = my_title
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
#===================
|
||||
# write output as csv file
|
||||
#===================
|
||||
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
|
131
mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
Normal file
131
mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
Normal file
|
@ -0,0 +1,131 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
require(data.table)
|
||||
require(dplyr)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
###########################
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
###########################
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
###########################
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
###########################
|
||||
# Data for bfactor figure
|
||||
# PS average
|
||||
# Lig average
|
||||
###########################
|
||||
|
||||
head(my_df$Position)
|
||||
head(my_df$ratioDUET)
|
||||
|
||||
# order data frame
|
||||
df = my_df[order(my_df$Position),]
|
||||
|
||||
head(df$Position)
|
||||
head(df$ratioDUET)
|
||||
|
||||
#***********
|
||||
# PS: average by position
|
||||
#***********
|
||||
|
||||
mean_DUET_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.DUET = mean(ratioDUET))
|
||||
|
||||
#***********
|
||||
# Lig: average by position
|
||||
#***********
|
||||
mean_Lig_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.Lig = mean(ratioPredAff))
|
||||
|
||||
|
||||
#***********
|
||||
# cbind:mean_DUET_by_position and mean_Lig_by_position
|
||||
#***********
|
||||
|
||||
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
|
||||
|
||||
# sanity check
|
||||
# mean_PS_Lig_Bfactor
|
||||
|
||||
colnames(combined)
|
||||
|
||||
colnames(combined) = c("Position"
|
||||
, "average_DUETR"
|
||||
, "Position2"
|
||||
, "average_PredAffR")
|
||||
|
||||
colnames(combined)
|
||||
|
||||
identical(combined$Position, combined$Position2)
|
||||
|
||||
n = which(colnames(combined) == "Position2"); n
|
||||
|
||||
combined_df = combined[,-n]
|
||||
|
||||
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
|
||||
|
||||
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
|
||||
|
||||
#=============
|
||||
# output csv
|
||||
#============
|
||||
outDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
|
||||
print(paste0("Output file with path will be:","", outFile))
|
||||
|
||||
head(combined_df$Position); tail(combined_df$Position)
|
||||
|
||||
write.csv(combined_df, outFile
|
||||
, row.names = F)
|
BIN
mcsm_analysis/pyrazinamide/scripts/plotting/.RData
Normal file
BIN
mcsm_analysis/pyrazinamide/scripts/plotting/.RData
Normal file
Binary file not shown.
0
mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
Normal file
0
mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
Normal file
|
@ -0,0 +1,250 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
require(cowplot)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for OR and stability plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#my_df = merged_df3
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# sanity check
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.numeric(my_df$OR)
|
||||
#[1] TRUE
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
# FOR PS Plots
|
||||
#<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
PS_df = my_df
|
||||
|
||||
rm(my_df)
|
||||
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
getwd()
|
||||
|
||||
source("combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for OR and stability plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df2 = merged_df3_comp
|
||||
#my_df2 = merged_df3
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df2)
|
||||
str(my_df2)
|
||||
|
||||
# sanity check
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.numeric(my_df2$OR)
|
||||
#[1] TRUE
|
||||
|
||||
# sanity check: should be <10
|
||||
if (max(my_df2$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
# FOR Lig Plots
|
||||
#<<<<<<<<<<<<<<<<
|
||||
|
||||
Lig_df = my_df2
|
||||
|
||||
rm(my_df2)
|
||||
|
||||
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
|
||||
|
||||
#############
|
||||
# Plots: Bubble plot
|
||||
# x = Position, Y = stability
|
||||
# size of dots = OR
|
||||
# col: stability
|
||||
#############
|
||||
|
||||
#=================
|
||||
# generate plot 1: DUET vs OR by position as geom_points
|
||||
#=================
|
||||
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# Spelling Correction: made redundant as already corrected at the source
|
||||
|
||||
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
|
||||
|
||||
g = ggplot(PS_df, aes(x = factor(Position)
|
||||
, y = ratioDUET))
|
||||
|
||||
p1 = g +
|
||||
geom_point(aes(col = DUET_outcome
|
||||
, size = OR)) +
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als)
|
||||
, legend.title = element_text(size = my_als) ) +
|
||||
#, legend.key.size = unit(1, "cm")) +
|
||||
labs(title = ""
|
||||
, x = "Position"
|
||||
, y = "DUET(PS)"
|
||||
, size = "Odds Ratio"
|
||||
, colour = "DUET Outcome") +
|
||||
guides(colour = guide_legend(override.aes = list(size=4)))
|
||||
|
||||
p1
|
||||
|
||||
#=================
|
||||
# generate plot 2: Lig vs OR by position as geom_points
|
||||
#=================
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# Spelling Correction: made redundant as already corrected at the source
|
||||
|
||||
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
table(Lig_df$Lig_outcome)
|
||||
|
||||
g = ggplot(Lig_df, aes(x = factor(Position)
|
||||
, y = ratioPredAff))
|
||||
|
||||
p2 = g +
|
||||
geom_point(aes(col = Lig_outcome
|
||||
, size = OR))+
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als)
|
||||
, legend.title = element_text(size = my_als) ) +
|
||||
#, legend.key.size = unit(1, "cm")) +
|
||||
labs(title = ""
|
||||
, x = "Position"
|
||||
, y = "Ligand Affinity"
|
||||
, size = "Odds Ratio"
|
||||
, colour = "Ligand Outcome"
|
||||
) +
|
||||
guides(colour = guide_legend(override.aes = list(size=4)))
|
||||
|
||||
p2
|
||||
|
||||
#======================
|
||||
#combine using cowplot
|
||||
#======================
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
|
||||
#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
|
||||
theme_set(theme_gray()) # to preserve default theme
|
||||
|
||||
printFile = cowplot::plot_grid(plot_grid(p1, p2
|
||||
, ncol = 1
|
||||
, align = 'v'
|
||||
, labels = c("A", "B")
|
||||
, label_size = my_als+5))
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Barplot with scores (unordered)
|
||||
# corresponds to Lig_outcome
|
||||
# Stacked Barplot with colours: Lig_outcome @ position coloured by
|
||||
# Lig_outcome. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding Lig_outcome.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(my_df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(df$Lig_outcome)
|
||||
#TRUE
|
||||
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# should be -1 and 1: may not be in this case because you have filtered the data
|
||||
# FIXME: normalisation before or after filtering?
|
||||
min(df$ratioPredAff) #
|
||||
max(df$ratioPredAff) #
|
||||
|
||||
# sanity checks
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
my_title = "Ligand affinity"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = Lig_outcome), colour = "grey") +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -0,0 +1,149 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot 2: Barplot with scores (unordered)
|
||||
# corresponds to DUET_outcome
|
||||
# Stacked Barplot with colours: DUET_outcome @ position coloured by
|
||||
# DUET outcome. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding DUET_outcome
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
table(my_df$DUET_outcome)
|
||||
|
||||
# should be -1 and 1
|
||||
min(df$ratioDUET)
|
||||
max(df$ratioDUET)
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
my_title = "Protein stability (DUET)"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = DUET_outcome), colour = "grey") +
|
||||
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -0,0 +1,202 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
source("../barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$Lig_outcome)
|
||||
my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
|
||||
is.factor(my_df$Lig_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Barplot with scores (unordered)
|
||||
# corresponds to Lig_outcome
|
||||
# Stacked Barplot with colours: Lig_outcome @ position coloured by
|
||||
# stability scores. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding Lig stability value.
|
||||
# Normalised values (range between -1 and 1 ) to aid visualisation
|
||||
# NOTE: since barplot plots discrete values, colour = score, so number of
|
||||
# colours will be equal to the no. of unique normalised scores
|
||||
# rather than a continuous scale
|
||||
# will require generating the colour scale separately.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# should be -1 and 1: may not be in this case because you have filtered the data
|
||||
# FIXME: normalisation before or after filtering?
|
||||
min(df$ratioPredAff) #
|
||||
max(df$ratioPredAff) #
|
||||
|
||||
# sanity checks
|
||||
# very important!!!!
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
# My colour FUNCTION: based on group and subgroup
|
||||
# in my case;
|
||||
# df = df
|
||||
# group = Lig_outcome
|
||||
# subgroup = normalised score i.e ratioPredAff
|
||||
|
||||
# Prepare data: round off ratioLig scores
|
||||
# round off to 3 significant digits:
|
||||
# 165 if no rounding is performed: used to generate the originalgraph
|
||||
# 156 if rounded to 3 places
|
||||
# FIXME: check if reducing precision creates any ML prob
|
||||
|
||||
# check unique values in normalised data
|
||||
u = unique(df$ratioPredAff)
|
||||
|
||||
# <<<<< -------------------------------------------
|
||||
# Run this section if rounding is to be used
|
||||
# specify number for rounding
|
||||
n = 3
|
||||
df$ratioLigR = round(df$ratioPredAff, n)
|
||||
u = unique(df$ratioLigR) # 156
|
||||
# create an extra column called group which contains the "gp name and score"
|
||||
# so colours can be generated for each unique values in this column
|
||||
my_grp = df$ratioLigR
|
||||
df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# else
|
||||
# uncomment the below if rounding is not required
|
||||
|
||||
#my_grp = df$ratioLig
|
||||
#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# <<<<< -----------------------------------------------
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
|
||||
my_title = "Ligand affinity"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = group), colour = "grey") +
|
||||
scale_fill_manual( values = colours
|
||||
, guide = 'none') +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -0,0 +1,192 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
source("../barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Barplot with scores (unordered)
|
||||
# corresponds to DUET_outcome
|
||||
# Stacked Barplot with colours: DUET_outcome @ position coloured by
|
||||
# stability scores. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding DUET stability value.
|
||||
# Normalised values (range between -1 and 1 ) to aid visualisation
|
||||
# NOTE: since barplot plots discrete values, colour = score, so number of
|
||||
# colours will be equal to the no. of unique normalised scores
|
||||
# rather than a continuous scale
|
||||
# will require generating the colour scale separately.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
table(df$DUET_outcome)
|
||||
|
||||
# should be -1 and 1
|
||||
min(df$ratioDUET)
|
||||
max(df$ratioDUET)
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
# My colour FUNCTION: based on group and subgroup
|
||||
# in my case;
|
||||
# df = df
|
||||
# group = DUET_outcome
|
||||
# subgroup = normalised score i.e ratioDUET
|
||||
|
||||
# Prepare data: round off ratioDUET scores
|
||||
# round off to 3 significant digits:
|
||||
# 323 if no rounding is performed: used to generate the original graph
|
||||
# 287 if rounded to 3 places
|
||||
# FIXME: check if reducing precicion creates any ML prob
|
||||
|
||||
# check unique values in normalised data
|
||||
u = unique(df$ratioDUET)
|
||||
|
||||
# <<<<< -------------------------------------------
|
||||
# Run this section if rounding is to be used
|
||||
# specify number for rounding
|
||||
n = 3
|
||||
df$ratioDUETR = round(df$ratioDUET, n)
|
||||
u = unique(df$ratioDUETR)
|
||||
# create an extra column called group which contains the "gp name and score"
|
||||
# so colours can be generated for each unique values in this column
|
||||
my_grp = df$ratioDUETR
|
||||
df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# else
|
||||
# uncomment the below if rounding is not required
|
||||
|
||||
#my_grp = df$ratioDUET
|
||||
#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# <<<<< -----------------------------------------------
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
|
||||
my_title = "Protein stability (DUET)"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = group), colour = "grey") +
|
||||
scale_fill_manual( values = colours
|
||||
, guide = 'none') +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
215
mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
Normal file
215
mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
Normal file
|
@ -0,0 +1,215 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#require(data.table)
|
||||
#require(dplyr)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$Lig_outcome)
|
||||
my_df$Lig_outcome = as.factor(my_df$lig_outcome)
|
||||
is.factor(my_df$Lig_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Basic barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
if (identical(df$Position, df$position)){
|
||||
print("Sanity check passed: Columns 'Position' and 'position' are identical")
|
||||
} else{
|
||||
print("Error!: Check column names and info contained")
|
||||
}
|
||||
|
||||
#****************
|
||||
# generate plot: No of stabilising and destabilsing muts
|
||||
#****************
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('basic_barplots_LIG.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# uncomment as necessary for either directly outputting results or
|
||||
# printing on the screen
|
||||
g = ggplot(df, aes(x = Lig_outcome))
|
||||
#prinfFile = g + geom_bar(
|
||||
g + geom_bar(
|
||||
aes(fill = Lig_outcome)
|
||||
, show.legend = TRUE
|
||||
) + geom_label(
|
||||
stat = "count"
|
||||
, aes(label = ..count..)
|
||||
, color = "black"
|
||||
, show.legend = FALSE
|
||||
, size = 10) + theme(
|
||||
axis.text.x = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size=my_als)
|
||||
, axis.text.y = element_text(size = my_ats)
|
||||
, legend.position = c(0.73,0.8)
|
||||
, legend.text = element_text(size=my_als-2)
|
||||
, legend.title = element_text(size=my_als)
|
||||
, plot.title = element_blank()
|
||||
) + labs(
|
||||
title = ""
|
||||
, y = "Number of SNPs"
|
||||
#, fill='Ligand Outcome'
|
||||
) + scale_fill_discrete(name = "Ligand Outcome"
|
||||
, labels = c("Destabilising", "Stabilising"))
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
|
||||
#****************
|
||||
# generate plot: No of positions
|
||||
#****************
|
||||
#get freq count of positions so you can subset freq<1
|
||||
#require(data.table)
|
||||
setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
|
||||
|
||||
head(df$pos_count)
|
||||
table(df$pos_count)
|
||||
# this is cummulative
|
||||
#1 2 3 4 5 6
|
||||
#5 24 36 56 30 18
|
||||
|
||||
# use group by on this
|
||||
snpsBYpos_df <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(snpsBYpos = mean(pos_count))
|
||||
|
||||
table(snpsBYpos_df$snpsBYpos)
|
||||
#1 2 3 4 5 6
|
||||
#5 12 12 14 6 3
|
||||
# this is what will get plotted
|
||||
|
||||
svg('position_count_LIG.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes (alpha = 0.5)
|
||||
, show.legend = FALSE
|
||||
) +
|
||||
geom_label(
|
||||
stat = "count", aes(label = ..count..)
|
||||
, color = "black"
|
||||
, size = 10
|
||||
) +
|
||||
theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
)
|
||||
, axis.text.y = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, plot.title = element_blank()
|
||||
) +
|
||||
labs(
|
||||
x = "Number of SNPs"
|
||||
, y = "Number of Sites"
|
||||
)
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
########################################################################
|
||||
# end of Lig barplots #
|
||||
########################################################################
|
||||
|
||||
|
211
mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
Normal file
211
mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
Normal file
|
@ -0,0 +1,211 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Basic barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
if (identical(df$Position, df$position)){
|
||||
print("Sanity check passed: Columns 'Position' and 'position' are identical")
|
||||
} else{
|
||||
print("Error!: Check column names and info contained")
|
||||
}
|
||||
|
||||
#****************
|
||||
# generate plot: No of stabilising and destabilsing muts
|
||||
#****************
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('basic_barplots_DUET.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
theme_set(theme_grey())
|
||||
|
||||
# uncomment as necessary for either directly outputting results or
|
||||
# printing on the screen
|
||||
g = ggplot(df, aes(x = DUET_outcome))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes(fill = DUET_outcome)
|
||||
, show.legend = TRUE
|
||||
) + geom_label(
|
||||
stat = "count"
|
||||
, aes(label = ..count..)
|
||||
, color = "black"
|
||||
, show.legend = FALSE
|
||||
, size = 10) + theme(
|
||||
axis.text.x = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size=my_als)
|
||||
, axis.text.y = element_text(size = my_ats)
|
||||
, legend.position = c(0.73,0.8)
|
||||
, legend.text = element_text(size=my_als-2)
|
||||
, legend.title = element_text(size=my_als)
|
||||
, plot.title = element_blank()
|
||||
) + labs(
|
||||
title = ""
|
||||
, y = "Number of SNPs"
|
||||
#, fill='DUET Outcome'
|
||||
) + scale_fill_discrete(name = "DUET Outcome"
|
||||
, labels = c("Destabilising", "Stabilising"))
|
||||
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
|
||||
#****************
|
||||
# generate plot: No of positions
|
||||
#****************
|
||||
#get freq count of positions so you can subset freq<1
|
||||
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
|
||||
|
||||
setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
|
||||
table(df$pos_count)
|
||||
# this is cummulative
|
||||
#1 2 3 4 5 6
|
||||
#34 76 63 104 40 18
|
||||
|
||||
# use group by on this
|
||||
snpsBYpos_df <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(snpsBYpos = mean(pos_count))
|
||||
|
||||
table(snpsBYpos_df$snpsBYpos)
|
||||
#1 2 3 4 5 6
|
||||
#34 38 21 26 8 3
|
||||
|
||||
foo = select(df, Mutationinformation
|
||||
, WildPos
|
||||
, wild_type
|
||||
, mutant_type
|
||||
, mutation_info
|
||||
, position
|
||||
, pos_count) #335, 5
|
||||
|
||||
getwd()
|
||||
write.csv(foo, "../Data/pos_count_freq.csv")
|
||||
|
||||
svg('position_count_DUET.svg')
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes (alpha = 0.5)
|
||||
, show.legend = FALSE
|
||||
) +
|
||||
geom_label(
|
||||
stat = "count", aes(label = ..count..)
|
||||
, color = "black"
|
||||
, size = 10
|
||||
) +
|
||||
theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
)
|
||||
, axis.text.y = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, plot.title = element_blank()
|
||||
) +
|
||||
labs(
|
||||
x = "Number of SNPs"
|
||||
, y = "Number of Sites"
|
||||
)
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
########################################################################
|
||||
# end of DUET barplots #
|
||||
########################################################################
|
||||
|
175
mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
Normal file
175
mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
Normal file
|
@ -0,0 +1,175 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for PS Corr plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Correlation plots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
table(df$DUET_outcome)
|
||||
|
||||
# unique positions
|
||||
length(unique(df$Position)) #{RESULT: unique positions for comp data}
|
||||
|
||||
|
||||
# subset data to generate pairwise correlations
|
||||
corr_data = df[, c("ratioDUET"
|
||||
# , "ratioPredAff"
|
||||
# , "DUETStability_Kcalpermol"
|
||||
# , "PredAffLog"
|
||||
# , "OR"
|
||||
, "logor"
|
||||
# , "pvalue"
|
||||
, "neglog10pvalue"
|
||||
, "AF"
|
||||
, "DUET_outcome"
|
||||
# , "Lig_outcome"
|
||||
, "pyrazinamide"
|
||||
)]
|
||||
dim(corr_data)
|
||||
rm(df)
|
||||
|
||||
# assign nice colnames (for display)
|
||||
my_corr_colnames = c("DUET"
|
||||
# , "Ligand Affinity"
|
||||
# , "DUET_raw"
|
||||
# , "Lig_raw"
|
||||
# , "OR"
|
||||
, "Log(Odds Ratio)"
|
||||
# , "P-value"
|
||||
, "-LogP"
|
||||
, "Allele Frequency"
|
||||
, "DUET_outcome"
|
||||
# , "Lig_outcome"
|
||||
, "pyrazinamide")
|
||||
|
||||
# sanity check
|
||||
if (length(my_corr_colnames) == length(corr_data)){
|
||||
print("Sanity check passed: corr_data and corr_names match in length")
|
||||
}else{
|
||||
print("Error: length mismatch!")
|
||||
}
|
||||
|
||||
colnames(corr_data)
|
||||
colnames(corr_data) <- my_corr_colnames
|
||||
colnames(corr_data)
|
||||
|
||||
###############
|
||||
# PLOTS: corr
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
###############
|
||||
#default pairs plot
|
||||
start = 1
|
||||
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
my_corr = corr_data[start:(end-offset)]
|
||||
head(my_corr)
|
||||
|
||||
#my_cols = c("#f8766d", "#00bfc4")
|
||||
# deep blue :#007d85
|
||||
# deep red: #ae301e
|
||||
|
||||
#==========
|
||||
# psych: ionformative since it draws the ellipsoid
|
||||
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
#==========
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('DUET_corr.svg', width = 15, height = 15)
|
||||
printFile = pairs.panels(my_corr[1:4]
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
|
||||
, pch = 21
|
||||
, jitter = T
|
||||
#, alpha = .05
|
||||
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
|
||||
, cex = 3
|
||||
, cex.axis = 2.5
|
||||
, cex.labels = 3
|
||||
, cex.cor = 1
|
||||
, smooth = F
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
187
mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
Normal file
187
mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
Normal file
|
@ -0,0 +1,187 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig Corr plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Correlation plots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# unique positions
|
||||
length(unique(df$Position)) #{RESULT: unique positions for comp data}
|
||||
|
||||
# subset data to generate pairwise correlations
|
||||
corr_data = df[, c(#"ratioDUET",
|
||||
"ratioPredAff"
|
||||
# , "DUETStability_Kcalpermol"
|
||||
# , "PredAffLog"
|
||||
# , "OR"
|
||||
, "logor"
|
||||
# , "pvalue"
|
||||
, "neglog10pvalue"
|
||||
, "AF"
|
||||
# , "DUET_outcome"
|
||||
, "Lig_outcome"
|
||||
, "pyrazinamide"
|
||||
)]
|
||||
dim(corr_data)
|
||||
rm(df)
|
||||
|
||||
# assign nice colnames (for display)
|
||||
my_corr_colnames = c(#"DUET",
|
||||
"Ligand Affinity"
|
||||
# ,"DUET_raw"
|
||||
# , "Lig_raw"
|
||||
# , "OR"
|
||||
, "Log(Odds Ratio)"
|
||||
# , "P-value"
|
||||
, "-LogP"
|
||||
, "Allele Frequency"
|
||||
# , "DUET_outcome"
|
||||
, "Lig_outcome"
|
||||
, "pyrazinamide")
|
||||
|
||||
# sanity check
|
||||
if (length(my_corr_colnames) == length(corr_data)){
|
||||
print("Sanity check passed: corr_data and corr_names match in length")
|
||||
}else{
|
||||
print("Error: length mismatch!")
|
||||
}
|
||||
|
||||
colnames(corr_data)
|
||||
colnames(corr_data) <- my_corr_colnames
|
||||
colnames(corr_data)
|
||||
|
||||
###############
|
||||
# PLOTS: corr
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
###############
|
||||
|
||||
# default pairs plot
|
||||
start = 1
|
||||
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
my_corr = corr_data[start:(end-offset)]
|
||||
head(my_corr)
|
||||
|
||||
#my_cols = c("#f8766d", "#00bfc4")
|
||||
# deep blue :#007d85
|
||||
# deep red: #ae301e
|
||||
|
||||
#==========
|
||||
# psych: ionformative since it draws the ellipsoid
|
||||
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
#==========
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('Lig_corr.svg', width = 15, height = 15)
|
||||
printFile = pairs.panels(my_corr[1:4]
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
|
||||
, pch = 21
|
||||
, jitter = T
|
||||
# , alpha = .05
|
||||
# , points(pch = 19, col = c("#f8766d", "#00bfc4"))
|
||||
, cex = 3
|
||||
, cex.axis = 2.5
|
||||
, cex.labels = 3
|
||||
, cex.cor = 1
|
||||
, smooth = F
|
||||
)
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,227 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2, comprehensive one
|
||||
# since this has one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage barplot
|
||||
# x = lineage y = No. of samples
|
||||
# col = Lineage
|
||||
# fill = lineage
|
||||
#============================
|
||||
table(my_df$lineage)
|
||||
|
||||
# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV
|
||||
#3 104 1293 264 1311 6 6 105
|
||||
|
||||
#===========================
|
||||
# Plot: Lineage Barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
rm(my_df)
|
||||
|
||||
# get freq count of positions so you can subset freq<1
|
||||
#setDT(df)[, lineage_count := .N, by = .(lineage)]
|
||||
|
||||
#******************
|
||||
# generate plot: barplot of mutation by lineage
|
||||
#******************
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
df_lin = subset(df, subset = lineage %in% sel_lineages )
|
||||
|
||||
#FIXME; add sanity check for numbers.
|
||||
# Done this manually
|
||||
|
||||
############################################################
|
||||
|
||||
#########
|
||||
# Data for barplot: Lineage barplot
|
||||
# to show total samples and number of unique mutations
|
||||
# within each linege
|
||||
##########
|
||||
|
||||
# Create df with lineage inform & no. of unique mutations
|
||||
# per lineage and total samples within lineage
|
||||
# this is essentially barplot with two y axis
|
||||
|
||||
bar = bar = as.data.frame(sel_lineages) #4, 1
|
||||
total_snps_u = NULL
|
||||
total_samples = NULL
|
||||
|
||||
for (i in sel_lineages){
|
||||
#print(i)
|
||||
curr_total = length(unique(df$id)[df$lineage==i])
|
||||
total_samples = c(total_samples, curr_total)
|
||||
print(total_samples)
|
||||
|
||||
foo = df[df$lineage==i,]
|
||||
print(paste0(i, "======="))
|
||||
print(length(unique(foo$Mutationinformation)))
|
||||
curr_count = length(unique(foo$Mutationinformation))
|
||||
|
||||
total_snps_u = c(total_snps_u, curr_count)
|
||||
}
|
||||
|
||||
print(total_snps_u)
|
||||
bar$num_snps_u = total_snps_u
|
||||
bar$total_samples = total_samples
|
||||
bar
|
||||
|
||||
#*****************
|
||||
# generate plot: lineage barplot with two y-axis
|
||||
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
|
||||
#*****************
|
||||
|
||||
bar$num_snps_u = y1
|
||||
bar$total_samples = y2
|
||||
sel_lineages = x
|
||||
|
||||
to_plot = data.frame(x = x
|
||||
, y1 = y1
|
||||
, y2 = y2)
|
||||
to_plot
|
||||
|
||||
melted = melt(to_plot, id = "x")
|
||||
melted
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_basic_barplot.svg')
|
||||
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(melted
|
||||
, aes(x = x
|
||||
, y = value
|
||||
, fill = variable)
|
||||
)
|
||||
|
||||
|
||||
printFile = g + geom_bar(
|
||||
|
||||
#g + geom_bar(
|
||||
stat = "identity"
|
||||
, position = position_stack(reverse = TRUE)
|
||||
, alpha=.75
|
||||
, colour='grey75'
|
||||
) + theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
# , angle= 30
|
||||
)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
#, angle = 30
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(
|
||||
size = my_als
|
||||
, colour = 'black'
|
||||
)
|
||||
, axis.title.y = element_text(
|
||||
size = my_als
|
||||
, colour = 'black'
|
||||
)
|
||||
, legend.position = "top"
|
||||
, legend.text = element_text(size = my_als)
|
||||
|
||||
#) + geom_text(
|
||||
) + geom_label(
|
||||
aes(label = value)
|
||||
, size = 5
|
||||
, hjust = 0.5
|
||||
, vjust = 0.5
|
||||
, colour = 'black'
|
||||
, show.legend = FALSE
|
||||
#, check_overlap = TRUE
|
||||
, position = position_stack(reverse = T)
|
||||
#, position = ('
|
||||
|
||||
) + labs(
|
||||
title = ''
|
||||
, x = ''
|
||||
, y = "Number"
|
||||
, fill = 'Variable'
|
||||
, colour = 'black'
|
||||
) + scale_fill_manual(
|
||||
values = c('grey50', 'gray75')
|
||||
, name=''
|
||||
, labels=c('Mutations', 'Total Samples')
|
||||
) + scale_x_discrete(
|
||||
breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
)
|
||||
print(printFile)
|
||||
dev.off()
|
233
mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
Normal file
233
mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
Normal file
|
@ -0,0 +1,233 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
#require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for Lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
table(my_df$mutation_info)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage Distribution
|
||||
# x = mcsm_values, y = dist
|
||||
# fill = stability
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
# uncomment as necessary
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
|
||||
|
||||
# refactor
|
||||
df_lin$lineage = factor(df_lin$lineage)
|
||||
|
||||
table(df_lin$lineage) #{RESULT: No of samples within lineage}
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#78 961 195 803
|
||||
|
||||
# when merged_df2_comp is used
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#77 955 194 770
|
||||
|
||||
length(unique(df_lin$Mutationinformation))
|
||||
#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
|
||||
# sanity checks
|
||||
r1 = 2:5 # when merged_df2 used: because there is missing lineages
|
||||
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
|
||||
print ("sanity check passed: numbers match")
|
||||
} else{
|
||||
print("Error!: check your numbers")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- df_lin
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(df_lin)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# basic: could improve this!
|
||||
library(plotly)
|
||||
library(ggridges)
|
||||
|
||||
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
g <- ggplot(df, aes(x = ratioPredAff)) +
|
||||
geom_density(aes(fill = Lig_outcome)
|
||||
, alpha = 0.5) +
|
||||
facet_wrap( ~ lineage
|
||||
, scales = "free"
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian(xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
)
|
||||
ggtitle("Kernel Density estimates of Ligand affinity by lineage")
|
||||
|
||||
ggplotly(g)
|
||||
|
||||
# 2 : ggridges (good!)
|
||||
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
|
||||
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_dist_LIG.svg')
|
||||
|
||||
printFile = ggplot( df, aes(x = ratioPredAff
|
||||
, y = Lig_outcome) ) +
|
||||
|
||||
geom_density_ridges_gradient( aes(fill = ..x..)
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage
|
||||
, scales = "free"
|
||||
# , switch = 'x'
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian( xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
) +
|
||||
|
||||
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
|
||||
, name = "Ligand Affinity" ) +
|
||||
theme( axis.text.x = element_text( size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
# , axis.text.y = element_text( size = my_ats
|
||||
# , angle = 0
|
||||
# , hjust = 1
|
||||
# , vjust = 0)
|
||||
, axis.text.y = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_blank()
|
||||
, axis.ticks.y = element_blank()
|
||||
, plot.title = element_blank()
|
||||
, strip.text = element_text(size = my_als)
|
||||
, legend.text = element_text(size = 10)
|
||||
, legend.title = element_text(size = my_als)
|
||||
# , legend.position = c(0.3, 0.8)
|
||||
# , legend.key.height = unit(1, 'mm')
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
||||
#=!=!=!=!=!=!
|
||||
# COMMENT: When you look at all mutations, the lineage differences disappear...
|
||||
# The pattern we are interested in is possibly only for dr_mutations
|
||||
#=!=!=!=!=!=!
|
||||
|
||||
#===================================================
|
||||
|
||||
# COMPARING DISTRIBUTIONS
|
||||
head(df$lineage)
|
||||
df$lineage = as.character(df$lineage)
|
||||
|
||||
lin1 = df[df$lineage == "lineage1",]$ratioPredAff
|
||||
lin2 = df[df$lineage == "lineage2",]$ratioPredAff
|
||||
lin3 = df[df$lineage == "lineage3",]$ratioPredAff
|
||||
lin4 = df[df$lineage == "lineage4",]$ratioPredAff
|
||||
|
||||
# ks test
|
||||
ks.test(lin1,lin2)
|
||||
ks.test(lin1,lin3)
|
||||
ks.test(lin1,lin4)
|
||||
|
||||
ks.test(lin2,lin3)
|
||||
ks.test(lin2,lin4)
|
||||
|
||||
ks.test(lin3,lin4)
|
||||
|
||||
|
||||
|
212
mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
Normal file
212
mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
Normal file
|
@ -0,0 +1,212 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
#require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
table(my_df$mutation_info)
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage Distribution
|
||||
# x = mcsm_values, y = dist
|
||||
# fill = stability
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
# uncomment as necessary
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
|
||||
|
||||
# refactor
|
||||
df_lin$lineage = factor(df_lin$lineage)
|
||||
|
||||
table(df_lin$lineage) #{RESULT: No of samples within lineage}
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#104 1293 264 1311
|
||||
|
||||
# when merged_df2_comp is used
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#99 1275 263 1255
|
||||
|
||||
length(unique(df_lin$Mutationinformation))
|
||||
#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
|
||||
# sanity checks
|
||||
r1 = 2:5 # when merged_df2 used: because there is missing lineages
|
||||
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
|
||||
print ("sanity check passed: numbers match")
|
||||
} else{
|
||||
print("Error!: check your numbers")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- df_lin
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(df_lin)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# basic: could improve this!
|
||||
library(plotly)
|
||||
library(ggridges)
|
||||
|
||||
g <- ggplot(df, aes(x = ratioDUET)) +
|
||||
geom_density(aes(fill = DUET_outcome)
|
||||
, alpha = 0.5) + facet_wrap(~ lineage,
|
||||
scales = "free") +
|
||||
ggtitle("Kernel Density estimates of Protein stability by lineage")
|
||||
|
||||
ggplotly(g)
|
||||
|
||||
# 2 : ggridges (good!)
|
||||
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
|
||||
fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_dist_PS.svg')
|
||||
|
||||
printFile = ggplot( df, aes(x = ratioDUET
|
||||
, y = DUET_outcome) )+
|
||||
|
||||
#printFile=geom_density_ridges_gradient(
|
||||
geom_density_ridges_gradient( aes(fill = ..x..)
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage
|
||||
, scales = "free"
|
||||
# , switch = 'x'
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian( xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
) +
|
||||
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
|
||||
, name = "DUET" ) +
|
||||
theme( axis.text.x = element_text( size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
# , axis.text.y = element_text( size = my_ats
|
||||
# , angle = 0
|
||||
# , hjust = 1
|
||||
# , vjust = 0)
|
||||
, axis.text.y = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_blank()
|
||||
, axis.ticks.y = element_blank()
|
||||
, plot.title = element_blank()
|
||||
, strip.text = element_text(size=my_als)
|
||||
, legend.text = element_text(size=10)
|
||||
, legend.title = element_text(size=my_als)
|
||||
# , legend.position = c(0.3, 0.8)
|
||||
# , legend.key.height = unit(1, 'mm')
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
||||
#=!=!=!=!=!=!
|
||||
# COMMENT: When you look at all mutations, the lineage differences disappear...
|
||||
# The pattern we are interested in is possibly only for dr_mutations
|
||||
#=!=!=!=!=!=!
|
||||
#===================================================
|
||||
|
||||
# COMPARING DISTRIBUTIONS
|
||||
head(df$lineage)
|
||||
df$lineage = as.character(df$lineage)
|
||||
|
||||
lin1 = df[df$lineage == "lineage1",]$ratioDUET
|
||||
lin2 = df[df$lineage == "lineage2",]$ratioDUET
|
||||
lin3 = df[df$lineage == "lineage3",]$ratioDUET
|
||||
lin4 = df[df$lineage == "lineage4",]$ratioDUET
|
||||
|
||||
# ks test
|
||||
ks.test(lin1,lin2)
|
||||
ks.test(lin1,lin3)
|
||||
ks.test(lin1,lin4)
|
||||
|
||||
ks.test(lin2,lin3)
|
||||
ks.test(lin2,lin4)
|
||||
|
||||
ks.test(lin3,lin4)
|
||||
|
||||
|
||||
|
27
mcsm_analysis/pyrazinamide/scripts/read_pdb.R
Normal file
27
mcsm_analysis/pyrazinamide/scripts/read_pdb.R
Normal file
|
@ -0,0 +1,27 @@
|
|||
#########################
|
||||
#3: Read complex pdb file
|
||||
##########################
|
||||
source("Header_TT.R")
|
||||
# This script only reads the pdb file of your complex
|
||||
|
||||
# read in pdb file complex1
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
complex1 = inFile
|
||||
|
||||
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
|
||||
#complex2 = inFile2
|
||||
|
||||
# list of 8
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
|
||||
rm(inDir, inFile, complex1)
|
||||
#====== end of script
|
||||
|
386
mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
Normal file
386
mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
Normal file
|
@ -0,0 +1,386 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("Header_TT.R")
|
||||
|
||||
#########################################################
|
||||
# TASK: replace B-factors in the pdb file with normalised values
|
||||
# use the complex file with no water as mCSM lig was
|
||||
# performed on this file. You can check it in the script: read_pdb file.
|
||||
#########################################################
|
||||
|
||||
###########################
|
||||
# 2: Read file: average stability values
|
||||
# or mcsm_normalised file, output of step 4 mcsm pipeline
|
||||
###########################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
|
||||
my_df <- read.csv(inFile
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
|
||||
#=========================================================
|
||||
# Processing P1: Replacing B factor with mean ratioDUET scores
|
||||
#=========================================================
|
||||
|
||||
#########################
|
||||
# Read complex pdb file
|
||||
# form the R script
|
||||
##########################
|
||||
|
||||
source("read_pdb.R") # list of 8
|
||||
|
||||
# extract atom list into a variable
|
||||
# since in the list this corresponds to data frame, variable will be a df
|
||||
d = my_pdb[[1]]
|
||||
|
||||
# make a copy: required for downstream sanity checks
|
||||
d2 = d
|
||||
|
||||
# sanity checks: B factor
|
||||
max(d$b); min(d$b)
|
||||
|
||||
#*******************************************
|
||||
# plot histograms for inspection
|
||||
# 1: original B-factors
|
||||
# 2: original DUET Scores
|
||||
# 3: replaced B-factors with DUET Scores
|
||||
#*********************************************
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
|
||||
#1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
|
||||
# 2: DUET scores
|
||||
hist(my_df$average_DUETR
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
|
||||
plot(density(my_df$average_DUETR)
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
|
||||
# 3: After the following replacement
|
||||
#********************************
|
||||
|
||||
#=========
|
||||
# step 0_P1: DONT RUN once you have double checked the matched output
|
||||
#=========
|
||||
# sanity check: match and assign to a separate column to double check
|
||||
# colnames(my_df)
|
||||
# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
|
||||
|
||||
#=========
|
||||
# step 1_P1
|
||||
#=========
|
||||
# Be brave and replace in place now (don't run sanity check)
|
||||
# this makes all the B-factor values in the non-matched positions as NA
|
||||
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
|
||||
|
||||
#=========
|
||||
# step 2_P1
|
||||
#=========
|
||||
# count NA in Bfactor
|
||||
b_na = sum(is.na(d$b)) ; b_na
|
||||
|
||||
# count number of 0's in Bactor
|
||||
sum(d$b == 0)
|
||||
#table(d$b)
|
||||
|
||||
# replace all NA in b factor with 0
|
||||
d$b[is.na(d$b)] = 0
|
||||
|
||||
# sanity check: should be 0
|
||||
sum(is.na(d$b))
|
||||
|
||||
# sanity check: should be True
|
||||
if (sum(d$b == 0) == b_na){
|
||||
print ("Sanity check passed: NA's replaced with 0's successfully")
|
||||
} else {
|
||||
print("Error: NA replacement NOT successful, Debug code!")
|
||||
}
|
||||
|
||||
max(d$b); min(d$b)
|
||||
|
||||
# sanity checks: should be True
|
||||
if(max(d$b) == max(my_df$average_DUETR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
|
||||
if (min(d$b) == min(my_df$average_DUETR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
|
||||
#=========
|
||||
# step 3_P1
|
||||
#=========
|
||||
# sanity check: dim should be same before reassignment
|
||||
# should be TRUE
|
||||
dim(d) == dim(d2)
|
||||
|
||||
#=========
|
||||
# step 4_P1
|
||||
#=========
|
||||
# assign it back to the pdb file
|
||||
my_pdb[[1]] = d
|
||||
|
||||
max(d$b); min(d$b)
|
||||
|
||||
#=========
|
||||
# step 5_P1
|
||||
#=========
|
||||
# output dir
|
||||
getwd()
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
|
||||
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
|
||||
write.pdb(my_pdb, outFile)
|
||||
|
||||
#********************************
|
||||
# Add the 3rd histogram and density plots for comparisons
|
||||
#********************************
|
||||
# Plots continued...
|
||||
# 3: hist and density of replaced B-factors with DUET Scores
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "repalced-B")
|
||||
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "replaced-B")
|
||||
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = "DUET_stability"
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
#********************************
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# NOTE: This replaced B-factor distribution has the same
|
||||
# x-axis as the PredAff normalised values, but the distribution
|
||||
# is affected since 0 is overinflated. This is because all the positions
|
||||
# where there are no SNPs have been assigned 0.
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#######################################################################
|
||||
#====================== end of section 1 ==============================
|
||||
#######################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#=========================================================
|
||||
# Processing P2: Replacing B values with PredAff Scores
|
||||
#=========================================================
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
|
||||
###########################
|
||||
# 2: Read file: average stability values
|
||||
# or mcsm_normalised file, output of step 4 mcsm pipeline
|
||||
###########################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
|
||||
my_df <- read.csv(inFile
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
#rm(inDir, inFile)
|
||||
|
||||
#########################
|
||||
# 3: Read complex pdb file
|
||||
# form the R script
|
||||
##########################
|
||||
|
||||
source("read_pdb.R") # list of 8
|
||||
|
||||
# extract atom list into a variable
|
||||
# since in the list this corresponds to data frame, variable will be a df
|
||||
d = my_pdb[[1]]
|
||||
|
||||
# make a copy: required for downstream sanity checks
|
||||
d2 = d
|
||||
|
||||
# sanity checks: B factor
|
||||
max(d$b); min(d$b)
|
||||
|
||||
#*******************************************
|
||||
# plot histograms for inspection
|
||||
# 1: original B-factors
|
||||
# 2: original Pred Aff Scores
|
||||
# 3: replaced B-factors with PredAff Scores
|
||||
#********************************************
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
|
||||
# 1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
|
||||
# 2: Pred Aff scores
|
||||
hist(my_df$average_PredAffR
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
|
||||
plot(density(my_df$average_PredAffR)
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
|
||||
# 3: After the following replacement
|
||||
#********************************
|
||||
|
||||
#=================================================
|
||||
# Processing P2: Replacing B values with ratioPredAff scores
|
||||
#=================================================
|
||||
# use match to perform this replacement linking with "position no"
|
||||
# in the pdb file, this corresponds to column "resno"
|
||||
# in my_df, this corresponds to column "Position"
|
||||
|
||||
#=========
|
||||
# step 0_P2: DONT RUN once you have double checked the matched output
|
||||
#=========
|
||||
# sanity check: match and assign to a separate column to double check
|
||||
# colnames(my_df)
|
||||
# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
|
||||
|
||||
#=========
|
||||
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
|
||||
#=========
|
||||
# this makes all the B-factor values in the non-matched positions as NA
|
||||
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
|
||||
|
||||
#=========
|
||||
# step 2_P2
|
||||
#=========
|
||||
# count NA in Bfactor
|
||||
b_na = sum(is.na(d$b)) ; b_na
|
||||
|
||||
# count number of 0's in Bactor
|
||||
sum(d$b == 0)
|
||||
#table(d$b)
|
||||
|
||||
# replace all NA in b factor with 0
|
||||
d$b[is.na(d$b)] = 0
|
||||
|
||||
# sanity check: should be 0
|
||||
sum(is.na(d$b))
|
||||
|
||||
if (sum(d$b == 0) == b_na){
|
||||
print ("Sanity check passed: NA's replaced with 0's successfully")
|
||||
} else {
|
||||
print("Error: NA replacement NOT successful, Debug code!")
|
||||
}
|
||||
|
||||
max(d$b); min(d$b)
|
||||
|
||||
# sanity checks: should be True
|
||||
if (max(d$b) == max(my_df$average_PredAffR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
|
||||
if (min(d$b) == min(my_df$average_PredAffR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
|
||||
#=========
|
||||
# step 3_P2
|
||||
#=========
|
||||
# sanity check: dim should be same before reassignment
|
||||
# should be TRUE
|
||||
dim(d) == dim(d2)
|
||||
|
||||
#=========
|
||||
# step 4_P2
|
||||
#=========
|
||||
# assign it back to the pdb file
|
||||
my_pdb[[1]] = d
|
||||
|
||||
max(d$b); min(d$b)
|
||||
|
||||
#=========
|
||||
# step 5_P2
|
||||
#=========
|
||||
|
||||
# output dir
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
|
||||
write.pdb(my_pdb, outFile)
|
||||
|
||||
#********************************
|
||||
# Add the 3rd histogram and density plots for comparisons
|
||||
#********************************
|
||||
# Plots continued...
|
||||
# 3: hist and density of replaced B-factors with PredAff Scores
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "repalced-B")
|
||||
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "replaced-B")
|
||||
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = "Lig_stability"
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
#********************************
|
||||
|
||||
###########
|
||||
# end of output files with Bfactors
|
||||
##########
|
257
mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
Normal file
257
mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
Normal file
|
@ -0,0 +1,257 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
|
||||
#########################################################
|
||||
# 1: Installing and loading required packages #
|
||||
#########################################################
|
||||
|
||||
source("Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
##########################################################
|
||||
# Checking: Entire data frame and for PS #
|
||||
##########################################################
|
||||
|
||||
###########################
|
||||
#2) Read file: combined one from the script
|
||||
###########################
|
||||
source("combining_two_df.R")
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3:
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp:
|
||||
# merged_df3_comp:
|
||||
|
||||
######################
|
||||
# You need to check it
|
||||
# with the merged_df3
|
||||
########################
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
#clear variables
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# should be true
|
||||
identical(my_df$Position, my_df$position)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data <- read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
str(mcsm_data)
|
||||
my_colnames = colnames(mcsm_data)
|
||||
|
||||
#====================================
|
||||
# subset my_df to include only the columns in mcsm data
|
||||
my_df2 = my_df[my_colnames]
|
||||
#====================================
|
||||
# compare the two
|
||||
head(mcsm_data$Mutationinformation)
|
||||
head(mcsm_data$Position)
|
||||
|
||||
head(my_df2$Mutationinformation)
|
||||
head(my_df2$Position)
|
||||
|
||||
# sort mcsm data by Mutationinformation
|
||||
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
|
||||
head(mcsm_data_s$Mutationinformation)
|
||||
head(mcsm_data_s$Position)
|
||||
|
||||
# now compare: should be True, but is false....
|
||||
# possibly due to rownames!?!
|
||||
identical(mcsm_data_s, my_df2)
|
||||
|
||||
# from library dplyr
|
||||
setdiff(mcsm_data_s, my_df2)
|
||||
|
||||
#from lib compare
|
||||
compare(mcsm_data_s, my_df2) # seems rownames are the problem
|
||||
|
||||
# FIXME: automate this
|
||||
# write files: checked using meld and files are indeed identical
|
||||
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
|
||||
#write.csv(my_df2, "my_df2.csv", row.names = F)
|
||||
|
||||
|
||||
#====================================================== end of section 1
|
||||
|
||||
|
||||
|
||||
##########################################################
|
||||
# Checking: LIG(Filtered dataframe) #
|
||||
##########################################################
|
||||
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
|
||||
###########################
|
||||
#3) Read file: combined_lig from the script
|
||||
###########################
|
||||
source("combining_two_df_lig.R")
|
||||
|
||||
# df with NA:
|
||||
# merged_df2 :
|
||||
# merged_df3:
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp:
|
||||
# merged_df3_comp:
|
||||
|
||||
######################
|
||||
# You need to check it
|
||||
# with the merged_df3
|
||||
########################
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
#clear variables
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# should be true
|
||||
identical(my_df$Position, my_df$position)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data <- read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
str(mcsm_data)
|
||||
|
||||
###########################
|
||||
# 4a: Filter/subset data: ONLY for LIGand analysis
|
||||
# Lig plots < 10Ang
|
||||
# Filter the lig plots for Dis_to_lig < 10Ang
|
||||
###########################
|
||||
# sanity checks
|
||||
upos = unique(mcsm_data$Position)
|
||||
|
||||
# check range of distances
|
||||
max(mcsm_data$Dis_lig_Ang)
|
||||
min(mcsm_data$Dis_lig_Ang)
|
||||
|
||||
# Lig filtered: subset data to have only values less than 10 Ang
|
||||
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
|
||||
|
||||
rm(mcsm_data) #to avoid confusion
|
||||
|
||||
table(mcsm_data2$Dis_lig_Ang<10)
|
||||
table(mcsm_data2$Dis_lig_Ang>10)
|
||||
|
||||
max(mcsm_data2$Dis_lig_Ang)
|
||||
min(mcsm_data2$Dis_lig_Ang)
|
||||
|
||||
upos_f = unique(mcsm_data2$Position); upos_f
|
||||
|
||||
# colnames of df that you will need to subset the bigger df from
|
||||
my_colnames = colnames(mcsm_data2)
|
||||
#====================================
|
||||
# subset bigger df i.e my_df to include only the columns in mcsm data2
|
||||
my_df2 = my_df[my_colnames]
|
||||
|
||||
rm(my_df) #to avoid confusion
|
||||
#====================================
|
||||
# compare the two
|
||||
head(mcsm_data2$Mutationinformation)
|
||||
head(mcsm_data2$Position)
|
||||
|
||||
head(my_df2$Mutationinformation)
|
||||
head(my_df2$Position)
|
||||
|
||||
# sort mcsm data by Mutationinformation
|
||||
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
|
||||
head(mcsm_data2_s$Mutationinformation)
|
||||
head(mcsm_data2_s$Position)
|
||||
|
||||
# now compare: should be True, but is false....
|
||||
# possibly due to rownames!?!
|
||||
identical(mcsm_data2_s, my_df2)
|
||||
|
||||
# from library dplyr
|
||||
setdiff(mcsm_data2_s, my_df2)
|
||||
|
||||
# from library compare
|
||||
compare(mcsm_data2_s, my_df2) # seems rownames are the problem
|
||||
|
||||
#FIXME: automate this
|
||||
# write files: checked using meld and files are indeed identical
|
||||
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
|
||||
#write.csv(my_df2, "my_df2.csv", row.names = F)
|
||||
|
||||
|
||||
##########################################################
|
||||
# extract and write output file for SNP posn: all #
|
||||
##########################################################
|
||||
|
||||
head(merged_df3$Position)
|
||||
|
||||
foo = merged_df3[order(merged_df3$Position),]
|
||||
head(foo$Position)
|
||||
|
||||
snp_pos_unique = unique(foo$Position); snp_pos_unique
|
||||
|
||||
# sanity check:
|
||||
table(snp_pos_unique == combined_df$Position)
|
||||
|
||||
#=====================
|
||||
# write_output files
|
||||
#=====================
|
||||
outDir = "~/Data/pyrazinamide/input/processed/"
|
||||
|
||||
|
||||
outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
|
||||
print(paste0("Output file name and path will be:","", outFile1))
|
||||
|
||||
write.table(snp_pos_unique
|
||||
, outFile1
|
||||
, row.names = F
|
||||
, col.names = F)
|
||||
|
||||
##############################################################
|
||||
# extract and write output file for SNP posn: complete only #
|
||||
##############################################################
|
||||
head(merged_df3_comp$Position)
|
||||
|
||||
foo = merged_df3_comp[order(merged_df3_comp$Position),]
|
||||
head(foo$Position)
|
||||
|
||||
snp_pos_unique = unique(foo$Position); snp_pos_unique
|
||||
|
||||
# outDir = "~/Data/pyrazinamide/input/processed/" # already set
|
||||
|
||||
outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
|
||||
print(paste0("Output file name and path will be:", outFile2))
|
||||
|
||||
write.table(snp_pos_unique
|
||||
, outFile2
|
||||
, row.names = F
|
||||
, col.names = F)
|
||||
#============================== end of script
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue