import commit

This commit is contained in:
Tanushree Tunstall 2020-01-08 16:15:33 +00:00
commit bccfe68192
39 changed files with 6837 additions and 0 deletions

59
README.md Normal file
View file

@ -0,0 +1,59 @@
mCSM Analysis
=============
This repo does mCSM analysis using Python (Pandas, numpy), bash and R.
Requires an additional 'Data' directory (Supplied separately). Batteries not included.
# directory struc
# Assumptions
1. git repos are cloned to `~/git`
2. The script
## LSHTM\_analysis: Repo
subdirs
needs to call `../Data/input/original/`
meta\_data\_analysis/
*.R
*.py
needs to call `../Data/input/processed/<drug>`
needs to output `../Data/output/results/<drug>`
mcsm\_analysis/
<drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
scripts/ (changed from `Scripts/`)
*.R
*.py
mcsm/
*.sh
*.py
*.R
plotting/
*.R
Data: Repo:
# subdirs
input/
original/
processed/
output/
<drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
results/
*.csv
*.xlsx
*.doc
*.txt
plots/
structure/
More docs here as I write them.

12
mcsm.conf Normal file
View file

@ -0,0 +1,12 @@
# This is not yet used, but will be soon :)
[DEFAULT]
mcsm_home = /home/tanu/git/github/LSHTM_analysis
[pyrazinamide]
gene = 'pnca'
server = 'http://mcsm.melb.ac.uk/'
[monkeyamide]
gene = 'abcs'
server = 'http://myserver.local/'

View file

@ -0,0 +1,512 @@
###########################
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(my_df$Position)
head(my_df$ratioDUET)
# order data frame
df = my_df[order(my_df$Position),]
head(df$Position)
head(df$ratioDUET)
#***********
# PS: average by position
#***********
mean_DUET_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.DUET = mean(ratioDUET))
#***********
# Lig: average by position
#***********
mean_Lig_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.Lig = mean(ratioPredAff))
#***********
# cbind:mean_DUET_by_position and mean_Lig_by_position
#***********
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
# sanity check
# mean_PS_Lig_Bfactor
colnames(combined)
colnames(combined) = c("Position"
, "average_DUETR"
, "Position2"
, "average_PredAffR")
colnames(combined)
identical(combined$Position, combined$Position2)
n = which(colnames(combined) == "Position2"); n
combined_df = combined[,-n]
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
#=============
# output csv
#============
outDir = "~/Data/pyrazinamide/input/processed/"
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
print(paste0("Output file with path will be:","", outFile))
head(combined_df$Position); tail(combined_df$Position)
write.csv(combined_df, outFile
, row.names = F)
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
require(dplyr)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
###########################
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
###########################
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
###########################
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(my_df$Position)
head(my_df$ratioDUET)
# order data frame
df = my_df[order(my_df$Position),]
head(df$Position)
head(df$ratioDUET)
#***********
# PS: average by position
#***********
mean_DUET_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.DUET = mean(ratioDUET))
#***********
# Lig: average by position
#***********
mean_Lig_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.Lig = mean(ratioPredAff))
#***********
# cbind:mean_DUET_by_position and mean_Lig_by_position
#***********
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
# sanity check
# mean_PS_Lig_Bfactor
colnames(combined)
colnames(combined) = c("Position"
, "average_DUETR"
, "Position2"
, "average_PredAffR")
colnames(combined)
identical(combined$Position, combined$Position2)
n = which(colnames(combined) == "Position2"); n
combined_df = combined[,-n]
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
#=============
# output csv
#============
outDir = "~/git/Data/pyrazinamide/input/processed/"
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
print(paste0("Output file with path will be:","", outFile))
head(combined_df$Position); tail(combined_df$Position)
write.csv(combined_df, outFile
, row.names = F)
# read in pdb file complex1
inDir = "~/git/Data/pyrazinamide/input/structure"
inFile = paste0(inDir, "complex1_no_water.pdb")
# read in pdb file complex1
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
#########################
#3: Read complex pdb file
##########################
source("Header_TT.R")
# list of 8
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
rm(inDir, inFile)
#====== end of script
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
complex1 = inFile
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
inFile
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
#complex2 = inFile2
# list of 8
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
rm(inDir, inFile, complex1)
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
source("Header_TT.R")
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("Header_TT.R")
#########################################################
# TASK: replace B-factors in the pdb file with normalised values
# use the complex file with no water as mCSM lig was
# performed on this file. You can check it in the script: read_pdb file.
#########################################################
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
source("read_pdb.R") # list of 8
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
#1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: DUET scores
hist(my_df$average_DUETR
, xlab = ""
, main = "Norm_DUET")
plot(density(my_df$average_DUETR)
, xlab = ""
, main = "Norm_DUET")
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
#1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: DUET scores
hist(my_df$average_DUETR
, xlab = ""
, main = "Norm_DUET")
plot(density(my_df$average_DUETR)
, xlab = ""
, main = "Norm_DUET")
#=========
# step 1_P1
#=========
# Be brave and replace in place now (don't run sanity check)
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
#=========
# step 2_P1
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
# sanity check: should be True
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if(max(d$b) == max(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P1
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P1
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P1
#=========
# output dir
getwd()
outDir = "~/git/Data/pyrazinamide/output/"
getwd()
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
outDir = "~/git/Data/pyrazinamide/input/structure"
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
write.pdb(my_pdb, outFile)
hist(d$b
, xlab = ""
, main = "repalced-B")
plot(density(d$b)
, xlab = ""
, main = "replaced-B")
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = "DUET_stability"
, side = 3
, line = 0
, outer = TRUE)
#=========================================================
# Processing P2: Replacing B values with PredAff Scores
#=========================================================
# clear workspace
rm(list = ls())
#=========================================================
# Processing P2: Replacing B values with PredAff Scores
#=========================================================
# clear workspace
rm(list = ls())
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
#=========================================================
# Processing P2: Replacing B factor with mean ratioLig scores
#=========================================================
#########################
# 3: Read complex pdb file
# form the R script
##########################
source("read_pdb.R") # list of 8
# extract atom list into a vari
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
# 1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: Pred Aff scores
hist(my_df$average_PredAffR
, xlab = ""
, main = "Norm_lig_average")
plot(density(my_df$average_PredAffR)
, xlab = ""
, main = "Norm_lig_average")
# 3: After the following replacement
#********************************
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
# 1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: Pred Aff scores
hist(my_df$average_PredAffR
, xlab = ""
, main = "Norm_lig_average")
plot(density(my_df$average_PredAffR)
, xlab = ""
, main = "Norm_lig_average")
# 3: After the following replacement
#********************************
#=========
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
#=========
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
#=========
# step 2_P2
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if (max(d$b) == max(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P2
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P2
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P2
#=========
write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
# output dir
getwd()
# output dir
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
write.pdb(my_pdb, outFile)

View file

@ -0,0 +1,129 @@
#########################################################
### A) Installing and loading required packages
#########################################################
#if (!require("gplots")) {
# install.packages("gplots", dependencies = TRUE)
# library(gplots)
#}
if (!require("tidyverse")) {
install.packages("tidyverse", dependencies = TRUE)
library(tidyverse)
}
if (!require("ggplot2")) {
install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)
}
if (!require("cowplot")) {
install.packages("copwplot", dependencies = TRUE)
library(ggplot2)
}
if (!require("ggcorrplot")) {
install.packages("ggcorrplot", dependencies = TRUE)
library(ggcorrplot)
}
if (!require("ggpubr")) {
install.packages("ggpubr", dependencies = TRUE)
library(ggpubr)
}
if (!require("RColorBrewer")) {
install.packages("RColorBrewer", dependencies = TRUE)
library(RColorBrewer)
}
if (!require ("GOplot")) {
install.packages("GOplot")
library(GOplot)
}
if(!require("VennDiagram")) {
install.packages("VennDiagram", dependencies = T)
library(VennDiagram)
}
if(!require("scales")) {
install.packages("scales", dependencies = T)
library(scales)
}
if(!require("plotrix")) {
install.packages("plotrix", dependencies = T)
library(plotrix)
}
if(!require("stats")) {
install.packages("stats", dependencies = T)
library(stats)
}
if(!require("stats4")) {
install.packages("stats4", dependencies = T)
library(stats4)
}
if(!require("data.table")) {
library(stats4)
}
if (!require("PerformanceAnalytics")){
install.packages("PerformanceAnalytics", dependencies = T)
library(PerformaceAnalytics)
}
if (!require ("GGally")){
install.packages("GGally")
library(GGally)
}
if (!require ("corrr")){
install.packages("corrr")
library(corrr)
}
if (!require ("psych")){
install.packages("psych")
library(psych)
}
if (!require ("dplyr")){
install.packages("dplyr")
library(psych)
}
if (!require ("compare")){
install.packages("compare")
library(psych)
}
if (!require ("arsenal")){
install.packages("arsenal")
library(psych)
}
####TIDYVERSE
# Install
#if(!require(devtools)) install.packages("devtools")
#devtools::install_github("kassambara/ggcorrplot")
library(ggcorrplot)
###for PDB files
#install.packages("bio3d")
if(!require(bio3d)){
install.packages("bio3d")
library(bio3d)
}

View file

@ -0,0 +1,27 @@
#########################################################
# 1b: Define function: coloured barplot by subgroup
# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
#########################################################
ColourPalleteMulti <- function(df, group, subgroup){
# Find how many colour categories to create and the number of colours in each
categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
, df
, function(x) length(unique(x)))
# return(categories) }
category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
#return(category.start); return(category.end)}
# Build Colour pallette
colours <- unlist(lapply(1:nrow(categories),
function(i){
colorRampPalette(colors = c(category.start[i]
, category.end[i]))(categories[i,2])}))
return(colours)
}
#########################################################

View file

@ -0,0 +1,299 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
getwd()
#########################################################
# TASK: To combine mcsm and meta data with af and or
#########################################################
########################################################################
# Installing and loading required packages #
########################################################################
source("Header_TT.R")
#require(data.table)
#require(arsenal)
#require(compare)
#library(tidyverse)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data = read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
rm(inDir, inFile)
str(mcsm_data)
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
# spelling Correction 1: DUET
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
# spelling Correction 2: Ligand
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
# count na in each column
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
# sort by Mutationinformation
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data$Mutationinformation)
# get freq count of positions and add to the df
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
###########################
# 2: Read file: meta data with AFandOR
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
meta_with_afor <- read.csv(inFile2
, stringsAsFactors = F
, header = T)
rm(inDir, inFile2)
str(meta_with_afor)
# sort by Mutationinformation
head(meta_with_afor$Mutationinformation)
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
head(meta_with_afor$Mutationinformation)
# sanity check: should be True for all the mentioned columns
#is.numeric(meta_with_afor$OR)
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
c1 = NULL
for (i in na_var){
print(i)
c0 = is.numeric(meta_with_afor[,i])
c1 = c(c0, c1)
if ( all(c1) ){
print("Sanity check passed: These are all numeric cols")
} else{
print("Error: Please check your respective data types")
}
}
# If OR, and P value are not numeric, then convert to numeric and then count
# else they will say 0
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
str(na_count)
# compare if the No of "NA" are the same for all these cols
na_len = NULL
for (i in na_var){
temp = na_count[[i]]
na_len = c(na_len, temp)
}
# extract how many NAs:
# should be all TRUE
# should be a single number since
# all the cols should have "equal" and "same" no. of NAs
my_nrows = NULL
for ( i in 1: (length(na_len)-1) ){
#print(compare(na_len[i]), na_len[i+1])
c = compare(na_len[i], na_len[i+1])
if ( c$result ) {
my_nrows = na_len[i] }
else {
print("Error: Please check your numbers")
}
}
my_nrows
#=#=#=#=#=#=#=#=#
# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
# these are the same 7 ones
#=#=#=#=#=#=#=#=#
# sanity check
#which(is.na(meta_with_afor$OR))
# initialise an empty df with nrows as extracted above
na_count_df = data.frame(matrix(vector(mode = 'numeric'
# , length = length(na_var)
)
, nrow = my_nrows
# , ncol = length(na_var)
))
# populate the df with the indices of the cols that are NA
for (i in na_var){
print(i)
na_i = which(is.na(meta_with_afor[i]))
na_count_df = cbind(na_count_df, na_i)
colnames(na_count_df)[which(na_var == i)] <- i
}
# Now compare these indices to ensure these are the same
c2 = NULL
for ( i in 1: ( length(na_count_df)-1 ) ) {
# print(na_count_df[i] == na_count_df[i+1])
c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
c2 = c(c1, c2)
if ( all(c2) ) {
print("Sanity check passed: The indices for AF, OR, etc are all the same")
} else {
print ("Error: Please check indices which are NA")
}
}
rm( c, c0, c1, c2, i, my_nrows
, na_count, na_i, na_len
, na_var, temp
, na_count_df
, pos_count_check )
###########################
# 3:merging two dfs: with NA
###########################
# link col name = Mutationinforamtion
head(mcsm_data$Mutationinformation)
head(meta_with_afor$Mutationinformation)
#########
# merge 1a: meta data with mcsm
#########
merged_df2 = merge(x = meta_with_afor
,y = mcsm_data
, by = "Mutationinformation"
, all.y = T)
head(merged_df2$Position)
# sort by Position
head(merged_df2$Position)
merged_df2 = merged_df2[order(merged_df2$Position),]
head(merged_df2$Position)
merged_df2v2 = merge(x = meta_with_afor
,y = mcsm_data
, by = "Mutationinformation"
, all.x = T)
#!=!=!=!=!=!=!=!
# COMMENT: used all.y since position 186 is not part of the struc,
# hence doesn't have a mcsm value
# but 186 is associated with with mutation
#!=!=!=!=!=!=!=!
# should be False
identical(merged_df2, merged_df2v2)
table(merged_df2$Position%in%merged_df2v2$Position)
rm(merged_df2v2)
#########
# merge 1b:remove duplicate mutation information
#########
#==#=#=#=#=#=#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#=#=#=#=#=#=#=
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
# sanity checks
# nrows of merged_df3 should be the same as the nrows of mcsm_data
if(nrow(mcsm_data) == nrow(merged_df3)){
print("sanity check: Passed")
} else {
print("Error!: check data, nrows is not as expected")
}
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# uncomment as necessary
# only need to run this if merged_df2v2 i.e non structural pos included
#mcsm = mcsm_data$Mutationinformation
#my_merged = merged_df3$Mutationinformation
# find the index where it differs
#diff_n = which(!my_merged%in%mcsm)
#check if it is indeed pos 186
#merged_df3[diff_n,]
# remove this entry
#merged_df3 = merged_df3[-diff_n,]]
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
###########################
# 3b :merging two dfs: without NA
###########################
#########
# merge 2a:same as merge 1 but excluding NA
#########
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
#########
# merge 2b: remove duplicate mutation information
#########
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$AF),]
# compare dfs: foo and merged_df3_com
all.equal(foo, merged_df3)
summary(comparedf(foo, merged_df3))
#=============== end of combining df
#clear variables
rm(mcsm_data
, meta_with_afor
, foo)
#rm(diff_n, my_merged, mcsm)
#=====================
# write_output files
#=====================
# output dir
outDir = "~/git/Data/pyrazinamide/output/"
getwd()
outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
write.csv(merged_df3, outFile1)
#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
#write.csv(merged_df3_comp, outFile2)
rm(outDir
, outFile1
# , outFile2
)
#============================= end of script

View file

@ -0,0 +1,348 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
getwd()
#########################################################
# TASK: To combine mcsm and meta data with af and or
# by filtering for distance to ligand (<10Ang)
#########################################################
#########################################################
# Installing and loading required packages
#########################################################
#source("Header_TT.R")
#require(data.table)
#require(arsenal)
#require(compare)
#library(tidyverse)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data = read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
rm(inDir, inFile)
str(mcsm_data)
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
# spelling Correction 1: DUET
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
# checks
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
# spelling Correction 2: Ligand
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
########################### !!! only for mcsm_lig
# 4: Filter/subset data
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)
# count
table(mcsm_data$Dis_lig_Ang<10)
# subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
# sanity checks
max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)
# count no of unique positions
length(unique(mcsm_data2$Position))
# count no of unique mutations
length(unique(mcsm_data2$Mutationinformation))
# count Destabilisinga and stabilising
table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
#<<<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT: so as not to alter the script
mcsm_data = mcsm_data2
#<<<<<<<<<<<<<<<<<<<<<<<<<<<
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(mcsm_data$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
# clear variables
rm(mcsm_data2)
# count na in each column
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
head(mcsm_data$Mutationinformation)
mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
mcsm_data[mcsm_data$Mutationinformation=="L4S",]
# sort by Mutationinformation
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data$Mutationinformation)
# check
mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
# get freq count of positions and add to the df
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
###########################
# 2: Read file: meta data with AFandOR
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
meta_with_afor <- read.csv(inFile2
, stringsAsFactors = F
, header = T)
str(meta_with_afor)
# sort by Mutationinformation
head(meta_with_afor$Mutationinformation)
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
head(meta_with_afor$Mutationinformation)
# sanity check: should be True for all the mentioned columns
#is.numeric(meta_with_afor$OR)
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
c1 = NULL
for (i in na_var){
print(i)
c0 = is.numeric(meta_with_afor[,i])
c1 = c(c0, c1)
if ( all(c1) ){
print("Sanity check passed: These are all numeric cols")
} else{
print("Error: Please check your respective data types")
}
}
# If OR, and P value are not numeric, then convert to numeric and then count
# else they will say 0
# NOW count na in each column: if you did it before, then
# OR and Pvalue column would say 0 na since these were not numeric
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
str(na_count)
# compare if the No of "NA" are the same for all these cols
na_len = NULL
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
for (i in na_var){
temp = na_count[[i]]
na_len = c(na_len, temp)
}
my_nrows = NULL
for ( i in 1: (length(na_len)-1) ){
#print(compare(na_len[i]), na_len[i+1])
c = compare(na_len[i], na_len[i+1])
if ( c$result ) {
my_nrows = na_len[i] }
else {
print("Error: Please check your numbers")
}
}
my_nrows
#=#=#=#=#=#=#=#=#
# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
# all have 81 NA, with pyrazinamide with 960
# and these are the same 7 ones
#=#=#=#=#=#=#=#=#
# sanity check
#which(is.na(meta_with_afor$OR))
# initialise an empty df with nrows as extracted above
na_count_df = data.frame(matrix(vector(mode = 'numeric'
# , length = length(na_var)
)
, nrow = my_nrows
# , ncol = length(na_var)
))
# populate the df with the indices of the cols that are NA
for (i in na_var){
print(i)
na_i = which(is.na(meta_with_afor[i]))
na_count_df = cbind(na_count_df, na_i)
colnames(na_count_df)[which(na_var == i)] <- i
}
# Now compare these indices to ensure these are the same
c2 = NULL
for ( i in 1: ( length(na_count_df)-1 ) ) {
# print(na_count_df[i] == na_count_df[i+1])
c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
c2 = c(c1, c2)
if ( all(c2) ) {
print("Sanity check passed: The indices for AF, OR, etc are all the same")
} else {
print ("Error: Please check indices which are NA")
}
}
rm( c, c1, c2, i, my_nrows
, na_count, na_i, na_len
, na_var, temp
, na_count_df
, pos_count_check )
###########################
# 3:merging two dfs: with NA
###########################
# link col name = Mutationinforamtion
head(mcsm_data$Mutationinformation)
head(meta_with_afor$Mutationinformation)
#########
# merge 1a: meta data with mcsm
#########
merged_df2 = merge(x = meta_with_afor
, y = mcsm_data
, by = "Mutationinformation"
, all.y = T)
head(merged_df2$Position)
# sort by Position
head(merged_df2$Position)
merged_df2 = merged_df2[order(merged_df2$Position),]
head(merged_df2$Position)
merged_df2v2 = merge(x = meta_with_afor
,y = mcsm_data
, by = "Mutationinformation"
, all.x = T)
#!=!=!=!=!=!=!=!
# COMMENT: used all.y since position 186 is not part of the struc,
# hence doesn't have a mcsm value
# but 186 is associated with with mutation
#!=!=!=!=!=!=!=!
# should be False
identical(merged_df2, merged_df2v2)
table(merged_df2$Position%in%merged_df2v2$Position)
rm(merged_df2v2)
#########
# merge 1b:remove duplicate mutation information
#########
#==#=#=#=#=#=#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#=#=#=#=#=#=#=
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
# sanity checks
# nrows of merged_df3 should be the same as the nrows of mcsm_data
if(nrow(mcsm_data) == nrow(merged_df3)){
print("sanity check: Passed")
} else {
print("Error!: check data, nrows is not as expected")
}
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# uncomment as necessary
# only need to run this if merged_df2v2 i.e non structural pos included
#mcsm = mcsm_data$Mutationinformation
#my_merged = merged_df3$Mutationinformation
# find the index where it differs
#diff_n = which(!my_merged%in%mcsm)
#check if it is indeed pos 186
#merged_df3[diff_n,]
# remove this entry
#merged_df3 = merged_df3[-diff_n,]
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
###########################
# 3b :merging two dfs: without NA
###########################
#########
# merge 2a:same as merge 1 but excluding NA
#########
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
#########
# merge 2b: remove duplicate mutation information
#########
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
# FIXME: add this as a sanity check. I have manually checked!
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$AF),]
# compare dfs: foo and merged_df3_com
all.equal(foo, merged_df3)
summary(comparedf(foo, merged_df3))
#=============== end of combining df
#clear variables
rm(mcsm_data
, meta_with_afor
, foo)
#rm(diff_n, my_merged, mcsm)
#===============end of script
#=====================
# write_output files
#=====================
# Not required as this is a subset of the "combining_two_df.R" script

View file

@ -0,0 +1,25 @@
#!/bin/bash
#*************************************
# need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#**********************************************************************
# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
# per line. Sort by unique, which automatically removes duplicates.
# sace file in current directory
#**********************************************************************
infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
# sort unique entries and output to current directory
sort -u ${infile} > ${outfile}
# count no. of unique snps mCSM will run on
count=$(wc -l < ${outfile})
# print to console no. of unique snps mCSM will run on
echo "${count} unique mutations for mCSM to run on"

View file

@ -0,0 +1,72 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#**********************************************************************
# TASK: submit requests using curl: HANDLE redirects and refresh url.
# Iterate over mutation file and write/append result urls to a file
# result url file: stored in the /Results directory
# mutation file: one mutation per line, no chain ID
# output: in a file, should be n urls (n=no. of mutations in file)
# NOTE: these are just result urls, not actual values for results
#**********************************************************************
## iterate over mutation file; line by line and submit query using curl
filename="../Data/pnca_mis_SNPs_v2_unique.csv"
## some useful messages
echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
COUNT=0
while read -r line; do
((COUNT++))
mutation="${line}"
# echo "${mutation}"
pdb='../Data/complex1_no_water.pdb'
mutation="${mutation}"
chain="A"
lig_id="PZA"
affin_wt="0.99"
host="http://biosig.unimelb.edu.au"
call_url="/mcsm_lig/prediction"
##=========================================
##html field_names names required for curl
##complex_field:wild=@
##mutation_field:mutation=@
##chain_field:chain=@
##ligand_field:lig_id@
##energy_field:affin_wt
#=========================================
refresh_url=$(curl -L \
-sS \
-F "wild=@${pdb}" \
-F "mutation=${mutation}" \
-F "chain=${chain}" \
-F "lig_id=${lig_id}" \
-F "affin_wt=${affin_wt}" \
${host}${call_url} | grep "http-equiv")
#echo $refresh_url
#echo ${host}${refresh_url}
#use regex to extract the relevant bit from the refresh url
#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
#Now build: result url using host and refresh url and write the urls to a file in the Results dir
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
sleep 10
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
#echo -n '.'
done < "${filename}"
echo
echo "Processing Complete"
##end of submitting query, receiving result url and storing results url in a file

View file

@ -0,0 +1,59 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#********************************************************************
# TASK: submit result urls and fetch actual results using curl
# iterate over each result url from the output of step1 in the stored
# in file in /Results.
# Use curl to fetch results and extract relevant sections using hxtools
# and store these in another file in /Results
# This script takes two arguments:
# input file: file containing results url
# In this case: 336_mCSM_lig_complex1_result_url.txt
# output file: name of the file where extracted results will be stored
# In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
#*********************************************************************
#if [ "$#" -ne 2 ]; then
#if [ -Z $1 ]; then
# echo "
# Please provide both Input and Output files.
# Usage: batch_read_urls.sh INFILE OUTFILE
# "
# exit 1
#fi
# First argument: Input File
# Second argument: Output File
#infile=$1
#outfile=$2
infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
#COUNT=$(($COUNT+1))
((COUNT++))
curl --silent ${line} \
| hxnormalize -x \
| hxselect -c div.span4 \
| hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \
>> ${outfile}
#| tee -a ${outfile}
# echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
done < "${infile}"
echo
echo "Processing Complete"

View file

@ -0,0 +1,52 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#********************************************************************
# TASK: Intermediate results processing
# output file has a convenient delimiter of ":" that can be used to
# format the file into two columns (col1: field_desc and col2: values)
# However the section "PredictedAffinityChange:...." and
# "DUETstabilitychange:.." are split over multiple lines and
# prevent this from happening.Additionally there are other empty lines
# that need to be omiited. In order ensure these sections are not split
# over multiple lines, this script is written.
#*********************************************************************
infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
# Outputs records separated by a newline, that look something like this:
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
# Mutationinformation:
# Wild-type:L
# Position:4
# Mutant-type:W
# Chain:A
# LigandID:PZA
# Distancetoligand:15.911&Aring;
# DUETstabilitychange:-2.169Kcal/mol
#
# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
# (...etc)
# This script brings everything in a convenient format for further processing in python.
# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
sed -i '/PredictedAffinityChange/ {
N
N
N
N
s/\n//g
}
/DUETstabilitychange:/ {
N
N
s/\n//g
}
/^$/d' ${infile}

View file

@ -0,0 +1,29 @@
#!/usr/bin/python
import pandas as pd
from collections import defaultdict
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)

View file

@ -0,0 +1,207 @@
getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
getwd()
#=======================================================
#TASK: To tidy the columns so you can generate figures
#=======================================================
####################
#### read file #####: this will be the output from python script (csv file)
####################
data = read.csv("336_complex1_formatted_results.csv"
, header = T
, stringsAsFactors = FALSE)
dim(data)
#335, 10
str(data)
###########################
##### Data processing #####
###########################
# populate mutation information columns as currently it is empty
head(data$Mutationinformation)
tail(data$Mutationinformation)
# should not be blank: create muation information
data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
head(data$Mutationinformation)
tail(data$Mutationinformation)
#write.csv(data, 'test.csv')
##########################################
# Remove duplicate SNPs as a sanity check
##########################################
#very important
table(duplicated(data$Mutationinformation))
#FALSE
#335
#extract duplicated entries
dups = data[duplicated(data$Mutationinformation),] #0
#No of dups should match with the no. of TRUE in the above table
#u_dups = unique(dups$Mutationinformation) #10
sum( table(dups$Mutationinformation) ) #13
rm(dups)
#***************************************************************
#select non-duplicated SNPs and create a new df
df = data[!duplicated(data$Mutationinformation),] #309, 10
#***************************************************************
#sanity check
u = unique(df$Mutationinformation)
u2 = unique(data$Mutationinformation)
table(u%in%u2)
#TRUE
#309
#should all be 1, hence 309 1's
sum(table(df$Mutationinformation) == 1)
#sort df by Position
#MANUAL CHECKPOINT:
#foo <- df[order(df$Position),]
#df <- df[order(df$Position),]
rm(u, u2, dups)
####################
#### give meaningful colnames to reflect units to enable correct data type
####################
#=======
#STEP 1
#========
#make a copy of the PredictedAffinityColumn and call it Lig_outcome
df$Lig_outcome = df$PredictedAffinityChange #335, 11
#make Predicted...column numeric and outcome column categorical
head(df$PredictedAffinityChange)
df$PredictedAffinityChange = gsub("log.*"
, ""
, df$PredictedAffinityChange)
#sanity checks
head(df$PredictedAffinityChange)
#should be numeric, check and if not make it numeric
is.numeric( df$PredictedAffinityChange )
#change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
#should be TRUE
is.numeric( df$PredictedAffinityChange )
#change the column name to indicate units
n = which(colnames(df) == "PredictedAffinityChange"); n
colnames(df)[n] = "PredAffLog"
colnames(df)[n]
#========
#STEP 2
#========
#make Lig_outcome column categorical showing effect of mutation
head(df$Lig_outcome)
df$Lig_outcome = gsub("^.*-"
, "",
df$Lig_outcome)
#sanity checks
head(df$Lig_outcome)
#should be factor, check and if not change it to factor
is.factor(df$Lig_outcome)
#change to factor
df$Lig_outcome = as.factor(df$Lig_outcome)
#should be TRUE
is.factor(df$Lig_outcome)
#========
#STEP 3
#========
#gsub
head(df$Distancetoligand)
df$Distancetoligand = gsub("&Aring;"
, ""
, df$Distancetoligand)
#sanity checks
head(df$Distancetoligand)
#should be numeric, check if not change it to numeric
is.numeric(df$Distancetoligand)
#change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand)
#should be TRUE
is.numeric(df$Distancetoligand)
#change the column name to indicate units
n = which(colnames(df) == "Distancetoligand")
colnames(df)[n] <- "Dis_lig_Ang"
colnames(df)[n]
#========
#STEP 4
#========
#gsub
head(df$DUETstabilitychange)
df$DUETstabilitychange = gsub("Kcal/mol"
, ""
, df$DUETstabilitychange)
#sanity checks
head(df$DUETstabilitychange)
#should be numeric, check if not change it to numeric
is.numeric(df$DUETstabilitychange)
#change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
#should be TRUE
is.numeric(df$DUETstabilitychange)
#change the column name to indicate units
n = which(colnames(df) == "DUETstabilitychange"); n
colnames(df)[n] = "DUETStability_Kcalpermol"
colnames(df)[n]
#========
#STEP 5
#========
#create yet another extra column: classification of DUET stability only
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
, "Stabilizing"
, "Destabilizing") #335, 12
table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#==============================
#FIXME
#Insert a venn diagram
#================================
#========
#STEP 6
#========
# assign wild and mutant colnames correctly
wt = which(colnames(df) == "Wild.type"); wt
colnames(df)[wt] <- "Wild_type"
colnames(df[wt])
mut = which(colnames(df) == "Mutant.type"); mut
colnames(df)[mut] <- "Mutant_type"
colnames(df[mut])
#========
#STEP 7
#========
#create an extra column: maybe useful for some plots
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
#clear variables
rm(n, wt, mut)
################ end of data cleaning

View file

@ -0,0 +1,252 @@
getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
getwd()
#=======================================================
#TASK:read cleaned data and perform rescaling
# of DUET stability scores
# of Pred affinity
#compare scaling methods with plots
#output normalised file
#=======================================================
####################
#### read file #####: this will be the output of my R script that cleans the data columns
####################
source("../Scripts/step3c_data_cleaning.R")
##This will outut two dataframes:
##data: unclean data: 335, 10
##df : cleaned df 335, 13
## you can remove data if you want as you will not need it
rm(data)
colnames(df)
#===================
#3a: PredAffLog
#===================
n = which(colnames(df) == "PredAffLog"); n
group = which(colnames(df) == "Lig_outcome"); group
#===================================================
# order according to PredAffLog values
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$PredAffLog)
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
df = df[order(df$PredAffLog),]
head(df$PredAffLog)
#sanity checks
head(df[,n]) #all negatives
tail(df[,n]) #all positives
#sanity checks
mean(df[,n])
#-0.9526746
tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.2112100 0.3926667
#===========================
#Same as above: in 2 steps
#===========================
#find range of your data
my_min = min(df[,n]); my_min #-3.948
my_max = max(df[,n]); my_max #2.23
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers
#===============================================
df$ratioPredAff = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
)#335 14
#sanity checks
head(df$ratioPredAff)
tail(df$ratioPredAff)
min(df$ratioPredAff); max(df$ratioPredAff)
tapply(df$ratioPredAff, df$Lig_outcome, min)
#Destabilizing Stabilizing
#-1.000000000 0.005381166
tapply(df$ratioPredAff, df$Lig_outcome, max)
#Destabilizing Stabilizing
#-0.001266464 1.000000000
#should be the same as below (281 and 54)
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
#uncomment as necessary
my_title = "Ligand_stability"
#my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioPredAff
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioPredAff )
, main = "ratio rescaling"
)
# titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
#clear variables
rm(my_min, my_max, my_title, n, group)
#===================
# 3b: DUET stability
#===================
dim(df) #335, 14
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
group = which(colnames(df) == "DUET_outcome"); group #12
#===================================================
# order according to DUET scores
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$DUETStability_Kcalpermol)
#ORDER BY DUET scores: negative values at the top and positive at the bottom
df = df[order(df$DUETStability_Kcalpermol),]
#sanity checks
head(df[,n]) #negatives
tail(df[,n]) #positives
#sanity checks
mean(df[,n])
#[1] -1.173316
tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.4297257 0.3978723
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers
#===============================================
#find range of your data
my_min = min(df[,n]); my_min #-3.87
my_max = max(df[,n]); my_max #1.689
df$ratioDUET = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
) #335, 15
#sanity check
head(df$ratioDUET)
tail(df$ratioDUET)
min(df$ratioDUET); max(df$ratioDUET)
#sanity checks
tapply(df$ratioDUET, df$DUET_outcome, min)
#Destabilizing Stabilizing
#-1.00000000 0.01065719
tapply(df$ratioDUET, df$DUET_outcome, max)
#Destabilizing Stabilizing
#-0.003875969 1.000000000
#should be the same as below (267 and 42)
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
#uncomment as necessary
my_title = "DUET_stability"
#my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioDUET
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioDUET )
, main = "ratio rescaling"
)
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
#===================
# write output as csv file
#===================
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15

View file

@ -0,0 +1,131 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
require(dplyr)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
###########################
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
###########################
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
###########################
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
###########################
# Data for bfactor figure
# PS average
# Lig average
###########################
head(my_df$Position)
head(my_df$ratioDUET)
# order data frame
df = my_df[order(my_df$Position),]
head(df$Position)
head(df$ratioDUET)
#***********
# PS: average by position
#***********
mean_DUET_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.DUET = mean(ratioDUET))
#***********
# Lig: average by position
#***********
mean_Lig_by_position <- df %>%
group_by(Position) %>%
summarize(averaged.Lig = mean(ratioPredAff))
#***********
# cbind:mean_DUET_by_position and mean_Lig_by_position
#***********
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
# sanity check
# mean_PS_Lig_Bfactor
colnames(combined)
colnames(combined) = c("Position"
, "average_DUETR"
, "Position2"
, "average_PredAffR")
colnames(combined)
identical(combined$Position, combined$Position2)
n = which(colnames(combined) == "Position2"); n
combined_df = combined[,-n]
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
#=============
# output csv
#============
outDir = "~/git/Data/pyrazinamide/input/processed/"
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
print(paste0("Output file with path will be:","", outFile))
head(combined_df$Position); tail(combined_df$Position)
write.csv(combined_df, outFile
, row.names = F)

Binary file not shown.

View file

@ -0,0 +1,250 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(cowplot)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#my_df = merged_df3
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# sanity check
# Ensure correct data type in columns to plot: need to be factor
is.numeric(my_df$OR)
#[1] TRUE
#<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
# FOR PS Plots
#<<<<<<<<<<<<<<<<<<<
PS_df = my_df
rm(my_df)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
########################################################################
# Read file: call script for combining df for lig #
########################################################################
getwd()
source("combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for OR and stability plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df2 = merged_df3_comp
#my_df2 = merged_df3
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df2)
str(my_df2)
# sanity check
# Ensure correct data type in columns to plot: need to be factor
is.numeric(my_df2$OR)
#[1] TRUE
# sanity check: should be <10
if (max(my_df2$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
#<<<<<<<<<<<<<<<<
# REASSIGNMENT
# FOR Lig Plots
#<<<<<<<<<<<<<<<<
Lig_df = my_df2
rm(my_df2)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
#############
# Plots: Bubble plot
# x = Position, Y = stability
# size of dots = OR
# col: stability
#############
#=================
# generate plot 1: DUET vs OR by position as geom_points
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
g = ggplot(PS_df, aes(x = factor(Position)
, y = ratioDUET))
p1 = g +
geom_point(aes(col = DUET_outcome
, size = OR)) +
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "DUET(PS)"
, size = "Odds Ratio"
, colour = "DUET Outcome") +
guides(colour = guide_legend(override.aes = list(size=4)))
p1
#=================
# generate plot 2: Lig vs OR by position as geom_points
#=================
my_ats = 20 # axis text size
my_als = 22 # axis label size
# Spelling Correction: made redundant as already corrected at the source
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
table(Lig_df$Lig_outcome)
g = ggplot(Lig_df, aes(x = factor(Position)
, y = ratioPredAff))
p2 = g +
geom_point(aes(col = Lig_outcome
, size = OR))+
theme(axis.text.x = element_text(size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, legend.text = element_text(size = my_als)
, legend.title = element_text(size = my_als) ) +
#, legend.key.size = unit(1, "cm")) +
labs(title = ""
, x = "Position"
, y = "Ligand Affinity"
, size = "Odds Ratio"
, colour = "Ligand Outcome"
) +
guides(colour = guide_legend(override.aes = list(size=4)))
p2
#======================
#combine using cowplot
#======================
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
theme_set(theme_gray()) # to preserve default theme
printFile = cowplot::plot_grid(plot_grid(p1, p2
, ncol = 1
, align = 'v'
, labels = c("A", "B")
, label_size = my_als+5))
print(printFile)
dev.off()

View file

@ -0,0 +1,154 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# Lig_outcome. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding Lig_outcome.
#============================
#===================
# Data for plots
#===================
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = my_df
#%%%%%%%%%%%%%%%%%%%%%%%%
rm(my_df)
# sanity checks
upos = unique(my_df$Position)
# should be a factor
is.factor(df$Lig_outcome)
#TRUE
table(df$Lig_outcome)
# should be -1 and 1: may not be in this case because you have filtered the data
# FIXME: normalisation before or after filtering?
min(df$ratioPredAff) #
max(df$ratioPredAff) #
# sanity checks
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
my_title = "Ligand affinity"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = Lig_outcome), colour = "grey") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -0,0 +1,149 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot 2: Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# DUET outcome. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET_outcome
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
upos = unique(df$Position)
# should be a factor
is.factor(my_df$DUET_outcome)
#[1] TRUE
table(my_df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
my_title = "Protein stability (DUET)"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = DUET_outcome), colour = "grey") +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -0,0 +1,202 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$Lig_outcome)
my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
is.factor(my_df$Lig_outcome)
#[1] TRUE
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Barplot with scores (unordered)
# corresponds to Lig_outcome
# Stacked Barplot with colours: Lig_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding Lig stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
table(df$Lig_outcome)
# should be -1 and 1: may not be in this case because you have filtered the data
# FIXME: normalisation before or after filtering?
min(df$ratioPredAff) #
max(df$ratioPredAff) #
# sanity checks
# very important!!!!
tapply(df$ratioPredAff, df$Lig_outcome, min)
tapply(df$ratioPredAff, df$Lig_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = Lig_outcome
# subgroup = normalised score i.e ratioPredAff
# Prepare data: round off ratioLig scores
# round off to 3 significant digits:
# 165 if no rounding is performed: used to generate the originalgraph
# 156 if rounded to 3 places
# FIXME: check if reducing precision creates any ML prob
# check unique values in normalised data
u = unique(df$ratioPredAff)
# <<<<< -------------------------------------------
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioLigR = round(df$ratioPredAff, n)
u = unique(df$ratioLigR) # 156
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioLigR
df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# else
# uncomment the below if rounding is not required
#my_grp = df$ratioLig
#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
# <<<<< -----------------------------------------------
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
my_title = "Ligand affinity"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -0,0 +1,192 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
source("../barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Barplot with scores (unordered)
# corresponds to DUET_outcome
# Stacked Barplot with colours: DUET_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding DUET stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
upos = unique(df$Position)
# should be a factor
is.factor(my_df$DUET_outcome)
#[1] TRUE
table(df$DUET_outcome)
# should be -1 and 1
min(df$ratioDUET)
max(df$ratioDUET)
tapply(df$ratioDUET, df$DUET_outcome, min)
tapply(df$ratioDUET, df$DUET_outcome, max)
#******************
# generate plot
#******************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = DUET_outcome
# subgroup = normalised score i.e ratioDUET
# Prepare data: round off ratioDUET scores
# round off to 3 significant digits:
# 323 if no rounding is performed: used to generate the original graph
# 287 if rounded to 3 places
# FIXME: check if reducing precicion creates any ML prob
# check unique values in normalised data
u = unique(df$ratioDUET)
# <<<<< -------------------------------------------
# Run this section if rounding is to be used
# specify number for rounding
n = 3
df$ratioDUETR = round(df$ratioDUET, n)
u = unique(df$ratioDUETR)
# create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$ratioDUETR
df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# else
# uncomment the below if rounding is not required
#my_grp = df$ratioDUET
#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
# <<<<< -----------------------------------------------
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
my_title = "Protein stability (DUET)"
# axis label size
my_xaxls = 13
my_yaxls = 15
# axes text size
my_xaxts = 15
my_yaxts = 15
# no ordering of x-axis
g = ggplot(df, aes(factor(Position, ordered = T)))
g +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual( values = colours
, guide = 'none') +
theme( axis.text.x = element_text(size = my_xaxls
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = my_yaxls
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xaxts)
, axis.title.y = element_text(size = my_yaxts ) ) +
labs(title = my_title
, x = "Position"
, y = "Frequency")
# for sanity and good practice
rm(df)
#======================= end of plot
# axis colours labels
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label

View file

@ -0,0 +1,215 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#require(data.table)
#require(dplyr)
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$Lig_outcome)
my_df$Lig_outcome = as.factor(my_df$lig_outcome)
is.factor(my_df$Lig_outcome)
#[1] TRUE
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Basic barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
if (identical(df$Position, df$position)){
print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
print("Error!: Check column names and info contained")
}
#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('basic_barplots_LIG.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = Lig_outcome))
#prinfFile = g + geom_bar(
g + geom_bar(
aes(fill = Lig_outcome)
, show.legend = TRUE
) + geom_label(
stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) + theme(
axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()
) + labs(
title = ""
, y = "Number of SNPs"
#, fill='Ligand Outcome'
) + scale_fill_discrete(name = "Ligand Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#require(data.table)
setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
head(df$pos_count)
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#5 24 36 56 30 18
# use group by on this
snpsBYpos_df <- df %>%
group_by(Position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#5 12 12 14 6 3
# this is what will get plotted
svg('position_count_LIG.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
aes (alpha = 0.5)
, show.legend = FALSE
) +
geom_label(
stat = "count", aes(label = ..count..)
, color = "black"
, size = 10
) +
theme(
axis.text.x = element_text(
size = my_ats
, angle = 0
)
, axis.text.y = element_text(
size = my_ats
, angle = 0
, hjust = 1
)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()
) +
labs(
x = "Number of SNPs"
, y = "Number of Sites"
)
print(prinfFile)
dev.off()
########################################################################
# end of Lig barplots #
########################################################################

View file

@ -0,0 +1,211 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for DUET plots
# you need merged_df3
# or
# merged_df3_comp
# since these have unique SNPs
# I prefer to use the merged_df3
# because using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3
#my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
# sanity check
is.factor(my_df$DUET_outcome)
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
is.factor(my_df$DUET_outcome)
#[1] TRUE
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Basic barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
if (identical(df$Position, df$position)){
print("Sanity check passed: Columns 'Position' and 'position' are identical")
} else{
print("Error!: Check column names and info contained")
}
#****************
# generate plot: No of stabilising and destabilsing muts
#****************
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('basic_barplots_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
theme_set(theme_grey())
# uncomment as necessary for either directly outputting results or
# printing on the screen
g = ggplot(df, aes(x = DUET_outcome))
prinfFile = g + geom_bar(
#g + geom_bar(
aes(fill = DUET_outcome)
, show.legend = TRUE
) + geom_label(
stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) + theme(
axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()
) + labs(
title = ""
, y = "Number of SNPs"
#, fill='DUET Outcome'
) + scale_fill_discrete(name = "DUET Outcome"
, labels = c("Destabilising", "Stabilising"))
print(prinfFile)
dev.off()
#****************
# generate plot: No of positions
#****************
#get freq count of positions so you can subset freq<1
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#34 76 63 104 40 18
# use group by on this
snpsBYpos_df <- df %>%
group_by(Position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#34 38 21 26 8 3
foo = select(df, Mutationinformation
, WildPos
, wild_type
, mutant_type
, mutation_info
, position
, pos_count) #335, 5
getwd()
write.csv(foo, "../Data/pos_count_freq.csv")
svg('position_count_DUET.svg')
my_ats = 25 # axis text size
my_als = 22 # axis label size
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
prinfFile = g + geom_bar(
#g + geom_bar(
aes (alpha = 0.5)
, show.legend = FALSE
) +
geom_label(
stat = "count", aes(label = ..count..)
, color = "black"
, size = 10
) +
theme(
axis.text.x = element_text(
size = my_ats
, angle = 0
)
, axis.text.y = element_text(
size = my_ats
, angle = 0
, hjust = 1
)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()
) +
labs(
x = "Number of SNPs"
, y = "Number of Sites"
)
print(prinfFile)
dev.off()
########################################################################
# end of DUET barplots #
########################################################################

View file

@ -0,0 +1,175 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages and functions #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for PS Corr plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Correlation plots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
table(df$DUET_outcome)
# unique positions
length(unique(df$Position)) #{RESULT: unique positions for comp data}
# subset data to generate pairwise correlations
corr_data = df[, c("ratioDUET"
# , "ratioPredAff"
# , "DUETStability_Kcalpermol"
# , "PredAffLog"
# , "OR"
, "logor"
# , "pvalue"
, "neglog10pvalue"
, "AF"
, "DUET_outcome"
# , "Lig_outcome"
, "pyrazinamide"
)]
dim(corr_data)
rm(df)
# assign nice colnames (for display)
my_corr_colnames = c("DUET"
# , "Ligand Affinity"
# , "DUET_raw"
# , "Lig_raw"
# , "OR"
, "Log(Odds Ratio)"
# , "P-value"
, "-LogP"
, "Allele Frequency"
, "DUET_outcome"
# , "Lig_outcome"
, "pyrazinamide")
# sanity check
if (length(my_corr_colnames) == length(corr_data)){
print("Sanity check passed: corr_data and corr_names match in length")
}else{
print("Error: length mismatch!")
}
colnames(corr_data)
colnames(corr_data) <- my_corr_colnames
colnames(corr_data)
###############
# PLOTS: corr
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
###############
#default pairs plot
start = 1
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
offset = 1
my_corr = corr_data[start:(end-offset)]
head(my_corr)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
#==========
# psych: ionformative since it draws the ellipsoid
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
#==========
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('DUET_corr.svg', width = 15, height = 15)
printFile = pairs.panels(my_corr[1:4]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
, pch = 21
, jitter = T
#, alpha = .05
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 3
, cex.cor = 1
, smooth = F
)
print(printFile)
dev.off()

View file

@ -0,0 +1,187 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
########################################################################
# Read file: call script for combining df for lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for Lig Corr plots
# you need merged_df3_comp
# since these are matched
# to allow pairwise corr
###########################
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df3_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#===========================
# Plot: Correlation plots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df = my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# sanity checks
str(df)
table(df$Lig_outcome)
# unique positions
length(unique(df$Position)) #{RESULT: unique positions for comp data}
# subset data to generate pairwise correlations
corr_data = df[, c(#"ratioDUET",
"ratioPredAff"
# , "DUETStability_Kcalpermol"
# , "PredAffLog"
# , "OR"
, "logor"
# , "pvalue"
, "neglog10pvalue"
, "AF"
# , "DUET_outcome"
, "Lig_outcome"
, "pyrazinamide"
)]
dim(corr_data)
rm(df)
# assign nice colnames (for display)
my_corr_colnames = c(#"DUET",
"Ligand Affinity"
# ,"DUET_raw"
# , "Lig_raw"
# , "OR"
, "Log(Odds Ratio)"
# , "P-value"
, "-LogP"
, "Allele Frequency"
# , "DUET_outcome"
, "Lig_outcome"
, "pyrazinamide")
# sanity check
if (length(my_corr_colnames) == length(corr_data)){
print("Sanity check passed: corr_data and corr_names match in length")
}else{
print("Error: length mismatch!")
}
colnames(corr_data)
colnames(corr_data) <- my_corr_colnames
colnames(corr_data)
###############
# PLOTS: corr
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
###############
# default pairs plot
start = 1
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
offset = 1
my_corr = corr_data[start:(end-offset)]
head(my_corr)
#my_cols = c("#f8766d", "#00bfc4")
# deep blue :#007d85
# deep red: #ae301e
#==========
# psych: ionformative since it draws the ellipsoid
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
#==========
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots"
getwd()
svg('Lig_corr.svg', width = 15, height = 15)
printFile = pairs.panels(my_corr[1:4]
, method = "spearman" # correlation method
, hist.col = "grey" ##00AFBB
, density = TRUE # show density plots
, ellipses = F # show correlation ellipses
, stars = T
, rug = F
, breaks = "Sturges"
, show.points = T
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
, pch = 21
, jitter = T
# , alpha = .05
# , points(pch = 19, col = c("#f8766d", "#00bfc4"))
, cex = 3
, cex.axis = 2.5
, cex.labels = 3
, cex.cor = 1
, smooth = F
)
print(printFile)
dev.off()

View file

@ -0,0 +1,227 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
require(data.table)
########################################################################
# Read file: call script for combining df #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#==========================
###########################
# Data for plots
# you need merged_df2, comprehensive one
# since this has one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
#==========================
# Plot: Lineage barplot
# x = lineage y = No. of samples
# col = Lineage
# fill = lineage
#============================
table(my_df$lineage)
# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV
#3 104 1293 264 1311 6 6 105
#===========================
# Plot: Lineage Barplots
#===========================
#===================
# Data for plots
#===================
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- my_df
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(my_df)
# get freq count of positions so you can subset freq<1
#setDT(df)[, lineage_count := .N, by = .(lineage)]
#******************
# generate plot: barplot of mutation by lineage
#******************
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
df_lin = subset(df, subset = lineage %in% sel_lineages )
#FIXME; add sanity check for numbers.
# Done this manually
############################################################
#########
# Data for barplot: Lineage barplot
# to show total samples and number of unique mutations
# within each linege
##########
# Create df with lineage inform & no. of unique mutations
# per lineage and total samples within lineage
# this is essentially barplot with two y axis
bar = bar = as.data.frame(sel_lineages) #4, 1
total_snps_u = NULL
total_samples = NULL
for (i in sel_lineages){
#print(i)
curr_total = length(unique(df$id)[df$lineage==i])
total_samples = c(total_samples, curr_total)
print(total_samples)
foo = df[df$lineage==i,]
print(paste0(i, "======="))
print(length(unique(foo$Mutationinformation)))
curr_count = length(unique(foo$Mutationinformation))
total_snps_u = c(total_snps_u, curr_count)
}
print(total_snps_u)
bar$num_snps_u = total_snps_u
bar$total_samples = total_samples
bar
#*****************
# generate plot: lineage barplot with two y-axis
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
#*****************
bar$num_snps_u = y1
bar$total_samples = y2
sel_lineages = x
to_plot = data.frame(x = x
, y1 = y1
, y2 = y2)
to_plot
melted = melt(to_plot, id = "x")
melted
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_basic_barplot.svg')
my_ats = 20 # axis text size
my_als = 22 # axis label size
g = ggplot(melted
, aes(x = x
, y = value
, fill = variable)
)
printFile = g + geom_bar(
#g + geom_bar(
stat = "identity"
, position = position_stack(reverse = TRUE)
, alpha=.75
, colour='grey75'
) + theme(
axis.text.x = element_text(
size = my_ats
# , angle= 30
)
, axis.text.y = element_text(size = my_ats
#, angle = 30
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(
size = my_als
, colour = 'black'
)
, axis.title.y = element_text(
size = my_als
, colour = 'black'
)
, legend.position = "top"
, legend.text = element_text(size = my_als)
#) + geom_text(
) + geom_label(
aes(label = value)
, size = 5
, hjust = 0.5
, vjust = 0.5
, colour = 'black'
, show.legend = FALSE
#, check_overlap = TRUE
, position = position_stack(reverse = T)
#, position = ('
) + labs(
title = ''
, x = ''
, y = "Number"
, fill = 'Variable'
, colour = 'black'
) + scale_fill_manual(
values = c('grey50', 'gray75')
, name=''
, labels=c('Mutations', 'Total Samples')
) + scale_x_discrete(
breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
)
print(printFile)
dev.off()

View file

@ -0,0 +1,233 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for Lig #
########################################################################
source("../combining_two_df_lig.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info)
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(my_df$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
#===================
# Data for plots
#===================
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#78 961 195 803
# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#77 955 194 770
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- df_lin
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
g <- ggplot(df, aes(x = ratioPredAff)) +
geom_density(aes(fill = Lig_outcome)
, alpha = 0.5) +
facet_wrap( ~ lineage
, scales = "free"
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian(xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
)
ggtitle("Kernel Density estimates of Ligand affinity by lineage")
ggplotly(g)
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_dist_LIG.svg')
printFile = ggplot( df, aes(x = ratioPredAff
, y = Lig_outcome) ) +
geom_density_ridges_gradient( aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
# , switch = 'x'
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian( xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
) +
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
, name = "Ligand Affinity" ) +
theme( axis.text.x = element_text( size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
# , axis.text.y = element_text( size = my_ats
# , angle = 0
# , hjust = 1
# , vjust = 0)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size = my_als)
, legend.text = element_text(size = 10)
, legend.title = element_text(size = my_als)
# , legend.position = c(0.3, 0.8)
# , legend.key.height = unit(1, 'mm')
)
print(printFile)
dev.off()
#=!=!=!=!=!=!
# COMMENT: When you look at all mutations, the lineage differences disappear...
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!
#===================================================
# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)
lin1 = df[df$lineage == "lineage1",]$ratioPredAff
lin2 = df[df$lineage == "lineage2",]$ratioPredAff
lin3 = df[df$lineage == "lineage3",]$ratioPredAff
lin4 = df[df$lineage == "lineage4",]$ratioPredAff
# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)
ks.test(lin2,lin3)
ks.test(lin2,lin4)
ks.test(lin3,lin4)

View file

@ -0,0 +1,212 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("../Header_TT.R")
#source("barplot_colour_function.R")
#require(data.table)
########################################################################
# Read file: call script for combining df for PS #
########################################################################
source("../combining_two_df.R")
#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION
#==========================
# This will return:
# df with NA:
# merged_df2
# merged_df3
# df without NA:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
###########################
# uncomment as necessary
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
my_df = merged_df2
#my_df = merged_df2_comp
#<<<<<<<<<<<<<<<<<<<<<<<<<
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_df)
str(my_df)
# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)
table(my_df$mutation_info)
########################################################################
# end of data extraction and cleaning for plots #
########################################################################
#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================
#===================
# Data for plots
#===================
# subset only lineages1-4
sel_lineages = c("lineage1"
, "lineage2"
, "lineage3"
, "lineage4")
# uncomment as necessary
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
# refactor
df_lin$lineage = factor(df_lin$lineage)
table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4
#104 1293 264 1311
# when merged_df2_comp is used
#lineage1 lineage2 lineage3 lineage4
#99 1275 263 1255
length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}
# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
print ("sanity check passed: numbers match")
} else{
print("Error!: check your numbers")
}
#<<<<<<<<<<<<<<<<<<<<<<<<<
# REASSIGNMENT
df <- df_lin
#<<<<<<<<<<<<<<<<<<<<<<<<<
rm(df_lin)
#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
library(plotly)
library(ggridges)
g <- ggplot(df, aes(x = ratioDUET)) +
geom_density(aes(fill = DUET_outcome)
, alpha = 0.5) + facet_wrap(~ lineage,
scales = "free") +
ggtitle("Kernel Density estimates of Protein stability by lineage")
ggplotly(g)
# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size
fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()
svg('lineage_dist_PS.svg')
printFile = ggplot( df, aes(x = ratioDUET
, y = DUET_outcome) )+
#printFile=geom_density_ridges_gradient(
geom_density_ridges_gradient( aes(fill = ..x..)
, scale = 3
, size = 0.3 ) +
facet_wrap( ~lineage
, scales = "free"
# , switch = 'x'
, labeller = labeller(lineage = fooNames) ) +
coord_cartesian( xlim = c(-1, 1)
# , ylim = c(0, 6)
# , clip = "off"
) +
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
, name = "DUET" ) +
theme( axis.text.x = element_text( size = my_ats
, angle = 90
, hjust = 1
, vjust = 0.4)
# , axis.text.y = element_text( size = my_ats
# , angle = 0
# , hjust = 1
# , vjust = 0)
, axis.text.y = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_blank()
, axis.ticks.y = element_blank()
, plot.title = element_blank()
, strip.text = element_text(size=my_als)
, legend.text = element_text(size=10)
, legend.title = element_text(size=my_als)
# , legend.position = c(0.3, 0.8)
# , legend.key.height = unit(1, 'mm')
)
print(printFile)
dev.off()
#=!=!=!=!=!=!
# COMMENT: When you look at all mutations, the lineage differences disappear...
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!
#===================================================
# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)
lin1 = df[df$lineage == "lineage1",]$ratioDUET
lin2 = df[df$lineage == "lineage2",]$ratioDUET
lin3 = df[df$lineage == "lineage3",]$ratioDUET
lin4 = df[df$lineage == "lineage4",]$ratioDUET
# ks test
ks.test(lin1,lin2)
ks.test(lin1,lin3)
ks.test(lin1,lin4)
ks.test(lin2,lin3)
ks.test(lin2,lin4)
ks.test(lin3,lin4)

View file

@ -0,0 +1,27 @@
#########################
#3: Read complex pdb file
##########################
source("Header_TT.R")
# This script only reads the pdb file of your complex
# read in pdb file complex1
inDir = "~/git/Data/pyrazinamide/input/structure/"
inFile = paste0(inDir, "complex1_no_water.pdb")
complex1 = inFile
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
#complex2 = inFile2
# list of 8
my_pdb = read.pdb(complex1
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
rm(inDir, inFile, complex1)
#====== end of script

View file

@ -0,0 +1,386 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
########################################################################
# Installing and loading required packages #
########################################################################
source("Header_TT.R")
#########################################################
# TASK: replace B-factors in the pdb file with normalised values
# use the complex file with no water as mCSM lig was
# performed on this file. You can check it in the script: read_pdb file.
#########################################################
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
#=========================================================
# Processing P1: Replacing B factor with mean ratioDUET scores
#=========================================================
#########################
# Read complex pdb file
# form the R script
##########################
source("read_pdb.R") # list of 8
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
#*******************************************
# plot histograms for inspection
# 1: original B-factors
# 2: original DUET Scores
# 3: replaced B-factors with DUET Scores
#*********************************************
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
#1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: DUET scores
hist(my_df$average_DUETR
, xlab = ""
, main = "Norm_DUET")
plot(density(my_df$average_DUETR)
, xlab = ""
, main = "Norm_DUET")
# 3: After the following replacement
#********************************
#=========
# step 0_P1: DONT RUN once you have double checked the matched output
#=========
# sanity check: match and assign to a separate column to double check
# colnames(my_df)
# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
#=========
# step 1_P1
#=========
# Be brave and replace in place now (don't run sanity check)
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
#=========
# step 2_P1
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
#table(d$b)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
# sanity check: should be True
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if(max(d$b) == max(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_DUETR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P1
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P1
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P1
#=========
# output dir
getwd()
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
write.pdb(my_pdb, outFile)
#********************************
# Add the 3rd histogram and density plots for comparisons
#********************************
# Plots continued...
# 3: hist and density of replaced B-factors with DUET Scores
hist(d$b
, xlab = ""
, main = "repalced-B")
plot(density(d$b)
, xlab = ""
, main = "replaced-B")
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = "DUET_stability"
, side = 3
, line = 0
, outer = TRUE)
#********************************
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# NOTE: This replaced B-factor distribution has the same
# x-axis as the PredAff normalised values, but the distribution
# is affected since 0 is overinflated. This is because all the positions
# where there are no SNPs have been assigned 0.
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#######################################################################
#====================== end of section 1 ==============================
#######################################################################
#=========================================================
# Processing P2: Replacing B values with PredAff Scores
#=========================================================
# clear workspace
rm(list = ls())
###########################
# 2: Read file: average stability values
# or mcsm_normalised file, output of step 4 mcsm pipeline
###########################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
my_df <- read.csv(inFile
# , row.names = 1
# , stringsAsFactors = F
, header = T)
str(my_df)
#rm(inDir, inFile)
#########################
# 3: Read complex pdb file
# form the R script
##########################
source("read_pdb.R") # list of 8
# extract atom list into a variable
# since in the list this corresponds to data frame, variable will be a df
d = my_pdb[[1]]
# make a copy: required for downstream sanity checks
d2 = d
# sanity checks: B factor
max(d$b); min(d$b)
#*******************************************
# plot histograms for inspection
# 1: original B-factors
# 2: original Pred Aff Scores
# 3: replaced B-factors with PredAff Scores
#********************************************
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(3,2))
#par(mfrow = c(3,2))
# 1: Original B-factor
hist(d$b
, xlab = ""
, main = "B-factor")
plot(density(d$b)
, xlab = ""
, main = "B-factor")
# 2: Pred Aff scores
hist(my_df$average_PredAffR
, xlab = ""
, main = "Norm_lig_average")
plot(density(my_df$average_PredAffR)
, xlab = ""
, main = "Norm_lig_average")
# 3: After the following replacement
#********************************
#=================================================
# Processing P2: Replacing B values with ratioPredAff scores
#=================================================
# use match to perform this replacement linking with "position no"
# in the pdb file, this corresponds to column "resno"
# in my_df, this corresponds to column "Position"
#=========
# step 0_P2: DONT RUN once you have double checked the matched output
#=========
# sanity check: match and assign to a separate column to double check
# colnames(my_df)
# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
#=========
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
#=========
# this makes all the B-factor values in the non-matched positions as NA
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
#=========
# step 2_P2
#=========
# count NA in Bfactor
b_na = sum(is.na(d$b)) ; b_na
# count number of 0's in Bactor
sum(d$b == 0)
#table(d$b)
# replace all NA in b factor with 0
d$b[is.na(d$b)] = 0
# sanity check: should be 0
sum(is.na(d$b))
if (sum(d$b == 0) == b_na){
print ("Sanity check passed: NA's replaced with 0's successfully")
} else {
print("Error: NA replacement NOT successful, Debug code!")
}
max(d$b); min(d$b)
# sanity checks: should be True
if (max(d$b) == max(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
if (min(d$b) == min(my_df$average_PredAffR)){
print("Sanity check passed: B-factors replaced correctly")
} else {
print ("Error: Debug code please")
}
#=========
# step 3_P2
#=========
# sanity check: dim should be same before reassignment
# should be TRUE
dim(d) == dim(d2)
#=========
# step 4_P2
#=========
# assign it back to the pdb file
my_pdb[[1]] = d
max(d$b); min(d$b)
#=========
# step 5_P2
#=========
# output dir
outDir = "~/git/Data/pyrazinamide/input/structure/"
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
write.pdb(my_pdb, outFile)
#********************************
# Add the 3rd histogram and density plots for comparisons
#********************************
# Plots continued...
# 3: hist and density of replaced B-factors with PredAff Scores
hist(d$b
, xlab = ""
, main = "repalced-B")
plot(density(d$b)
, xlab = ""
, main = "replaced-B")
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = "Lig_stability"
, side = 3
, line = 0
, outer = TRUE)
#********************************
###########
# end of output files with Bfactors
##########

View file

@ -0,0 +1,257 @@
getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
#########################################################
# 1: Installing and loading required packages #
#########################################################
source("Header_TT.R")
#source("barplot_colour_function.R")
##########################################################
# Checking: Entire data frame and for PS #
##########################################################
###########################
#2) Read file: combined one from the script
###########################
source("combining_two_df.R")
# df with NA:
# merged_df2
# merged_df3:
# df without NA:
# merged_df2_comp:
# merged_df3_comp:
######################
# You need to check it
# with the merged_df3
########################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# should be true
identical(my_df$Position, my_df$position)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data <- read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
str(mcsm_data)
my_colnames = colnames(mcsm_data)
#====================================
# subset my_df to include only the columns in mcsm data
my_df2 = my_df[my_colnames]
#====================================
# compare the two
head(mcsm_data$Mutationinformation)
head(mcsm_data$Position)
head(my_df2$Mutationinformation)
head(my_df2$Position)
# sort mcsm data by Mutationinformation
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data_s$Mutationinformation)
head(mcsm_data_s$Position)
# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data_s, my_df2)
# from library dplyr
setdiff(mcsm_data_s, my_df2)
#from lib compare
compare(mcsm_data_s, my_df2) # seems rownames are the problem
# FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)
#====================================================== end of section 1
##########################################################
# Checking: LIG(Filtered dataframe) #
##########################################################
# clear workspace
rm(list = ls())
###########################
#3) Read file: combined_lig from the script
###########################
source("combining_two_df_lig.R")
# df with NA:
# merged_df2 :
# merged_df3:
# df without NA:
# merged_df2_comp:
# merged_df3_comp:
######################
# You need to check it
# with the merged_df3
########################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# should be true
identical(my_df$Position, my_df$position)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data <- read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
str(mcsm_data)
###########################
# 4a: Filter/subset data: ONLY for LIGand analysis
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# sanity checks
upos = unique(mcsm_data$Position)
# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)
# Lig filtered: subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
rm(mcsm_data) #to avoid confusion
table(mcsm_data2$Dis_lig_Ang<10)
table(mcsm_data2$Dis_lig_Ang>10)
max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)
upos_f = unique(mcsm_data2$Position); upos_f
# colnames of df that you will need to subset the bigger df from
my_colnames = colnames(mcsm_data2)
#====================================
# subset bigger df i.e my_df to include only the columns in mcsm data2
my_df2 = my_df[my_colnames]
rm(my_df) #to avoid confusion
#====================================
# compare the two
head(mcsm_data2$Mutationinformation)
head(mcsm_data2$Position)
head(my_df2$Mutationinformation)
head(my_df2$Position)
# sort mcsm data by Mutationinformation
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
head(mcsm_data2_s$Mutationinformation)
head(mcsm_data2_s$Position)
# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data2_s, my_df2)
# from library dplyr
setdiff(mcsm_data2_s, my_df2)
# from library compare
compare(mcsm_data2_s, my_df2) # seems rownames are the problem
#FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)
##########################################################
# extract and write output file for SNP posn: all #
##########################################################
head(merged_df3$Position)
foo = merged_df3[order(merged_df3$Position),]
head(foo$Position)
snp_pos_unique = unique(foo$Position); snp_pos_unique
# sanity check:
table(snp_pos_unique == combined_df$Position)
#=====================
# write_output files
#=====================
outDir = "~/Data/pyrazinamide/input/processed/"
outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
print(paste0("Output file name and path will be:","", outFile1))
write.table(snp_pos_unique
, outFile1
, row.names = F
, col.names = F)
##############################################################
# extract and write output file for SNP posn: complete only #
##############################################################
head(merged_df3_comp$Position)
foo = merged_df3_comp[order(merged_df3_comp$Position),]
head(foo$Position)
snp_pos_unique = unique(foo$Position); snp_pos_unique
# outDir = "~/Data/pyrazinamide/input/processed/" # already set
outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
print(paste0("Output file name and path will be:", outFile2))
write.table(snp_pos_unique
, outFile2
, row.names = F
, col.names = F)
#============================== end of script

View file

@ -0,0 +1,512 @@
, stringsAsFactors = F)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
my_logor
pnca_snps_or$Mutationinformation == i
View(pnca_snps_or)
#===============
# Step 4: Calculate for one snp
# using i, when you run the loop, it is easy
#===============
i = "pnca_p.trp68gly"
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
# uncomment as necessary
pnca_snps_or = pnca_snps_or[1:5,]
pnca_snps_or = pnca_snps_or[c(1:5),]
#===============
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
pnca_snps_or = pnca_snps_or[1:5,]
pnca_snps_or = pnca_snps_or[c(1:5),]
pnca_snps_or = pnca_snps_or[1:5]
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
pnca_snps_or = pnca_snps_or[1:5]
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
foo = pnca_snps_or[c(1:5,)]
foo = pnca_snps_or[c(1:5),]
foo = as.data.frame(pnca_snps_or[c(1:5),])
View(foo)
# create an empty dataframe
pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
#===============
# Step 4: Iterate through this unique list
# and calculate OR, but only for one snp
# this is test before you apply it all others
#===============
pnca_snps_or$mutation == i
View(pnca_snps_or)
# create an empty dataframe
pnca_snps_or = data.frame(mutation = i)
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
View(pnca_snps_or_copy)
#===============
# Step 4: Iterate through this unique list
# and calculate OR, but only for one snp
# this is test before you apply it all others
#===============
#reset original df so you don't make a mistake
pnca_snps_or = pnca_snps_or_copy
for (i in pnca_snps_unique){
print(i)
}
pnca_snps_or = pnca_snps_or_copy #2133, 1
#........................................
# create an empty dataframe : uncomment as necessary
pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
#........................................
# create an empty dataframe : uncomment as necessary
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
#........................................
# create an empty dataframe : uncomment as necessary
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
View(pnca_snps_or)
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
View(pnca_snps_or)
pnca_snps_or = pnca_snps_or_copy #2133, 1
for (i in pnca_snps_unique){
print(i)
#*************
# start logistic regression model building
#*************
# set the IV and DV for the logistic regression model
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
}
warnings()
View(pnca_snps_or)
View(pnca_snps_or_copy)
#sanity check
pnca_snps_or$mutation == i1
#sanity check
pnca_snps_or[pnca_snps_or$mutation == i1]
pnca_snps_or[pnca_snps_or$mutation == i2]
pnca_snps_or[pnca_snps_or$mutation == i2,]
pnca_snps_or1 = unique(pnca_snps_or)
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
# you only need it for the unique mutations
pnca_snps_or = unique(pnca_snps_or) #2133, 1
for (i in pnca_snps_unique){
print(i)
#*************
# start logistic regression model building
#*************
# set the IV and DV for the logistic regression model
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
}
View(pnca_snps_or)
2.290256e+01
1.561132e+06
3.242285e-04
#sanity check
pnca_snps_or[pnca_snps_or$mutation == i1]
pnca_snps_or[pnca_snps_or$mutation == i2,]
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
, stringsAsFactors = FALSE) #11374, 19
View(my_data)
# remove the first column
my_data = my_data[-1] #11374, 18
# check if first col is 'id': should be TRUE
colnames(my_data)[1] == 'id'
# sanity check
snps_all = unique(my_data$mutation)# 337
pnca_snps_or = snps_all
pnca_snps_or = as.data.frame(snps_all)
View(pnca_snps_or)
snps_all[-"true_wt"]
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
View(pnca_snps_or)
snps_all = as.data.frame(snps_all)
View(snps_all)
#remove true_wt entry
w1 = which(rownames(snps_all) == "true_wt")
View(snps_all)
#remove true_wt entry
w1 = which(snps_all$snps_all == "true_wt")
rm(pnca_snps_or)
pnca_snps_or = snps_all[-w1]
pnca_snps_or = snps_all[,-w1]
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
#remove true_wt entry
w1 = which(snps_all) == "true_wt"
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
, stringsAsFactors = FALSE) #11374, 19
# remove the first column
my_data = my_data[-1] #11374, 18
# check if first col is 'id': should be TRUE
colnames(my_data)[1] == 'id'
# sanity check
snps_all = unique(my_data$mutation)# 337
snps_all = as.data.frame(snps_all)
snps_all[-c(1,1)]
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
#remove true_wt entry
#w1 = which(snps_all) == "true_wt"
pnca_snps_or = snps_all
pnca_snps_or = pnca_snps_or_copy
#remove true_wt entry
#w1 = which(snps_all) == "true_wt"
pnca_snps_or = snps_all
pnca_snps_or -> pnca_snps_or_copy
#===============
# Step 4: Iterate through this unique list
# and calculate OR for each snp
# and assign to the pnca_snps_or df that has
# each row as a unique snp
#===============
# reset original df so you don't make a mistake: IMPORTANT
pnca_snps_or = pnca_snps_or_copy #2133, 1
# you only need it for the unique mutations
pnca_snps_or = unique(pnca_snps_or) #337, 1
for (i in pnca_snps_unique){
print(i)
#*************
# start logistic regression model building
#*************
# set the IV and DV for the logistic regression model
# IV: corresponds to each unique snp (extracted using grep)
x = as.numeric(grepl(i,raw_data$all_muts_pza))
# DV: pyrazinamide 0 or 1
y = as.numeric(raw_data$pyrazinamide)
table(y,x)
# run glm model
model = glm(y ~ x, family = binomial)
#model = glm(y ~ x, family = binomial(link = "logit"))
summary(model)
#**********
# extract relevant model output
#**********
# extract log OR i.e the Beta estimate of the logistic model for a given snp
my_logor = summary(model)$coefficients[2,1]
print(paste0('Beta:', my_logor))
# extract SE of the logistic model for a given snp
my_se = summary(model)$coefficients[2,2]
print(paste0('SE:', my_se))
# extract Z of the logistic model for a given snp
my_zval = summary(model)$coefficients[2,3]
print(paste0('Z-value:', my_zval))
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
my_or = exp(summary(model)$coefficients[2,1])
print(paste0('OR:', my_or))
# sanity check : should be True
log(my_or) == my_logor
# extract P-value of the logistic model for a given snp
my_pval = summary(model)$coefficients[2,4]
print(paste0('P-value:', my_pval))
# extract confint interval of snp (2 steps, since the output is a named number)
ci_mod = exp(confint(model))[2,]
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
print(paste0('CI:', my_ci))
#*************
# Assign the regression output in the original df
# you can use ('=' or '<-/->')
#*************
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
}
getwd()
#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
getwd()
#===============
# Step 1: read raw data
#===============
raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
# combine the two mutation columns
raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
head(raw_data$all_mutations_pyrazinamide)
# create yet another column that contains all the mutations but in lower case
raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
table(grepl("pnca_p",raw_data$all_muts_pza))
#FALSE TRUE
#10603 1908
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
, stringsAsFactors = F
, header = T) #2133
# subset a snall section to test
#pnca_snps_or_copy = pnca_snps_or
#pnca_snps_or = pnca_snps_or_copy
pnca_snps_unique = unique(pnca_snps_or$mutation) #293
i2 = "pnca_p.trp68gly" # Should exist
grep(i2, pnca_snps_unique)
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
, stringsAsFactors = FALSE) #11374, 19
# remove the first column
my_data = my_data[-1] #11374, 18
# check if first col is 'id': should be TRUE
colnames(my_data)[1] == 'id'
# sanity check
head(my_data$mutation)
my_data = unique(my_data$mutation)
my_data[!duplicated(my_data$mutation)]
my_data_unique = my_data[!duplicated(my_data$mutation),]
my_data[!duplicated('mutation'),]
my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
my_data_unique = my_data[!duplicated(my_data['mutation']),]
getwd()
setwd("/git/LSHTM_analysis/meta_data_analysis")
getwd()
getwd()
setwd("/git/github/LSHTM_analysis/meta_data_analysis")
getwd()
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
c = file.choose()
c = file.choose(../Data_original)
c = read.csv(file.choose(), stringsAsFactors = F)
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv(file.choose(), stringsAsFactors = F))
c = read.csv(file.choose(), stringsAsFactors = F)
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv(file.choose(), stringsAsFactors = F)
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv(file.choose(), stringsAsFactors = F)
raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
outdir = paste0("../mcsm_analysis",drug,"/Data/")
# define output variables
drug = 'pyrazinamide'
outdir = paste0("../mcsm_analysis",drug,"/Data/")
outdir = paste0("../mcsm_analysis/",drug,"/Data/")
outFile = "meta_data_with_AFandOR.csv"
output_filename = paste0(outdir, outFile)
output_filename

View file

@ -0,0 +1,7 @@
#!/usr/bin/python3
# Initialise a blank 'Data' directory and drug subdirs etc.
# TODO:
# - Read base dir from config file
# - Create eg: '~/git/Data/{original,processed}
# - Create eg: '~/git/Data/processed/' + drug (for each drug)
# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'

View file

@ -0,0 +1,241 @@
getwd()
setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
getwd()
#===============
# Step 1: read GWAS raw data stored in Data_original/
#===============
infile = read.csv(file.choose(), stringsAsFactors = F)
raw_data = infile[,c("id"
, "pyrazinamide"
, "dr_mutations_pyrazinamide"
, "other_mutations_pyrazinamide")]
#####
# 1a: exclude na
#####
raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
total_samples = length(unique(raw_data$id))
print(total_samples)
# sanity check: should be true
is.numeric(total_samples)
#####
# 1b: combine the two mutation columns
#####
raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
, raw_data$other_mutations_pyrazinamide)
head(raw_data$all_mutations_pyrazinamide)
#####
# 1c: create yet another column that contains all the mutations but in lower case
#####
raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide)
# sanity checks
table(grepl("pnca_p",raw_data$all_muts_pnca))
# sanity check: should be TRUE
sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
# set up variables: can be used for logistic regression as well
i = "pnca_p.ala134gly" # has a NA, should NOT exist
table(grepl(i,raw_data$all_muts_pnca))
i = "pnca_p.trp68gly"
table(grepl(i,raw_data$all_muts_pnca))
mut = grepl(i,raw_data$all_muts_pnca)
dst = raw_data$pyrazinamide
table(mut, dst)
#chisq.test(table(mut,dst))
#fisher.test(table(mut, dst))
#table(mut)
###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
pnca_snps_or = read.csv(file.choose()
, stringsAsFactors = F
, header = T)
# extract unique snps to iterate over for AF and OR calcs
# total no of unique snps
# AF and OR calculations
pnca_snps_unique = unique(pnca_snps_or$mutation)
# Define OR function
x = as.numeric(mut)
y = dst
or = function(x,y){
tab = as.matrix(table(x,y))
a = tab[2,2]
if (a==0){ a<-0.5}
b = tab[2,1]
if (b==0){ b<-0.5}
c = tab[1,2]
if (c==0){ c<-0.5}
d = tab[1,1]
if (d==0){ d<-0.5}
(a/b)/(c/d)
}
dst = raw_data$pyrazinamide
ors = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
or(mut,dst)
})
ors
pvals = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
fisher.test(mut,dst)$p.value
})
pvals
afs = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
mean(mut)
})
afs
# check ..hmmm
afs['pnca_p.trp68gly']
afs['pnca_p.gln10pro']
afs['pnca_p.leu4ser']
#plot(density(log(ors)))
#plot(-log10(pvals))
#hist(log(ors)
# ,breaks = 100
# )
# subset df cols to add to the calc param df
pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')]
pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
rownames(pnca_snps_cols) = pnca_snps_cols$mutation
head(rownames(pnca_snps_cols))
#snps_with_AF_and_OR
# combine
comb_AF_and_OR = data.frame(ors, pvals, afs)
head(rownames(comb_AF_and_OR))
# sanity checks: should be the same
dim(comb_AF_and_OR); dim(pnca_snps_cols)
table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
# merge the above two df whose dim you checked
snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
, by = "row.names"
# , all.x = T
)
#rm(pnca_snps_cols, pnca_snps_or, raw_data)
#===============
# Step 3: Read data file where you will add the calculated OR
# Note: this is the big file with one-many relationship between snps and lineages
# i.e fname4 from 'pnca_extraction.py'
#===============
my_data = read.csv(file.choose()
, row.names = 1
, stringsAsFactors = FALSE)
head(my_data)
length(unique(my_data$id))
# check if first col is 'id': should be TRUE
colnames(my_data)[1] == 'id'
# sanity check
head(my_data$mutation)
# FILES TO MERGE:
# comb_AF_and_OR: file containing OR
# my_data = big meta data file
# linking column: mutation
head(my_data)
merged_df = merge(my_data # big file
, snps_with_AF_and_OR # small (afor file)
, by = "mutation"
, all.x = T) # because you want all the entries of the meta data
# sanity checks: should be True
# FIXME: I have checked this manually, but make it so it is a pass or a fail!
comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors
merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
merged_df[merged_df$Mutationinformation.x == "Q10P",]
# sanity check: very important!
colnames(merged_df)
table(merged_df$mutation_info.x == merged_df$mutation_info.y)
#FIXME: what happened to other 7 and FALSE
table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
# problem
identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
#throw away the y because that is a smaller df
d1 = which(colnames(merged_df) == "mutation_info.y") #21
d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
merged_df2 = merged_df[-c(d1, d2)] #3093 20
colnames(merged_df2)
# rename cols
colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
colnames(merged_df2)
# should be 0
sum(is.na(merged_df2$Mutationinformation))
# count na in each column
na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
# only some or and Af should be NA
#Row.names ors pvals afs
#81 81 81 81
colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
colnames(merged_df2)
# add log OR and neglog pvalue
merged_df2$logor = log(merged_df2$OR)
is.numeric(merged_df2$logor)
merged_df2$neglog10pvalue = -log10(merged_df2$pvalue)
is.numeric(merged_df2$neglog10pvalue)
# write file out
#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
# define output variables
drug = 'pyrazinamide'
out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
outFile = "meta_data_with_AFandOR.csv"
output_filename = paste0(outdir, outFile)
write.csv(merged_df2, output_filename
, row.names = F)

View file

@ -0,0 +1,626 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 6 12:56:03 2019
@author: tanu
"""
# FIXME: include error checking to enure you only
# concentrate on positions that have structural info?
#%% load libraries
###################
# load libraries
import os, sys
import pandas as pd
#import numpy as np
#from pandas.api.types import is_string_dtype
#from pandas.api.types import is_numeric_dtype
# to create dir
#my_dir = os.path.expanduser('~/some_dir')
#make sure mcsm_analysis/ exists
#or specify the output directory
#%%
#%%
#%%
#========================================================
# TASK: extract ALL pncA mutations from GWAS data
#========================================================
#%%
####################
# my working dir
os.getcwd()
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#%%
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
#%%
#NOTE: Out_dir MUST exis
# User defined dir strpyrazinamide
drug = 'pyrazinamide'
gene = 'pnca'
out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
# = out_dir + drug
data_dir = homedir + '/git/Data'
if not os.path.exists(data_dir):
print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
os.makedirs(data_dir)
die()
if not os.path.exists(out_dir):
print('Error!', out_dir, 'does not exist. Please create it')
exit()
#if not os.path.exists(work_dir):
# print('creating dir that does not exist', 'dir_name:', work_dir)
# os.makedirs(work_dir)
else:
print('Dir exists: Carrying on')
# now create sub dir structure within work_dir
# pyrazinamide/mcsm_analysis
# we need three dir
# Data
# Scripts
# Plotting
# Results
# Plots
# create a list of dir names
#dir_names = ['Data', 'Scripts', 'Results']
#for i in dir_names:
# this_dir = (work_dir + '/' + i)
# if not os.path.exists(this_dir):
# print('creating dir that does not exist:', this_dir)
# os.makedirs(this_dir)
#else:
# print('Dir exists: Carrying on')
# Create sub dirs
# 1)
# Scripts
# Plotting
#subdir_plotting = work_dir + '/Scripts/Plotting'
#if not os.path.exists(subdir_plotting):
# print('creating dir that does not exist:', subdir_plotting)
# os.makedirs(subdir_plotting)
#else:
# print('Dir exists: Carrying on')
# 2)
# Results
# Plots
#subdir_plots = work_dir + '/Results/Plots'
#if not os.path.exists(subdir_plots):
# print('creating dir that does not exist:', subdir_plots)
# os.makedirs(subdir_plots)
#else:
# print('Dir exists: Carrying on')
# clear varaibles
#del(dir_names, drug, i, subdir_plots, subdir_plotting)
#exit()
#%%
#==============================================================================
############
# STEP 1: Read file original_tanushree_data_v2.csv
############
data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
meta_data = pd.read_csv(data_file, sep = ',')
# column names
list(meta_data.columns)
# extract elevant columns to extract from meta data related to the pyrazinamide
meta_data = meta_data[['id'
,'country'
,'lineage'
,'sublineage'
,'drtype'
, 'pyrazinamide'
, 'dr_mutations_pyrazinamide'
, 'other_mutations_pyrazinamide'
]]
# checks
total_samples = meta_data['id'].nunique() # 19265
# counts NA per column
meta_data.isna().sum()
# glance
meta_data.head()
# equivalent of table in R
# pyrazinamide counts
meta_data.pyrazinamide.value_counts()
#%%
############
# STEP 2: extract entries containing selected genes:
# pyrazinamide: pnca_p.
# in the dr_mutations and other mutations"
# as we are interested in the mutations in the protein coding region only
# (corresponding to a structure)
# and drop the entries with NA
#############
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
del(meta_pza)
##########################
# pyrazinamide: pnca_p.
##########################
meta_data_pnca = meta_data[['id'
,'country'
,'lineage'
,'sublineage'
,'drtype'
, 'pyrazinamide'
, 'dr_mutations_pyrazinamide'
, 'other_mutations_pyrazinamide'
]]
del(meta_data)
# sanity checks
# dr_mutations only
meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pnca_dr['id'].nunique()
del(meta_pnca_dr)
# other mutations
meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pnca_other['id'].nunique()
del(meta_pnca_other)
# Now extract "all" mutations
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
meta_pnca_all['id'].nunique()
pnca_samples = len(meta_pnca_all)
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
comp_pnca_samples = pnca_samples - pnca_na
#=#=#=#=#=#=#
# COMMENT: use it later to check number of complete samples from LF data
#=#=#=#=#=#=#
# sanity checks
meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
meta_pnca_all.other_mutations_pyrazinamide.value_counts()
# more sanity checks
# !CAUTION!: muts will change depending on your gene
# dr muts : insert
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] #
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
# other_muts
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
#=#=#=#=#=#=#=#=#=#
# FIXME
# COMMENTS: both mutations columns are separated by ;
# CHECK if there are mutations that exist both in dr and other_muts!
# doesn't make any sense for the same mut to exist in both, I would have thought!
#=#=#=#=#=#=#=#=#=#
# remove not required variables
del(meta_data_pnca)
############
# STEP 3: split the columns of
# a) dr_mutation_... (;) as
# the column has snps related to multiple genes.
# useful links
# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
# this one works beautifully
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
############
# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
meta_pnca_all.isna().sum()
#=#=#=#=#=#=#=#=#=#
# COMMENT: no NA's in dr_mutations/other_mutations_columns
#=#=#=#=#=#=#=#=#=#
# define the split function
def tidy_split(df, column, sep='|', keep=False):
"""
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
"""
indexes = list()
new_values = list()
#df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
########
# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
########
meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';')
# remove leading white space else these are counted as distinct mutations as well
meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip()
########
# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
########
meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';')
# remove the leading white spaces in the column
meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip()
##########
# Step 4: Reshape data so that all mutations are in one column and the
# annotations for the mutation reflect the column name
# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
# data frame “df” is passed to melt() function
# id_vars is the variable which need to be left unaltered
# var_name are the column names so we named it as 'mutation_info'
# value_name are its values so we named it as 'mutation'
##########
meta_pnca_WF1.columns
meta_pnca_LF0 = pd.melt(meta_pnca_WF1
, id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
, var_name = 'mutation_info'
, value_name = 'mutation')
# sanity check: should be true
if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
print('sanity check passed: Long format df has the expected length')
else:
print("Sanity check failed: Debug please!")
###########
# Step 5: This is still dirty data. Filter LF data so that you only have
# mutations corresponding to pnca_p.
# this will be your list you run OR calcs
###########
meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')]
# sanity checks
# unique samples
meta_pnca_LF1['id'].nunique()
if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
print("Sanity check passed: No of samples with pncA mutations match")
else:
print("Sanity check failed: Debug please!")
# count if all the mutations are indeed in the protein coding region
# i.e begin with pnca_p
meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
# should be true.
# and check against the length of the df, which should match
if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
else:
print("Sanity check failed: Debug please!")
###########
# Step 6: Filter dataframe with "na" in the drug column
# This is because for OR, you can't use the snps that have the
# NA in the specified drug column
# it creates problems when performing calcs in R inside the loop
# so best to filter it here
###########
# NOT NEEDED FOR all snps, only for extracting valid OR snps
del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
###########
# Step 7: count unique pncA_p mutations (all and comp cases)
###########
meta_pnca_LF1['mutation'].nunique()
meta_pnca_LF1.groupby('mutation_info').nunique()
meta_pnca_LF1['id'].nunique()
meta_pnca_LF1['mutation'].nunique()
meta_pnca_LF1.groupby('id').nunique()
###########
# Step 8: convert all snps only (IN LOWERCASE)
# because my mcsm file intergated has lowercase
###########
# convert mutation to lower case as it needs to exactly match the dict key
#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
###########
# Step 9 : Split 'mutation' column into three: wild_type, position and
# mutant_type separately. Then map three letter code to one from the
# referece_dict imported pncaeady. First convert to mutation to lowercase
# to allow to match entries from dict
###########
#=======
# Step 9a: iterate through the dict, create a lookup dict i.e
# lookup_dict = {three_letter_code: one_letter_code}.
# lookup dict should be the key and the value (you want to create a column for)
# Then use this to perform the mapping separetly for wild type and mutant cols.
# The three letter code is extracted using a regex match from the dataframe and then converted
# to 'pandas series'since map only works in pandas series
#=======
# initialise a sub dict that is a lookup dict for three letter code to one
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['one_letter_code']
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
# extract position info from mutation column separetly using regex
meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)')
# clear variables
del(k, v, wt, mut, lookup_dict)
#=========
# Step 9b: iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_water}
# Do this for both wild_type and mutant as above.
#=========
# initialise a sub dict that is lookup dict for three letter code to aa prop
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_prop_water']
#print(lookup_dict)
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
# added two more cols
# clear variables
del(k, v, wt, mut, lookup_dict)
#========
# Step 9c: iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_polarity}
# Do this for both wild_type and mutant as above.
#=========
# initialise a sub dict that is lookup dict for three letter code to aa prop
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_prop_polarity']
#print(lookup_dict)
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
# added two more cols
# clear variables
del(k, v, wt, mut, lookup_dict)
########
# Step 10: combine the wild_type+poistion+mutant_type columns to generate
# Mutationinformation (matches mCSM output field)
# Remember to use .map(str) for int col types to allow string concatenation
#########
meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
#=#=#=#=#=#=#
# Step 11:
# COMMENT: there is more processing in the older version of this script
# consult if necessary
# possibly due to the presence of true_wt
# since this file doesn't contain any true_wt, we won't need it(hopefully!)
#=#=#=#=#=#=#
#%%
###########
# Step 12: Output files for only SNPs to run mCSM
###########
#=========
# Step 12a: all SNPs to run mCSM
#=========
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique())
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique())
# assign meaningful colnames
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
#list(snps_only.columns)
snps_only.isna().sum() # should be 0
# output csv: all SNPS for mCSM analysis
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname1 = '_snps_'
nrows = len(snps_only)
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
#output_file_path = work_dir + '/Data/'
output_file_path = data_dir + '/input/processed/' + drug + '/'
if not os.path.exists(output_file_path):
print( output_file_path, 'does not exist. Creating')
os.makedirs(output_file_path)
exit()
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
print(output_filename) #<<<- check
# write to csv: without column or row names
# Bad practice: numbers at the start of a filename
snps_only.to_csv(output_filename, header = False, index = False)
#=========
# Step 12b: all snps with annotation
#=========
# all snps, selected cols
#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
# , 'drtype', 'pyrazinamide'
# , 'mutation_info', 'mutation', 'Mutationinformation']]
#len(pnca_snps_ALL)
# sanity check
#meta_pnca_LF1['mutation'].nunique()
# output csv: WITH column but WITHOUT row names(all snps with meta data)
# specify variable name for output file
#gene = 'pnca'
#drug = 'pyrazinamide'
#my_fname2 = '_snps_with_metadata_'
#nrows = len(pnca_snps_ALL)
#output_file_path = work_dir + '/Data/'
#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
#print(output_filename) #<<<- check
# write out file
#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
#=========
# Step 12c: comp snps for OR calcs with annotation
#=========
# remove all NA's from pyrazinamide column from LF1
# counts NA per column
meta_pnca_LF1.isna().sum()
# remove NA
meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
# sanity checks
# should be True
len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
# unique counts
meta_pnca_LF2['mutation'].nunique()
meta_pnca_LF2.groupby('mutation_info').nunique()
# sanity check
meta_pnca_LF2['id'].nunique()
# should be True
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
print ('sanity check passed: complete numbers match')
else:
print('Error: Please Debug!')
# value counts
meta_pnca_LF2.mutation.value_counts()
#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
# valid/comp snps
# uncomment as necessary
pnca_snps_COMP = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
len(pnca_snps_COMP)
# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname3 = '_comp_snps_with_metadata_'
nrows = len(pnca_snps_COMP)
#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
#print(output_filename) #<<<-check
# write out file
#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
#=========
# Step 12d: comp snps only
#=========
# output csv: comp SNPS for info (i.e snps for which OR exist)
# specify variable name for output file
snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname1 = '_comp_snps_'
nrows = len(snps_only)
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
print(output_filename) #<<<- check
# write to csv: without column or row names
snps_only.to_csv(output_filename, header = False, index = False)
#=#=#=#=#=#=#=#
# COMMENT: LF1 is the file to extract all unique snps for mcsm
# but you have that already in file called pnca_snps...
# LF2: is the file for extracting snps tested for DS and hence OR calcs
#=#=#=#=#=#=#=#
###########
# Step 13 : Output the whole df i.e
# file for meta_data which is now formatted with
# each row as a unique snp rather than the original version where
# each row is a unique id
# ***** This is the file you will ADD the AF and OR calculations to *****
###########
# output csv: the entire DF
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname4 = '_metadata'
#nrows = len(meta_pnca_LF1)
output_filename = output_file_path + gene + my_fname4 + '.csv'
print(output_filename) #<<<-check
# write out file
meta_pnca_LF1.to_csv(output_filename)

View file

@ -0,0 +1,121 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 18 11:32:28 2019
@author: tanushree
"""
############################################
#load libraries
import pandas as pd
import os
#############################################
#!#########################!
# REQUIREMNETS:
# Data_original/ must exist
# containing GWAS data
#!#########################!
print(os.getcwd())
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.chdir(homedir + '/git/Data/input/original')
print(os.getcwd())
#==========
#read file
#==========
my_aa = pd.read_csv('aa_codes.csv') #20, 6
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
#=========================================================
#convert file to dict of dicts
#=========================================================
#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
#with your choice of column name that you have assigned to index as the "primary key".
#using 'index' creates a dict of dicts
#using 'records' creates a list of dicts
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
#================================================
#dict of aa with their corresponding properties
#This is defined twice
#================================================
#7 categories: no overlap
qualities1 = { ('R', 'H', 'K'): 'Basic'
, ('D', 'E'): 'Acidic'
, ('N', 'Q'): 'Amidic'
, ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
, ('S', 'T'): 'Hydroxylic'
, ('F', 'W', 'Y'): 'Aromatic'
, ('C', 'M'): 'Sulphur'
}
#9 categories: allowing for overlap
qualities2 = { ('R', 'H', 'K'): 'Basic'
, ('D', 'E'): 'Acidc'
, ('S', 'T', 'N', 'Q'): 'Polar'
, ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
, ('S', 'G', 'A', 'P'): 'Small'
, ('F', 'W', 'Y', 'H'): 'Aromatic'
, ('V', 'I', 'L', 'M'): 'Aliphatic'
, ('C', 'G', 'P'): 'Special'
}
qualities_taylor = { ('R', 'H', 'K'): 'Basic'
, ('D', 'E'): 'Acidc'
, ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
, ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
#, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
, ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small'
, ('F', 'W', 'Y', 'H'): 'Aromatic'
, ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
, ('C', 'G', 'P'): 'Special'
}
qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
, ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
}
qualities_polarity = { ('D', 'E'): 'acidic'
, ('H', 'K', 'R'): 'basic'
, ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
, ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'
}
#==============================================================================
#adding amino acid properties to my dict of dicts
for k, v in my_aa_dict.items():
#print (k,v)
v['aa_prop1'] = str() #initialise keys
v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
v['aa_prop_water'] = str() #initialise keys
v['aa_prop_polarity'] = str() #initialise keys
for group in qualities1:
if v['one_letter_code'] in group:
v['aa_prop1']+= qualities1[group] # += for str concat
for group in qualities2:
if v['one_letter_code'] in group:
v['aa_prop2'].append(qualities2[group]) # append to list
for group in qualities_taylor:
if v['one_letter_code'] in group:
v['aa_taylor'].append(qualities_taylor[group]) # append to list
for group in qualities_water:
if v['one_letter_code'] in group:
v['aa_prop_water']+= qualities_water[group] # += for str concat
for group in qualities_polarity:
if v['one_letter_code'] in group:
v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat
#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
#==============================================================================

40
mk-drug-dirs.sh Executable file
View file

@ -0,0 +1,40 @@
#!/bin/bash
# Create a drug directory structure for processing
#
#
# Structure:
#
# $DATA_DIR/$DRUG/input
# |- original
# |- processed
# |- structure
#
# $DATA_DIR/$DRUG/output
# |- plots
# |- structure
DATA_DIR=~/git/Data
if [[ $1 == '' ]]; then
echo "usage: mk-drug-dirs.sh <drug name>";
exit;
else
DRUG=$1
echo Creating structure for: $DRUG
if [ -d $DATA_DIR ]
then
echo Doing creation in $DATA_DIR
mkdir -p $DATA_DIR/$DRUG/input/original
mkdir -p $DATA_DIR/$DRUG/input/processed
mkdir -p $DATA_DIR/$DRUG/input/structure
mkdir -p $DATA_DIR/$DRUG/output/plots
mkdir -p $DATA_DIR/$DRUG/output/structure
else
echo "Error: $DATA_DIR does not exist. Did you check it out of git?"
exit
fi
fi