commit bccfe68192b50ea9089f03bfc26dca775f14a30a Author: Tanushree Tunstall Date: Wed Jan 8 16:15:33 2020 +0000 import commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..1fe45f7 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +mCSM Analysis +============= + +This repo does mCSM analysis using Python (Pandas, numpy), bash and R. + +Requires an additional 'Data' directory (Supplied separately). Batteries not included. + + # directory struc + + # Assumptions + + 1. git repos are cloned to `~/git` + 2. The script + +## LSHTM\_analysis: Repo + + subdirs + + needs to call `../Data/input/original/` + + meta\_data\_analysis/ + *.R + *.py + + needs to call `../Data/input/processed/` + needs to output `../Data/output/results/` + + mcsm\_analysis/ + / (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon") + scripts/ (changed from `Scripts/`) + *.R + *.py + mcsm/ + *.sh + *.py + *.R + plotting/ + *.R + + +Data: Repo: + + # subdirs + + input/ + original/ + processed/ + + output/ + / (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon") + results/ + *.csv + *.xlsx + *.doc + *.txt + plots/ + structure/ + +More docs here as I write them. diff --git a/mcsm.conf b/mcsm.conf new file mode 100644 index 0000000..5310227 --- /dev/null +++ b/mcsm.conf @@ -0,0 +1,12 @@ +# This is not yet used, but will be soon :) +[DEFAULT] +mcsm_home = /home/tanu/git/github/LSHTM_analysis + +[pyrazinamide] + gene = 'pnca' + server = 'http://mcsm.melb.ac.uk/' + +[monkeyamide] + gene = 'abcs' + server = 'http://myserver.local/' + diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory new file mode 100644 index 0000000..8055405 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory @@ -0,0 +1,512 @@ +########################### +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) +# quick checks +colnames(my_df) +str(my_df) +########################### +# Data for bfactor figure +# PS average +# Lig average +########################### +head(my_df$Position) +head(my_df$ratioDUET) +# order data frame +df = my_df[order(my_df$Position),] +head(df$Position) +head(df$ratioDUET) +#*********** +# PS: average by position +#*********** +mean_DUET_by_position <- df %>% +group_by(Position) %>% +summarize(averaged.DUET = mean(ratioDUET)) +#*********** +# Lig: average by position +#*********** +mean_Lig_by_position <- df %>% +group_by(Position) %>% +summarize(averaged.Lig = mean(ratioPredAff)) +#*********** +# cbind:mean_DUET_by_position and mean_Lig_by_position +#*********** +combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) +# sanity check +# mean_PS_Lig_Bfactor +colnames(combined) +colnames(combined) = c("Position" +, "average_DUETR" +, "Position2" +, "average_PredAffR") +colnames(combined) +identical(combined$Position, combined$Position2) +n = which(colnames(combined) == "Position2"); n +combined_df = combined[,-n] +max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) +max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) +#============= +# output csv +#============ +outDir = "~/Data/pyrazinamide/input/processed/" +outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") +print(paste0("Output file with path will be:","", outFile)) +head(combined_df$Position); tail(combined_df$Position) +write.csv(combined_df, outFile +, row.names = F) +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() +######################################################################## +# Installing and loading required packages # +######################################################################## +source("../Header_TT.R") +#source("barplot_colour_function.R") +require(data.table) +require(dplyr) +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## +source("../combining_two_df.R") +########################### +# This will return: +# df with NA: +# merged_df2 +# merged_df3 +# df without NA: +# merged_df2_comp +# merged_df3_comp +########################### +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION +########################### +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) +# quick checks +colnames(my_df) +str(my_df) +########################### +# Data for bfactor figure +# PS average +# Lig average +########################### +head(my_df$Position) +head(my_df$ratioDUET) +# order data frame +df = my_df[order(my_df$Position),] +head(df$Position) +head(df$ratioDUET) +#*********** +# PS: average by position +#*********** +mean_DUET_by_position <- df %>% +group_by(Position) %>% +summarize(averaged.DUET = mean(ratioDUET)) +#*********** +# Lig: average by position +#*********** +mean_Lig_by_position <- df %>% +group_by(Position) %>% +summarize(averaged.Lig = mean(ratioPredAff)) +#*********** +# cbind:mean_DUET_by_position and mean_Lig_by_position +#*********** +combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) +# sanity check +# mean_PS_Lig_Bfactor +colnames(combined) +colnames(combined) = c("Position" +, "average_DUETR" +, "Position2" +, "average_PredAffR") +colnames(combined) +identical(combined$Position, combined$Position2) +n = which(colnames(combined) == "Position2"); n +combined_df = combined[,-n] +max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) +max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) +#============= +# output csv +#============ +outDir = "~/git/Data/pyrazinamide/input/processed/" +outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") +print(paste0("Output file with path will be:","", outFile)) +head(combined_df$Position); tail(combined_df$Position) +write.csv(combined_df, outFile +, row.names = F) +# read in pdb file complex1 +inDir = "~/git/Data/pyrazinamide/input/structure" +inFile = paste0(inDir, "complex1_no_water.pdb") +# read in pdb file complex1 +inDir = "~/git/Data/pyrazinamide/input/structure/" +inFile = paste0(inDir, "complex1_no_water.pdb") +complex1 = inFile +my_pdb = read.pdb(complex1 +, maxlines = -1 +, multi = FALSE +, rm.insert = FALSE +, rm.alt = TRUE +, ATOM.only = FALSE +, hex = FALSE +, verbose = TRUE) +######################### +#3: Read complex pdb file +########################## +source("Header_TT.R") +# list of 8 +my_pdb = read.pdb(complex1 +, maxlines = -1 +, multi = FALSE +, rm.insert = FALSE +, rm.alt = TRUE +, ATOM.only = FALSE +, hex = FALSE +, verbose = TRUE) +rm(inDir, inFile) +#====== end of script +inDir = "~/git/Data/pyrazinamide/input/structure/" +inFile = paste0(inDir, "complex1_no_water.pdb") +complex1 = inFile +complex1 = inFile +my_pdb = read.pdb(complex1 +, maxlines = -1 +, multi = FALSE +, rm.insert = FALSE +, rm.alt = TRUE +, ATOM.only = FALSE +, hex = FALSE +, verbose = TRUE) +inFile +inDir = "~/git/Data/pyrazinamide/input/structure/" +inFile = paste0(inDir, "complex1_no_water.pdb") +complex1 = inFile +#inFile2 = paste0(inDir, "complex2_no_water.pdb") +#complex2 = inFile2 +# list of 8 +my_pdb = read.pdb(complex1 +, maxlines = -1 +, multi = FALSE +, rm.insert = FALSE +, rm.alt = TRUE +, ATOM.only = FALSE +, hex = FALSE +, verbose = TRUE) +rm(inDir, inFile, complex1) +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() +source("Header_TT.R") +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() +######################################################################## +# Installing and loading required packages # +######################################################################## +source("Header_TT.R") +######################################################### +# TASK: replace B-factors in the pdb file with normalised values +# use the complex file with no water as mCSM lig was +# performed on this file. You can check it in the script: read_pdb file. +######################################################### +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F +, header = T) +str(my_df) +source("read_pdb.R") # list of 8 +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] +# make a copy: required for downstream sanity checks +d2 = d +# sanity checks: B factor +max(d$b); min(d$b) +par(oma = c(3,2,3,0) +, mar = c(1,3,5,2) +, mfrow = c(3,2)) +#par(mfrow = c(3,2)) +#1: Original B-factor +hist(d$b +, xlab = "" +, main = "B-factor") +plot(density(d$b) +, xlab = "" +, main = "B-factor") +# 2: DUET scores +hist(my_df$average_DUETR +, xlab = "" +, main = "Norm_DUET") +plot(density(my_df$average_DUETR) +, xlab = "" +, main = "Norm_DUET") +# Set the margin on all sides +par(oma = c(3,2,3,0) +, mar = c(1,3,5,2) +, mfrow = c(3,2)) +#par(mfrow = c(3,2)) +#1: Original B-factor +hist(d$b +, xlab = "" +, main = "B-factor") +plot(density(d$b) +, xlab = "" +, main = "B-factor") +# 2: DUET scores +hist(my_df$average_DUETR +, xlab = "" +, main = "Norm_DUET") +plot(density(my_df$average_DUETR) +, xlab = "" +, main = "Norm_DUET") +#========= +# step 1_P1 +#========= +# Be brave and replace in place now (don't run sanity check) +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] +#========= +# step 2_P1 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na +# count number of 0's in Bactor +sum(d$b == 0) +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 +# sanity check: should be 0 +sum(is.na(d$b)) +# sanity check: should be True +if (sum(d$b == 0) == b_na){ +print ("Sanity check passed: NA's replaced with 0's successfully") +} else { +print("Error: NA replacement NOT successful, Debug code!") +} +max(d$b); min(d$b) +# sanity checks: should be True +if(max(d$b) == max(my_df$average_DUETR)){ +print("Sanity check passed: B-factors replaced correctly") +} else { +print ("Error: Debug code please") +} +if (min(d$b) == min(my_df$average_DUETR)){ +print("Sanity check passed: B-factors replaced correctly") +} else { +print ("Error: Debug code please") +} +#========= +# step 3_P1 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) +#========= +# step 4_P1 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d +max(d$b); min(d$b) +#========= +# step 5_P1 +#========= +# output dir +getwd() +outDir = "~/git/Data/pyrazinamide/output/" +getwd() +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb") +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile +outDir = "~/git/Data/pyrazinamide/input/structure" +outDir = "~/git/Data/pyrazinamide/input/structure/" +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile +write.pdb(my_pdb, outFile) +hist(d$b +, xlab = "" +, main = "repalced-B") +plot(density(d$b) +, xlab = "" +, main = "replaced-B") +# graph titles +mtext(text = "Frequency" +, side = 2 +, line = 0 +, outer = TRUE) +mtext(text = "DUET_stability" +, side = 3 +, line = 0 +, outer = TRUE) +#========================================================= +# Processing P2: Replacing B values with PredAff Scores +#========================================================= +# clear workspace +rm(list = ls()) +#========================================================= +# Processing P2: Replacing B values with PredAff Scores +#========================================================= +# clear workspace +rm(list = ls()) +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile +my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv" +# , row.names = 1 +# , stringsAsFactors = F +, header = T) +str(my_df) +#========================================================= +# Processing P2: Replacing B factor with mean ratioLig scores +#========================================================= +######################### +# 3: Read complex pdb file +# form the R script +########################## +source("read_pdb.R") # list of 8 +# extract atom list into a vari +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F +, header = T) +str(my_df) +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] +# make a copy: required for downstream sanity checks +d2 = d +# sanity checks: B factor +max(d$b); min(d$b) +par(oma = c(3,2,3,0) +, mar = c(1,3,5,2) +, mfrow = c(3,2)) +#par(mfrow = c(3,2)) +# 1: Original B-factor +hist(d$b +, xlab = "" +, main = "B-factor") +plot(density(d$b) +, xlab = "" +, main = "B-factor") +# 2: Pred Aff scores +hist(my_df$average_PredAffR +, xlab = "" +, main = "Norm_lig_average") +plot(density(my_df$average_PredAffR) +, xlab = "" +, main = "Norm_lig_average") +# 3: After the following replacement +#******************************** +par(oma = c(3,2,3,0) +, mar = c(1,3,5,2) +, mfrow = c(3,2)) +#par(mfrow = c(3,2)) +# 1: Original B-factor +hist(d$b +, xlab = "" +, main = "B-factor") +plot(density(d$b) +, xlab = "" +, main = "B-factor") +# 2: Pred Aff scores +hist(my_df$average_PredAffR +, xlab = "" +, main = "Norm_lig_average") +plot(density(my_df$average_PredAffR) +, xlab = "" +, main = "Norm_lig_average") +# 3: After the following replacement +#******************************** +#========= +# step 1_P2: BE BRAVE and replace in place now (don't run step 0) +#========= +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] +#========= +# step 2_P2 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na +# count number of 0's in Bactor +sum(d$b == 0) +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 +# sanity check: should be 0 +sum(is.na(d$b)) +if (sum(d$b == 0) == b_na){ +print ("Sanity check passed: NA's replaced with 0's successfully") +} else { +print("Error: NA replacement NOT successful, Debug code!") +} +max(d$b); min(d$b) +# sanity checks: should be True +if (max(d$b) == max(my_df$average_PredAffR)){ +print("Sanity check passed: B-factors replaced correctly") +} else { +print ("Error: Debug code please") +} +if (min(d$b) == min(my_df$average_PredAffR)){ +print("Sanity check passed: B-factors replaced correctly") +} else { +print ("Error: Debug code please") +} +#========= +# step 3_P2 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) +#========= +# step 4_P2 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d +max(d$b); min(d$b) +#========= +# step 5_P2 +#========= +write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb") +# output dir +getwd() +# output dir +outDir = "~/git/Data/pyrazinamide/input/structure/" +outFile = paste0(outDir, "complex1_BwithNormLIG.pdb") +outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile +write.pdb(my_pdb, outFile) diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R new file mode 100644 index 0000000..2069fa7 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R @@ -0,0 +1,129 @@ +######################################################### +### A) Installing and loading required packages +######################################################### + +#if (!require("gplots")) { +# install.packages("gplots", dependencies = TRUE) +# library(gplots) +#} + +if (!require("tidyverse")) { + install.packages("tidyverse", dependencies = TRUE) + library(tidyverse) +} + +if (!require("ggplot2")) { + install.packages("ggplot2", dependencies = TRUE) + library(ggplot2) +} + +if (!require("cowplot")) { + install.packages("copwplot", dependencies = TRUE) + library(ggplot2) +} + +if (!require("ggcorrplot")) { + install.packages("ggcorrplot", dependencies = TRUE) + library(ggcorrplot) +} + +if (!require("ggpubr")) { + install.packages("ggpubr", dependencies = TRUE) + library(ggpubr) +} + +if (!require("RColorBrewer")) { + install.packages("RColorBrewer", dependencies = TRUE) + library(RColorBrewer) +} + +if (!require ("GOplot")) { + install.packages("GOplot") + library(GOplot) +} + +if(!require("VennDiagram")) { + + install.packages("VennDiagram", dependencies = T) + library(VennDiagram) +} + +if(!require("scales")) { + + install.packages("scales", dependencies = T) + library(scales) +} + +if(!require("plotrix")) { + + install.packages("plotrix", dependencies = T) + library(plotrix) +} + +if(!require("stats")) { + + install.packages("stats", dependencies = T) + library(stats) +} + +if(!require("stats4")) { + + install.packages("stats4", dependencies = T) + library(stats4) +} + +if(!require("data.table")) { + library(stats4) +} + +if (!require("PerformanceAnalytics")){ + install.packages("PerformanceAnalytics", dependencies = T) + library(PerformaceAnalytics) +} + +if (!require ("GGally")){ + install.packages("GGally") + library(GGally) +} + +if (!require ("corrr")){ + install.packages("corrr") + library(corrr) +} + +if (!require ("psych")){ + install.packages("psych") + library(psych) +} + +if (!require ("dplyr")){ + install.packages("dplyr") + library(psych) +} + + +if (!require ("compare")){ + install.packages("compare") + library(psych) +} + +if (!require ("arsenal")){ + install.packages("arsenal") + library(psych) +} + + +####TIDYVERSE +# Install +#if(!require(devtools)) install.packages("devtools") +#devtools::install_github("kassambara/ggcorrplot") + +library(ggcorrplot) + + +###for PDB files +#install.packages("bio3d") +if(!require(bio3d)){ + install.packages("bio3d") + library(bio3d) +} diff --git a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R new file mode 100644 index 0000000..a3cc403 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R @@ -0,0 +1,27 @@ +######################################################### +# 1b: Define function: coloured barplot by subgroup +# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar +######################################################### + +ColourPalleteMulti <- function(df, group, subgroup){ + + # Find how many colour categories to create and the number of colours in each + categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) + , df + , function(x) length(unique(x))) + # return(categories) } + + category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete + + category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom + + #return(category.start); return(category.end)} + + # Build Colour pallette + colours <- unlist(lapply(1:nrow(categories), + function(i){ + colorRampPalette(colors = c(category.start[i] + , category.end[i]))(categories[i,2])})) + return(colours) +} +######################################################### \ No newline at end of file diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R new file mode 100644 index 0000000..57e476a --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R @@ -0,0 +1,299 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/") +getwd() + +######################################################### +# TASK: To combine mcsm and meta data with af and or +######################################################### + + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("Header_TT.R") +#require(data.table) +#require(arsenal) +#require(compare) +#library(tidyverse) + +################################# +# Read file: normalised file +# output of step 4 mcsm_pipeline +################################# + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile + +mcsm_data = read.csv(inFile + , row.names = 1 + , stringsAsFactors = F + , header = T) +rm(inDir, inFile) + +str(mcsm_data) + +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) + +# spelling Correction 1: DUET +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) +head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) + +# spelling Correction 2: Ligand +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) + +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) +head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) + +# count na in each column +na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count + +# sort by Mutationinformation +mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] +head(mcsm_data$Mutationinformation) + +# get freq count of positions and add to the df +setDT(mcsm_data)[, occurrence := .N, by = .(Position)] + +pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) + +########################### +# 2: Read file: meta data with AFandOR +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2 + +meta_with_afor <- read.csv(inFile2 + , stringsAsFactors = F + , header = T) + +rm(inDir, inFile2) + +str(meta_with_afor) + +# sort by Mutationinformation +head(meta_with_afor$Mutationinformation) +meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] +head(meta_with_afor$Mutationinformation) + +# sanity check: should be True for all the mentioned columns +#is.numeric(meta_with_afor$OR) +na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue") + +c1 = NULL +for (i in na_var){ + print(i) + c0 = is.numeric(meta_with_afor[,i]) + c1 = c(c0, c1) + if ( all(c1) ){ + print("Sanity check passed: These are all numeric cols") + } else{ + print("Error: Please check your respective data types") + } +} + +# If OR, and P value are not numeric, then convert to numeric and then count +# else they will say 0 +na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count +str(na_count) + +# compare if the No of "NA" are the same for all these cols +na_len = NULL +for (i in na_var){ + temp = na_count[[i]] + na_len = c(na_len, temp) +} + +# extract how many NAs: +# should be all TRUE +# should be a single number since +# all the cols should have "equal" and "same" no. of NAs + +my_nrows = NULL +for ( i in 1: (length(na_len)-1) ){ + #print(compare(na_len[i]), na_len[i+1]) + c = compare(na_len[i], na_len[i+1]) + if ( c$result ) { + my_nrows = na_len[i] } + else { + print("Error: Please check your numbers") + } +} + +my_nrows + +#=#=#=#=#=#=#=#=# +# COMMENT: AF, OR, pvalue, logor and neglog10pvalue +# these are the same 7 ones +#=#=#=#=#=#=#=#=# + +# sanity check +#which(is.na(meta_with_afor$OR)) + +# initialise an empty df with nrows as extracted above +na_count_df = data.frame(matrix(vector(mode = 'numeric' +# , length = length(na_var) + ) + , nrow = my_nrows +# , ncol = length(na_var) + )) + +# populate the df with the indices of the cols that are NA +for (i in na_var){ + print(i) + na_i = which(is.na(meta_with_afor[i])) + na_count_df = cbind(na_count_df, na_i) + colnames(na_count_df)[which(na_var == i)] <- i +} + +# Now compare these indices to ensure these are the same +c2 = NULL +for ( i in 1: ( length(na_count_df)-1 ) ) { +# print(na_count_df[i] == na_count_df[i+1]) + c1 = identical(na_count_df[[i]], na_count_df[[i+1]]) + c2 = c(c1, c2) + if ( all(c2) ) { + print("Sanity check passed: The indices for AF, OR, etc are all the same") + } else { + print ("Error: Please check indices which are NA") + } +} + +rm( c, c0, c1, c2, i, my_nrows + , na_count, na_i, na_len + , na_var, temp + , na_count_df + , pos_count_check ) + +########################### +# 3:merging two dfs: with NA +########################### + +# link col name = Mutationinforamtion +head(mcsm_data$Mutationinformation) +head(meta_with_afor$Mutationinformation) + +######### +# merge 1a: meta data with mcsm +######### +merged_df2 = merge(x = meta_with_afor + ,y = mcsm_data + , by = "Mutationinformation" + , all.y = T) + +head(merged_df2$Position) + +# sort by Position +head(merged_df2$Position) +merged_df2 = merged_df2[order(merged_df2$Position),] +head(merged_df2$Position) + +merged_df2v2 = merge(x = meta_with_afor + ,y = mcsm_data + , by = "Mutationinformation" + , all.x = T) +#!=!=!=!=!=!=!=! +# COMMENT: used all.y since position 186 is not part of the struc, +# hence doesn't have a mcsm value +# but 186 is associated with with mutation +#!=!=!=!=!=!=!=! + +# should be False +identical(merged_df2, merged_df2v2) +table(merged_df2$Position%in%merged_df2v2$Position) + +rm(merged_df2v2) + +######### +# merge 1b:remove duplicate mutation information +######### + +#==#=#=#=#=#=# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#=#=#=#=#=#=#= +merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] +head(merged_df3$Position); tail(merged_df3$Position) # should be sorted + +# sanity checks +# nrows of merged_df3 should be the same as the nrows of mcsm_data +if(nrow(mcsm_data) == nrow(merged_df3)){ + print("sanity check: Passed") +} else { + print("Error!: check data, nrows is not as expected") +} + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# uncomment as necessary +# only need to run this if merged_df2v2 i.e non structural pos included +#mcsm = mcsm_data$Mutationinformation +#my_merged = merged_df3$Mutationinformation + +# find the index where it differs +#diff_n = which(!my_merged%in%mcsm) + +#check if it is indeed pos 186 +#merged_df3[diff_n,] + +# remove this entry +#merged_df3 = merged_df3[-diff_n,]] +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +########################### +# 3b :merging two dfs: without NA +########################### + +######### +# merge 2a:same as merge 1 but excluding NA +######### +merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] + +######### +# merge 2b: remove duplicate mutation information +######### +merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] + +# alternate way of deriving merged_df3_comp +foo = merged_df3[!is.na(merged_df3$AF),] +# compare dfs: foo and merged_df3_com +all.equal(foo, merged_df3) + +summary(comparedf(foo, merged_df3)) + +#=============== end of combining df +#clear variables +rm(mcsm_data + , meta_with_afor + , foo) + +#rm(diff_n, my_merged, mcsm) + +#===================== +# write_output files +#===================== +# output dir +outDir = "~/git/Data/pyrazinamide/output/" +getwd() + +outFile1 = paste0(outDir, "merged_df3.csv"); outFile1 +write.csv(merged_df3, outFile1) + +#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2 +#write.csv(merged_df3_comp, outFile2) + +rm(outDir + , outFile1 +# , outFile2 +) +#============================= end of script + diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R new file mode 100644 index 0000000..1dccd65 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R @@ -0,0 +1,348 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/") +getwd() + +######################################################### +# TASK: To combine mcsm and meta data with af and or +# by filtering for distance to ligand (<10Ang) +######################################################### + +######################################################### +# Installing and loading required packages +######################################################### + +#source("Header_TT.R") +#require(data.table) +#require(arsenal) +#require(compare) +#library(tidyverse) + +################################# +# Read file: normalised file +# output of step 4 mcsm_pipeline +################################# + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile + +mcsm_data = read.csv(inFile + , row.names = 1 + , stringsAsFactors = F + , header = T) +rm(inDir, inFile) + +str(mcsm_data) + +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) + +# spelling Correction 1: DUET +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' + +# checks +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) +head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) + +# spelling Correction 2: Ligand +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) + +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) +head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) + +########################### !!! only for mcsm_lig +# 4: Filter/subset data +# Lig plots < 10Ang +# Filter the lig plots for Dis_to_lig < 10Ang +########################### + +# check range of distances +max(mcsm_data$Dis_lig_Ang) +min(mcsm_data$Dis_lig_Ang) + +# count +table(mcsm_data$Dis_lig_Ang<10) + +# subset data to have only values less than 10 Ang +mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) + +# sanity checks +max(mcsm_data2$Dis_lig_Ang) +min(mcsm_data2$Dis_lig_Ang) + +# count no of unique positions +length(unique(mcsm_data2$Position)) + +# count no of unique mutations +length(unique(mcsm_data2$Mutationinformation)) + +# count Destabilisinga and stabilising +table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang} + +#<<<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT: so as not to alter the script +mcsm_data = mcsm_data2 +#<<<<<<<<<<<<<<<<<<<<<<<<<<< + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(mcsm_data$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +# clear variables +rm(mcsm_data2) + +# count na in each column +na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count + +head(mcsm_data$Mutationinformation) +mcsm_data[mcsm_data$Mutationinformation=="Q10P",] +mcsm_data[mcsm_data$Mutationinformation=="L4S",] + +# sort by Mutationinformation +mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] +head(mcsm_data$Mutationinformation) + +# check +mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),] +mcsm_data[grep("A102T", mcsm_data$Mutationinformation),] + +# get freq count of positions and add to the df +setDT(mcsm_data)[, occurrence := .N, by = .(Position)] + +pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) + +########################### +# 2: Read file: meta data with AFandOR +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2 + +meta_with_afor <- read.csv(inFile2 + , stringsAsFactors = F + , header = T) + +str(meta_with_afor) + +# sort by Mutationinformation +head(meta_with_afor$Mutationinformation) +meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] +head(meta_with_afor$Mutationinformation) + +# sanity check: should be True for all the mentioned columns +#is.numeric(meta_with_afor$OR) +na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue") + +c1 = NULL +for (i in na_var){ + print(i) + c0 = is.numeric(meta_with_afor[,i]) + c1 = c(c0, c1) + if ( all(c1) ){ + print("Sanity check passed: These are all numeric cols") + } else{ + print("Error: Please check your respective data types") + } +} + +# If OR, and P value are not numeric, then convert to numeric and then count +# else they will say 0 + +# NOW count na in each column: if you did it before, then +# OR and Pvalue column would say 0 na since these were not numeric +na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count +str(na_count) + +# compare if the No of "NA" are the same for all these cols +na_len = NULL +na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue") +for (i in na_var){ + temp = na_count[[i]] + na_len = c(na_len, temp) +} + +my_nrows = NULL + +for ( i in 1: (length(na_len)-1) ){ + #print(compare(na_len[i]), na_len[i+1]) + c = compare(na_len[i], na_len[i+1]) + if ( c$result ) { + my_nrows = na_len[i] } + else { + print("Error: Please check your numbers") + } +} + +my_nrows + +#=#=#=#=#=#=#=#=# +# COMMENT: AF, OR, pvalue, logor and neglog10pvalue +# all have 81 NA, with pyrazinamide with 960 +# and these are the same 7 ones +#=#=#=#=#=#=#=#=# + +# sanity check +#which(is.na(meta_with_afor$OR)) + +# initialise an empty df with nrows as extracted above +na_count_df = data.frame(matrix(vector(mode = 'numeric' +# , length = length(na_var) + ) + , nrow = my_nrows +# , ncol = length(na_var) + )) + +# populate the df with the indices of the cols that are NA +for (i in na_var){ + print(i) + na_i = which(is.na(meta_with_afor[i])) + na_count_df = cbind(na_count_df, na_i) + colnames(na_count_df)[which(na_var == i)] <- i +} + +# Now compare these indices to ensure these are the same +c2 = NULL +for ( i in 1: ( length(na_count_df)-1 ) ) { + # print(na_count_df[i] == na_count_df[i+1]) + c1 = identical(na_count_df[[i]], na_count_df[[i+1]]) + c2 = c(c1, c2) + if ( all(c2) ) { + print("Sanity check passed: The indices for AF, OR, etc are all the same") + } else { + print ("Error: Please check indices which are NA") + } +} + +rm( c, c1, c2, i, my_nrows + , na_count, na_i, na_len + , na_var, temp + , na_count_df + , pos_count_check ) + +########################### +# 3:merging two dfs: with NA +########################### + +# link col name = Mutationinforamtion +head(mcsm_data$Mutationinformation) +head(meta_with_afor$Mutationinformation) + +######### +# merge 1a: meta data with mcsm +######### +merged_df2 = merge(x = meta_with_afor + , y = mcsm_data + , by = "Mutationinformation" + , all.y = T) + +head(merged_df2$Position) + +# sort by Position +head(merged_df2$Position) +merged_df2 = merged_df2[order(merged_df2$Position),] +head(merged_df2$Position) + +merged_df2v2 = merge(x = meta_with_afor + ,y = mcsm_data + , by = "Mutationinformation" + , all.x = T) + +#!=!=!=!=!=!=!=! +# COMMENT: used all.y since position 186 is not part of the struc, +# hence doesn't have a mcsm value +# but 186 is associated with with mutation +#!=!=!=!=!=!=!=! + +# should be False +identical(merged_df2, merged_df2v2) +table(merged_df2$Position%in%merged_df2v2$Position) + +rm(merged_df2v2) + +######### +# merge 1b:remove duplicate mutation information +######### + +#==#=#=#=#=#=# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#=#=#=#=#=#=#= +merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] +head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted + +# sanity checks +# nrows of merged_df3 should be the same as the nrows of mcsm_data +if(nrow(mcsm_data) == nrow(merged_df3)){ + print("sanity check: Passed") +} else { + print("Error!: check data, nrows is not as expected") +} + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# uncomment as necessary +# only need to run this if merged_df2v2 i.e non structural pos included +#mcsm = mcsm_data$Mutationinformation +#my_merged = merged_df3$Mutationinformation + +# find the index where it differs +#diff_n = which(!my_merged%in%mcsm) + +#check if it is indeed pos 186 +#merged_df3[diff_n,] + +# remove this entry +#merged_df3 = merged_df3[-diff_n,] +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +########################### +# 3b :merging two dfs: without NA +########################### + +######### +# merge 2a:same as merge 1 but excluding NA +######### +merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] + +######### +# merge 2b: remove duplicate mutation information +######### +merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] + +# FIXME: add this as a sanity check. I have manually checked! + +# alternate way of deriving merged_df3_comp +foo = merged_df3[!is.na(merged_df3$AF),] + +# compare dfs: foo and merged_df3_com +all.equal(foo, merged_df3) + +summary(comparedf(foo, merged_df3)) + +#=============== end of combining df +#clear variables +rm(mcsm_data + , meta_with_afor + , foo) + +#rm(diff_n, my_merged, mcsm) + +#===============end of script + +#===================== +# write_output files +#===================== + +# Not required as this is a subset of the "combining_two_df.R" script + diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh new file mode 100755 index 0000000..22fada7 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +#************************************* +# need to be in the correct directory +#************************************* +##: comments for code +#: commented out code + +#********************************************************************** +# TASK: Text file containing a list of SNPs; SNP in the format(C2E) +# per line. Sort by unique, which automatically removes duplicates. +# sace file in current directory +#********************************************************************** +infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv" +outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv" + +# sort unique entries and output to current directory +sort -u ${infile} > ${outfile} + +# count no. of unique snps mCSM will run on +count=$(wc -l < ${outfile}) + +# print to console no. of unique snps mCSM will run on +echo "${count} unique mutations for mCSM to run on" + diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh new file mode 100755 index 0000000..87f4265 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +#************************************* +#need to be in the correct directory +#************************************* +##: comments for code +#: commented out code + +#********************************************************************** +# TASK: submit requests using curl: HANDLE redirects and refresh url. +# Iterate over mutation file and write/append result urls to a file +# result url file: stored in the /Results directory +# mutation file: one mutation per line, no chain ID +# output: in a file, should be n urls (n=no. of mutations in file) +# NOTE: these are just result urls, not actual values for results +#********************************************************************** +## iterate over mutation file; line by line and submit query using curl +filename="../Data/pnca_mis_SNPs_v2_unique.csv" + +## some useful messages +echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n" +COUNT=0 +while read -r line; do +((COUNT++)) + mutation="${line}" +# echo "${mutation}" +pdb='../Data/complex1_no_water.pdb' +mutation="${mutation}" +chain="A" +lig_id="PZA" +affin_wt="0.99" +host="http://biosig.unimelb.edu.au" +call_url="/mcsm_lig/prediction" + +##========================================= +##html field_names names required for curl +##complex_field:wild=@ +##mutation_field:mutation=@ +##chain_field:chain=@ +##ligand_field:lig_id@ +##energy_field:affin_wt +#========================================= +refresh_url=$(curl -L \ + -sS \ + -F "wild=@${pdb}" \ + -F "mutation=${mutation}" \ + -F "chain=${chain}" \ + -F "lig_id=${lig_id}" \ + -F "affin_wt=${affin_wt}" \ + ${host}${call_url} | grep "http-equiv") + +#echo $refresh_url +#echo ${host}${refresh_url} + +#use regex to extract the relevant bit from the refresh url +#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g' + +#Now build: result url using host and refresh url and write the urls to a file in the Results dir +result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g') +sleep 10 + +echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..." + +echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt +#echo -n '.' +done < "${filename}" + +echo +echo "Processing Complete" + +##end of submitting query, receiving result url and storing results url in a file + diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh new file mode 100755 index 0000000..e250fe8 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh @@ -0,0 +1,59 @@ +#!/bin/bash +#************************************* +#need to be in the correct directory +#************************************* +##: comments for code +#: commented out code + +#******************************************************************** +# TASK: submit result urls and fetch actual results using curl +# iterate over each result url from the output of step1 in the stored +# in file in /Results. +# Use curl to fetch results and extract relevant sections using hxtools +# and store these in another file in /Results +# This script takes two arguments: +# input file: file containing results url +# In this case: 336_mCSM_lig_complex1_result_url.txt +# output file: name of the file where extracted results will be stored +# In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt +#********************************************************************* + +#if [ "$#" -ne 2 ]; then + #if [ -Z $1 ]; then +# echo " +# Please provide both Input and Output files. + +# Usage: batch_read_urls.sh INFILE OUTFILE +# " +# exit 1 +#fi + +# First argument: Input File +# Second argument: Output File +#infile=$1 +#outfile=$2 + +infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt" +outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt" + +echo -n "Processing $(wc -l < ${infile}) entries from ${infile}" +echo +COUNT=0 +while read -r line; do +#COUNT=$(($COUNT+1)) +((COUNT++)) + curl --silent ${line} \ + | hxnormalize -x \ + | hxselect -c div.span4 \ + | hxselect -c div.well \ + | sed -r -e 's/<[^>]*>//g' \ + | sed -re 's/ +//g' \ + >> ${outfile} + #| tee -a ${outfile} +# echo -n '.' +echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..." + +done < "${infile}" + +echo +echo "Processing Complete" diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh new file mode 100755 index 0000000..78dbdf5 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh @@ -0,0 +1,52 @@ +#!/bin/bash +#************************************* +#need to be in the correct directory +#************************************* +##: comments for code +#: commented out code + +#******************************************************************** +# TASK: Intermediate results processing +# output file has a convenient delimiter of ":" that can be used to +# format the file into two columns (col1: field_desc and col2: values) +# However the section "PredictedAffinityChange:...." and +# "DUETstabilitychange:.." are split over multiple lines and +# prevent this from happening.Additionally there are other empty lines +# that need to be omiited. In order ensure these sections are not split +# over multiple lines, this script is written. +#********************************************************************* + +infile="../Results/336_mCSM_lig_complex1_output_processed.txt" + +#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \ +# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile} + +# Outputs records separated by a newline, that look something like this: +# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing +# Mutationinformation: +# Wild-type:L +# Position:4 +# Mutant-type:W +# Chain:A +# LigandID:PZA +# Distancetoligand:15.911Å +# DUETstabilitychange:-2.169Kcal/mol +# +# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing +# (...etc) + +# This script brings everything in a convenient format for further processing in python. +# bear in mind, this replaces the file in place, so make sure you retain a copy for your records +sed -i '/PredictedAffinityChange/ { +N +N +N +N +s/\n//g +} +/DUETstabilitychange:/ { +N +N +s/\n//g +} +/^$/d' ${infile} diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py new file mode 100755 index 0000000..a780576 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +import pandas as pd +from collections import defaultdict + +#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt' + +outCols=[ + 'PredictedAffinityChange', + 'Mutationinformation', + 'Wild-type', + 'Position', + 'Mutant-type', + 'Chain', + 'LigandID', + 'Distancetoligand', + 'DUETstabilitychange' + ] + +lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')] + +outputs = defaultdict(list) + +for item in lines: + col, val = item.split(':') + outputs[col].append(val) + +dfOut=pd.DataFrame(outputs) + +pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R new file mode 100644 index 0000000..4876b5e --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R @@ -0,0 +1,207 @@ +getwd() +#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work +setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad +#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac +getwd() + +#======================================================= +#TASK: To tidy the columns so you can generate figures +#======================================================= +#################### +#### read file #####: this will be the output from python script (csv file) +#################### +data = read.csv("336_complex1_formatted_results.csv" + , header = T + , stringsAsFactors = FALSE) +dim(data) +#335, 10 +str(data) + +########################### +##### Data processing ##### +########################### + +# populate mutation information columns as currently it is empty +head(data$Mutationinformation) +tail(data$Mutationinformation) + +# should not be blank: create muation information +data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type) + +head(data$Mutationinformation) +tail(data$Mutationinformation) +#write.csv(data, 'test.csv') +########################################## +# Remove duplicate SNPs as a sanity check +########################################## +#very important +table(duplicated(data$Mutationinformation)) +#FALSE +#335 + +#extract duplicated entries +dups = data[duplicated(data$Mutationinformation),] #0 + +#No of dups should match with the no. of TRUE in the above table +#u_dups = unique(dups$Mutationinformation) #10 +sum( table(dups$Mutationinformation) ) #13 + +rm(dups) + +#*************************************************************** +#select non-duplicated SNPs and create a new df +df = data[!duplicated(data$Mutationinformation),] #309, 10 +#*************************************************************** +#sanity check +u = unique(df$Mutationinformation) +u2 = unique(data$Mutationinformation) +table(u%in%u2) +#TRUE +#309 +#should all be 1, hence 309 1's +sum(table(df$Mutationinformation) == 1) + +#sort df by Position +#MANUAL CHECKPOINT: +#foo <- df[order(df$Position),] +#df <- df[order(df$Position),] + +rm(u, u2, dups) + +#################### +#### give meaningful colnames to reflect units to enable correct data type +#################### + +#======= +#STEP 1 +#======== +#make a copy of the PredictedAffinityColumn and call it Lig_outcome +df$Lig_outcome = df$PredictedAffinityChange #335, 11 + +#make Predicted...column numeric and outcome column categorical +head(df$PredictedAffinityChange) +df$PredictedAffinityChange = gsub("log.*" + , "" + , df$PredictedAffinityChange) + +#sanity checks +head(df$PredictedAffinityChange) + +#should be numeric, check and if not make it numeric +is.numeric( df$PredictedAffinityChange ) +#change to numeric +df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange) +#should be TRUE +is.numeric( df$PredictedAffinityChange ) + +#change the column name to indicate units +n = which(colnames(df) == "PredictedAffinityChange"); n +colnames(df)[n] = "PredAffLog" +colnames(df)[n] + +#======== +#STEP 2 +#======== +#make Lig_outcome column categorical showing effect of mutation +head(df$Lig_outcome) +df$Lig_outcome = gsub("^.*-" + , "", + df$Lig_outcome) +#sanity checks +head(df$Lig_outcome) +#should be factor, check and if not change it to factor +is.factor(df$Lig_outcome) +#change to factor +df$Lig_outcome = as.factor(df$Lig_outcome) +#should be TRUE +is.factor(df$Lig_outcome) + +#======== +#STEP 3 +#======== +#gsub +head(df$Distancetoligand) +df$Distancetoligand = gsub("Å" + , "" + , df$Distancetoligand) +#sanity checks +head(df$Distancetoligand) +#should be numeric, check if not change it to numeric +is.numeric(df$Distancetoligand) +#change to numeric +df$Distancetoligand = as.numeric(df$Distancetoligand) +#should be TRUE +is.numeric(df$Distancetoligand) + +#change the column name to indicate units +n = which(colnames(df) == "Distancetoligand") +colnames(df)[n] <- "Dis_lig_Ang" +colnames(df)[n] + +#======== +#STEP 4 +#======== +#gsub +head(df$DUETstabilitychange) +df$DUETstabilitychange = gsub("Kcal/mol" + , "" + , df$DUETstabilitychange) +#sanity checks +head(df$DUETstabilitychange) +#should be numeric, check if not change it to numeric +is.numeric(df$DUETstabilitychange) +#change to numeric +df$DUETstabilitychange = as.numeric(df$DUETstabilitychange) +#should be TRUE +is.numeric(df$DUETstabilitychange) + +#change the column name to indicate units +n = which(colnames(df) == "DUETstabilitychange"); n +colnames(df)[n] = "DUETStability_Kcalpermol" +colnames(df)[n] + +#======== +#STEP 5 +#======== +#create yet another extra column: classification of DUET stability only +df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0 + , "Stabilizing" + , "Destabilizing") #335, 12 + +table(df$Lig_outcome) +#Destabilizing Stabilizing +#281 54 + +table(df$DUET_outcome) +#Destabilizing Stabilizing +#288 47 +#============================== +#FIXME +#Insert a venn diagram + +#================================ + + +#======== +#STEP 6 +#======== +# assign wild and mutant colnames correctly + +wt = which(colnames(df) == "Wild.type"); wt +colnames(df)[wt] <- "Wild_type" +colnames(df[wt]) + +mut = which(colnames(df) == "Mutant.type"); mut +colnames(df)[mut] <- "Mutant_type" +colnames(df[mut]) + +#======== +#STEP 7 +#======== +#create an extra column: maybe useful for some plots +df$WildPos = paste0(df$Wild_type, df$Position) #335, 13 + +#clear variables +rm(n, wt, mut) + +################ end of data cleaning diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R new file mode 100644 index 0000000..4721e29 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R @@ -0,0 +1,252 @@ +getwd() +#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work +setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad +#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac +getwd() + +#======================================================= +#TASK:read cleaned data and perform rescaling + # of DUET stability scores + # of Pred affinity +#compare scaling methods with plots +#output normalised file +#======================================================= + +#################### +#### read file #####: this will be the output of my R script that cleans the data columns +#################### +source("../Scripts/step3c_data_cleaning.R") +##This will outut two dataframes: +##data: unclean data: 335, 10 +##df : cleaned df 335, 13 +## you can remove data if you want as you will not need it +rm(data) + +colnames(df) + +#=================== +#3a: PredAffLog +#=================== +n = which(colnames(df) == "PredAffLog"); n +group = which(colnames(df) == "Lig_outcome"); group + +#=================================================== +# order according to PredAffLog values +#=================================================== +# This is because this makes it easier to see the results of rescaling for debugging +head(df$PredAffLog) + +#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom +df = df[order(df$PredAffLog),] +head(df$PredAffLog) + +#sanity checks +head(df[,n]) #all negatives +tail(df[,n]) #all positives + +#sanity checks +mean(df[,n]) +#-0.9526746 + +tapply(df[,n], df[,group], mean) +#Destabilizing Stabilizing +#-1.2112100 0.3926667 +#=========================== +#Same as above: in 2 steps +#=========================== + +#find range of your data +my_min = min(df[,n]); my_min #-3.948 +my_max = max(df[,n]); my_max #2.23 + +#=============================================== +# WITHIN GROUP rescaling 2: method "ratio" +# create column to store the rescaled values +# Rescaling separately (Less dangerous) +# =====> chosen one:as Nick prefers +#=============================================== +df$ratioPredAff = ifelse(df[,n] < 0 + , df[,n]/abs(my_min) + , df[,n]/my_max + )#335 14 +#sanity checks +head(df$ratioPredAff) +tail(df$ratioPredAff) + +min(df$ratioPredAff); max(df$ratioPredAff) + +tapply(df$ratioPredAff, df$Lig_outcome, min) +#Destabilizing Stabilizing +#-1.000000000 0.005381166 + +tapply(df$ratioPredAff, df$Lig_outcome, max) +#Destabilizing Stabilizing +#-0.001266464 1.000000000 + +#should be the same as below (281 and 54) +sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0) + +table(df$Lig_outcome) +#Destabilizing Stabilizing +#281 54 + +#=============================================== +# Hist and density plots to compare the rescaling +# methods: Base R +#=============================================== +#uncomment as necessary +my_title = "Ligand_stability" +#my_title = colnames(df[n]) + +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(2,2)) + +hist(df[,n] + , xlab = "" + , main = "Raw values" +) + +hist(df$ratioPredAff + , xlab = "" + , main = "ratio rescaling" +) + +# Plot density plots underneath +plot(density( df[,n] ) + , main = "Raw values" +) + +plot(density( df$ratioPredAff ) + , main = "ratio rescaling" +) + +# titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = my_title + , side = 3 + , line = 0 + , outer = TRUE) + + +#clear variables +rm(my_min, my_max, my_title, n, group) + +#=================== +# 3b: DUET stability +#=================== +dim(df) #335, 14 + +n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10 +group = which(colnames(df) == "DUET_outcome"); group #12 + +#=================================================== +# order according to DUET scores +#=================================================== +# This is because this makes it easier to see the results of rescaling for debugging +head(df$DUETStability_Kcalpermol) + +#ORDER BY DUET scores: negative values at the top and positive at the bottom +df = df[order(df$DUETStability_Kcalpermol),] + +#sanity checks +head(df[,n]) #negatives +tail(df[,n]) #positives + +#sanity checks +mean(df[,n]) +#[1] -1.173316 + +tapply(df[,n], df[,group], mean) +#Destabilizing Stabilizing +#-1.4297257 0.3978723 + +#=============================================== +# WITHIN GROUP rescaling 2: method "ratio" +# create column to store the rescaled values +# Rescaling separately (Less dangerous) +# =====> chosen one:as Nick prefers +#=============================================== +#find range of your data +my_min = min(df[,n]); my_min #-3.87 +my_max = max(df[,n]); my_max #1.689 + +df$ratioDUET = ifelse(df[,n] < 0 + , df[,n]/abs(my_min) + , df[,n]/my_max + ) #335, 15 +#sanity check +head(df$ratioDUET) +tail(df$ratioDUET) + +min(df$ratioDUET); max(df$ratioDUET) + +#sanity checks +tapply(df$ratioDUET, df$DUET_outcome, min) +#Destabilizing Stabilizing +#-1.00000000 0.01065719 + +tapply(df$ratioDUET, df$DUET_outcome, max) +#Destabilizing Stabilizing +#-0.003875969 1.000000000 + +#should be the same as below (267 and 42) +sum(df$ratioDUET < 0); sum(df$ratioDUET > 0) + +table(df$DUET_outcome) +#Destabilizing Stabilizing +#288 47 + +#=============================================== +# Hist and density plots to compare the rescaling +# methods: Base R +#=============================================== +#uncomment as necessary + +my_title = "DUET_stability" +#my_title = colnames(df[n]) + +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(2,2)) + +hist(df[,n] + , xlab = "" + , main = "Raw values" +) + +hist(df$ratioDUET + , xlab = "" + , main = "ratio rescaling" +) + +# Plot density plots underneath +plot(density( df[,n] ) + , main = "Raw values" +) + +plot(density( df$ratioDUET ) + , main = "ratio rescaling" +) + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = my_title + , side = 3 + , line = 0 + , outer = TRUE) + +#=================== +# write output as csv file +#=================== +write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15 diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R new file mode 100644 index 0000000..877215a --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R @@ -0,0 +1,131 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") +require(data.table) +require(dplyr) + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +########################### +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +########################### + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +########################### +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +########################### +# Data for bfactor figure +# PS average +# Lig average +########################### + +head(my_df$Position) +head(my_df$ratioDUET) + +# order data frame +df = my_df[order(my_df$Position),] + +head(df$Position) +head(df$ratioDUET) + +#*********** +# PS: average by position +#*********** + +mean_DUET_by_position <- df %>% + group_by(Position) %>% + summarize(averaged.DUET = mean(ratioDUET)) + +#*********** +# Lig: average by position +#*********** +mean_Lig_by_position <- df %>% + group_by(Position) %>% + summarize(averaged.Lig = mean(ratioPredAff)) + + +#*********** +# cbind:mean_DUET_by_position and mean_Lig_by_position +#*********** + +combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) + +# sanity check +# mean_PS_Lig_Bfactor + +colnames(combined) + +colnames(combined) = c("Position" + , "average_DUETR" + , "Position2" + , "average_PredAffR") + +colnames(combined) + +identical(combined$Position, combined$Position2) + +n = which(colnames(combined) == "Position2"); n + +combined_df = combined[,-n] + +max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) + +max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) + +#============= +# output csv +#============ +outDir = "~/git/Data/pyrazinamide/input/processed/" +outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") +print(paste0("Output file with path will be:","", outFile)) + +head(combined_df$Position); tail(combined_df$Position) + +write.csv(combined_df, outFile + , row.names = F) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData new file mode 100644 index 0000000..9ebc62b Binary files /dev/null and b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData differ diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory new file mode 100644 index 0000000..e69de29 diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R new file mode 100644 index 0000000..015d45f --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R @@ -0,0 +1,250 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") +require(cowplot) + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for OR and stability plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3_comp +#my_df = merged_df3 +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# sanity check +# Ensure correct data type in columns to plot: need to be factor +is.numeric(my_df$OR) +#[1] TRUE + +#<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +# FOR PS Plots +#<<<<<<<<<<<<<<<<<<< + +PS_df = my_df + +rm(my_df) +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1 + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +getwd() + +source("combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for OR and stability plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df2 = merged_df3_comp +#my_df2 = merged_df3 +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df2) +str(my_df2) + +# sanity check +# Ensure correct data type in columns to plot: need to be factor +is.numeric(my_df2$OR) +#[1] TRUE + +# sanity check: should be <10 +if (max(my_df2$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +#<<<<<<<<<<<<<<<< +# REASSIGNMENT +# FOR Lig Plots +#<<<<<<<<<<<<<<<< + +Lig_df = my_df2 + +rm(my_df2) + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1 + +############# +# Plots: Bubble plot +# x = Position, Y = stability +# size of dots = OR +# col: stability +############# + +#================= +# generate plot 1: DUET vs OR by position as geom_points +#================= + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +# Spelling Correction: made redundant as already corrected at the source + +#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising' +#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising' + +table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome)) + +g = ggplot(PS_df, aes(x = factor(Position) + , y = ratioDUET)) + +p1 = g + + geom_point(aes(col = DUET_outcome + , size = OR)) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "DUET(PS)" + , size = "Odds Ratio" + , colour = "DUET Outcome") + + guides(colour = guide_legend(override.aes = list(size=4))) + +p1 + +#================= +# generate plot 2: Lig vs OR by position as geom_points +#================= +my_ats = 20 # axis text size +my_als = 22 # axis label size + +# Spelling Correction: made redundant as already corrected at the source + +#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising' +#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising' + +table(Lig_df$Lig_outcome) + +g = ggplot(Lig_df, aes(x = factor(Position) + , y = ratioPredAff)) + +p2 = g + + geom_point(aes(col = Lig_outcome + , size = OR))+ + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "Ligand Affinity" + , size = "Odds Ratio" + , colour = "Ligand Outcome" + ) + + guides(colour = guide_legend(override.aes = list(size=4))) + +p2 + +#====================== +#combine using cowplot +#====================== +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots" +getwd() + +svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches +#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi +theme_set(theme_gray()) # to preserve default theme + +printFile = cowplot::plot_grid(plot_grid(p1, p2 + , ncol = 1 + , align = 'v' + , labels = c("A", "B") + , label_size = my_als+5)) +print(printFile) +dev.off() + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R new file mode 100644 index 0000000..30b9981 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R @@ -0,0 +1,154 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for Lig plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot: Barplot with scores (unordered) +# corresponds to Lig_outcome +# Stacked Barplot with colours: Lig_outcome @ position coloured by +# Lig_outcome. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding Lig_outcome. +#============================ + +#=================== +# Data for plots +#=================== + +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df = my_df +#%%%%%%%%%%%%%%%%%%%%%%%% + +rm(my_df) + +# sanity checks +upos = unique(my_df$Position) + +# should be a factor +is.factor(df$Lig_outcome) +#TRUE + +table(df$Lig_outcome) + +# should be -1 and 1: may not be in this case because you have filtered the data +# FIXME: normalisation before or after filtering? +min(df$ratioPredAff) # +max(df$ratioPredAff) # + +# sanity checks +tapply(df$ratioPredAff, df$Lig_outcome, min) +tapply(df$ratioPredAff, df$Lig_outcome, max) + +#****************** +# generate plot +#****************** + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +my_title = "Ligand affinity" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = Lig_outcome), colour = "grey") + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R new file mode 100644 index 0000000..169bdaf --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R @@ -0,0 +1,149 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#========================== + +########################### +# Data for DUET plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$DUET_outcome) +my_df$DUET_outcome = as.factor(my_df$DUET_outcome) +is.factor(my_df$DUET_outcome) +#[1] TRUE + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot 2: Barplot with scores (unordered) +# corresponds to DUET_outcome +# Stacked Barplot with colours: DUET_outcome @ position coloured by +# DUET outcome. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET_outcome +#============================ + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +upos = unique(df$Position) + +# should be a factor +is.factor(my_df$DUET_outcome) +#[1] TRUE + +table(my_df$DUET_outcome) + +# should be -1 and 1 +min(df$ratioDUET) +max(df$ratioDUET) + +tapply(df$ratioDUET, df$DUET_outcome, min) +tapply(df$ratioDUET, df$DUET_outcome, max) + +#****************** +# generate plot +#****************** + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +my_title = "Protein stability (DUET)" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = DUET_outcome), colour = "grey") + + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R new file mode 100644 index 0000000..a5d9361 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R @@ -0,0 +1,202 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") +source("../barplot_colour_function.R") + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for Lig plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$Lig_outcome) +my_df$Lig_outcome = as.factor(my_df$Ligoutcome) +is.factor(my_df$Lig_outcome) +#[1] TRUE + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot: Barplot with scores (unordered) +# corresponds to Lig_outcome +# Stacked Barplot with colours: Lig_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding Lig stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +table(df$Lig_outcome) + +# should be -1 and 1: may not be in this case because you have filtered the data +# FIXME: normalisation before or after filtering? +min(df$ratioPredAff) # +max(df$ratioPredAff) # + +# sanity checks +# very important!!!! +tapply(df$ratioPredAff, df$Lig_outcome, min) + +tapply(df$ratioPredAff, df$Lig_outcome, max) + + +#****************** +# generate plot +#****************** + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = Lig_outcome +# subgroup = normalised score i.e ratioPredAff + +# Prepare data: round off ratioLig scores +# round off to 3 significant digits: +# 165 if no rounding is performed: used to generate the originalgraph +# 156 if rounded to 3 places +# FIXME: check if reducing precision creates any ML prob + +# check unique values in normalised data +u = unique(df$ratioPredAff) + +# <<<<< ------------------------------------------- +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$ratioLigR = round(df$ratioPredAff, n) +u = unique(df$ratioLigR) # 156 +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$ratioLigR +df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") + +# else +# uncomment the below if rounding is not required + +#my_grp = df$ratioLig +#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") + +# <<<<< ----------------------------------------------- + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp") +my_title = "Ligand affinity" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R new file mode 100644 index 0000000..8828e90 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R @@ -0,0 +1,192 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") +source("../barplot_colour_function.R") + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for DUET plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$DUET_outcome) +my_df$DUET_outcome = as.factor(my_df$DUET_outcome) +is.factor(my_df$DUET_outcome) +#[1] TRUE + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Barplot with scores (unordered) +# corresponds to DUET_outcome +# Stacked Barplot with colours: DUET_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +upos = unique(df$Position) + +# should be a factor +is.factor(my_df$DUET_outcome) +#[1] TRUE + +table(df$DUET_outcome) + +# should be -1 and 1 +min(df$ratioDUET) +max(df$ratioDUET) + +tapply(df$ratioDUET, df$DUET_outcome, min) +tapply(df$ratioDUET, df$DUET_outcome, max) + +#****************** +# generate plot +#****************** + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = DUET_outcome +# subgroup = normalised score i.e ratioDUET + +# Prepare data: round off ratioDUET scores +# round off to 3 significant digits: +# 323 if no rounding is performed: used to generate the original graph +# 287 if rounded to 3 places +# FIXME: check if reducing precicion creates any ML prob + +# check unique values in normalised data +u = unique(df$ratioDUET) + +# <<<<< ------------------------------------------- +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$ratioDUETR = round(df$ratioDUET, n) +u = unique(df$ratioDUETR) +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$ratioDUETR +df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") + +# else +# uncomment the below if rounding is not required + +#my_grp = df$ratioDUET +#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") + +# <<<<< ----------------------------------------------- + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") +my_title = "Protein stability (DUET)" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R new file mode 100644 index 0000000..c4826d3 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R @@ -0,0 +1,215 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") + +#require(data.table) +#require(dplyr) + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for Lig plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$Lig_outcome) +my_df$Lig_outcome = as.factor(my_df$lig_outcome) +is.factor(my_df$Lig_outcome) +#[1] TRUE + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Plot: Basic barplots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< +rm(my_df) + +# sanity checks +str(df) + +if (identical(df$Position, df$position)){ + print("Sanity check passed: Columns 'Position' and 'position' are identical") +} else{ + print("Error!: Check column names and info contained") +} + +#**************** +# generate plot: No of stabilising and destabilsing muts +#**************** +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('basic_barplots_LIG.svg') + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +# uncomment as necessary for either directly outputting results or +# printing on the screen +g = ggplot(df, aes(x = Lig_outcome)) +#prinfFile = g + geom_bar( + g + geom_bar( + aes(fill = Lig_outcome) + , show.legend = TRUE +) + geom_label( + stat = "count" + , aes(label = ..count..) + , color = "black" + , show.legend = FALSE + , size = 10) + theme( + axis.text.x = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_text(size=my_als) + , axis.text.y = element_text(size = my_ats) + , legend.position = c(0.73,0.8) + , legend.text = element_text(size=my_als-2) + , legend.title = element_text(size=my_als) + , plot.title = element_blank() + ) + labs( + title = "" + , y = "Number of SNPs" + #, fill='Ligand Outcome' + ) + scale_fill_discrete(name = "Ligand Outcome" + , labels = c("Destabilising", "Stabilising")) +print(prinfFile) +dev.off() + +#**************** +# generate plot: No of positions +#**************** +#get freq count of positions so you can subset freq<1 +#require(data.table) +setDT(df)[, pos_count := .N, by = .(Position)] #169, 36 + +head(df$pos_count) +table(df$pos_count) +# this is cummulative +#1 2 3 4 5 6 +#5 24 36 56 30 18 + +# use group by on this +snpsBYpos_df <- df %>% + group_by(Position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) +#1 2 3 4 5 6 +#5 12 12 14 6 3 +# this is what will get plotted + +svg('position_count_LIG.svg') + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) +prinfFile = g + geom_bar( + #g + geom_bar( + aes (alpha = 0.5) + , show.legend = FALSE +) + + geom_label( + stat = "count", aes(label = ..count..) + , color = "black" + , size = 10 + ) + + theme( + axis.text.x = element_text( + size = my_ats + , angle = 0 + ) + , axis.text.y = element_text( + size = my_ats + , angle = 0 + , hjust = 1 + ) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , plot.title = element_blank() + ) + + labs( + x = "Number of SNPs" + , y = "Number of Sites" + ) +print(prinfFile) +dev.off() +######################################################################## +# end of Lig barplots # +######################################################################## + + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R new file mode 100644 index 0000000..51a2812 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R @@ -0,0 +1,211 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#========================== + +########################### +# Data for DUET plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$DUET_outcome) +my_df$DUET_outcome = as.factor(my_df$DUET_outcome) +is.factor(my_df$DUET_outcome) +#[1] TRUE + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Plot: Basic barplots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +str(df) + +if (identical(df$Position, df$position)){ + print("Sanity check passed: Columns 'Position' and 'position' are identical") +} else{ + print("Error!: Check column names and info contained") + } + +#**************** +# generate plot: No of stabilising and destabilsing muts +#**************** +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('basic_barplots_DUET.svg') + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +theme_set(theme_grey()) + +# uncomment as necessary for either directly outputting results or +# printing on the screen +g = ggplot(df, aes(x = DUET_outcome)) +prinfFile = g + geom_bar( +#g + geom_bar( + aes(fill = DUET_outcome) + , show.legend = TRUE + ) + geom_label( + stat = "count" + , aes(label = ..count..) + , color = "black" + , show.legend = FALSE + , size = 10) + theme( + axis.text.x = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_text(size=my_als) + , axis.text.y = element_text(size = my_ats) + , legend.position = c(0.73,0.8) + , legend.text = element_text(size=my_als-2) + , legend.title = element_text(size=my_als) + , plot.title = element_blank() + ) + labs( + title = "" + , y = "Number of SNPs" + #, fill='DUET Outcome' + ) + scale_fill_discrete(name = "DUET Outcome" + , labels = c("Destabilising", "Stabilising")) + +print(prinfFile) +dev.off() + +#**************** +# generate plot: No of positions +#**************** +#get freq count of positions so you can subset freq<1 +#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36 + +setDT(df)[, pos_count := .N, by = .(Position)] #335, 36 +table(df$pos_count) +# this is cummulative +#1 2 3 4 5 6 +#34 76 63 104 40 18 + +# use group by on this +snpsBYpos_df <- df %>% + group_by(Position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) +#1 2 3 4 5 6 +#34 38 21 26 8 3 + +foo = select(df, Mutationinformation + , WildPos + , wild_type + , mutant_type + , mutation_info + , position + , pos_count) #335, 5 + +getwd() +write.csv(foo, "../Data/pos_count_freq.csv") + +svg('position_count_DUET.svg') +my_ats = 25 # axis text size +my_als = 22 # axis label size + +g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) +prinfFile = g + geom_bar( +#g + geom_bar( + aes (alpha = 0.5) + , show.legend = FALSE + ) + + geom_label( + stat = "count", aes(label = ..count..) + , color = "black" + , size = 10 + ) + + theme( + axis.text.x = element_text( + size = my_ats + , angle = 0 + ) + , axis.text.y = element_text( + size = my_ats + , angle = 0 + , hjust = 1 + ) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , plot.title = element_blank() + ) + + labs( + x = "Number of SNPs" + , y = "Number of Sites" + ) +print(prinfFile) +dev.off() +######################################################################## +# end of DUET barplots # +######################################################################## + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R new file mode 100644 index 0000000..f3a507e --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R @@ -0,0 +1,175 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") + +#source("barplot_colour_function.R") + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#========================== + +########################### +# Data for PS Corr plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Plot: Correlation plots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +str(df) + +table(df$DUET_outcome) + +# unique positions +length(unique(df$Position)) #{RESULT: unique positions for comp data} + + +# subset data to generate pairwise correlations +corr_data = df[, c("ratioDUET" +# , "ratioPredAff" +# , "DUETStability_Kcalpermol" +# , "PredAffLog" +# , "OR" + , "logor" +# , "pvalue" + , "neglog10pvalue" + , "AF" + , "DUET_outcome" +# , "Lig_outcome" + , "pyrazinamide" + )] +dim(corr_data) +rm(df) + +# assign nice colnames (for display) +my_corr_colnames = c("DUET" +# , "Ligand Affinity" +# , "DUET_raw" +# , "Lig_raw" +# , "OR" + , "Log(Odds Ratio)" +# , "P-value" + , "-LogP" + , "Allele Frequency" + , "DUET_outcome" +# , "Lig_outcome" + , "pyrazinamide") + +# sanity check +if (length(my_corr_colnames) == length(corr_data)){ + print("Sanity check passed: corr_data and corr_names match in length") +}else{ + print("Error: length mismatch!") +} + +colnames(corr_data) +colnames(corr_data) <- my_corr_colnames +colnames(corr_data) + +############### +# PLOTS: corr +# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs +############### +#default pairs plot +start = 1 +end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column +offset = 1 + +my_corr = corr_data[start:(end-offset)] +head(my_corr) + +#my_cols = c("#f8766d", "#00bfc4") +# deep blue :#007d85 +# deep red: #ae301e + +#========== +# psych: ionformative since it draws the ellipsoid +# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html +# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs +#========== + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots" +getwd() + +svg('DUET_corr.svg', width = 15, height = 15) +printFile = pairs.panels(my_corr[1:4] + , method = "spearman" # correlation method + , hist.col = "grey" ##00AFBB + , density = TRUE # show density plots + , ellipses = F # show correlation ellipses + , stars = T + , rug = F + , breaks = "Sturges" + , show.points = T + , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))] + , pch = 21 + , jitter = T + #, alpha = .05 + #, points(pch = 19, col = c("#f8766d", "#00bfc4")) + , cex = 3 + , cex.axis = 2.5 + , cex.labels = 3 + , cex.cor = 1 + , smooth = F +) + +print(printFile) +dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R new file mode 100644 index 0000000..4e05d41 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R @@ -0,0 +1,187 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") + +#source("barplot_colour_function.R") + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for Lig Corr plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Plot: Correlation plots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +str(df) + +table(df$Lig_outcome) + +# unique positions +length(unique(df$Position)) #{RESULT: unique positions for comp data} + +# subset data to generate pairwise correlations +corr_data = df[, c(#"ratioDUET", + "ratioPredAff" +# , "DUETStability_Kcalpermol" +# , "PredAffLog" +# , "OR" + , "logor" +# , "pvalue" + , "neglog10pvalue" + , "AF" +# , "DUET_outcome" + , "Lig_outcome" + , "pyrazinamide" + )] +dim(corr_data) +rm(df) + +# assign nice colnames (for display) +my_corr_colnames = c(#"DUET", + "Ligand Affinity" +# ,"DUET_raw" +# , "Lig_raw" +# , "OR" + , "Log(Odds Ratio)" +# , "P-value" + , "-LogP" + , "Allele Frequency" +# , "DUET_outcome" + , "Lig_outcome" + , "pyrazinamide") + +# sanity check +if (length(my_corr_colnames) == length(corr_data)){ + print("Sanity check passed: corr_data and corr_names match in length") +}else{ + print("Error: length mismatch!") +} + +colnames(corr_data) +colnames(corr_data) <- my_corr_colnames +colnames(corr_data) + +############### +# PLOTS: corr +# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs +############### + +# default pairs plot +start = 1 +end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column +offset = 1 + +my_corr = corr_data[start:(end-offset)] +head(my_corr) + +#my_cols = c("#f8766d", "#00bfc4") +# deep blue :#007d85 +# deep red: #ae301e + +#========== +# psych: ionformative since it draws the ellipsoid +# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html +# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs +#========== + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots" +getwd() + +svg('Lig_corr.svg', width = 15, height = 15) +printFile = pairs.panels(my_corr[1:4] + , method = "spearman" # correlation method + , hist.col = "grey" ##00AFBB + , density = TRUE # show density plots + , ellipses = F # show correlation ellipses + , stars = T + , rug = F + , breaks = "Sturges" + , show.points = T + , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))] + , pch = 21 + , jitter = T +# , alpha = .05 +# , points(pch = 19, col = c("#f8766d", "#00bfc4")) + , cex = 3 + , cex.axis = 2.5 + , cex.labels = 3 + , cex.cor = 1 + , smooth = F +) +print(printFile) +dev.off() + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R new file mode 100644 index 0000000..1f868e4 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R @@ -0,0 +1,227 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") + +require(data.table) + +######################################################################## +# Read file: call script for combining df # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#========================== + +########################### +# Data for plots +# you need merged_df2, comprehensive one +# since this has one-many relationship +# i.e the same SNP can belong to multiple lineages +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +#========================== +# Plot: Lineage barplot +# x = lineage y = No. of samples +# col = Lineage +# fill = lineage +#============================ +table(my_df$lineage) + +# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV +#3 104 1293 264 1311 6 6 105 + +#=========================== +# Plot: Lineage Barplots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df <- my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< +rm(my_df) + +# get freq count of positions so you can subset freq<1 +#setDT(df)[, lineage_count := .N, by = .(lineage)] + +#****************** +# generate plot: barplot of mutation by lineage +#****************** +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +df_lin = subset(df, subset = lineage %in% sel_lineages ) + +#FIXME; add sanity check for numbers. +# Done this manually + +############################################################ + +######### +# Data for barplot: Lineage barplot +# to show total samples and number of unique mutations +# within each linege +########## + +# Create df with lineage inform & no. of unique mutations +# per lineage and total samples within lineage +# this is essentially barplot with two y axis + +bar = bar = as.data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(df$id)[df$lineage==i]) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = df[df$lineage==i,] + print(paste0(i, "=======")) + print(length(unique(foo$Mutationinformation))) + curr_count = length(unique(foo$Mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} + +print(total_snps_u) +bar$num_snps_u = total_snps_u +bar$total_samples = total_samples +bar + +#***************** +# generate plot: lineage barplot with two y-axis +#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 +#***************** + +bar$num_snps_u = y1 +bar$total_samples = y2 +sel_lineages = x + +to_plot = data.frame(x = x + , y1 = y1 + , y2 = y2) +to_plot + +melted = melt(to_plot, id = "x") +melted + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('lineage_basic_barplot.svg') + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +g = ggplot(melted + , aes(x = x + , y = value + , fill = variable) + ) + + +printFile = g + geom_bar( + +#g + geom_bar( + stat = "identity" + , position = position_stack(reverse = TRUE) + , alpha=.75 + , colour='grey75' + ) + theme( + axis.text.x = element_text( + size = my_ats +# , angle= 30 + ) + , axis.text.y = element_text(size = my_ats + #, angle = 30 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text( + size = my_als + , colour = 'black' + ) + , axis.title.y = element_text( + size = my_als + , colour = 'black' + ) + , legend.position = "top" + , legend.text = element_text(size = my_als) + + #) + geom_text( + ) + geom_label( + aes(label = value) + , size = 5 + , hjust = 0.5 + , vjust = 0.5 + , colour = 'black' + , show.legend = FALSE + #, check_overlap = TRUE + , position = position_stack(reverse = T) + #, position = (' + + ) + labs( + title = '' + , x = '' + , y = "Number" + , fill = 'Variable' + , colour = 'black' + ) + scale_fill_manual( + values = c('grey50', 'gray75') + , name='' + , labels=c('Mutations', 'Total Samples') + ) + scale_x_discrete( + breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') + ) +print(printFile) +dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R new file mode 100644 index 0000000..ebbec76 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R @@ -0,0 +1,233 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") +#require(data.table) + +######################################################################## +# Read file: call script for combining df for Lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info) + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot: Lineage Distribution +# x = mcsm_values, y = dist +# fill = stability +#============================ + +#=================== +# Data for plots +#=================== + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# uncomment as necessary +df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35 + +# refactor +df_lin$lineage = factor(df_lin$lineage) + +table(df_lin$lineage) #{RESULT: No of samples within lineage} +#lineage1 lineage2 lineage3 lineage4 +#78 961 195 803 + +# when merged_df2_comp is used +#lineage1 lineage2 lineage3 lineage4 +#77 955 194 770 + +length(unique(df_lin$Mutationinformation)) +#{Result: No. of unique mutations the 4 lineages contribute to} + +# sanity checks +r1 = 2:5 # when merged_df2 used: because there is missing lineages +if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) { + print ("sanity check passed: numbers match") +} else{ + print("Error!: check your numbers") +} + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df <- df_lin +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(df_lin) + +#****************** +# generate distribution plot of lineages +#****************** +# basic: could improve this! +library(plotly) +library(ggridges) + +fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + +g <- ggplot(df, aes(x = ratioPredAff)) + + geom_density(aes(fill = Lig_outcome) + , alpha = 0.5) + + facet_wrap( ~ lineage + , scales = "free" + , labeller = labeller(lineage = fooNames) ) + + coord_cartesian(xlim = c(-1, 1) +# , ylim = c(0, 6) +# , clip = "off" +) + ggtitle("Kernel Density estimates of Ligand affinity by lineage") + +ggplotly(g) + +# 2 : ggridges (good!) + +my_ats = 15 # axis text size +my_als = 20 # axis label size + +fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('lineage_dist_LIG.svg') + +printFile = ggplot( df, aes(x = ratioPredAff + , y = Lig_outcome) ) + + + geom_density_ridges_gradient( aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" +# , switch = 'x' + , labeller = labeller(lineage = fooNames) ) + + coord_cartesian( xlim = c(-1, 1) +# , ylim = c(0, 6) +# , clip = "off" + ) + + + scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4") + , name = "Ligand Affinity" ) + + theme( axis.text.x = element_text( size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) +# , axis.text.y = element_text( size = my_ats +# , angle = 0 +# , hjust = 1 +# , vjust = 0) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = 10) + , legend.title = element_text(size = my_als) +# , legend.position = c(0.3, 0.8) +# , legend.key.height = unit(1, 'mm') + ) + +print(printFile) +dev.off() + +#=!=!=!=!=!=! +# COMMENT: When you look at all mutations, the lineage differences disappear... +# The pattern we are interested in is possibly only for dr_mutations +#=!=!=!=!=!=! + +#=================================================== + +# COMPARING DISTRIBUTIONS +head(df$lineage) +df$lineage = as.character(df$lineage) + +lin1 = df[df$lineage == "lineage1",]$ratioPredAff +lin2 = df[df$lineage == "lineage2",]$ratioPredAff +lin3 = df[df$lineage == "lineage3",]$ratioPredAff +lin4 = df[df$lineage == "lineage4",]$ratioPredAff + +# ks test +ks.test(lin1,lin2) +ks.test(lin1,lin3) +ks.test(lin1,lin4) + +ks.test(lin2,lin3) +ks.test(lin2,lin4) + +ks.test(lin3,lin4) + + + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R new file mode 100644 index 0000000..cc7e15a --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R @@ -0,0 +1,212 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") +#require(data.table) + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot: Lineage Distribution +# x = mcsm_values, y = dist +# fill = stability +#============================ + +#=================== +# Data for plots +#=================== + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# uncomment as necessary +df_lin = subset(my_df, subset = lineage %in% sel_lineages ) + +# refactor +df_lin$lineage = factor(df_lin$lineage) + +table(df_lin$lineage) #{RESULT: No of samples within lineage} +#lineage1 lineage2 lineage3 lineage4 +#104 1293 264 1311 + +# when merged_df2_comp is used +#lineage1 lineage2 lineage3 lineage4 +#99 1275 263 1255 + +length(unique(df_lin$Mutationinformation)) +#{Result: No. of unique mutations the 4 lineages contribute to} + +# sanity checks +r1 = 2:5 # when merged_df2 used: because there is missing lineages +if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) { + print ("sanity check passed: numbers match") +} else{ + print("Error!: check your numbers") +} + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df <- df_lin +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(df_lin) + +#****************** +# generate distribution plot of lineages +#****************** +# basic: could improve this! +library(plotly) +library(ggridges) + +g <- ggplot(df, aes(x = ratioDUET)) + + geom_density(aes(fill = DUET_outcome) + , alpha = 0.5) + facet_wrap(~ lineage, + scales = "free") + + ggtitle("Kernel Density estimates of Protein stability by lineage") + +ggplotly(g) + +# 2 : ggridges (good!) + +my_ats = 15 # axis text size +my_als = 20 # axis label size + +fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4') + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('lineage_dist_PS.svg') + +printFile = ggplot( df, aes(x = ratioDUET + , y = DUET_outcome) )+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient( aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" +# , switch = 'x' + , labeller = labeller(lineage = fooNames) ) + + coord_cartesian( xlim = c(-1, 1) +# , ylim = c(0, 6) +# , clip = "off" + ) + + scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme( axis.text.x = element_text( size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) +# , axis.text.y = element_text( size = my_ats +# , angle = 0 +# , hjust = 1 +# , vjust = 0) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size=my_als) + , legend.text = element_text(size=10) + , legend.title = element_text(size=my_als) +# , legend.position = c(0.3, 0.8) +# , legend.key.height = unit(1, 'mm') + ) + +print(printFile) +dev.off() + +#=!=!=!=!=!=! +# COMMENT: When you look at all mutations, the lineage differences disappear... +# The pattern we are interested in is possibly only for dr_mutations +#=!=!=!=!=!=! +#=================================================== + +# COMPARING DISTRIBUTIONS +head(df$lineage) +df$lineage = as.character(df$lineage) + +lin1 = df[df$lineage == "lineage1",]$ratioDUET +lin2 = df[df$lineage == "lineage2",]$ratioDUET +lin3 = df[df$lineage == "lineage3",]$ratioDUET +lin4 = df[df$lineage == "lineage4",]$ratioDUET + +# ks test +ks.test(lin1,lin2) +ks.test(lin1,lin3) +ks.test(lin1,lin4) + +ks.test(lin2,lin3) +ks.test(lin2,lin4) + +ks.test(lin3,lin4) + + + diff --git a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R new file mode 100644 index 0000000..41ca884 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R @@ -0,0 +1,27 @@ +######################### +#3: Read complex pdb file +########################## +source("Header_TT.R") +# This script only reads the pdb file of your complex + +# read in pdb file complex1 +inDir = "~/git/Data/pyrazinamide/input/structure/" +inFile = paste0(inDir, "complex1_no_water.pdb") +complex1 = inFile + +#inFile2 = paste0(inDir, "complex2_no_water.pdb") +#complex2 = inFile2 + +# list of 8 +my_pdb = read.pdb(complex1 + , maxlines = -1 + , multi = FALSE + , rm.insert = FALSE + , rm.alt = TRUE + , ATOM.only = FALSE + , hex = FALSE + , verbose = TRUE) + +rm(inDir, inFile, complex1) +#====== end of script + diff --git a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R new file mode 100644 index 0000000..658eec4 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R @@ -0,0 +1,386 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("Header_TT.R") + +######################################################### +# TASK: replace B-factors in the pdb file with normalised values +# use the complex file with no water as mCSM lig was +# performed on this file. You can check it in the script: read_pdb file. +######################################################### + +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile + +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F + , header = T) +str(my_df) + +#========================================================= +# Processing P1: Replacing B factor with mean ratioDUET scores +#========================================================= + +######################### +# Read complex pdb file +# form the R script +########################## + +source("read_pdb.R") # list of 8 + +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] + +# make a copy: required for downstream sanity checks +d2 = d + +# sanity checks: B factor +max(d$b); min(d$b) + +#******************************************* +# plot histograms for inspection +# 1: original B-factors +# 2: original DUET Scores +# 3: replaced B-factors with DUET Scores +#********************************************* +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(3,2)) +#par(mfrow = c(3,2)) + + #1: Original B-factor +hist(d$b + , xlab = "" + , main = "B-factor") + +plot(density(d$b) + , xlab = "" + , main = "B-factor") + +# 2: DUET scores +hist(my_df$average_DUETR + , xlab = "" + , main = "Norm_DUET") + +plot(density(my_df$average_DUETR) + , xlab = "" + , main = "Norm_DUET") + +# 3: After the following replacement +#******************************** + +#========= +# step 0_P1: DONT RUN once you have double checked the matched output +#========= +# sanity check: match and assign to a separate column to double check +# colnames(my_df) +# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)] + +#========= +# step 1_P1 +#========= +# Be brave and replace in place now (don't run sanity check) +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] + +#========= +# step 2_P1 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na + +# count number of 0's in Bactor +sum(d$b == 0) +#table(d$b) + +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 + +# sanity check: should be 0 +sum(is.na(d$b)) + +# sanity check: should be True +if (sum(d$b == 0) == b_na){ + print ("Sanity check passed: NA's replaced with 0's successfully") +} else { + print("Error: NA replacement NOT successful, Debug code!") +} + +max(d$b); min(d$b) + +# sanity checks: should be True +if(max(d$b) == max(my_df$average_DUETR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +if (min(d$b) == min(my_df$average_DUETR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +#========= +# step 3_P1 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) + +#========= +# step 4_P1 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d + +max(d$b); min(d$b) + +#========= +# step 5_P1 +#========= +# output dir +getwd() +outDir = "~/git/Data/pyrazinamide/input/structure/" + +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile +write.pdb(my_pdb, outFile) + +#******************************** +# Add the 3rd histogram and density plots for comparisons +#******************************** +# Plots continued... +# 3: hist and density of replaced B-factors with DUET Scores +hist(d$b + , xlab = "" + , main = "repalced-B") + +plot(density(d$b) + , xlab = "" + , main = "replaced-B") + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = "DUET_stability" + , side = 3 + , line = 0 + , outer = TRUE) +#******************************** + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# NOTE: This replaced B-factor distribution has the same +# x-axis as the PredAff normalised values, but the distribution +# is affected since 0 is overinflated. This is because all the positions +# where there are no SNPs have been assigned 0. +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + + + + +####################################################################### +#====================== end of section 1 ============================== +####################################################################### + + + + + +#========================================================= +# Processing P2: Replacing B values with PredAff Scores +#========================================================= +# clear workspace +rm(list = ls()) + +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile + +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F + , header = T) +str(my_df) +#rm(inDir, inFile) + +######################### +# 3: Read complex pdb file +# form the R script +########################## + +source("read_pdb.R") # list of 8 + +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] + +# make a copy: required for downstream sanity checks +d2 = d + +# sanity checks: B factor +max(d$b); min(d$b) + +#******************************************* +# plot histograms for inspection +# 1: original B-factors +# 2: original Pred Aff Scores +# 3: replaced B-factors with PredAff Scores +#******************************************** +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(3,2)) +#par(mfrow = c(3,2)) + +# 1: Original B-factor +hist(d$b + , xlab = "" + , main = "B-factor") + +plot(density(d$b) + , xlab = "" + , main = "B-factor") + +# 2: Pred Aff scores +hist(my_df$average_PredAffR + , xlab = "" + , main = "Norm_lig_average") + +plot(density(my_df$average_PredAffR) + , xlab = "" + , main = "Norm_lig_average") + +# 3: After the following replacement +#******************************** + +#================================================= +# Processing P2: Replacing B values with ratioPredAff scores +#================================================= +# use match to perform this replacement linking with "position no" +# in the pdb file, this corresponds to column "resno" +# in my_df, this corresponds to column "Position" + +#========= +# step 0_P2: DONT RUN once you have double checked the matched output +#========= +# sanity check: match and assign to a separate column to double check +# colnames(my_df) +# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17 + +#========= +# step 1_P2: BE BRAVE and replace in place now (don't run step 0) +#========= +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] + +#========= +# step 2_P2 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na + +# count number of 0's in Bactor +sum(d$b == 0) +#table(d$b) + +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 + +# sanity check: should be 0 +sum(is.na(d$b)) + +if (sum(d$b == 0) == b_na){ + print ("Sanity check passed: NA's replaced with 0's successfully") +} else { + print("Error: NA replacement NOT successful, Debug code!") +} + +max(d$b); min(d$b) + +# sanity checks: should be True +if (max(d$b) == max(my_df$average_PredAffR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +if (min(d$b) == min(my_df$average_PredAffR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +#========= +# step 3_P2 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) + +#========= +# step 4_P2 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d + +max(d$b); min(d$b) + +#========= +# step 5_P2 +#========= + +# output dir +outDir = "~/git/Data/pyrazinamide/input/structure/" +outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile +write.pdb(my_pdb, outFile) + +#******************************** +# Add the 3rd histogram and density plots for comparisons +#******************************** +# Plots continued... +# 3: hist and density of replaced B-factors with PredAff Scores +hist(d$b + , xlab = "" + , main = "repalced-B") + +plot(density(d$b) + , xlab = "" + , main = "replaced-B") + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = "Lig_stability" + , side = 3 + , line = 0 + , outer = TRUE) + +#******************************** + +########### +# end of output files with Bfactors +########## diff --git a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R new file mode 100644 index 0000000..9f30f28 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R @@ -0,0 +1,257 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() + +######################################################### +# 1: Installing and loading required packages # +######################################################### + +source("Header_TT.R") +#source("barplot_colour_function.R") + +########################################################## +# Checking: Entire data frame and for PS # +########################################################## + +########################### +#2) Read file: combined one from the script +########################### +source("combining_two_df.R") + +# df with NA: +# merged_df2 +# merged_df3: + +# df without NA: +# merged_df2_comp: +# merged_df3_comp: + +###################### +# You need to check it +# with the merged_df3 +######################## + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#clear variables +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# should be true +identical(my_df$Position, my_df$position) + +################################# +# Read file: normalised file +# output of step 4 mcsm_pipeline +################################# + + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile + +mcsm_data <- read.csv(inFile + , row.names = 1 + , stringsAsFactors = F + , header = T) +str(mcsm_data) +my_colnames = colnames(mcsm_data) + +#==================================== +# subset my_df to include only the columns in mcsm data +my_df2 = my_df[my_colnames] +#==================================== +# compare the two +head(mcsm_data$Mutationinformation) +head(mcsm_data$Position) + +head(my_df2$Mutationinformation) +head(my_df2$Position) + +# sort mcsm data by Mutationinformation +mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] +head(mcsm_data_s$Mutationinformation) +head(mcsm_data_s$Position) + +# now compare: should be True, but is false.... +# possibly due to rownames!?! +identical(mcsm_data_s, my_df2) + +# from library dplyr +setdiff(mcsm_data_s, my_df2) + +#from lib compare +compare(mcsm_data_s, my_df2) # seems rownames are the problem + +# FIXME: automate this +# write files: checked using meld and files are indeed identical +#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F) +#write.csv(my_df2, "my_df2.csv", row.names = F) + + +#====================================================== end of section 1 + + + +########################################################## +# Checking: LIG(Filtered dataframe) # +########################################################## + +# clear workspace +rm(list = ls()) + +########################### +#3) Read file: combined_lig from the script +########################### +source("combining_two_df_lig.R") + +# df with NA: +# merged_df2 : +# merged_df3: + +# df without NA: +# merged_df2_comp: +# merged_df3_comp: + +###################### +# You need to check it +# with the merged_df3 +######################## + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#clear variables +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# should be true +identical(my_df$Position, my_df$position) + +################################# +# Read file: normalised file +# output of step 4 mcsm_pipeline +################################# + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile + +mcsm_data <- read.csv(inFile + , row.names = 1 + , stringsAsFactors = F + , header = T) +str(mcsm_data) + +########################### +# 4a: Filter/subset data: ONLY for LIGand analysis +# Lig plots < 10Ang +# Filter the lig plots for Dis_to_lig < 10Ang +########################### +# sanity checks +upos = unique(mcsm_data$Position) + +# check range of distances +max(mcsm_data$Dis_lig_Ang) +min(mcsm_data$Dis_lig_Ang) + +# Lig filtered: subset data to have only values less than 10 Ang +mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) + +rm(mcsm_data) #to avoid confusion + +table(mcsm_data2$Dis_lig_Ang<10) +table(mcsm_data2$Dis_lig_Ang>10) + +max(mcsm_data2$Dis_lig_Ang) +min(mcsm_data2$Dis_lig_Ang) + +upos_f = unique(mcsm_data2$Position); upos_f + +# colnames of df that you will need to subset the bigger df from +my_colnames = colnames(mcsm_data2) +#==================================== +# subset bigger df i.e my_df to include only the columns in mcsm data2 +my_df2 = my_df[my_colnames] + +rm(my_df) #to avoid confusion +#==================================== +# compare the two +head(mcsm_data2$Mutationinformation) +head(mcsm_data2$Position) + +head(my_df2$Mutationinformation) +head(my_df2$Position) + +# sort mcsm data by Mutationinformation +mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] +head(mcsm_data2_s$Mutationinformation) +head(mcsm_data2_s$Position) + +# now compare: should be True, but is false.... +# possibly due to rownames!?! +identical(mcsm_data2_s, my_df2) + +# from library dplyr +setdiff(mcsm_data2_s, my_df2) + +# from library compare +compare(mcsm_data2_s, my_df2) # seems rownames are the problem + +#FIXME: automate this +# write files: checked using meld and files are indeed identical +#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F) +#write.csv(my_df2, "my_df2.csv", row.names = F) + + +########################################################## +# extract and write output file for SNP posn: all # +########################################################## + +head(merged_df3$Position) + +foo = merged_df3[order(merged_df3$Position),] +head(foo$Position) + +snp_pos_unique = unique(foo$Position); snp_pos_unique + +# sanity check: +table(snp_pos_unique == combined_df$Position) + +#===================== +# write_output files +#===================== +outDir = "~/Data/pyrazinamide/input/processed/" + + +outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1 +print(paste0("Output file name and path will be:","", outFile1)) + +write.table(snp_pos_unique + , outFile1 + , row.names = F + , col.names = F) + +############################################################## +# extract and write output file for SNP posn: complete only # +############################################################## +head(merged_df3_comp$Position) + +foo = merged_df3_comp[order(merged_df3_comp$Position),] +head(foo$Position) + +snp_pos_unique = unique(foo$Position); snp_pos_unique + +# outDir = "~/Data/pyrazinamide/input/processed/" # already set + +outFile2 = paste0(outDir, "snp_pos_unique_comp.txt") +print(paste0("Output file name and path will be:", outFile2)) + +write.table(snp_pos_unique + , outFile2 + , row.names = F + , col.names = F) +#============================== end of script + + diff --git a/meta_data_analysis/.Rhistory b/meta_data_analysis/.Rhistory new file mode 100644 index 0000000..28b1797 --- /dev/null +++ b/meta_data_analysis/.Rhistory @@ -0,0 +1,512 @@ +, stringsAsFactors = F) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] +my_logor +pnca_snps_or$Mutationinformation == i +View(pnca_snps_or) +#=============== +# Step 4: Calculate for one snp +# using i, when you run the loop, it is easy +#=============== +i = "pnca_p.trp68gly" +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +# uncomment as necessary +pnca_snps_or = pnca_snps_or[1:5,] +pnca_snps_or = pnca_snps_or[c(1:5),] +#=============== +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +pnca_snps_or = pnca_snps_or[1:5,] +pnca_snps_or = pnca_snps_or[c(1:5),] +pnca_snps_or = pnca_snps_or[1:5] +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +pnca_snps_or = pnca_snps_or[1:5] +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +foo = pnca_snps_or[c(1:5,)] +foo = pnca_snps_or[c(1:5),] +foo = as.data.frame(pnca_snps_or[c(1:5),]) +View(foo) +# create an empty dataframe +pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),]) +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +#=============== +# Step 4: Iterate through this unique list +# and calculate OR, but only for one snp +# this is test before you apply it all others +#=============== +pnca_snps_or$mutation == i +View(pnca_snps_or) +# create an empty dataframe +pnca_snps_or = data.frame(mutation = i) +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +View(pnca_snps_or_copy) +#=============== +# Step 4: Iterate through this unique list +# and calculate OR, but only for one snp +# this is test before you apply it all others +#=============== +#reset original df so you don't make a mistake +pnca_snps_or = pnca_snps_or_copy +for (i in pnca_snps_unique){ +print(i) +} +pnca_snps_or = pnca_snps_or_copy #2133, 1 +#........................................ +# create an empty dataframe : uncomment as necessary +pnca_snps_or = data.frame(mutation = c(i, "blank_mut") +#........................................ +# create an empty dataframe : uncomment as necessary +pnca_snps_or = data.frame(mutation = c(i, "blank_mut")) +#........................................ +# create an empty dataframe : uncomment as necessary +pnca_snps_or = data.frame(mutation = c(i, "blank_mut")) +View(pnca_snps_or) +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +View(pnca_snps_or) +pnca_snps_or = pnca_snps_or_copy #2133, 1 +for (i in pnca_snps_unique){ +print(i) +#************* +# start logistic regression model building +#************* +# set the IV and DV for the logistic regression model +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +} +warnings() +View(pnca_snps_or) +View(pnca_snps_or_copy) +#sanity check +pnca_snps_or$mutation == i1 +#sanity check +pnca_snps_or[pnca_snps_or$mutation == i1] +pnca_snps_or[pnca_snps_or$mutation == i2] +pnca_snps_or[pnca_snps_or$mutation == i2,] +pnca_snps_or1 = unique(pnca_snps_or) +write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv") +# you only need it for the unique mutations +pnca_snps_or = unique(pnca_snps_or) #2133, 1 +for (i in pnca_snps_unique){ +print(i) +#************* +# start logistic regression model building +#************* +# set the IV and DV for the logistic regression model +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +} +View(pnca_snps_or) +2.290256e+01 +1.561132e+06 +3.242285e-04 +#sanity check +pnca_snps_or[pnca_snps_or$mutation == i1] +pnca_snps_or[pnca_snps_or$mutation == i2,] +write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv") +my_data = read.csv("../Data_original/meta_pza_with_AF.csv" +, stringsAsFactors = FALSE) #11374, 19 +View(my_data) +# remove the first column +my_data = my_data[-1] #11374, 18 +# check if first col is 'id': should be TRUE +colnames(my_data)[1] == 'id' +# sanity check +snps_all = unique(my_data$mutation)# 337 +pnca_snps_or = snps_all +pnca_snps_or = as.data.frame(snps_all) +View(pnca_snps_or) +snps_all[-"true_wt"] +pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) +View(pnca_snps_or) +snps_all = as.data.frame(snps_all) +View(snps_all) +#remove true_wt entry +w1 = which(rownames(snps_all) == "true_wt") +View(snps_all) +#remove true_wt entry +w1 = which(snps_all$snps_all == "true_wt") +rm(pnca_snps_or) +pnca_snps_or = snps_all[-w1] +pnca_snps_or = snps_all[,-w1] +pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) +#remove true_wt entry +w1 = which(snps_all) == "true_wt" +pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) +my_data = read.csv("../Data_original/meta_pza_with_AF.csv" +, stringsAsFactors = FALSE) #11374, 19 +# remove the first column +my_data = my_data[-1] #11374, 18 +# check if first col is 'id': should be TRUE +colnames(my_data)[1] == 'id' +# sanity check +snps_all = unique(my_data$mutation)# 337 +snps_all = as.data.frame(snps_all) +snps_all[-c(1,1)] +pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) +pnca_snps_or = as.data.frame(snps_all[, -c(1,1)]) +#remove true_wt entry +#w1 = which(snps_all) == "true_wt" +pnca_snps_or = snps_all +pnca_snps_or = pnca_snps_or_copy +#remove true_wt entry +#w1 = which(snps_all) == "true_wt" +pnca_snps_or = snps_all +pnca_snps_or -> pnca_snps_or_copy +#=============== +# Step 4: Iterate through this unique list +# and calculate OR for each snp +# and assign to the pnca_snps_or df that has +# each row as a unique snp +#=============== +# reset original df so you don't make a mistake: IMPORTANT +pnca_snps_or = pnca_snps_or_copy #2133, 1 +# you only need it for the unique mutations +pnca_snps_or = unique(pnca_snps_or) #337, 1 +for (i in pnca_snps_unique){ +print(i) +#************* +# start logistic regression model building +#************* +# set the IV and DV for the logistic regression model +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +} +getwd() +#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work +setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad +#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac +getwd() +#=============== +# Step 1: read raw data +#=============== +raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv" +,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4 +raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4 +# combine the two mutation columns +raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5 +head(raw_data$all_mutations_pyrazinamide) +# create yet another column that contains all the mutations but in lower case +raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6 +table(grepl("pnca_p",raw_data$all_muts_pza)) +#FALSE TRUE +#10603 1908 +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +# subset a snall section to test +#pnca_snps_or_copy = pnca_snps_or +#pnca_snps_or = pnca_snps_or_copy +pnca_snps_unique = unique(pnca_snps_or$mutation) #293 +i2 = "pnca_p.trp68gly" # Should exist +grep(i2, pnca_snps_unique) +my_data = read.csv("../Data_original/meta_pza_with_AF.csv" +, stringsAsFactors = FALSE) #11374, 19 +# remove the first column +my_data = my_data[-1] #11374, 18 +# check if first col is 'id': should be TRUE +colnames(my_data)[1] == 'id' +# sanity check +head(my_data$mutation) +my_data = unique(my_data$mutation) +my_data[!duplicated(my_data$mutation)] +my_data_unique = my_data[!duplicated(my_data$mutation),] +my_data[!duplicated('mutation'),] +my_data_unique = my_data[!duplicated(my_data[,'mutation']),] +my_data_unique = my_data[!duplicated(my_data['mutation']),] +getwd() +setwd("/git/LSHTM_analysis/meta_data_analysis") +getwd() +getwd() +setwd("/git/github/LSHTM_analysis/meta_data_analysis") +getwd() +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F)) +c = file.choose() +c = file.choose(../Data_original) +c = read.csv(file.choose(), stringsAsFactors = F) +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv(file.choose(), stringsAsFactors = F)) +c = read.csv(file.choose(), stringsAsFactors = F) +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv(file.choose(), stringsAsFactors = F) +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv(file.choose(), stringsAsFactors = F) +raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")] +outdir = paste0("../mcsm_analysis",drug,"/Data/") +# define output variables +drug = 'pyrazinamide' +outdir = paste0("../mcsm_analysis",drug,"/Data/") +outdir = paste0("../mcsm_analysis/",drug,"/Data/") +outFile = "meta_data_with_AFandOR.csv" +output_filename = paste0(outdir, outFile) +output_filename diff --git a/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc b/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc new file mode 100644 index 0000000..1db3c83 Binary files /dev/null and b/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc differ diff --git a/meta_data_analysis/init_data_dirs.py b/meta_data_analysis/init_data_dirs.py new file mode 100755 index 0000000..8dd7446 --- /dev/null +++ b/meta_data_analysis/init_data_dirs.py @@ -0,0 +1,7 @@ +#!/usr/bin/python3 +# Initialise a blank 'Data' directory and drug subdirs etc. +# TODO: +# - Read base dir from config file +# - Create eg: '~/git/Data/{original,processed} +# - Create eg: '~/git/Data/processed/' + drug (for each drug) +# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}' diff --git a/meta_data_analysis/pnca_AF_and_OR_calcs.R b/meta_data_analysis/pnca_AF_and_OR_calcs.R new file mode 100644 index 0000000..3c84670 --- /dev/null +++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R @@ -0,0 +1,241 @@ +getwd() +setwd("/git/github/git/LSHTM_analysis/meta_data_analysis") +getwd() + +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv(file.choose(), stringsAsFactors = F) + +raw_data = infile[,c("id" + , "pyrazinamide" + , "dr_mutations_pyrazinamide" + , "other_mutations_pyrazinamide")] + +##### +# 1a: exclude na +##### +raw_data = raw_data[!is.na(raw_data$pyrazinamide),] + +total_samples = length(unique(raw_data$id)) +print(total_samples) + +# sanity check: should be true +is.numeric(total_samples) + +##### +# 1b: combine the two mutation columns +##### +raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide + , raw_data$other_mutations_pyrazinamide) +head(raw_data$all_mutations_pyrazinamide) + +##### +# 1c: create yet another column that contains all the mutations but in lower case +##### +raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) + +# sanity checks +table(grepl("pnca_p",raw_data$all_muts_pnca)) + +# sanity check: should be TRUE +sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples + +# set up variables: can be used for logistic regression as well +i = "pnca_p.ala134gly" # has a NA, should NOT exist +table(grepl(i,raw_data$all_muts_pnca)) + +i = "pnca_p.trp68gly" +table(grepl(i,raw_data$all_muts_pnca)) + +mut = grepl(i,raw_data$all_muts_pnca) +dst = raw_data$pyrazinamide +table(mut, dst) + +#chisq.test(table(mut,dst)) +#fisher.test(table(mut, dst)) +#table(mut) + +###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py) +pnca_snps_or = read.csv(file.choose() + , stringsAsFactors = F + , header = T) + +# extract unique snps to iterate over for AF and OR calcs +# total no of unique snps +# AF and OR calculations + +pnca_snps_unique = unique(pnca_snps_or$mutation) + +# Define OR function +x = as.numeric(mut) +y = dst +or = function(x,y){ + tab = as.matrix(table(x,y)) + a = tab[2,2] + if (a==0){ a<-0.5} + b = tab[2,1] + if (b==0){ b<-0.5} + c = tab[1,2] + if (c==0){ c<-0.5} + d = tab[1,1] + if (d==0){ d<-0.5} + (a/b)/(c/d) + + } + +dst = raw_data$pyrazinamide +ors = sapply(pnca_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_pnca) + or(mut,dst) +}) + +ors + +pvals = sapply(pnca_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_pnca) + fisher.test(mut,dst)$p.value +}) + +pvals + +afs = sapply(pnca_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_pnca) + mean(mut) +}) + +afs + +# check ..hmmm +afs['pnca_p.trp68gly'] +afs['pnca_p.gln10pro'] +afs['pnca_p.leu4ser'] + +#plot(density(log(ors))) +#plot(-log10(pvals)) +#hist(log(ors) +# ,breaks = 100 +# ) + +# subset df cols to add to the calc param df +pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] +pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),] + +rownames(pnca_snps_cols) = pnca_snps_cols$mutation +head(rownames(pnca_snps_cols)) +#snps_with_AF_and_OR + +# combine +comb_AF_and_OR = data.frame(ors, pvals, afs) +head(rownames(comb_AF_and_OR)) + +# sanity checks: should be the same +dim(comb_AF_and_OR); dim(pnca_snps_cols) +table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols)) + +table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR)) + +# merge the above two df whose dim you checked +snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols + , by = "row.names" +# , all.x = T + ) + +#rm(pnca_snps_cols, pnca_snps_or, raw_data) + +#=============== +# Step 3: Read data file where you will add the calculated OR +# Note: this is the big file with one-many relationship between snps and lineages +# i.e fname4 from 'pnca_extraction.py' +#=============== +my_data = read.csv(file.choose() + , row.names = 1 + , stringsAsFactors = FALSE) + +head(my_data) +length(unique(my_data$id)) + +# check if first col is 'id': should be TRUE +colnames(my_data)[1] == 'id' + +# sanity check +head(my_data$mutation) + +# FILES TO MERGE: +# comb_AF_and_OR: file containing OR +# my_data = big meta data file +# linking column: mutation + +head(my_data) +merged_df = merge(my_data # big file + , snps_with_AF_and_OR # small (afor file) + , by = "mutation" + , all.x = T) # because you want all the entries of the meta data + +# sanity checks: should be True +# FIXME: I have checked this manually, but make it so it is a pass or a fail! +comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors +merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors + +merged_df[merged_df$Mutationinformation.x == "Q10P",] + +# sanity check: very important! +colnames(merged_df) + +table(merged_df$mutation_info.x == merged_df$mutation_info.y) + +#FIXME: what happened to other 7 and FALSE +table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y) + +# problem +identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y) + +#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,] + +#throw away the y because that is a smaller df +d1 = which(colnames(merged_df) == "mutation_info.y") #21 +d2 = which(colnames(merged_df) == "Mutationinformation.y") #22 + +merged_df2 = merged_df[-c(d1, d2)] #3093 20 +colnames(merged_df2) + +# rename cols +colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info" +colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation" + +colnames(merged_df2) + +# should be 0 +sum(is.na(merged_df2$Mutationinformation)) + +# count na in each column +na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count +# only some or and Af should be NA +#Row.names ors pvals afs +#81 81 81 81 + + +colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR" +colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF" +colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue" + +colnames(merged_df2) + +# add log OR and neglog pvalue +merged_df2$logor = log(merged_df2$OR) +is.numeric(merged_df2$logor) + +merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) +is.numeric(merged_df2$neglog10pvalue) + +# write file out +#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv") + +# define output variables +drug = 'pyrazinamide' +out_dir = paste0("../mcsm_analysis/",drug,"/Data/") +outFile = "meta_data_with_AFandOR.csv" +output_filename = paste0(outdir, outFile) + +write.csv(merged_df2, output_filename + , row.names = F) diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py new file mode 100755 index 0000000..0eb5aab --- /dev/null +++ b/meta_data_analysis/pnca_data_extraction.py @@ -0,0 +1,626 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +""" + +# FIXME: include error checking to enure you only +# concentrate on positions that have structural info? + +#%% load libraries +################### +# load libraries +import os, sys +import pandas as pd +#import numpy as np + +#from pandas.api.types import is_string_dtype +#from pandas.api.types import is_numeric_dtype + +# to create dir +#my_dir = os.path.expanduser('~/some_dir') +#make sure mcsm_analysis/ exists +#or specify the output directory + +#%% +#%% +#%% +#======================================================== +# TASK: extract ALL pncA mutations from GWAS data +#======================================================== +#%% +#################### +# my working dir +os.getcwd() +homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#%% +from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! +#%% +#NOTE: Out_dir MUST exis +# User defined dir strpyrazinamide +drug = 'pyrazinamide' +gene = 'pnca' +out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/' +# = out_dir + drug +data_dir = homedir + '/git/Data' + +if not os.path.exists(data_dir): + print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data') + os.makedirs(data_dir) + die() + +if not os.path.exists(out_dir): + print('Error!', out_dir, 'does not exist. Please create it') + exit() + +#if not os.path.exists(work_dir): +# print('creating dir that does not exist', 'dir_name:', work_dir) +# os.makedirs(work_dir) +else: + print('Dir exists: Carrying on') + +# now create sub dir structure within work_dir +# pyrazinamide/mcsm_analysis + +# we need three dir +# Data +# Scripts + # Plotting +# Results + # Plots + +# create a list of dir names +#dir_names = ['Data', 'Scripts', 'Results'] + + +#for i in dir_names: +# this_dir = (work_dir + '/' + i) +# if not os.path.exists(this_dir): +# print('creating dir that does not exist:', this_dir) +# os.makedirs(this_dir) +#else: +# print('Dir exists: Carrying on') + +# Create sub dirs +# 1) +# Scripts + # Plotting +#subdir_plotting = work_dir + '/Scripts/Plotting' +#if not os.path.exists(subdir_plotting): +# print('creating dir that does not exist:', subdir_plotting) +# os.makedirs(subdir_plotting) +#else: +# print('Dir exists: Carrying on') + +# 2) +# Results + # Plots +#subdir_plots = work_dir + '/Results/Plots' +#if not os.path.exists(subdir_plots): +# print('creating dir that does not exist:', subdir_plots) +# os.makedirs(subdir_plots) +#else: +# print('Dir exists: Carrying on') + +# clear varaibles +#del(dir_names, drug, i, subdir_plots, subdir_plotting) + +#exit() +#%% +#============================================================================== +############ +# STEP 1: Read file original_tanushree_data_v2.csv +############ +data_file = data_dir + '/input/original/original_tanushree_data_v2.csv' +meta_data = pd.read_csv(data_file, sep = ',') + +# column names +list(meta_data.columns) + +# extract elevant columns to extract from meta data related to the pyrazinamide +meta_data = meta_data[['id' + ,'country' + ,'lineage' + ,'sublineage' + ,'drtype' + , 'pyrazinamide' + , 'dr_mutations_pyrazinamide' + , 'other_mutations_pyrazinamide' + ]] + +# checks +total_samples = meta_data['id'].nunique() # 19265 + +# counts NA per column +meta_data.isna().sum() + +# glance +meta_data.head() + +# equivalent of table in R +# pyrazinamide counts +meta_data.pyrazinamide.value_counts() + +#%% +############ +# STEP 2: extract entries containing selected genes: +# pyrazinamide: pnca_p. +# in the dr_mutations and other mutations" +# as we are interested in the mutations in the protein coding region only +# (corresponding to a structure) +# and drop the entries with NA +############# +meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] + +del(meta_pza) + +########################## +# pyrazinamide: pnca_p. +########################## +meta_data_pnca = meta_data[['id' + ,'country' + ,'lineage' + ,'sublineage' + ,'drtype' + , 'pyrazinamide' + , 'dr_mutations_pyrazinamide' + , 'other_mutations_pyrazinamide' + ]] + +del(meta_data) + +# sanity checks + +# dr_mutations only +meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +meta_pnca_dr['id'].nunique() +del(meta_pnca_dr) + +# other mutations +meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +meta_pnca_other['id'].nunique() +del(meta_pnca_other) + +# Now extract "all" mutations +meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ] + +meta_pnca_all['id'].nunique() +pnca_samples = len(meta_pnca_all) +pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() +comp_pnca_samples = pnca_samples - pnca_na + +#=#=#=#=#=#=# +# COMMENT: use it later to check number of complete samples from LF data +#=#=#=#=#=#=# + +# sanity checks +meta_pnca_all.dr_mutations_pyrazinamide.value_counts() +meta_pnca_all.other_mutations_pyrazinamide.value_counts() + +# more sanity checks +# !CAUTION!: muts will change depending on your gene + +# dr muts : insert +meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # +meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty +meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')] + +meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows +m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows + +# other_muts +meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty +meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] + +#=#=#=#=#=#=#=#=#=# +# FIXME +# COMMENTS: both mutations columns are separated by ; +# CHECK if there are mutations that exist both in dr and other_muts! +# doesn't make any sense for the same mut to exist in both, I would have thought! +#=#=#=#=#=#=#=#=#=# + +# remove not required variables +del(meta_data_pnca) + +############ +# STEP 3: split the columns of +# a) dr_mutation_... (;) as +# the column has snps related to multiple genes. +# useful links +# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows +# this one works beautifully +# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows +############ + +# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations) +meta_pnca_all.isna().sum() + +#=#=#=#=#=#=#=#=#=# +# COMMENT: no NA's in dr_mutations/other_mutations_columns +#=#=#=#=#=#=#=#=#=# +# define the split function +def tidy_split(df, column, sep='|', keep=False): + """ + Split the values of a column and expand so the new DataFrame has one split + value per row. Filters rows where the column is missing. + + Params + ------ + df : pandas.DataFrame + dataframe with the column to split and expand + column : str + the column to split and expand + sep : str + the string used to split the column's values + keep : bool + whether to retain the presplit value as it's own row + + Returns + ------- + pandas.DataFrame + Returns a dataframe with the same columns as `df`. + """ + indexes = list() + new_values = list() + #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case + for i, presplit in enumerate(df[column].astype(str)): + values = presplit.split(sep) + if keep and len(values) > 1: + indexes.append(i) + new_values.append(presplit) + for value in values: + indexes.append(i) + new_values.append(value) + new_df = df.iloc[indexes, :].copy() + new_df[column] = new_values + return new_df + +######## +# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces +#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas +######## +meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') + +# remove leading white space else these are counted as distinct mutations as well +meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() + +######## +# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces +######## +meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') + +# remove the leading white spaces in the column +meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() + +########## +# Step 4: Reshape data so that all mutations are in one column and the +# annotations for the mutation reflect the column name +# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/ + +# data frame “df” is passed to melt() function +# id_vars is the variable which need to be left unaltered +# var_name are the column names so we named it as 'mutation_info' +# value_name are its values so we named it as 'mutation' +########## +meta_pnca_WF1.columns + +meta_pnca_LF0 = pd.melt(meta_pnca_WF1 + , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide'] + , var_name = 'mutation_info' + , value_name = 'mutation') + +# sanity check: should be true +if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2: + print('sanity check passed: Long format df has the expected length') +else: + print("Sanity check failed: Debug please!") + +########### +# Step 5: This is still dirty data. Filter LF data so that you only have +# mutations corresponding to pnca_p. +# this will be your list you run OR calcs +########### +meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] + +# sanity checks +# unique samples +meta_pnca_LF1['id'].nunique() +if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique(): + print("Sanity check passed: No of samples with pncA mutations match") +else: + print("Sanity check failed: Debug please!") + +# count if all the mutations are indeed in the protein coding region +# i.e begin with pnca_p +meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093 + +# should be true. +# and check against the length of the df, which should match +if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum(): + print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region") +else: + print("Sanity check failed: Debug please!") + +########### +# Step 6: Filter dataframe with "na" in the drug column +# This is because for OR, you can't use the snps that have the +# NA in the specified drug column +# it creates problems when performing calcs in R inside the loop +# so best to filter it here +########### +# NOT NEEDED FOR all snps, only for extracting valid OR snps +del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all) + +########### +# Step 7: count unique pncA_p mutations (all and comp cases) +########### +meta_pnca_LF1['mutation'].nunique() +meta_pnca_LF1.groupby('mutation_info').nunique() + +meta_pnca_LF1['id'].nunique() +meta_pnca_LF1['mutation'].nunique() +meta_pnca_LF1.groupby('id').nunique() + +########### +# Step 8: convert all snps only (IN LOWERCASE) +# because my mcsm file intergated has lowercase +########### +# convert mutation to lower case as it needs to exactly match the dict key +#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc +meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower() + +########### +# Step 9 : Split 'mutation' column into three: wild_type, position and +# mutant_type separately. Then map three letter code to one from the +# referece_dict imported pncaeady. First convert to mutation to lowercase +# to allow to match entries from dict +########### +#======= +# Step 9a: iterate through the dict, create a lookup dict i.e +# lookup_dict = {three_letter_code: one_letter_code}. +# lookup dict should be the key and the value (you want to create a column for) +# Then use this to perform the mapping separetly for wild type and mutant cols. +# The three letter code is extracted using a regex match from the dataframe and then converted +# to 'pandas series'since map only works in pandas series +#======= +# initialise a sub dict that is a lookup dict for three letter code to one +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['one_letter_code'] + wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on + meta_pnca_LF1['wild_type'] = wt.map(lookup_dict) + mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict) + +# extract position info from mutation column separetly using regex +meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') + +# clear variables +del(k, v, wt, mut, lookup_dict) + +#========= +# Step 9b: iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_water} +# Do this for both wild_type and mutant as above. +#========= +# initialise a sub dict that is lookup dict for three letter code to aa prop +lookup_dict = dict() + +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_water'] + #print(lookup_dict) + wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on + meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict) + mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict) + +# added two more cols + +# clear variables +del(k, v, wt, mut, lookup_dict) + +#======== +# Step 9c: iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_polarity} +# Do this for both wild_type and mutant as above. +#========= +# initialise a sub dict that is lookup dict for three letter code to aa prop +lookup_dict = dict() + +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_polarity'] + #print(lookup_dict) + wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on + meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict) + mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict) + +# added two more cols + +# clear variables +del(k, v, wt, mut, lookup_dict) + +######## +# Step 10: combine the wild_type+poistion+mutant_type columns to generate +# Mutationinformation (matches mCSM output field) +# Remember to use .map(str) for int col types to allow string concatenation +######### +meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type'] + +#=#=#=#=#=#=# +# Step 11: +# COMMENT: there is more processing in the older version of this script +# consult if necessary +# possibly due to the presence of true_wt +# since this file doesn't contain any true_wt, we won't need it(hopefully!) +#=#=#=#=#=#=# + +#%% +########### +# Step 12: Output files for only SNPs to run mCSM +########### + +#========= +# Step 12a: all SNPs to run mCSM +#========= +snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) +pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) + +# assign meaningful colnames +#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True) +#list(snps_only.columns) +snps_only.isna().sum() # should be 0 + +# output csv: all SNPS for mCSM analysis +# specify variable name for output file +gene = 'pnca' +#drug = 'pyrazinamide' +my_fname1 = '_snps_' +nrows = len(snps_only) + +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' +#output_file_path = work_dir + '/Data/' +output_file_path = data_dir + '/input/processed/' + drug + '/' + +if not os.path.exists(output_file_path): + print( output_file_path, 'does not exist. Creating') + os.makedirs(output_file_path) + exit() + +output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' +print(output_filename) #<<<- check + +# write to csv: without column or row names +# Bad practice: numbers at the start of a filename +snps_only.to_csv(output_filename, header = False, index = False) + +#========= +# Step 12b: all snps with annotation +#========= +# all snps, selected cols +#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage' +# , 'drtype', 'pyrazinamide' +# , 'mutation_info', 'mutation', 'Mutationinformation']] + +#len(pnca_snps_ALL) + +# sanity check +#meta_pnca_LF1['mutation'].nunique() + +# output csv: WITH column but WITHOUT row names(all snps with meta data) +# specify variable name for output file +#gene = 'pnca' +#drug = 'pyrazinamide' +#my_fname2 = '_snps_with_metadata_' +#nrows = len(pnca_snps_ALL) + +#output_file_path = work_dir + '/Data/' +#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv' +#print(output_filename) #<<<- check + +# write out file +#pnca_snps_ALL.to_csv(output_filename, header = True, index = False) + +#========= +# Step 12c: comp snps for OR calcs with annotation +#========= +# remove all NA's from pyrazinamide column from LF1 + +# counts NA per column +meta_pnca_LF1.isna().sum() + +# remove NA +meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide']) + +# sanity checks +# should be True +len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum() + +# unique counts +meta_pnca_LF2['mutation'].nunique() + +meta_pnca_LF2.groupby('mutation_info').nunique() + +# sanity check +meta_pnca_LF2['id'].nunique() + +# should be True +if meta_pnca_LF2['id'].nunique() == comp_pnca_samples: + print ('sanity check passed: complete numbers match') +else: + print('Error: Please Debug!') + +# value counts +meta_pnca_LF2.mutation.value_counts() +#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size() + +# valid/comp snps +# uncomment as necessary +pnca_snps_COMP = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique()) +len(pnca_snps_COMP) + +# output csv: WITH column but WITHOUT row names (COMP snps with meta data) +# specify variable name for output file + +gene = 'pnca' +#drug = 'pyrazinamide' +my_fname3 = '_comp_snps_with_metadata_' +nrows = len(pnca_snps_COMP) + +#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv' +#print(output_filename) #<<<-check + +# write out file +#pnca_snps_COMP.to_csv(output_filename, header = True, index = False) + + +#========= +# Step 12d: comp snps only +#========= +# output csv: comp SNPS for info (i.e snps for which OR exist) +# specify variable name for output file + +snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique()) + +gene = 'pnca' +#drug = 'pyrazinamide' +my_fname1 = '_comp_snps_' +nrows = len(snps_only) + +output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' +print(output_filename) #<<<- check + +# write to csv: without column or row names +snps_only.to_csv(output_filename, header = False, index = False) + + +#=#=#=#=#=#=#=# +# COMMENT: LF1 is the file to extract all unique snps for mcsm +# but you have that already in file called pnca_snps... +# LF2: is the file for extracting snps tested for DS and hence OR calcs +#=#=#=#=#=#=#=# + +########### +# Step 13 : Output the whole df i.e +# file for meta_data which is now formatted with +# each row as a unique snp rather than the original version where +# each row is a unique id +# ***** This is the file you will ADD the AF and OR calculations to ***** +########### + +# output csv: the entire DF +# specify variable name for output file +gene = 'pnca' +#drug = 'pyrazinamide' +my_fname4 = '_metadata' +#nrows = len(meta_pnca_LF1) +output_filename = output_file_path + gene + my_fname4 + '.csv' +print(output_filename) #<<<-check + +# write out file +meta_pnca_LF1.to_csv(output_filename) diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py new file mode 100755 index 0000000..bf6561b --- /dev/null +++ b/meta_data_analysis/reference_dict.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jun 18 11:32:28 2019 + +@author: tanushree +""" +############################################ +#load libraries +import pandas as pd +import os +############################################# + +#!#########################! +# REQUIREMNETS: +# Data_original/ must exist +# containing GWAS data +#!#########################! + +print(os.getcwd()) +homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde +os.chdir(homedir + '/git/Data/input/original') +print(os.getcwd()) +#========== +#read file +#========== +my_aa = pd.read_csv('aa_codes.csv') #20, 6 +#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index +#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column +my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 + +#========================================================= +#convert file to dict of dicts +#========================================================= +#convert each row into a dict of dicts so that there are 20 aa and 5 keys within +#with your choice of column name that you have assigned to index as the "primary key". +#using 'index' creates a dict of dicts +#using 'records' creates a list of dicts +my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys + +#================================================ +#dict of aa with their corresponding properties +#This is defined twice +#================================================ +#7 categories: no overlap +qualities1 = { ('R', 'H', 'K'): 'Basic' + , ('D', 'E'): 'Acidic' + , ('N', 'Q'): 'Amidic' + , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic' + , ('S', 'T'): 'Hydroxylic' + , ('F', 'W', 'Y'): 'Aromatic' + , ('C', 'M'): 'Sulphur' +} + +#9 categories: allowing for overlap +qualities2 = { ('R', 'H', 'K'): 'Basic' + , ('D', 'E'): 'Acidc' + , ('S', 'T', 'N', 'Q'): 'Polar' + , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic' + , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic' + , ('S', 'G', 'A', 'P'): 'Small' + , ('F', 'W', 'Y', 'H'): 'Aromatic' + , ('V', 'I', 'L', 'M'): 'Aliphatic' + , ('C', 'G', 'P'): 'Special' +} + +qualities_taylor = { ('R', 'H', 'K'): 'Basic' + , ('D', 'E'): 'Acidc' + , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar' + , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic' + #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR! + , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small' + , ('F', 'W', 'Y', 'H'): 'Aromatic' + , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle! + , ('C', 'G', 'P'): 'Special' +} + +qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic' + , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic' +} + +qualities_polarity = { ('D', 'E'): 'acidic' + , ('H', 'K', 'R'): 'basic' + , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral' + , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar' +} + +#============================================================================== +#adding amino acid properties to my dict of dicts +for k, v in my_aa_dict.items(): + #print (k,v) + v['aa_prop1'] = str() #initialise keys + v['aa_prop2'] = list() #initialise keys (allows for overalpping properties) + v['aa_taylor'] = list() #initialise keys (allows for overalpping properties) + v['aa_prop_water'] = str() #initialise keys + v['aa_prop_polarity'] = str() #initialise keys + + for group in qualities1: + if v['one_letter_code'] in group: + v['aa_prop1']+= qualities1[group] # += for str concat + + for group in qualities2: + if v['one_letter_code'] in group: + v['aa_prop2'].append(qualities2[group]) # append to list + + for group in qualities_taylor: + if v['one_letter_code'] in group: + v['aa_taylor'].append(qualities_taylor[group]) # append to list + + for group in qualities_water: + if v['one_letter_code'] in group: + v['aa_prop_water']+= qualities_water[group] # += for str concat + + for group in qualities_polarity: + if v['one_letter_code'] in group: + v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat + +#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa +#============================================================================== + + diff --git a/mk-drug-dirs.sh b/mk-drug-dirs.sh new file mode 100755 index 0000000..56fb820 --- /dev/null +++ b/mk-drug-dirs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Create a drug directory structure for processing +# +# +# Structure: +# +# $DATA_DIR/$DRUG/input +# |- original +# |- processed +# |- structure +# +# $DATA_DIR/$DRUG/output +# |- plots +# |- structure + +DATA_DIR=~/git/Data + +if [[ $1 == '' ]]; then + echo "usage: mk-drug-dirs.sh "; + exit; +else + DRUG=$1 + echo Creating structure for: $DRUG + + if [ -d $DATA_DIR ] + then + echo Doing creation in $DATA_DIR + mkdir -p $DATA_DIR/$DRUG/input/original + mkdir -p $DATA_DIR/$DRUG/input/processed + mkdir -p $DATA_DIR/$DRUG/input/structure + mkdir -p $DATA_DIR/$DRUG/output/plots + mkdir -p $DATA_DIR/$DRUG/output/structure + + else + echo "Error: $DATA_DIR does not exist. Did you check it out of git?" + exit + fi + +fi +