From bccfe68192b50ea9089f03bfc26dca775f14a30a Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 8 Jan 2020 16:15:33 +0000 Subject: [PATCH] import commit --- README.md | 59 ++ mcsm.conf | 12 + mcsm_analysis/pyrazinamide/scripts/.Rhistory | 512 ++++++++++++++ .../pyrazinamide/scripts/Header_TT.R | 129 ++++ .../scripts/barplot_colour_function.R | 27 + .../pyrazinamide/scripts/combining_two_df.R | 299 +++++++++ .../scripts/combining_two_df_lig.R | 348 ++++++++++ .../mcsm/step0_check_duplicate_SNPs.sh | 25 + ...ep1_mCSMLig_curl_submit_store_outputurl.sh | 72 ++ ...step2_mCSM_LIG_batch_outputurls_results.sh | 59 ++ ...step3a_mCSM_LIG_regex_output_formatting.sh | 52 ++ .../scripts/mcsm/step3b_format_results.py | 29 + .../scripts/mcsm/step3c_data_cleaning.R | 207 ++++++ .../scripts/mcsm/step4_normalise.R | 252 +++++++ .../scripts/mcsm_mean_stability.R | 131 ++++ .../pyrazinamide/scripts/plotting/.RData | Bin 0 -> 43777 bytes .../pyrazinamide/scripts/plotting/.Rhistory | 0 .../plotting/OR_PS_Ligand_combined_plot.R | 250 +++++++ .../scripts/plotting/barplots_2colours_LIG.R | 154 +++++ .../scripts/plotting/barplots_2colours_PS.R | 149 +++++ .../plotting/barplots_subcolours_LIG.R | 202 ++++++ .../scripts/plotting/barplots_subcolours_PS.R | 192 ++++++ .../scripts/plotting/basic_barplots_LIG.R | 215 ++++++ .../scripts/plotting/basic_barplots_PS.R | 211 ++++++ .../scripts/plotting/corr_plots_v3_PS.R | 175 +++++ .../scripts/plotting/corr_plots_v3_lig.R | 187 ++++++ .../scripts/plotting/lineage_basic_barplot.R | 227 +++++++ .../scripts/plotting/lineage_dist_LIG.R | 233 +++++++ .../scripts/plotting/lineage_dist_PS.R | 212 ++++++ mcsm_analysis/pyrazinamide/scripts/read_pdb.R | 27 + .../pyrazinamide/scripts/replaceBfactor_pdb.R | 386 +++++++++++ .../pyrazinamide/scripts/source_data_checks.R | 257 +++++++ meta_data_analysis/.Rhistory | 512 ++++++++++++++ .../__pycache__/reference_dict.cpython-37.pyc | Bin 0 -> 2203 bytes meta_data_analysis/init_data_dirs.py | 7 + meta_data_analysis/pnca_AF_and_OR_calcs.R | 241 +++++++ meta_data_analysis/pnca_data_extraction.py | 626 ++++++++++++++++++ meta_data_analysis/reference_dict.py | 121 ++++ mk-drug-dirs.sh | 40 ++ 39 files changed, 6837 insertions(+) create mode 100644 README.md create mode 100644 mcsm.conf create mode 100644 mcsm_analysis/pyrazinamide/scripts/.Rhistory create mode 100644 mcsm_analysis/pyrazinamide/scripts/Header_TT.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/combining_two_df.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py create mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/.RData create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/read_pdb.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/source_data_checks.R create mode 100644 meta_data_analysis/.Rhistory create mode 100644 meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc create mode 100755 meta_data_analysis/init_data_dirs.py create mode 100644 meta_data_analysis/pnca_AF_and_OR_calcs.R create mode 100755 meta_data_analysis/pnca_data_extraction.py create mode 100755 meta_data_analysis/reference_dict.py create mode 100755 mk-drug-dirs.sh diff --git a/README.md b/README.md new file mode 100644 index 0000000..1fe45f7 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +mCSM Analysis +============= + +This repo does mCSM analysis using Python (Pandas, numpy), bash and R. + +Requires an additional 'Data' directory (Supplied separately). Batteries not included. + + # directory struc + + # Assumptions + + 1. git repos are cloned to `~/git` + 2. The script + +## LSHTM\_analysis: Repo + + subdirs + + needs to call `../Data/input/original/` + + meta\_data\_analysis/ + *.R + *.py + + needs to call `../Data/input/processed/` + needs to output `../Data/output/results/` + + mcsm\_analysis/ + / (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon") + scripts/ (changed from `Scripts/`) + *.R + *.py + mcsm/ + *.sh + *.py + *.R + plotting/ + *.R + + +Data: Repo: + + # subdirs + + input/ + original/ + processed/ + + output/ + / (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon") + results/ + *.csv + *.xlsx + *.doc + *.txt + plots/ + structure/ + +More docs here as I write them. diff --git a/mcsm.conf b/mcsm.conf new file mode 100644 index 0000000..5310227 --- /dev/null +++ b/mcsm.conf @@ -0,0 +1,12 @@ +# This is not yet used, but will be soon :) +[DEFAULT] +mcsm_home = /home/tanu/git/github/LSHTM_analysis + +[pyrazinamide] + gene = 'pnca' + server = 'http://mcsm.melb.ac.uk/' + +[monkeyamide] + gene = 'abcs' + server = 'http://myserver.local/' + diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory new file mode 100644 index 0000000..8055405 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory @@ -0,0 +1,512 @@ +########################### +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) +# quick checks +colnames(my_df) +str(my_df) +########################### +# Data for bfactor figure +# PS average +# Lig average +########################### +head(my_df$Position) +head(my_df$ratioDUET) +# order data frame +df = my_df[order(my_df$Position),] +head(df$Position) +head(df$ratioDUET) +#*********** +# PS: average by position +#*********** +mean_DUET_by_position <- df %>% +group_by(Position) %>% +summarize(averaged.DUET = mean(ratioDUET)) +#*********** +# Lig: average by position +#*********** +mean_Lig_by_position <- df %>% +group_by(Position) %>% +summarize(averaged.Lig = mean(ratioPredAff)) +#*********** +# cbind:mean_DUET_by_position and mean_Lig_by_position +#*********** +combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) +# sanity check +# mean_PS_Lig_Bfactor +colnames(combined) +colnames(combined) = c("Position" +, "average_DUETR" +, "Position2" +, "average_PredAffR") +colnames(combined) +identical(combined$Position, combined$Position2) +n = which(colnames(combined) == "Position2"); n +combined_df = combined[,-n] +max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) +max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) +#============= +# output csv +#============ +outDir = "~/Data/pyrazinamide/input/processed/" +outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") +print(paste0("Output file with path will be:","", outFile)) +head(combined_df$Position); tail(combined_df$Position) +write.csv(combined_df, outFile +, row.names = F) +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() +######################################################################## +# Installing and loading required packages # +######################################################################## +source("../Header_TT.R") +#source("barplot_colour_function.R") +require(data.table) +require(dplyr) +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## +source("../combining_two_df.R") +########################### +# This will return: +# df with NA: +# merged_df2 +# merged_df3 +# df without NA: +# merged_df2_comp +# merged_df3_comp +########################### +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION +########################### +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) +# quick checks +colnames(my_df) +str(my_df) +########################### +# Data for bfactor figure +# PS average +# Lig average +########################### +head(my_df$Position) +head(my_df$ratioDUET) +# order data frame +df = my_df[order(my_df$Position),] +head(df$Position) +head(df$ratioDUET) +#*********** +# PS: average by position +#*********** +mean_DUET_by_position <- df %>% +group_by(Position) %>% +summarize(averaged.DUET = mean(ratioDUET)) +#*********** +# Lig: average by position +#*********** +mean_Lig_by_position <- df %>% +group_by(Position) %>% +summarize(averaged.Lig = mean(ratioPredAff)) +#*********** +# cbind:mean_DUET_by_position and mean_Lig_by_position +#*********** +combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) +# sanity check +# mean_PS_Lig_Bfactor +colnames(combined) +colnames(combined) = c("Position" +, "average_DUETR" +, "Position2" +, "average_PredAffR") +colnames(combined) +identical(combined$Position, combined$Position2) +n = which(colnames(combined) == "Position2"); n +combined_df = combined[,-n] +max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) +max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) +#============= +# output csv +#============ +outDir = "~/git/Data/pyrazinamide/input/processed/" +outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") +print(paste0("Output file with path will be:","", outFile)) +head(combined_df$Position); tail(combined_df$Position) +write.csv(combined_df, outFile +, row.names = F) +# read in pdb file complex1 +inDir = "~/git/Data/pyrazinamide/input/structure" +inFile = paste0(inDir, "complex1_no_water.pdb") +# read in pdb file complex1 +inDir = "~/git/Data/pyrazinamide/input/structure/" +inFile = paste0(inDir, "complex1_no_water.pdb") +complex1 = inFile +my_pdb = read.pdb(complex1 +, maxlines = -1 +, multi = FALSE +, rm.insert = FALSE +, rm.alt = TRUE +, ATOM.only = FALSE +, hex = FALSE +, verbose = TRUE) +######################### +#3: Read complex pdb file +########################## +source("Header_TT.R") +# list of 8 +my_pdb = read.pdb(complex1 +, maxlines = -1 +, multi = FALSE +, rm.insert = FALSE +, rm.alt = TRUE +, ATOM.only = FALSE +, hex = FALSE +, verbose = TRUE) +rm(inDir, inFile) +#====== end of script +inDir = "~/git/Data/pyrazinamide/input/structure/" +inFile = paste0(inDir, "complex1_no_water.pdb") +complex1 = inFile +complex1 = inFile +my_pdb = read.pdb(complex1 +, maxlines = -1 +, multi = FALSE +, rm.insert = FALSE +, rm.alt = TRUE +, ATOM.only = FALSE +, hex = FALSE +, verbose = TRUE) +inFile +inDir = "~/git/Data/pyrazinamide/input/structure/" +inFile = paste0(inDir, "complex1_no_water.pdb") +complex1 = inFile +#inFile2 = paste0(inDir, "complex2_no_water.pdb") +#complex2 = inFile2 +# list of 8 +my_pdb = read.pdb(complex1 +, maxlines = -1 +, multi = FALSE +, rm.insert = FALSE +, rm.alt = TRUE +, ATOM.only = FALSE +, hex = FALSE +, verbose = TRUE) +rm(inDir, inFile, complex1) +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() +source("Header_TT.R") +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() +######################################################################## +# Installing and loading required packages # +######################################################################## +source("Header_TT.R") +######################################################### +# TASK: replace B-factors in the pdb file with normalised values +# use the complex file with no water as mCSM lig was +# performed on this file. You can check it in the script: read_pdb file. +######################################################### +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F +, header = T) +str(my_df) +source("read_pdb.R") # list of 8 +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] +# make a copy: required for downstream sanity checks +d2 = d +# sanity checks: B factor +max(d$b); min(d$b) +par(oma = c(3,2,3,0) +, mar = c(1,3,5,2) +, mfrow = c(3,2)) +#par(mfrow = c(3,2)) +#1: Original B-factor +hist(d$b +, xlab = "" +, main = "B-factor") +plot(density(d$b) +, xlab = "" +, main = "B-factor") +# 2: DUET scores +hist(my_df$average_DUETR +, xlab = "" +, main = "Norm_DUET") +plot(density(my_df$average_DUETR) +, xlab = "" +, main = "Norm_DUET") +# Set the margin on all sides +par(oma = c(3,2,3,0) +, mar = c(1,3,5,2) +, mfrow = c(3,2)) +#par(mfrow = c(3,2)) +#1: Original B-factor +hist(d$b +, xlab = "" +, main = "B-factor") +plot(density(d$b) +, xlab = "" +, main = "B-factor") +# 2: DUET scores +hist(my_df$average_DUETR +, xlab = "" +, main = "Norm_DUET") +plot(density(my_df$average_DUETR) +, xlab = "" +, main = "Norm_DUET") +#========= +# step 1_P1 +#========= +# Be brave and replace in place now (don't run sanity check) +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] +#========= +# step 2_P1 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na +# count number of 0's in Bactor +sum(d$b == 0) +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 +# sanity check: should be 0 +sum(is.na(d$b)) +# sanity check: should be True +if (sum(d$b == 0) == b_na){ +print ("Sanity check passed: NA's replaced with 0's successfully") +} else { +print("Error: NA replacement NOT successful, Debug code!") +} +max(d$b); min(d$b) +# sanity checks: should be True +if(max(d$b) == max(my_df$average_DUETR)){ +print("Sanity check passed: B-factors replaced correctly") +} else { +print ("Error: Debug code please") +} +if (min(d$b) == min(my_df$average_DUETR)){ +print("Sanity check passed: B-factors replaced correctly") +} else { +print ("Error: Debug code please") +} +#========= +# step 3_P1 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) +#========= +# step 4_P1 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d +max(d$b); min(d$b) +#========= +# step 5_P1 +#========= +# output dir +getwd() +outDir = "~/git/Data/pyrazinamide/output/" +getwd() +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb") +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile +outDir = "~/git/Data/pyrazinamide/input/structure" +outDir = "~/git/Data/pyrazinamide/input/structure/" +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile +write.pdb(my_pdb, outFile) +hist(d$b +, xlab = "" +, main = "repalced-B") +plot(density(d$b) +, xlab = "" +, main = "replaced-B") +# graph titles +mtext(text = "Frequency" +, side = 2 +, line = 0 +, outer = TRUE) +mtext(text = "DUET_stability" +, side = 3 +, line = 0 +, outer = TRUE) +#========================================================= +# Processing P2: Replacing B values with PredAff Scores +#========================================================= +# clear workspace +rm(list = ls()) +#========================================================= +# Processing P2: Replacing B values with PredAff Scores +#========================================================= +# clear workspace +rm(list = ls()) +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile +my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv" +# , row.names = 1 +# , stringsAsFactors = F +, header = T) +str(my_df) +#========================================================= +# Processing P2: Replacing B factor with mean ratioLig scores +#========================================================= +######################### +# 3: Read complex pdb file +# form the R script +########################## +source("read_pdb.R") # list of 8 +# extract atom list into a vari +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F +, header = T) +str(my_df) +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] +# make a copy: required for downstream sanity checks +d2 = d +# sanity checks: B factor +max(d$b); min(d$b) +par(oma = c(3,2,3,0) +, mar = c(1,3,5,2) +, mfrow = c(3,2)) +#par(mfrow = c(3,2)) +# 1: Original B-factor +hist(d$b +, xlab = "" +, main = "B-factor") +plot(density(d$b) +, xlab = "" +, main = "B-factor") +# 2: Pred Aff scores +hist(my_df$average_PredAffR +, xlab = "" +, main = "Norm_lig_average") +plot(density(my_df$average_PredAffR) +, xlab = "" +, main = "Norm_lig_average") +# 3: After the following replacement +#******************************** +par(oma = c(3,2,3,0) +, mar = c(1,3,5,2) +, mfrow = c(3,2)) +#par(mfrow = c(3,2)) +# 1: Original B-factor +hist(d$b +, xlab = "" +, main = "B-factor") +plot(density(d$b) +, xlab = "" +, main = "B-factor") +# 2: Pred Aff scores +hist(my_df$average_PredAffR +, xlab = "" +, main = "Norm_lig_average") +plot(density(my_df$average_PredAffR) +, xlab = "" +, main = "Norm_lig_average") +# 3: After the following replacement +#******************************** +#========= +# step 1_P2: BE BRAVE and replace in place now (don't run step 0) +#========= +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] +#========= +# step 2_P2 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na +# count number of 0's in Bactor +sum(d$b == 0) +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 +# sanity check: should be 0 +sum(is.na(d$b)) +if (sum(d$b == 0) == b_na){ +print ("Sanity check passed: NA's replaced with 0's successfully") +} else { +print("Error: NA replacement NOT successful, Debug code!") +} +max(d$b); min(d$b) +# sanity checks: should be True +if (max(d$b) == max(my_df$average_PredAffR)){ +print("Sanity check passed: B-factors replaced correctly") +} else { +print ("Error: Debug code please") +} +if (min(d$b) == min(my_df$average_PredAffR)){ +print("Sanity check passed: B-factors replaced correctly") +} else { +print ("Error: Debug code please") +} +#========= +# step 3_P2 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) +#========= +# step 4_P2 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d +max(d$b); min(d$b) +#========= +# step 5_P2 +#========= +write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb") +# output dir +getwd() +# output dir +outDir = "~/git/Data/pyrazinamide/input/structure/" +outFile = paste0(outDir, "complex1_BwithNormLIG.pdb") +outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile +write.pdb(my_pdb, outFile) diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R new file mode 100644 index 0000000..2069fa7 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R @@ -0,0 +1,129 @@ +######################################################### +### A) Installing and loading required packages +######################################################### + +#if (!require("gplots")) { +# install.packages("gplots", dependencies = TRUE) +# library(gplots) +#} + +if (!require("tidyverse")) { + install.packages("tidyverse", dependencies = TRUE) + library(tidyverse) +} + +if (!require("ggplot2")) { + install.packages("ggplot2", dependencies = TRUE) + library(ggplot2) +} + +if (!require("cowplot")) { + install.packages("copwplot", dependencies = TRUE) + library(ggplot2) +} + +if (!require("ggcorrplot")) { + install.packages("ggcorrplot", dependencies = TRUE) + library(ggcorrplot) +} + +if (!require("ggpubr")) { + install.packages("ggpubr", dependencies = TRUE) + library(ggpubr) +} + +if (!require("RColorBrewer")) { + install.packages("RColorBrewer", dependencies = TRUE) + library(RColorBrewer) +} + +if (!require ("GOplot")) { + install.packages("GOplot") + library(GOplot) +} + +if(!require("VennDiagram")) { + + install.packages("VennDiagram", dependencies = T) + library(VennDiagram) +} + +if(!require("scales")) { + + install.packages("scales", dependencies = T) + library(scales) +} + +if(!require("plotrix")) { + + install.packages("plotrix", dependencies = T) + library(plotrix) +} + +if(!require("stats")) { + + install.packages("stats", dependencies = T) + library(stats) +} + +if(!require("stats4")) { + + install.packages("stats4", dependencies = T) + library(stats4) +} + +if(!require("data.table")) { + library(stats4) +} + +if (!require("PerformanceAnalytics")){ + install.packages("PerformanceAnalytics", dependencies = T) + library(PerformaceAnalytics) +} + +if (!require ("GGally")){ + install.packages("GGally") + library(GGally) +} + +if (!require ("corrr")){ + install.packages("corrr") + library(corrr) +} + +if (!require ("psych")){ + install.packages("psych") + library(psych) +} + +if (!require ("dplyr")){ + install.packages("dplyr") + library(psych) +} + + +if (!require ("compare")){ + install.packages("compare") + library(psych) +} + +if (!require ("arsenal")){ + install.packages("arsenal") + library(psych) +} + + +####TIDYVERSE +# Install +#if(!require(devtools)) install.packages("devtools") +#devtools::install_github("kassambara/ggcorrplot") + +library(ggcorrplot) + + +###for PDB files +#install.packages("bio3d") +if(!require(bio3d)){ + install.packages("bio3d") + library(bio3d) +} diff --git a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R new file mode 100644 index 0000000..a3cc403 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R @@ -0,0 +1,27 @@ +######################################################### +# 1b: Define function: coloured barplot by subgroup +# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar +######################################################### + +ColourPalleteMulti <- function(df, group, subgroup){ + + # Find how many colour categories to create and the number of colours in each + categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) + , df + , function(x) length(unique(x))) + # return(categories) } + + category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete + + category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom + + #return(category.start); return(category.end)} + + # Build Colour pallette + colours <- unlist(lapply(1:nrow(categories), + function(i){ + colorRampPalette(colors = c(category.start[i] + , category.end[i]))(categories[i,2])})) + return(colours) +} +######################################################### \ No newline at end of file diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R new file mode 100644 index 0000000..57e476a --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R @@ -0,0 +1,299 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/") +getwd() + +######################################################### +# TASK: To combine mcsm and meta data with af and or +######################################################### + + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("Header_TT.R") +#require(data.table) +#require(arsenal) +#require(compare) +#library(tidyverse) + +################################# +# Read file: normalised file +# output of step 4 mcsm_pipeline +################################# + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile + +mcsm_data = read.csv(inFile + , row.names = 1 + , stringsAsFactors = F + , header = T) +rm(inDir, inFile) + +str(mcsm_data) + +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) + +# spelling Correction 1: DUET +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) +head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) + +# spelling Correction 2: Ligand +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) + +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) +head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) + +# count na in each column +na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count + +# sort by Mutationinformation +mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] +head(mcsm_data$Mutationinformation) + +# get freq count of positions and add to the df +setDT(mcsm_data)[, occurrence := .N, by = .(Position)] + +pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) + +########################### +# 2: Read file: meta data with AFandOR +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2 + +meta_with_afor <- read.csv(inFile2 + , stringsAsFactors = F + , header = T) + +rm(inDir, inFile2) + +str(meta_with_afor) + +# sort by Mutationinformation +head(meta_with_afor$Mutationinformation) +meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] +head(meta_with_afor$Mutationinformation) + +# sanity check: should be True for all the mentioned columns +#is.numeric(meta_with_afor$OR) +na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue") + +c1 = NULL +for (i in na_var){ + print(i) + c0 = is.numeric(meta_with_afor[,i]) + c1 = c(c0, c1) + if ( all(c1) ){ + print("Sanity check passed: These are all numeric cols") + } else{ + print("Error: Please check your respective data types") + } +} + +# If OR, and P value are not numeric, then convert to numeric and then count +# else they will say 0 +na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count +str(na_count) + +# compare if the No of "NA" are the same for all these cols +na_len = NULL +for (i in na_var){ + temp = na_count[[i]] + na_len = c(na_len, temp) +} + +# extract how many NAs: +# should be all TRUE +# should be a single number since +# all the cols should have "equal" and "same" no. of NAs + +my_nrows = NULL +for ( i in 1: (length(na_len)-1) ){ + #print(compare(na_len[i]), na_len[i+1]) + c = compare(na_len[i], na_len[i+1]) + if ( c$result ) { + my_nrows = na_len[i] } + else { + print("Error: Please check your numbers") + } +} + +my_nrows + +#=#=#=#=#=#=#=#=# +# COMMENT: AF, OR, pvalue, logor and neglog10pvalue +# these are the same 7 ones +#=#=#=#=#=#=#=#=# + +# sanity check +#which(is.na(meta_with_afor$OR)) + +# initialise an empty df with nrows as extracted above +na_count_df = data.frame(matrix(vector(mode = 'numeric' +# , length = length(na_var) + ) + , nrow = my_nrows +# , ncol = length(na_var) + )) + +# populate the df with the indices of the cols that are NA +for (i in na_var){ + print(i) + na_i = which(is.na(meta_with_afor[i])) + na_count_df = cbind(na_count_df, na_i) + colnames(na_count_df)[which(na_var == i)] <- i +} + +# Now compare these indices to ensure these are the same +c2 = NULL +for ( i in 1: ( length(na_count_df)-1 ) ) { +# print(na_count_df[i] == na_count_df[i+1]) + c1 = identical(na_count_df[[i]], na_count_df[[i+1]]) + c2 = c(c1, c2) + if ( all(c2) ) { + print("Sanity check passed: The indices for AF, OR, etc are all the same") + } else { + print ("Error: Please check indices which are NA") + } +} + +rm( c, c0, c1, c2, i, my_nrows + , na_count, na_i, na_len + , na_var, temp + , na_count_df + , pos_count_check ) + +########################### +# 3:merging two dfs: with NA +########################### + +# link col name = Mutationinforamtion +head(mcsm_data$Mutationinformation) +head(meta_with_afor$Mutationinformation) + +######### +# merge 1a: meta data with mcsm +######### +merged_df2 = merge(x = meta_with_afor + ,y = mcsm_data + , by = "Mutationinformation" + , all.y = T) + +head(merged_df2$Position) + +# sort by Position +head(merged_df2$Position) +merged_df2 = merged_df2[order(merged_df2$Position),] +head(merged_df2$Position) + +merged_df2v2 = merge(x = meta_with_afor + ,y = mcsm_data + , by = "Mutationinformation" + , all.x = T) +#!=!=!=!=!=!=!=! +# COMMENT: used all.y since position 186 is not part of the struc, +# hence doesn't have a mcsm value +# but 186 is associated with with mutation +#!=!=!=!=!=!=!=! + +# should be False +identical(merged_df2, merged_df2v2) +table(merged_df2$Position%in%merged_df2v2$Position) + +rm(merged_df2v2) + +######### +# merge 1b:remove duplicate mutation information +######### + +#==#=#=#=#=#=# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#=#=#=#=#=#=#= +merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] +head(merged_df3$Position); tail(merged_df3$Position) # should be sorted + +# sanity checks +# nrows of merged_df3 should be the same as the nrows of mcsm_data +if(nrow(mcsm_data) == nrow(merged_df3)){ + print("sanity check: Passed") +} else { + print("Error!: check data, nrows is not as expected") +} + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# uncomment as necessary +# only need to run this if merged_df2v2 i.e non structural pos included +#mcsm = mcsm_data$Mutationinformation +#my_merged = merged_df3$Mutationinformation + +# find the index where it differs +#diff_n = which(!my_merged%in%mcsm) + +#check if it is indeed pos 186 +#merged_df3[diff_n,] + +# remove this entry +#merged_df3 = merged_df3[-diff_n,]] +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +########################### +# 3b :merging two dfs: without NA +########################### + +######### +# merge 2a:same as merge 1 but excluding NA +######### +merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] + +######### +# merge 2b: remove duplicate mutation information +######### +merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] + +# alternate way of deriving merged_df3_comp +foo = merged_df3[!is.na(merged_df3$AF),] +# compare dfs: foo and merged_df3_com +all.equal(foo, merged_df3) + +summary(comparedf(foo, merged_df3)) + +#=============== end of combining df +#clear variables +rm(mcsm_data + , meta_with_afor + , foo) + +#rm(diff_n, my_merged, mcsm) + +#===================== +# write_output files +#===================== +# output dir +outDir = "~/git/Data/pyrazinamide/output/" +getwd() + +outFile1 = paste0(outDir, "merged_df3.csv"); outFile1 +write.csv(merged_df3, outFile1) + +#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2 +#write.csv(merged_df3_comp, outFile2) + +rm(outDir + , outFile1 +# , outFile2 +) +#============================= end of script + diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R new file mode 100644 index 0000000..1dccd65 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R @@ -0,0 +1,348 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/") +getwd() + +######################################################### +# TASK: To combine mcsm and meta data with af and or +# by filtering for distance to ligand (<10Ang) +######################################################### + +######################################################### +# Installing and loading required packages +######################################################### + +#source("Header_TT.R") +#require(data.table) +#require(arsenal) +#require(compare) +#library(tidyverse) + +################################# +# Read file: normalised file +# output of step 4 mcsm_pipeline +################################# + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile + +mcsm_data = read.csv(inFile + , row.names = 1 + , stringsAsFactors = F + , header = T) +rm(inDir, inFile) + +str(mcsm_data) + +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) + +# spelling Correction 1: DUET +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' + +# checks +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) +head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) + +# spelling Correction 2: Ligand +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) + +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) +head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) + +########################### !!! only for mcsm_lig +# 4: Filter/subset data +# Lig plots < 10Ang +# Filter the lig plots for Dis_to_lig < 10Ang +########################### + +# check range of distances +max(mcsm_data$Dis_lig_Ang) +min(mcsm_data$Dis_lig_Ang) + +# count +table(mcsm_data$Dis_lig_Ang<10) + +# subset data to have only values less than 10 Ang +mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) + +# sanity checks +max(mcsm_data2$Dis_lig_Ang) +min(mcsm_data2$Dis_lig_Ang) + +# count no of unique positions +length(unique(mcsm_data2$Position)) + +# count no of unique mutations +length(unique(mcsm_data2$Mutationinformation)) + +# count Destabilisinga and stabilising +table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang} + +#<<<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT: so as not to alter the script +mcsm_data = mcsm_data2 +#<<<<<<<<<<<<<<<<<<<<<<<<<<< + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(mcsm_data$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +# clear variables +rm(mcsm_data2) + +# count na in each column +na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count + +head(mcsm_data$Mutationinformation) +mcsm_data[mcsm_data$Mutationinformation=="Q10P",] +mcsm_data[mcsm_data$Mutationinformation=="L4S",] + +# sort by Mutationinformation +mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] +head(mcsm_data$Mutationinformation) + +# check +mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),] +mcsm_data[grep("A102T", mcsm_data$Mutationinformation),] + +# get freq count of positions and add to the df +setDT(mcsm_data)[, occurrence := .N, by = .(Position)] + +pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) + +########################### +# 2: Read file: meta data with AFandOR +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2 + +meta_with_afor <- read.csv(inFile2 + , stringsAsFactors = F + , header = T) + +str(meta_with_afor) + +# sort by Mutationinformation +head(meta_with_afor$Mutationinformation) +meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] +head(meta_with_afor$Mutationinformation) + +# sanity check: should be True for all the mentioned columns +#is.numeric(meta_with_afor$OR) +na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue") + +c1 = NULL +for (i in na_var){ + print(i) + c0 = is.numeric(meta_with_afor[,i]) + c1 = c(c0, c1) + if ( all(c1) ){ + print("Sanity check passed: These are all numeric cols") + } else{ + print("Error: Please check your respective data types") + } +} + +# If OR, and P value are not numeric, then convert to numeric and then count +# else they will say 0 + +# NOW count na in each column: if you did it before, then +# OR and Pvalue column would say 0 na since these were not numeric +na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count +str(na_count) + +# compare if the No of "NA" are the same for all these cols +na_len = NULL +na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue") +for (i in na_var){ + temp = na_count[[i]] + na_len = c(na_len, temp) +} + +my_nrows = NULL + +for ( i in 1: (length(na_len)-1) ){ + #print(compare(na_len[i]), na_len[i+1]) + c = compare(na_len[i], na_len[i+1]) + if ( c$result ) { + my_nrows = na_len[i] } + else { + print("Error: Please check your numbers") + } +} + +my_nrows + +#=#=#=#=#=#=#=#=# +# COMMENT: AF, OR, pvalue, logor and neglog10pvalue +# all have 81 NA, with pyrazinamide with 960 +# and these are the same 7 ones +#=#=#=#=#=#=#=#=# + +# sanity check +#which(is.na(meta_with_afor$OR)) + +# initialise an empty df with nrows as extracted above +na_count_df = data.frame(matrix(vector(mode = 'numeric' +# , length = length(na_var) + ) + , nrow = my_nrows +# , ncol = length(na_var) + )) + +# populate the df with the indices of the cols that are NA +for (i in na_var){ + print(i) + na_i = which(is.na(meta_with_afor[i])) + na_count_df = cbind(na_count_df, na_i) + colnames(na_count_df)[which(na_var == i)] <- i +} + +# Now compare these indices to ensure these are the same +c2 = NULL +for ( i in 1: ( length(na_count_df)-1 ) ) { + # print(na_count_df[i] == na_count_df[i+1]) + c1 = identical(na_count_df[[i]], na_count_df[[i+1]]) + c2 = c(c1, c2) + if ( all(c2) ) { + print("Sanity check passed: The indices for AF, OR, etc are all the same") + } else { + print ("Error: Please check indices which are NA") + } +} + +rm( c, c1, c2, i, my_nrows + , na_count, na_i, na_len + , na_var, temp + , na_count_df + , pos_count_check ) + +########################### +# 3:merging two dfs: with NA +########################### + +# link col name = Mutationinforamtion +head(mcsm_data$Mutationinformation) +head(meta_with_afor$Mutationinformation) + +######### +# merge 1a: meta data with mcsm +######### +merged_df2 = merge(x = meta_with_afor + , y = mcsm_data + , by = "Mutationinformation" + , all.y = T) + +head(merged_df2$Position) + +# sort by Position +head(merged_df2$Position) +merged_df2 = merged_df2[order(merged_df2$Position),] +head(merged_df2$Position) + +merged_df2v2 = merge(x = meta_with_afor + ,y = mcsm_data + , by = "Mutationinformation" + , all.x = T) + +#!=!=!=!=!=!=!=! +# COMMENT: used all.y since position 186 is not part of the struc, +# hence doesn't have a mcsm value +# but 186 is associated with with mutation +#!=!=!=!=!=!=!=! + +# should be False +identical(merged_df2, merged_df2v2) +table(merged_df2$Position%in%merged_df2v2$Position) + +rm(merged_df2v2) + +######### +# merge 1b:remove duplicate mutation information +######### + +#==#=#=#=#=#=# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#=#=#=#=#=#=#= +merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] +head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted + +# sanity checks +# nrows of merged_df3 should be the same as the nrows of mcsm_data +if(nrow(mcsm_data) == nrow(merged_df3)){ + print("sanity check: Passed") +} else { + print("Error!: check data, nrows is not as expected") +} + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# uncomment as necessary +# only need to run this if merged_df2v2 i.e non structural pos included +#mcsm = mcsm_data$Mutationinformation +#my_merged = merged_df3$Mutationinformation + +# find the index where it differs +#diff_n = which(!my_merged%in%mcsm) + +#check if it is indeed pos 186 +#merged_df3[diff_n,] + +# remove this entry +#merged_df3 = merged_df3[-diff_n,] +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +########################### +# 3b :merging two dfs: without NA +########################### + +######### +# merge 2a:same as merge 1 but excluding NA +######### +merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] + +######### +# merge 2b: remove duplicate mutation information +######### +merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] + +# FIXME: add this as a sanity check. I have manually checked! + +# alternate way of deriving merged_df3_comp +foo = merged_df3[!is.na(merged_df3$AF),] + +# compare dfs: foo and merged_df3_com +all.equal(foo, merged_df3) + +summary(comparedf(foo, merged_df3)) + +#=============== end of combining df +#clear variables +rm(mcsm_data + , meta_with_afor + , foo) + +#rm(diff_n, my_merged, mcsm) + +#===============end of script + +#===================== +# write_output files +#===================== + +# Not required as this is a subset of the "combining_two_df.R" script + diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh new file mode 100755 index 0000000..22fada7 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +#************************************* +# need to be in the correct directory +#************************************* +##: comments for code +#: commented out code + +#********************************************************************** +# TASK: Text file containing a list of SNPs; SNP in the format(C2E) +# per line. Sort by unique, which automatically removes duplicates. +# sace file in current directory +#********************************************************************** +infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv" +outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv" + +# sort unique entries and output to current directory +sort -u ${infile} > ${outfile} + +# count no. of unique snps mCSM will run on +count=$(wc -l < ${outfile}) + +# print to console no. of unique snps mCSM will run on +echo "${count} unique mutations for mCSM to run on" + diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh new file mode 100755 index 0000000..87f4265 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +#************************************* +#need to be in the correct directory +#************************************* +##: comments for code +#: commented out code + +#********************************************************************** +# TASK: submit requests using curl: HANDLE redirects and refresh url. +# Iterate over mutation file and write/append result urls to a file +# result url file: stored in the /Results directory +# mutation file: one mutation per line, no chain ID +# output: in a file, should be n urls (n=no. of mutations in file) +# NOTE: these are just result urls, not actual values for results +#********************************************************************** +## iterate over mutation file; line by line and submit query using curl +filename="../Data/pnca_mis_SNPs_v2_unique.csv" + +## some useful messages +echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n" +COUNT=0 +while read -r line; do +((COUNT++)) + mutation="${line}" +# echo "${mutation}" +pdb='../Data/complex1_no_water.pdb' +mutation="${mutation}" +chain="A" +lig_id="PZA" +affin_wt="0.99" +host="http://biosig.unimelb.edu.au" +call_url="/mcsm_lig/prediction" + +##========================================= +##html field_names names required for curl +##complex_field:wild=@ +##mutation_field:mutation=@ +##chain_field:chain=@ +##ligand_field:lig_id@ +##energy_field:affin_wt +#========================================= +refresh_url=$(curl -L \ + -sS \ + -F "wild=@${pdb}" \ + -F "mutation=${mutation}" \ + -F "chain=${chain}" \ + -F "lig_id=${lig_id}" \ + -F "affin_wt=${affin_wt}" \ + ${host}${call_url} | grep "http-equiv") + +#echo $refresh_url +#echo ${host}${refresh_url} + +#use regex to extract the relevant bit from the refresh url +#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g' + +#Now build: result url using host and refresh url and write the urls to a file in the Results dir +result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g') +sleep 10 + +echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..." + +echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt +#echo -n '.' +done < "${filename}" + +echo +echo "Processing Complete" + +##end of submitting query, receiving result url and storing results url in a file + diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh new file mode 100755 index 0000000..e250fe8 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh @@ -0,0 +1,59 @@ +#!/bin/bash +#************************************* +#need to be in the correct directory +#************************************* +##: comments for code +#: commented out code + +#******************************************************************** +# TASK: submit result urls and fetch actual results using curl +# iterate over each result url from the output of step1 in the stored +# in file in /Results. +# Use curl to fetch results and extract relevant sections using hxtools +# and store these in another file in /Results +# This script takes two arguments: +# input file: file containing results url +# In this case: 336_mCSM_lig_complex1_result_url.txt +# output file: name of the file where extracted results will be stored +# In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt +#********************************************************************* + +#if [ "$#" -ne 2 ]; then + #if [ -Z $1 ]; then +# echo " +# Please provide both Input and Output files. + +# Usage: batch_read_urls.sh INFILE OUTFILE +# " +# exit 1 +#fi + +# First argument: Input File +# Second argument: Output File +#infile=$1 +#outfile=$2 + +infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt" +outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt" + +echo -n "Processing $(wc -l < ${infile}) entries from ${infile}" +echo +COUNT=0 +while read -r line; do +#COUNT=$(($COUNT+1)) +((COUNT++)) + curl --silent ${line} \ + | hxnormalize -x \ + | hxselect -c div.span4 \ + | hxselect -c div.well \ + | sed -r -e 's/<[^>]*>//g' \ + | sed -re 's/ +//g' \ + >> ${outfile} + #| tee -a ${outfile} +# echo -n '.' +echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..." + +done < "${infile}" + +echo +echo "Processing Complete" diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh new file mode 100755 index 0000000..78dbdf5 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh @@ -0,0 +1,52 @@ +#!/bin/bash +#************************************* +#need to be in the correct directory +#************************************* +##: comments for code +#: commented out code + +#******************************************************************** +# TASK: Intermediate results processing +# output file has a convenient delimiter of ":" that can be used to +# format the file into two columns (col1: field_desc and col2: values) +# However the section "PredictedAffinityChange:...." and +# "DUETstabilitychange:.." are split over multiple lines and +# prevent this from happening.Additionally there are other empty lines +# that need to be omiited. In order ensure these sections are not split +# over multiple lines, this script is written. +#********************************************************************* + +infile="../Results/336_mCSM_lig_complex1_output_processed.txt" + +#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \ +# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile} + +# Outputs records separated by a newline, that look something like this: +# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing +# Mutationinformation: +# Wild-type:L +# Position:4 +# Mutant-type:W +# Chain:A +# LigandID:PZA +# Distancetoligand:15.911Å +# DUETstabilitychange:-2.169Kcal/mol +# +# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing +# (...etc) + +# This script brings everything in a convenient format for further processing in python. +# bear in mind, this replaces the file in place, so make sure you retain a copy for your records +sed -i '/PredictedAffinityChange/ { +N +N +N +N +s/\n//g +} +/DUETstabilitychange:/ { +N +N +s/\n//g +} +/^$/d' ${infile} diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py new file mode 100755 index 0000000..a780576 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +import pandas as pd +from collections import defaultdict + +#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt' + +outCols=[ + 'PredictedAffinityChange', + 'Mutationinformation', + 'Wild-type', + 'Position', + 'Mutant-type', + 'Chain', + 'LigandID', + 'Distancetoligand', + 'DUETstabilitychange' + ] + +lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')] + +outputs = defaultdict(list) + +for item in lines: + col, val = item.split(':') + outputs[col].append(val) + +dfOut=pd.DataFrame(outputs) + +pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R new file mode 100644 index 0000000..4876b5e --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R @@ -0,0 +1,207 @@ +getwd() +#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work +setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad +#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac +getwd() + +#======================================================= +#TASK: To tidy the columns so you can generate figures +#======================================================= +#################### +#### read file #####: this will be the output from python script (csv file) +#################### +data = read.csv("336_complex1_formatted_results.csv" + , header = T + , stringsAsFactors = FALSE) +dim(data) +#335, 10 +str(data) + +########################### +##### Data processing ##### +########################### + +# populate mutation information columns as currently it is empty +head(data$Mutationinformation) +tail(data$Mutationinformation) + +# should not be blank: create muation information +data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type) + +head(data$Mutationinformation) +tail(data$Mutationinformation) +#write.csv(data, 'test.csv') +########################################## +# Remove duplicate SNPs as a sanity check +########################################## +#very important +table(duplicated(data$Mutationinformation)) +#FALSE +#335 + +#extract duplicated entries +dups = data[duplicated(data$Mutationinformation),] #0 + +#No of dups should match with the no. of TRUE in the above table +#u_dups = unique(dups$Mutationinformation) #10 +sum( table(dups$Mutationinformation) ) #13 + +rm(dups) + +#*************************************************************** +#select non-duplicated SNPs and create a new df +df = data[!duplicated(data$Mutationinformation),] #309, 10 +#*************************************************************** +#sanity check +u = unique(df$Mutationinformation) +u2 = unique(data$Mutationinformation) +table(u%in%u2) +#TRUE +#309 +#should all be 1, hence 309 1's +sum(table(df$Mutationinformation) == 1) + +#sort df by Position +#MANUAL CHECKPOINT: +#foo <- df[order(df$Position),] +#df <- df[order(df$Position),] + +rm(u, u2, dups) + +#################### +#### give meaningful colnames to reflect units to enable correct data type +#################### + +#======= +#STEP 1 +#======== +#make a copy of the PredictedAffinityColumn and call it Lig_outcome +df$Lig_outcome = df$PredictedAffinityChange #335, 11 + +#make Predicted...column numeric and outcome column categorical +head(df$PredictedAffinityChange) +df$PredictedAffinityChange = gsub("log.*" + , "" + , df$PredictedAffinityChange) + +#sanity checks +head(df$PredictedAffinityChange) + +#should be numeric, check and if not make it numeric +is.numeric( df$PredictedAffinityChange ) +#change to numeric +df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange) +#should be TRUE +is.numeric( df$PredictedAffinityChange ) + +#change the column name to indicate units +n = which(colnames(df) == "PredictedAffinityChange"); n +colnames(df)[n] = "PredAffLog" +colnames(df)[n] + +#======== +#STEP 2 +#======== +#make Lig_outcome column categorical showing effect of mutation +head(df$Lig_outcome) +df$Lig_outcome = gsub("^.*-" + , "", + df$Lig_outcome) +#sanity checks +head(df$Lig_outcome) +#should be factor, check and if not change it to factor +is.factor(df$Lig_outcome) +#change to factor +df$Lig_outcome = as.factor(df$Lig_outcome) +#should be TRUE +is.factor(df$Lig_outcome) + +#======== +#STEP 3 +#======== +#gsub +head(df$Distancetoligand) +df$Distancetoligand = gsub("Å" + , "" + , df$Distancetoligand) +#sanity checks +head(df$Distancetoligand) +#should be numeric, check if not change it to numeric +is.numeric(df$Distancetoligand) +#change to numeric +df$Distancetoligand = as.numeric(df$Distancetoligand) +#should be TRUE +is.numeric(df$Distancetoligand) + +#change the column name to indicate units +n = which(colnames(df) == "Distancetoligand") +colnames(df)[n] <- "Dis_lig_Ang" +colnames(df)[n] + +#======== +#STEP 4 +#======== +#gsub +head(df$DUETstabilitychange) +df$DUETstabilitychange = gsub("Kcal/mol" + , "" + , df$DUETstabilitychange) +#sanity checks +head(df$DUETstabilitychange) +#should be numeric, check if not change it to numeric +is.numeric(df$DUETstabilitychange) +#change to numeric +df$DUETstabilitychange = as.numeric(df$DUETstabilitychange) +#should be TRUE +is.numeric(df$DUETstabilitychange) + +#change the column name to indicate units +n = which(colnames(df) == "DUETstabilitychange"); n +colnames(df)[n] = "DUETStability_Kcalpermol" +colnames(df)[n] + +#======== +#STEP 5 +#======== +#create yet another extra column: classification of DUET stability only +df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0 + , "Stabilizing" + , "Destabilizing") #335, 12 + +table(df$Lig_outcome) +#Destabilizing Stabilizing +#281 54 + +table(df$DUET_outcome) +#Destabilizing Stabilizing +#288 47 +#============================== +#FIXME +#Insert a venn diagram + +#================================ + + +#======== +#STEP 6 +#======== +# assign wild and mutant colnames correctly + +wt = which(colnames(df) == "Wild.type"); wt +colnames(df)[wt] <- "Wild_type" +colnames(df[wt]) + +mut = which(colnames(df) == "Mutant.type"); mut +colnames(df)[mut] <- "Mutant_type" +colnames(df[mut]) + +#======== +#STEP 7 +#======== +#create an extra column: maybe useful for some plots +df$WildPos = paste0(df$Wild_type, df$Position) #335, 13 + +#clear variables +rm(n, wt, mut) + +################ end of data cleaning diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R new file mode 100644 index 0000000..4721e29 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R @@ -0,0 +1,252 @@ +getwd() +#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work +setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad +#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac +getwd() + +#======================================================= +#TASK:read cleaned data and perform rescaling + # of DUET stability scores + # of Pred affinity +#compare scaling methods with plots +#output normalised file +#======================================================= + +#################### +#### read file #####: this will be the output of my R script that cleans the data columns +#################### +source("../Scripts/step3c_data_cleaning.R") +##This will outut two dataframes: +##data: unclean data: 335, 10 +##df : cleaned df 335, 13 +## you can remove data if you want as you will not need it +rm(data) + +colnames(df) + +#=================== +#3a: PredAffLog +#=================== +n = which(colnames(df) == "PredAffLog"); n +group = which(colnames(df) == "Lig_outcome"); group + +#=================================================== +# order according to PredAffLog values +#=================================================== +# This is because this makes it easier to see the results of rescaling for debugging +head(df$PredAffLog) + +#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom +df = df[order(df$PredAffLog),] +head(df$PredAffLog) + +#sanity checks +head(df[,n]) #all negatives +tail(df[,n]) #all positives + +#sanity checks +mean(df[,n]) +#-0.9526746 + +tapply(df[,n], df[,group], mean) +#Destabilizing Stabilizing +#-1.2112100 0.3926667 +#=========================== +#Same as above: in 2 steps +#=========================== + +#find range of your data +my_min = min(df[,n]); my_min #-3.948 +my_max = max(df[,n]); my_max #2.23 + +#=============================================== +# WITHIN GROUP rescaling 2: method "ratio" +# create column to store the rescaled values +# Rescaling separately (Less dangerous) +# =====> chosen one:as Nick prefers +#=============================================== +df$ratioPredAff = ifelse(df[,n] < 0 + , df[,n]/abs(my_min) + , df[,n]/my_max + )#335 14 +#sanity checks +head(df$ratioPredAff) +tail(df$ratioPredAff) + +min(df$ratioPredAff); max(df$ratioPredAff) + +tapply(df$ratioPredAff, df$Lig_outcome, min) +#Destabilizing Stabilizing +#-1.000000000 0.005381166 + +tapply(df$ratioPredAff, df$Lig_outcome, max) +#Destabilizing Stabilizing +#-0.001266464 1.000000000 + +#should be the same as below (281 and 54) +sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0) + +table(df$Lig_outcome) +#Destabilizing Stabilizing +#281 54 + +#=============================================== +# Hist and density plots to compare the rescaling +# methods: Base R +#=============================================== +#uncomment as necessary +my_title = "Ligand_stability" +#my_title = colnames(df[n]) + +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(2,2)) + +hist(df[,n] + , xlab = "" + , main = "Raw values" +) + +hist(df$ratioPredAff + , xlab = "" + , main = "ratio rescaling" +) + +# Plot density plots underneath +plot(density( df[,n] ) + , main = "Raw values" +) + +plot(density( df$ratioPredAff ) + , main = "ratio rescaling" +) + +# titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = my_title + , side = 3 + , line = 0 + , outer = TRUE) + + +#clear variables +rm(my_min, my_max, my_title, n, group) + +#=================== +# 3b: DUET stability +#=================== +dim(df) #335, 14 + +n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10 +group = which(colnames(df) == "DUET_outcome"); group #12 + +#=================================================== +# order according to DUET scores +#=================================================== +# This is because this makes it easier to see the results of rescaling for debugging +head(df$DUETStability_Kcalpermol) + +#ORDER BY DUET scores: negative values at the top and positive at the bottom +df = df[order(df$DUETStability_Kcalpermol),] + +#sanity checks +head(df[,n]) #negatives +tail(df[,n]) #positives + +#sanity checks +mean(df[,n]) +#[1] -1.173316 + +tapply(df[,n], df[,group], mean) +#Destabilizing Stabilizing +#-1.4297257 0.3978723 + +#=============================================== +# WITHIN GROUP rescaling 2: method "ratio" +# create column to store the rescaled values +# Rescaling separately (Less dangerous) +# =====> chosen one:as Nick prefers +#=============================================== +#find range of your data +my_min = min(df[,n]); my_min #-3.87 +my_max = max(df[,n]); my_max #1.689 + +df$ratioDUET = ifelse(df[,n] < 0 + , df[,n]/abs(my_min) + , df[,n]/my_max + ) #335, 15 +#sanity check +head(df$ratioDUET) +tail(df$ratioDUET) + +min(df$ratioDUET); max(df$ratioDUET) + +#sanity checks +tapply(df$ratioDUET, df$DUET_outcome, min) +#Destabilizing Stabilizing +#-1.00000000 0.01065719 + +tapply(df$ratioDUET, df$DUET_outcome, max) +#Destabilizing Stabilizing +#-0.003875969 1.000000000 + +#should be the same as below (267 and 42) +sum(df$ratioDUET < 0); sum(df$ratioDUET > 0) + +table(df$DUET_outcome) +#Destabilizing Stabilizing +#288 47 + +#=============================================== +# Hist and density plots to compare the rescaling +# methods: Base R +#=============================================== +#uncomment as necessary + +my_title = "DUET_stability" +#my_title = colnames(df[n]) + +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(2,2)) + +hist(df[,n] + , xlab = "" + , main = "Raw values" +) + +hist(df$ratioDUET + , xlab = "" + , main = "ratio rescaling" +) + +# Plot density plots underneath +plot(density( df[,n] ) + , main = "Raw values" +) + +plot(density( df$ratioDUET ) + , main = "ratio rescaling" +) + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = my_title + , side = 3 + , line = 0 + , outer = TRUE) + +#=================== +# write output as csv file +#=================== +write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15 diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R new file mode 100644 index 0000000..877215a --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R @@ -0,0 +1,131 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") +require(data.table) +require(dplyr) + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +########################### +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +########################### + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +########################### +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +########################### +# Data for bfactor figure +# PS average +# Lig average +########################### + +head(my_df$Position) +head(my_df$ratioDUET) + +# order data frame +df = my_df[order(my_df$Position),] + +head(df$Position) +head(df$ratioDUET) + +#*********** +# PS: average by position +#*********** + +mean_DUET_by_position <- df %>% + group_by(Position) %>% + summarize(averaged.DUET = mean(ratioDUET)) + +#*********** +# Lig: average by position +#*********** +mean_Lig_by_position <- df %>% + group_by(Position) %>% + summarize(averaged.Lig = mean(ratioPredAff)) + + +#*********** +# cbind:mean_DUET_by_position and mean_Lig_by_position +#*********** + +combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) + +# sanity check +# mean_PS_Lig_Bfactor + +colnames(combined) + +colnames(combined) = c("Position" + , "average_DUETR" + , "Position2" + , "average_PredAffR") + +colnames(combined) + +identical(combined$Position, combined$Position2) + +n = which(colnames(combined) == "Position2"); n + +combined_df = combined[,-n] + +max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) + +max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) + +#============= +# output csv +#============ +outDir = "~/git/Data/pyrazinamide/input/processed/" +outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") +print(paste0("Output file with path will be:","", outFile)) + +head(combined_df$Position); tail(combined_df$Position) + +write.csv(combined_df, outFile + , row.names = F) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData new file mode 100644 index 0000000000000000000000000000000000000000..9ebc62b0d6fbe54e858e3c9da53c9878c18035b4 GIT binary patch literal 43777 zcmV)QK(xOfiwFP!000001MNKrd=$mkbLovVdJnya^g{6x2~7fq5GkUDBe{^1G%gp4 z6)X1Md+!AmJ30{+yHXXf&{PmX5o}1wx3l|Z_wC&7l@!GPm;8P&H#0l)=Djy>`plcc zyqPHxGb0Eg0VFUehy(;m&Y%G4pM?aGaOpR!a!K)G+hSL(E|p-h1Y;#QTY{@47%#yq5}YnUssx=RxJiN}37(f=js$%q zXfHuG2~LyXR0-NjkS#%m1j8kGMgqpUOM+wx91={B;9d#3O0Z0VAt0msB{)ffyCf)+ z;5-S=WwMmuSqZEXFqsdMV2T6}Nl+od6%ve;;5i8%kYJ1i*GX`V1P@D4EWt$*luJ-3 zfm4Dw2}&i1mmpt)t0Z_rf)^!NBEhQ?+$cd031&!;BbON=k4SK(1Sd-nCBbM3Zjs;u z3DPB)F2M^D^phZ0f-@yZkl+*v9wba(2#JxPT7q5@L`v|I1d}D0C&A+qTx+09OKgiP zyy+6?9EvyCFkVz*p{=U4x-zlWZf7!(^d>-(iRES{->yMn8DDPiHs|xd3Lk#<{`cSM zch%J8H};7A{F<20v#0&Oe9{@&7yXm_{mLVYN5@&uUH4r3Gs5=`9Z+^(K;_I!zdbeY z{;vw_-gvuBz^K0m-TKhi)4#qm`|yWnTyb#X-i%q#{=MMR>#utF?eW789QbWa-jVQY zLZ7za4&|6*pJ{`JG&T;8j8_p~;vb}o2lR>}kWr~P=kv-jJmI(G$mQczos+A38%`-FWx*`x_3`MqIyS$BXx$Uo`xzv>BfSFP?mT%A4P> zdvnRh_0F91Ay@Xe{=!Mm-*$U(ZO*6Njs>2)@B3acLCa!OM<4lR;Jm(fc1w9-$dFUQ zx{SW?;%~lRzGG|cU-!IxBxpo_yYR~&>hj{SqwzaK{@VH34Lz23xOme=n|>TQ^TC@p z&j@+2eDM8$-`4hqg>mohJ@wrMZ%z8jl=f0;lOm{r|Z;KIpHsLyPYo`}t$B zfBpT=_;YTY)?vg+0d2?ZO6&h+-m97OS3H|JV(oj8Ba`=>(>L+7@>hy~P1tz-pLKmc zN?1|-*v^_WFT7~X>W}u%d};A@-wvgRUJu@Q^+#{~_Qn_Omaoi6Sl@8y$p=rHzVGyk zi#8>$df=4J-DCR4+`e)2$H%^3ob+1i3tjI1uIJ)Ir(`}g!T#4D3-7)0y%(m}58U}z z#()<)t=trJ-PHNxM|PUqz4(*YZt3pqG-C9$(=WSb$JNEX!UBJMW%%bc7k9Vay!8Cl z$tnNbv+c>-_gxTn1_^&{a_q>Mu(KlWUst*JvcetJQ>?=twJ+Uw#w~}Yrc~7>4ZC#4 zlwGHV_Kkk*{Lgj=?3?lPwOuA$S#j2_I-BSk8@_P`S_-;iKo4^ zdh?6LML(BKYk&I250-V?`_kN=J#Sf67jkV()K^31?uc2t=&dE))?`-BJpH>lQDvQW z{y6d6^AB4(@30ShqTlmtr{B8fx3#BNzi`QA-EN5A`+DFb|Li?DujbV&-dgwJ^0Zl9 z$J{kI?~5P$-aPxl4tpX)o|^yt4HtF%BlWRK`xAE5bXqy=-2(+TT~WK^ns0N;fBN#h z{L?zWc4YL3PY(P!eLUV7*6qh1()uCsXZcQsdCv!QR?!$Y#xl~>#qG3UUV zOzXClSsj15Wpzl0ixW!*)_)tkbJXfyXWTIH(eHY9nYoo9!q?a6uNrWLP7&KbFBeg4t-+qcCGUG&fm zb<3Wh7Z1B8VE7X&KfC8j=c8Y`_TPBU4SBy-^_qF`-p5At?l`jM#-hce3Zom^P0HxE z>YC(pw*8R&*LfYv-d+>=)N2nNnSRZ{7>a+ zQ+hr0`<}={H~sZc(VT9}>`~A5FWmM|aroSjOYeH(s%tOKZ}>3zywr>PrF^k;P0jGQ z5AuEtJ#*8U`}dx+;y_hW&9Xf!pT6_{qA|toPJZ?3GapF5<*AiJcD%kOU`6-2cL#j2 zyWNy!pN75u`H{y0EeG0%hb?_(#rMD8@b$PWuKnfq5pRF9zTmS9$5l8_t-s~m)e8!f zzBsn+m6Dr2-C0@iz{^SJUi`#@_}#PaP1!s3@oz>&eNyPmd%s;x#f|;@PP**9tin6) z=<@!myVewr-Z1L#wt2UFM9#bTv((V~&2McUWtm%e!?KgE>HW(orD1D#et6B=m%DX) zJ~-p1on@s78-(oF~4?q1W8lammV~vG1ikwEXd$1K(z? z=;2yFA>{Y7-&i?0CicErckKNuGOcKw?W_2A_t!p98NBt=wmav~>9Lor8l4g{F#Y`Y z&y4OeeAKA;2It)3X-OaaesbR>Ma#3S{jb?Fjl6S1(2IW_dHnO3TRKg6_vAlP-yk=Q z+%YOMbb7&~r^ZfO@Y+aM*N`g%PRkl}`y;Mtwy-xocx>3umPalbcE@Skez>M#`>41( z_F2zPMKv*#yI-9CNc{)xM8Nt#{z=$vgq&%Zn8PE zpDa6V)kjyQ^nM_VOuQ-Tr;jd}Kl{(+mCs!K=gV*WwRe5>T_b`HmG|C!T})d38K>vo z_t|^Bx@3;{dw>0*g<*xcZIW2ZJ80{`2K;=UeW7dj9CYg96%qdF9MsN-q2~vERnpGgnYQHvMW!Eh^dv1U7+n$dWU-sk&U3Q+dyu-1fXN-+H|JKKE`{4cG zuU}qV_s-{M-o54UtUIne^T-3|TxHGs@QK7(=f!otZC~JxKgsYB{>TYj$RO{I2_M8nxi5Ip=jA^3jh$ zryTy{ffb(~+;r{fIg1lc%`CgNb4S+)iJv5I9`*bK&prF;EkDh;sQ-r#Z2jktV@EST z{N%fLZ~N?rLAHN>9B^0GpI0y1zVyTK7lk^-aIMJp9kI9{oFT#h7VhPtBS!cge+z*BrU&-9ypW-ao{3(aL+4 z#@_VtsMTN7H*;3qr>b4pT4#A?t7!Rbr|r$4Y`^7oYoGDL!)I>jePo*bo%d%~ zuNnAzj4;*+K8DD!04YF5Xal4QaL-tuSy+Ql_ysmq z{G73nYNxGgo}Ken6QOb-KBZ?ytWjzUuaC>CqtpQ4`-di#EtV>DDj%emlI& z{cqD>M%O(2Rm5m|K#J{Jc{a$M&C%Xg=0z4Z4`TDiJ0*8#2{IHq$gPsQo4!mer*aAA zCMTs7U{4|T6k*SF&6Apzhdtx5rvQ6UnzYnB>=}kXwtNnNo<{JUlorIYkqv6*$) z?kUKP<4i^FLgd!+O3y@iGEv%0lr|HkMRaDOyfRTeXTIi~tU0x087P@rxD&IJrXjaV$D}m14Poxd$UOzQ zwfNauPb0Y}Bli^Ko{HRs$gOsF`N_#zPs%)nnkOZx0DI7vbX91wX9n_MZ0TzL%#)43 zWMgbv$Csa+nNol~C|Xv^IP95*J!;>VpR8uX+$#V1DVpX zHSAd^c9zyuO-{~CpNKt^uqPjTim*o=LrhMds5vKTPPM$JBx`zFke-%{-1*2o6}h$e z87cY5U5MN`PLbx$M)9*z{A?6o8@Uu@W~%Y0CMQim?gHdir8PAvD^Gh-rhYvEa$Du>gPQ?%g>^Nhou@yL^fzht2=shU)n zXDs&UqM=MvlgD8Xj+TwFW$V7=V-LcXo}P<6ls@+@uk&fIF& zv(mKD{j3ZWCtK2JUl)QR0N6%yJp6KZJW&>*z^_{*Y6loCT#yz{i-i+n;Wfd+3(3O8 zW8utMcu84!17zW1vvAF|@QQBXHQ2&?77LfFg-gZ4+j$G`t1Y~twaoHGDuheL+i?T- z?P4UwSbz}0&44Yl#VPLpA5v7MV(|@hQn6@KIbK->7M$)=X3FAVPU7Rg-uvRP=$!sr z-(L9PitL<$ciqn%IRmC_?yzh~Ku+&VP#?vD7k*S9 z3E`3Qn&X1{Na^dTTSxjRoB$Ky@P+0!dG=bDZLXujagL*E9t1{CLqRQFOf5qRJJiD# zT^A7LfJ&wWl|(u46A0g`1WGREtt?w%)!eCRnma8Oxid6(MzZG4#O^Gvgt;l)8IYT* z2Bs#dP8D5XZe}*ez}6di@R;N@)tSlZ3(QMaNoDJZ$epdZQ&D(Yx|%dqB_uB`TXm*u zp&8s6kf)N7m!%?OOKCiKY;w95oSBN;D%E3CGkGvu9^=8|lGO~yrK{PD%TiH~%T`N( ztwSMqmgY`DaZ*z>cZMpd@yVI0Q;i)sUMr&U*(!Z(g@y-DPzyb9LVB|1)FNbRp;>Cy zY-xiBPfX=80w$)bE>!{(vsE>il%|q1DO(jZTMW?LDM^}JC5X9Eob+tXttv|po1Sy@ z(u`bnrl`C~^KMRGzA7VWdd&kTtNF1xGmkJu(*tR;t2#B=OH)+UnW<(bO+-01HrwQG zHorvfRLz~1gxp!0Tg@||Q00|P88vqXa;w!VP?{F1PE~0Fr=@EVGSW3?hUQdRn5L?C z5Sztm?qn3M$~1^gx-@qNim#$%?kvr%NlTgvsZO=M36v&3sxwQ)!6rD!ou#?e7Kgc$ zkvm&+r=zs#2&dW{1hH8Ga;u6yT`jplX}GUBRsETssk&4lOQUy=i;cf|@C;21Y|PAq zXQ}OR;4D>V0%v7tq1oxaclUJdl1H;9-2VanD7!H0>fqn#mJNfiTy#?#`toV)ZyH_m zcSF`^6~pMa*UrA~r{AOK*0b-5>oM~=>$~41ez{=&YxMJn=bb<9iwL?d=c`W>23?@y z-B08{`s?aVD~?nyqra47U-M(??R4M9vb-xSZRoy7Ub$(@r1^B;;*+2K>eAtKU$4a< z52*N&?!Bjb*Pa~*)4gXfM_ao0^b0cHfBbueqj1hqG;kDA;IxmDLqIiG-rKy3SQnUCFbEVo8Ls zP)4dm9gpHdYN91> z+sg9kUiKFRzXwowcW^f|==auZ>m(scfq{sKL-5PTPoeYX)l^iwc=HmRnwXrJq#$b3 z;F);+Nr+mh%;Y_IGjiHiRaX_)fa(-O^Z-U&OoI?LsMcP=&NLqeR3Ye_X3@?RL6`W_BRUbg#sn$*r3Oz@mA)jDn*#F4p|l> z`2n)d4W1EqX)r^!q^bJjsH&-R>5E9t6%|W6Or5F=byXFPT9?t63R_K0#S-m{$W5FX zHvzL}8-zyKq|tx5-Q_6pq7?(y8;sap?9Oh2QFcN7998{LfeV@T6B`6m9o7lUt;MUTp64iG=ey)j@(l*- ze!~BJ)iO&}W9oN>y=tDT+y{Yr$~t16-DTJf7*@*=si$YHk$36>Mhz1r91L}zNA2KG zRq#@$hl;Gak>e;c3LQOK3mpu26LH>hi4yJ42zQ9n?y7TEdCHwPn~4p?tET7#9%w5q zH7}P$4|+1(s8m87wTV@A6%_`Y3dxgABsxs2IAJwT`@&+IJm~PXgGArCL3J{ZuEuAYFDKU2tBACbEhr>2B&G6f0)VQ_+iM6;)}kt+maw7cX+!YKVKU z8##?JmLN?O5(X%rGa3w{B@IY$)F#*}oOWC3k_4%+YVB37#Dv_^Qb*Oi1Y1?L)ap4C zYJI{rXu!~fMdgl?@`Oc>ii!lMy{5ueVo#7TB-j^AV-1E;#+2wqdHm`9EG7OI3Ldhh zH@Fj`braeUDN)owgJx5mZ}3C15pp5a<})n9AQ-Ctym`dET1T+qXEB-?MDxts1CeYp z#4g&g2~kI)G(z2pvi`|_*^Gc)X5^z_3(I>ej8XG4qB?kLkrSw^T1g9 zVy5b~(x9n!VpXY~59xi#cXUhS%ke}4o&ev#pFCLH(P*+5Bj~eE0YfY8m2>S*)@Z{7 zSwA3=8p|kgJ{VHW&&UVsvC1W7F{lwjUzV82+J(c!y?Px-tsjZM6CfV>M+^Q|b^dqM z*_TO46PjcBR)NHJV2FT9U6NT)S6y4@R2Q|_f?QpNP0;D6#IaQi9nR{iN~u?s{6|#< zfr@kiF&HqaOPRWXl)9iNRMl!{sog2xdRDO(>E#eE%9Kr$j8{?Xs-UrF$-S(N>1)4K zOg4GV1aB4^wJkv<6^n&`V4KLC0 z;kKvcH+?ZVYKzO9cC@%5M-_ynUtCb`@esq+wDm!J&&2Av^c!=Mryem*BLfMHXKxO@wNS^@Mw!Q-;bT#^S~1pOzSW3Q+|ELeTXk zOzNzC>P0m33#5T#vY1=@uu9a%@B^)F%Y%W~)qX<1(r3b=6GDEMsyaibw_B*Yy+ zc&*pYocP#B4HNqy&UK8-X>(K+m)M*xd#%G(C7`kB(e%b>_z0JPE=m5$f?un_FkRx` zh=zmfpdl+BSeR-OPn*stWzF>rB~27iO*TQ*QHa-8@K56G8ujQtu%HcOs_R?^Nj#7u zvY?HY=1TLj`NeF%$AB0vH-CxsK*^bAl!V%xT=r(!G>Sp_&Y;|d88FfahQf^)@_9A@@F^ArTM|7mX-2d+;rvdtp#*%-5*w00{rA%aK4ysVgNS*t zvhTqjz7zKvfGBr~qI@AkOVa>x-kxxu5L6OAYwg7?p<-8!? zHiLUMxYh2h4FEeR-}b6P?LT7*Bp=(0eidxIv$3i7zdX>MrRBYP7r@gFc;S;%o(EUnNW~xCfC`S& z6b>|b`SXj;SYpsJS)kcE2n<+IgizVQh^!5$%*ss4%#LNt%g95zQcvl)9L9ni0(^u} z1pEq5ID$w6h(^HE#2|=65D(BEK|F$venK~Z9wyKYK^qPHUfAUgwFQRrs3FVZ{s5vi z96r>g8MvWo?z$8~+&h2($cuLdK|~uV9@PA8g@$=D)USTHenkj$VjU9xnpVda6RoxN zL8vnuQ2Y}?sG)*T!}LNe`d5Uy1ZCO_GUXWsA>dtHsE@$n8nqBc5Tb|X*lN)q9Gz<` zneTGis%qIzNU_1vHD28<#8Xyb$=Yc1ULJkGNVKrQ3n}3$8D=L}*(qS(h;ctJA_+)c zkcLS3D4GK!;~c3y6eLzYIo#+fb5>XC1;E$Zr(vc8h+Y6*X-!wF~MTwdHKZ)KTT&>v`4QR3@y{QR}i* zl?W=^52UzDW@;CW@n#iG;lV!LT=!8W4 zlw%P>zQ90K-Wma;%`y2q+6dDf0tg;cUF|H@5R736;;`n*mk+A!HmCG~b&S7x*f$Gt z-^$Q^571Lv(9rxiYYcEURe=_uFeh?1mul1+;PGviBzA7NJ3q%{&v?v`Aks@Nb%jfh_#WN8qvCghz*uJaSz>8-+DjFYM2- zpt1$>Z|o>Q?7#i#>#OmviEfydjfb|VwR~Eze1tkR-G_E#26E3t?peUA?o@|KnVxPU zQ=hidJm|_)6vnrE__o1h^i6DYCKcfS4?l0WaVg28G?$fU?L6Y(UPhwjamjaUd2r)u~n4RRmj#R5~`|Q369E|3OhT{ZZA!6R3*5|9kmH1teT8WkQl71 zaMdPM+Lk2DwI?icR#(kS6xP~;C43c$?i_H+T4dge$DK$*O&Me2Z^F@Z+g$eEM1WaY z6hc_Os3WGw;C1-hIPH@DIE&?@lM9Wr(M#M_ZkG@2dYPdX8|g<-zw^%Pr-hpy0uNu3 zs2veC>Ygy8p*2oNrOmm-;8S?5y`;LTRQYJo-EsBM2&}M|xdfskk%$C(L$qsxD!Bu6F0cS_Wp5Gke_6b|Akn`f>)=LwC6 z)+O4N+Fz_7L0O74DR&eS=knvd$IzltBNmGm8|5)sX`^(zPfcyAmavRWS|`6&lH6Jb zTWyWK#AUKA6(X~OFLNLq<$}=g3dht2<1|`9C^s)EY)h&|TOqjBT2jsL?yGH}K~XF- zQuGKV2YF*Hz>H3Bt*S$4QELRAeOhhyrt@?C(0PyhI-!dbD{W`1_q0Mx?$ldO(gFvg z2x3Ee6&8xx6cFw2m;v&jx+;gu2b^N938|9m$U^bLjwenXRc4%Cjoh9Xyw(o}uki&> z6Pa4#srYbhoP>bmA*s_7+leKRq2eWRGt+LU>9j#7K^Jg1|S=sKBev3m1-gu>8| zm@2DF9c4>A+p`K&z8c;%3jvtP&>q zUT>&Lt`_g%|BKu2|K|2{OKjd#-gqLYLwH&;1A{8r7^ssQWQB z=;XMS2k}@TU!*}Vvl^b5bvwZdhb4_nCwLlMwr(|O(7M$?JnM=!#tSe#5GjuFf=o8Z z{22M+EwU**&sM!{)mtAdm-P)m>4FwlIM|d5*&Wk=Ut;)O?(Kt#UeEDNLW*;zmpBYh zK97TWQMPj2L(FD6L8P&kj7%tV&s&F&*I#Z&34CD1Ntk+q7{PE8kj8evPl179E*AkJ zU3cQ!Z6BS*;K#lvL+VBfK$!5{)1Xak?nMI|qvD>_U|!63HlSTP!{W^shhwdsuNJSh z5p!$AWGyo=kLIaUZq5$Z?0kVYI{X3s!dh*)(@{0wm}`FOp-E=dqD!hPYwBEf!@VuH zy(e!=w~-2#Vz7bSRQ4lFowh~FpuJW@J-1%ml?MWCl?x3G9b+eNW5&?{!_sp3;qJ|=YfAJ_zOV>6Cm$tMAH+-`)CNW0}a!`UkbF$ zlwcOPXT!IX1U$^m>d#rkuR(yn7K&1Y_?L-;glMAQX(8y_F#$JkL``VXb|jR;55@A^ zJ_3$FJ&yB5`h)b}MTdC4=8=V?SgwsK@(jY!PvNx^?LfwPQg-#O$9D$-$!$ zF+U^rqI_g81h#MWsoM~}f<+U*6zo<{3MN(pPh7j6)UUc@4wu>4ogD#%)5oLXcKl$Kz>Pk~pNp%I5Lr$0~ z1KnVJ?h4x85GU5#1k@!HW&LI6B-f@o&DCV-Mn6IY!Qt8j<2fj#5{7t$?B@ zF=@%zdBDUHA$EdE>~NOB42EAw(2F#_=`iBBw;ta$-kJ>jEtcTwrA*9?^5~<_<5f|+ z`})VzMMgY%;;|p`tyH7rQZ2W$muhv{Nv*ee@kyD@vZb^Nf*n8<^&9a**Uk?6H6kbw#HV% zkIoy%U}Lah;;TE2!6A;dag1Pj^~o!Wxy?X^#~3OOPeqjIV-d-%S~2WZUCyJ~P^qh)4YvN7MNxqv_+$10m)GmP|Yr*fOz>rx4@Tt<5}_ZEl6#eNZFPq+MY^ zF`nnysIk-@Rr-5b{2HpY%+lX$@c9{#H=jvz`M)V9x16JVSmCW9P4yrgGeuzO(mZ?@ zh06V6{sc||8HB%K60XC8tTW0hN72N+3%9tNpgo`avBgGTP3HzFetCK*{XX45TE=uP zrz^^Ut^lB&uL3ig`fL2j<$^3ZKFRr2wiuL^B7aQn29xIB;))XR1RyZ^^D7?<3<3@U zPHu#e8)sy{;Y2g;$ji$DmFpn-h6mIvJW`NrwBo;&``h>_-!{!CUmx;4uO;%ez~G1B zje_4eCH!0-9_y!EV*e$%G-tjR7}7JmF^YdV_qRoKjUsw0$ZXYw>1}_YuNTnF=x1#J zUoGzCA#Nzd<<@+8IMd%M$gel>!OHAx2ouvY`l7w$#p#Lxd}eX6Mdx^s1GH_~(5Ak? z<RMv1Qw^W66YeG2IPJRZqG+8CAz;r0SM7{7lRR) z#T6|q^XWUcjf&fwD5ACf5kh)o(by|pF$m1!qCdZMTa_(Zh_JvSg#{KQEHLt6GUnyY z`Yo|OD*5UA+U_WiaK7Nf7kBtV4qvq4<;xdiI8A(!B}ZeD(;(Jy)Cc)n$5(J3yDI)y zxW6ajHH^3)!Qu6T2)rjK{#S8d?duq&7t!hmuNN&bM&UTambxQq3SQChGM86x;FbLb z3UUhu-g_(lYs~6?tUv=_4B)(QW6VDMHHs&Hbf6DuybkmwIvu#^zor8gm=iHxdcbf0 zM#^!3;(xs{avVssK8W`{|0Ow^)g>-3PjzV!(dv>9<0o~zJ%}{l_$ikBck1h4(s(@^ zLUej|$A3-FEU-wyc z9EKBjMT3PU_Blz3o)2a7CQp{A_*Zg&iUAsJEy#x&^gPCN9A`c+w?w{D{I`5;IbLg_ z{NMI5|7ZRsxxe`@<-4MVxghuBxvaOh zKk(Zgc8iOafTuUCjp!WFv9>&M{ov$(-2J4 zz~8%&`eXE!HgD#u@X>rv*9i>GK!rJK6BpSOqc;J_)D^@_HcAuXa?F*@$mkF97vxSE zo3BJ2*PyYZf+sS9?UgmIC4SHlTr$^DRjMUmq`cSQJ$Iw4?%d44dqT$JcwBMO0O52P zNAC+@e_g2zdNS(k=@Usz+AsK#VH+#Y-w#cRyPlt(brA?7CT zbvOdN;0Yi)p}gY*jg8U#Rk+YiwCr6!6bj>h-v`{QQJjBWX+4yksca3(u~oh%@)dOY zQ4^hh%tY>wBli=?jW1F?iQGQjof68<$TkT-OM96QykZRS)SKS-J9%hg&+-=o{tpEW%+Vk6T4~1PF$@1bokV z!FR5o_?}1jmLqpIa=#$RzN5g~?ZS7n_?-3&h3{N1)wA6m%1^{~k&k(ETJJE64|Q4I z)spkqcDq=(yvt=&=7Ky|n8-6r$D44hSqbZ85Eg3?*3CUkILxZVOE3rvGYIQ#5T*t- zmt0;(vUml#UqxFfCsg`@ea9Srp+jmzMR?f|HJlDUn@#_)@p#zY`*dJ_mbjo2)~1`V}p z&7mdpHsOC6;(w)~Lg1r~&%(rlprb2hivDpS1*PG;K^|0HMHl6A+8kA?9WP9>+58C4p-kCo zuZTicgR6-CKwKoK0i)wGqHNv=5zV~UL=Xi#Y|snB!yr5yh4%)$tm+K_yd8*g7=hzs zV-seEX(Wh|<;xBroJy3c91nE1gI`8>f1;WA>VR-@8rVewc6or^U1uXjc9G)@$hQc> zIuOmA7t4oV-0DFzGhU4T1n~EQ-`dAb$E0?I7nR0zU%%_ zI~z9)_+bYVY6O0aOnh;El89#N3-gmg^d`TC13#A{exitWxgmxqy1@=M_|a@?y=7PGyH(7sG*(d&<mj{(gh-cO{Kdov>hZc6i3-wx?<{@~)_)g`NIh;uhcAU9bgWdZ9IsKxNGD(BX*^~&pl}f%O{!`mt~~fs}NG{mMa;r3g%`0z}W@BS3O=o>R>%DY0 z8~m*y8-3ACZ@(7kq%A-YpD;-NFStK|_+)U)WB+LZKR?LK@OAZ!&treekGEQuDq#rE z7dRcihKl`N^>}r7%qD;qDDZu(kX<>k0XUjnyk(4ej!4EkQ&Z zATfOW%6&MZBbpeRX^`tEpR`3I9{D8N?6|V<*Qm*9IMH9f<~*~JIKHyXaIBA< z_VA_dL>c_~d$@iLFC#8ne$Bp(@_ilu4(18gbAL4PX%hY!j(;2{|7%(z-&kVODEUCp zx_?O!kGDj=7HDi4-Z-M!H(fDTF=tI66{-u03{!95j(GvM`JXZ;L0tMiAW4NA%AYkF#wnI3AU=zUi zWOyBkAzA|#hoG$&z;|Id4V@8mLcr}ya6i5~)4h=pCqN$nc0Z;cKz{@Sya2va!}%GC z0Nx3g#123nX|ge!^08299Hr546wFP=Cs#n!G>oG zj%i_ojZiRCp9|{j+81%V=sk?d090a`3KKyzk!6l5M{T(uH)70WV?tMjPk(>#q-p?56XuDoSlFXBM2YNLYdW6EO829!OjY0 zRT867+vrN$V)xv+I7g2L^uqU6 zANh_$O}D1baywUHo9p&qQcJhy=a{uHY)s6?HEf)CfEb@X;3p3FHYMMl3nAL8M=TD1 z?I;|@i6q(v=_wbW565VN*zG7ZBfiyKM%YOLHRHk>IZMkKg3L2^a&2ZzRPmmReA-Fkh^lU*Twk ztz%vxYiuhgFN*0&o&k_9nFk6^haaBrW-ADwyn>P``VN>97k00`QZMXg8&W?{d1%HAjafmqWB~>s^U|@1>=34x z>N!{*YN;TG`{^QCKM4MxMY2?cPGY#R?hWByQ-m%Ph69ixcFd>`w$XikQ7RtS<+;@^ zSGCyA2>bhP)j67YuFf%@rR1MBskp9 zGh)Wiwn^B2_sWd09lys@{1&gqIXYH-sK(If$hEKuwRW$c>BUOAj~A_Qc;Q#ELV0Mc z8dHZo6!Z@!1@xm)0h{6X=Zr(l64pk@uNo9r*vrH-Q^Hw?S4BM(&N{d?5W*P+Z+ceL zoc+CUoWN_H5{?TP_LwEr6<+%L{kYvwNhkV0Gm?c@w!D(uogCL@!0+NQZ3dXu@tZI| zH<)12^`mo%;R9IS>ihw-I^WYjJo^MZjqSS=h;Q{W7S&6q!P`u|i06vgn|Q96eLS=8 zU&-eX-JiFL+1IlS+_YSH9qTtQJ@-Me=k8A$*Sve;L3mBeD-Q$72~<;abgcSNQ=LSs zsf?RJq|qJb0Y5u!!wP(yD=rZ_Tnlu_{IHe}d*6@Y_|14url9S?uDpDIJG0 z#1{|9`BFxq%~(c0Rh}H(7615{emNfb#v)u?rQ>m#(fz8x&1gYo<`*saP-Htl z%c5N~$|RN;HCl6hlvaBW(lVm5Dbi}IkLbPX_{NcZQGtv<&#M4#>{OwMA5M6hApar|r1C8nK zd+O=i*HZMcyt)zXVX5Y<<-Li%miHkBwLC{7{9mZ$2RUsCM0?!CAMNf(j%)2ccs%lr zMYy;=_9y1Oj}a|R>3#YV?Qtf5%3@$M$|RN;wKPs2r8UTdw2Y{1inInGdA;t# zAB>(U^cCU}v^&m_fS{KN^hMCmPZ)$?@NtGj1iqAQk_n_DNcR)65M&=`up-DYfiVd3 z`~>`b#KhwalM%#e(3GnOcQxXWyBuzSOQj)@pJr8FD}eL7^XAn!t6kOnJnX!A?pQb` ztAr_l%`AA!$=hW=DP?Z_)IjTV0-C!y&E20U*Bn}Hsir!BT(_g^-C>IrnXxSAqsynaj;87gj%_d#AGT2-=DC51d#T$y%Q z^k4fO z@lR`2XsrtWZtF~rqS}H<>t1FnaaB797F5?dToQdl6RMr1cBj2GVRS;#Ad~83L3LUE z7|tgQX!H^yTk@?{YoTI%07#WaIyH`}d7KSbxxLcPM?EFg5*dyvn@f!zT!z>TtEs4V z72B&8YWcuDa4fO_Bn5~kiAPGQQLjyNl<>f|GAGJ!B;eV@`w8BwUQe9Eyo43;LEU9xYX^c0d%28WutNM@mLTjq4Tn<}FGk8O4 z>*m^=b^mRjt9nVbHb89k{SWyf7FJixuU%xDXRm4v`5z3?Plp0*Y0cn@u%#s|sI$3T z&gv5+B%=`_@g>RWNl6J(OIPQdYpY5~SyXzQzl4=kI~UqrE_*ZiW9G{_xoqXt_JlMw zn`Q|PskBu%=GHaQ7g=I++7{}B@ZS!KE49s+8qXLc_XLVJJd*kOh*^IZO-0rNNZ^zx(fAdQu zYY^?a8GNII+Em#VI_BA2)lSqP_#PNjSzTA;OR{T$l~7nJ6#tE$oo zzlDQ>OKg?uT37RWqGV4(ZFPk-==_(%LZyjDbwx#Ucss~zktNPLN3A4DX^tulOf-3V z{u-yOTu5`1lIlv^3I8E*p3Bh)XYkq5l+JdXJrPy*MG155c2|YH#X~}BY_+vESBvH*df-+cMEw$f z;iQ=Q%k@UAPvUPd@GJgu{p9+j@@-O{YW)%A1>tdC%GIoV#q#vVca-oOCj9bnv3~LV zItcBFS-op-lE1fK3-HN*{Y@{HQzX#Af5mbV^K)ybOyuKlI*_B89=qwas5fSMXvQD^ zC0>4^f3P0&u|A0MhV&vHQ+z^tzx%f+B_2@@^zD&`b6)^#0>FMx-!=*EO$Om&d@+ug zm(e#dy%|nVagD+~(QhUPGrtkdVjkXP=?RA?K0TF}H@=Ky;!oN5A~R3vyy5Zpdt-Ik z3|C|7gFkfpK|imrBuIb|1fITO2x|hE4%id=6b{ ztE#hA46Jky$rz+dV_P`LXB=V*vls?Agj{Pg{LNKMVTQB9G=)i?Y0c#~rO0axAGm5Z zDjVl_lUnMDMPG$TUlSziU7x-AO)+@m!rvUdrT4Cz{`2IAWPO%oQ%l>sjjcbDLohVZFqaSIsRyTKh;{|`S_qG1ER!z_b(@~Ar_RekW zY4kkF(YfPMWoLHsffZIocU-HO_b5&2`J?6sDuY=F=U6 zF>}lGN&b&D8Z+K+1|9W2i#TpFYl7bT+yeSB6ZDj;egt2uwr3Vk z6eBk6;5U}&O{~YlH)e4phG^!aW00T4VQ0}I;g{i#g76sh8-wEUxD0OxlwTbBjRoL& zMj|}12rthw8bLV9rwu?j(F`2%-zXH`2Kgg}c#$Z6w2*HbA%7gkabW(iJg{8YiMLKf zd8(!(xVwPA6O<3f(_ZkmMbHj_=OvaImk*~E^A=^oFXpK|y-;`uAzzM% z8PMvE;&J{s&25RkF2)h9zIF%bNt6d>x&U+~nyz#Me`f)(ZgDwsJ&A+#9f@A&a9PFy zUY>7v_~r8J0KolmNCsF};!&QwjH8f@`8OWM<-p}0h2n6(SWf(VJjw^>7boBl({dSb z{&*R8gkL5%R@Mms-Ql-4(Vlk!v1tS?;zoL$i`uD?7FkuFg-U5TdW_#2lclRc|P z{ot3&gXhKTV_Ou?%Y)Y;#>-HHa1I?&oIxmFHz8g(^o;{63w4HGbHY2+vA5kV#NE+c zm`i=gtGOJ$@O3=1XdFGhq&g0qp7;)T^=n+55`Q8e2Zt|J#)ATQsGc6g(sH~vjvil< zHx6NQ3E_Ck@c*;Jg93P{DZXf(hpOjSx7FBwSwDgZW!BhjC9eNnZ>6KO)^U#ACvRYF z-NHsVLsM$)HkW1?t_fbhvh{~Guih%hLd|yD3F!?{OilbtzDDYXvGd<$r3D%Lx7dxS zt(&W__^l!ShBR+`;=%jxvUwcz{rEbYu>)+1_h0I|nl8U)t3OTfHiH%%uW~+726jys zZ_9T}CkXq^==~q-kc^*+4ROt_C@$sOnt_s!)5soqNGx;ODj{hEOUkxT0+veu8X6ip z5dHHIk&rTq9lK(WPSiQo!G3UMb*Wu_&!W^(Tf;p;OYAmvN>H*yjpK6Ks`$$gfu&AW zr%PN*xU+l_cLtQH=}IbWwfr3|{u~BeXC?nZY|DD4EOVYC0+I4`eXBo~32> zhKY)!4SMoSDty!DjpLBp>`__u0p2D6!{3JGAE+@F!|?)RZm$6aO;ZE`>G@+o$Jj5s zm-ibneazS|zNE0;5au(U025pRq*1bA59`DmHynN=P#U)H&Gu^}A$=%O_F|&|4?lNMPLzG@VSqapaKyuJTS(9LaEl;r z2T05CbOE0D9`rz>>^Jv<{G%YO3kqZW4K#ZLQzI)~FpvRph5+A7l9xmi1E2k0OAkXP$;dDmngNd=wB5SK`^XC9Sa`F{3Qrvq8L<*hJeOL8 z=S&5Su>_ixVx(VqG~#2bs;jts1mzR6(a>9hjL_rw>WhkHjw*-C4qx#*jb^zpW}=A4 z@}DINv5h`{2XFDC_3_t=-(!gHRRX7rMd_G7jx?@NBII$o-NoL-bC;CaFQz2kvjnwK zs?9l1Y{c3U&nnwa|2-RzBH8!$#JDyFFQ}`QRl!566vvTt&>^uz3Jk<~YC?z)trqV_ zxxbU1=xsuN+f6HfX=zQx5@RA9Vxm}_upoEJ*nEMi&Uz|(V*~+6T89lXH0#(rs6m`j zJP!)HOz;mest9Hk(F0ylQ12PxWq+*VTCDpC{_|87jMOm9cQlaS5}rR=*6<`EtpUdCR%!|(PJC&Q zy~VQ-OMua|FO+E0FGl@D_|B)OL68Qz)V9joCUoHM-y;IC2B!VW-J7r@yC zXle&E@nL)*xbgQ8^gS54IgMe!6Y!X&@T7FX2Z=iUbZ~!=;-7_i?yywPWS|EiL^G9S z1kE9@FycPVWBm0;eDkslmisLFVX(Gsl*D)N zOZ80gr5CXt@$lxVKN1~bK(DF?A@JKqS0>63o`pvvIfk3aQIruc=NO{3HDa3*F37;- zuTkY4LEQC_%a<)};C3PyXjSlQ7aM=50zeF>edX0bd&s3abZ25XS4G zhtafL52A^?-tz`CdZ~uXQ_sSUj^ZHyMgU4%!e>Sns7HLnEB2t`2%B%H81zHHj~yev!3LLcf?J_KPsyQ{CgAq zMp|!(+a2V@HilTg$I4OMTni)mzED_M_tqCSwcT^yMrx+^&xd}uS=vGOe%ZJ^w1BL5 z-H!ELR*jihE9&L{#M(km{E4+FI<@TmZP&n`1MON|OSNkj&?O7#2Jef+6`X)3LbyBl zdsx7&@EgywGWO#5It%i|bO#wgC;|h%%-~<+m*bh;h_VcA0<=L8Lp0+T9OAqHkJk=? zXi%Xu(JtX*qZM6*Fwr1`2>9}PPcP8hH)zCR7`AEgu_4!jL8eGtCBR)J%Xw1qh#GvkE@yxrr;97nGO#y z(QhO5%|Zg|Y)+f9Vu}P3?h7B0rS>vgU4;uyT8MJR8WC)uW{mbL8q5#G`f>Q_L zFe*0_zTm?rWygq<48{rTOim`or}g**f-lX-K^(pi7ecfSJ|4b%T9D(D57t3Mp}5fy zCl-F=AZ`rNEGhB0+{y~?WI7Vf(h8sabcXcp4bpR~FC2P6+-|^20`Y57G+g|ypW%@A zg^7siVKrd_lPu!(W#VCkS6C=CJued?*GlokjM)*m;M$rBhs%I9oLIjO=yi?Ky@BwB zKVdP^l)qqhd|zZwSSjo-%o2Nu90_OMgq1~00xy)ttStJmBHB?m#WOhDCkWlrOO%_i(x53FyHO%UF{;pJ71`ek>_M?Nnb4<=4W*&Tf;xLibw#Z%rKPVYE%2o(g}R?Q zMaepQBg`_;I6{AugyJE^Ol{Tkp~ZG_Jto_ILZtc1Q2<;uE0ycX?ActKUp z>P0^AhrX+FRdJL$t7{;x(R!CTF4H`E2{P&-<{36^;^cz-iQ{JZkS}fv^ao(w_mZOZ za`O3fV}x_AqpCEq%u!)-a;=wB z&)fLBN*97?u|R*;GOKiohN7LJ?%W^dlf1O_VJ#ppPrC;!y?*yVkrN)+!-}IE$Cq98 zo*?_~P*y#GuLPp)xLb%@zwdEyY%mx;uHzbUsoMhh#j}VW$~XbXZ>}=t`t3uVjBBC% zb*tZP1$+Srj8<#Cz`wZ)-HXdh*~PT<=Ke?nc^lO&ZUPn6tv=cWS{XUvdKLpXQ%K8B zpz*{x62)l)abk!zfo5@8uQ-xu=5f7Ep!sB((ITAGZ{`WG@p$=PoB(eU^AS$B^b_d! zIC?gi^@?+&{$hT@1lDBuY#AV?6XvdNc!jyDPorpqxm~l2rVZwR9!J#%vp|odYxj|2 zV@BDQfxhtSUY3?YJf1J3Y5&KN3LXE#7_y&#V@RcLH1`>(q84VmnvxFb6?^S z5=2fQr=6%_Q?)amBZlLa$61egp^cQRM$BL(!xrewnfGB?Muw-JhuF#v7}f} zG;2Tc`7kw2l%qDWq}=ATmAK@p58pfDb_3@D&w((+M`PMmFT&fe2mnfO$0n z2|@E}Y+@+~N@XuZ3$|6%l-qnL1yye_!y;|jNI-f$fx%t~#4$-fvZ0aq!^)~9c`Lez z*|Q8WC>_gS^Bj^e1==ab8ke51T8#J>PDCJT%emldKpys;(Gv99Xfa z)Gt~rIT~x0+y?L2;r2y68W*C*PgUc`VttxhVJn%>BU%4eBVhxc*lkb=)X_qSrv}Wu zol}%dr`Lk;eLKbHsZ;UaUgmhE(TDV4FH;^4>TE~`aOGUg9ZQJ*;3lTove+naiB@2t zB7WaH2GdM=V=soQ4Yc?{*Jf}v;#B?rP$%5(1j<8&Al`-xe6#QoZWA2fKZrL?f*gW) zqlsh?#2ZQ>EJsVP%Ezr)iI4{ZVAIE&LDxd$eo}KQmL$9!r$<8rOR|SnOV{T%t>?F- z`kIeH-|lw$UQ(QDXDK>)kFh=pzvYMm-X|0qaHL_KH?K;u&LdW)2*t=U8(3%@rMYut z84qFoj5|~HdyzJfu%tH_#=(OPONJ81lBsBdB@6rp9lDx+gl8JW4qee@FE~vG38pwl z0nX7HPNRNOO^=2E7ORk#m%diT;cs8d=sdZhW%JNMh`D)XziPEXjr@<+Y_(oAu`XLC z81guVkG7(R_JJZGpW^_3)I>?`HtQ*_5vUGBW)D zSR~#oiDo(uao_;}%NQQ7bw#{&MZ6IY0O+qL*45gJ;ofE~@!j};2v|eoI@(^rEz8wO zJnJIY!wy8*LFT6tLy5b=e-<~4cs}*W;zS$7i6eR=|M5TvEALdI8CAn4AUhE4E%h<6gC<8<|eG`--rH{_iFdG|qa`$3!m@H-IV z_J=quU$&6RmWkPE))Jzec40V*APm#Z9+|H|ai&8Urm-EkS=kLIezg;(_k|+$R@gBL z{b5l{CoziveHP*3cm1JIuLsQ8>XZD3nU}T{t-EW+Kju)qjt&LE-sE;RXjx#j7{k3YN3NCO%Q1b%>1dVwa=@P1ih6 z^mT0lpCUey9}9GO;#qLk%J^a@8J`ypAv#M}{9Lno2MSKQ$;~_;jYcgcx<5Yh3`8LO z9hWoDL3$n>nC6x^^K5}`PCFB?-3JUod_)u9EK{BVJQN|`c==!pi~I~bFBeV+=k<75 zqio9g&v3YuE`o#~rZoj3^e_Ehl;5QQ_e*yRVc#?fGj^CU&Fkm%Zy%B;><9e8* zI&l2vq(?agZ<)&R$M`9qm}aC0^BGMv>)=McRp9JV!-%e-EyWEpOB$#6?MefzA1a^@ zsK`@C;SCROw0JxVGT_dS*y4(2)R34NMu)g+!vPoe2H*yxr3|Atwju9;WMVlEhUYEN z0kgd075_Bu$9B~Dk{OpjZ@gL98Nd^6yg5Ie5Fb1*acRfLOCgPtzdv?CJ1aEG11f)- zl+L92sxZgX+;%}3on28`Fu#5v=vI$Q^?s%7cs=I$2=T8y@P>!;?*#rmEcK~t9UQ{E zfG-zxViClfKsy8-OrSFWj~5AGbo9T62jFpdI)*F33-m?M-v<~BFbF{(0PY_GFbn~Y z!~H`LAd{EPbx4_QgM^!#r&R0@`eu>Rg2TMnK~&BBjcYKVOTgQ~f4pewEN{7-ur?+6 zVxwP1B=g|+G$Y}m{2F0IlG|kTWH(Z||6-i)iE1NKnb{b~$lNAobqzC(Elm(KW0Il@ zG}VZtqrok$Wje3bL_0x={qiPz>*l)hXAsu9xvo$DzrMMyI~GoaqqfAS!?6Ok(}Wsk zwhd`?K+UMO`FMNJtb6wPwjoatsOMhpV;jp4KPzs=8}CATpi=IsFIhR>I1eSCU{w3X z76v!DbJ1zn?(PKO6NP%KR0d6$QD33iRc z({2|w$iqs?9TlZcdzFGjH9n_PiFx^sT4fi>GQ+GfJtiOyFSk{dR@jTHY)!KY%%}=B zD041WSe#Z4ZX&j`4C0z~?c(=P_nygJyk?9mV?wy{=?~}3z09oEv)L`oT1gr4%@k8V zqM2f1arlENArObnu)0E=a3KzVe55<*0-Fi3SrWHt%r@$yiM9g6(!>#MgPpm{A#Dca z5l^(k1l@rrmX5WpMG&_G!qWxPbphT70=>P!-3__43G4R6*+{^|)u+SG|F_`#uprBc7D|^Z>K>uCb1RNyB7fu_v(I=c9 z(5kzmHnGfB;*uXc#tDRD0^Ten>t1ma88tXEplx71WOgN$NY?JU8ywEww=({I+g%b{)tJ$=~i$% z&*`4@qJM^4@)$opB4Dt^@3V2+;L-}ks?nF)jKhJB?@+ z!&Svh^wDOu<;1hld3l5$moQ~nVjAbqS+Bbau~$F2N3N9 z13%e-_OxM>Oxpm(d+q*iB!YFe(URHamh*H}I!S^=sJsL`@9OxN<7%zgbvx)I(dQTHi3vuiIPtXT(eCy^7 zCTSkKdxJSziuhfab2X~_Z-(RCEjAo*p9s0X!XQk1nss<^;`Qi?uJIld_k_?}FT*{k z1>1U)P>*v>Q3}lFguw*PXxWZKwtl% zF*gO>Ek??wihQ$$hY_h3BZG-2TSmI9yjmgpe_M4g-vlxw=K>d%K#SrvWJEvthPBl8Y)#$MqlWb3rZYnWfOfG<5HiBm7Iry?0(^fVW zVw%dPsG}k6DS(TmIf-aXh-?aimk^^MA3k;B=Qd)AZ*Oh)CdcckFpJv~Uw}TnmnAQy zcOgVy!}xbkbP7v}W><*)${*v`FF9&GG5_x0F%V-u&3+ zMw-#x&Obb6bdTb6>kUCzMvHWJ(0@Nk{4Q+Bc%vuDxE3m7Q6qi@!w$7#{(`{yx>x1>+5Fyma%zOZR`77qefyiJ**MmHN*&J<#!kR36^+ z;|EF}EqVL^$+|>nC&l}ld?U_|`Q!5OfXA>mHN#_pUJY&5lyiS5@ozIKlpNRbdSOfC zYXP0tHn)`Cm~ZIB5>LG`r#hY(-(nVLXl&ff&u52#pP#|zT!a(dvu1Ly?yr_dBF-$( zv$y2-n6Gq2>FFvuQRr%;qpRgw1q+OZIDH+6vEDGf@I`O5f0dijnxQOZ3|PgizJ(Ip;|^5M^4y~=88`NosRmv39W97mog z<=c+@&y?>-TrOP<_~+{`?STJ8qLeL;!;b~9urP?jdpU-0I8oY-p+u8S8-lH~XuYMP zTgqtM0;6w!sO}))2O*Uaqyo zZF%v?1K)ygLRpN$Wf9{?S;PWu!;yUXVbiXHe7l>-ubYYdxZGm=kQr2{C_I#S~9~39bNletnUL-v;Yr3qTLl$1%Ns2b|1&z0E^8vN+s) z{ahZeb)T-q=KcTWeL7y3o1-oA=SI0$m${Dd8+UNi&Ju;{rMixUdDC5>GieBnD!gzg zL_H<1YY0NnDfkd#i5Dv!#NpdIaRM`Z@sgiNA0pt!#-4=30m=zw9#F?K1_7Usjh<-wr!mG90t{r}&E}C%M#jbvymWR!ARm7f_&7XW z?t|!kdfMEG9V4={O~hn)U^R^QOHDbnYH6O1Uw06O%e(cNbJE%?HnCUq*6_K_58Ov< zO{*$^UG0y)`!9;}H)g&xFLzM&!1U$YqjCM9)8XBhIi3>7OTIN#L9( zRMyqH5}bBdowF*zmVk5c+Orr8aN7A=Vg{>;^sgcE5yk~vdjfO$3XshAUF2z*oRO) z;~?%RqS;0mFQl6Q>6lDkM&EdTOlGSA-z1=KGSGJ_{1(9PRG@bX$}eBQ^^qX|X^{3b z_?-?oicp-H@a=T?HVeXLNbtRZJebU9L%bQlA2ZB;0OFiU6un_`VMk$$C71(pA4RmG zYda_>CY!ks2d66)(urk14}Qym?lR(D)@MVU3Wzh`AWjv;sew4vLL6Skte!~iS2G@~ z^?Dn7XanFphQdANAE5s(-k={K#upCuXp7I<+k?9JL4MJ%j4DO?86GdkYX9_v$L!Y^ z`3O@nuksOY{`7%O$0qqbN}Bs_U2io}St7|*1+X-}*^ zzG!sLExvFJ*X_y0c>VXzf~YIZinOigbd~-VQ9I(fl@s1m>nwinp#Sc6+}qPJTK2t@ zhwsF_Rvt2v3jr&(-uU&@nh2LH%q}OScq(`H7faAGHMgEF62da{bbgn8@9N>ZyI_nT zR`VtpBf9iwbv0(%)=KrwUR!J{EiE?bQ9K{xb?XV@j7fI8xZqZ|dia+Q+J(&P z8&`cjb2Uh$tDjzqe!o!IpLlM`2I#*hd7_8?v93ovg5_&#GSCYxgFMi}z6&ON2McAg zz@+9hiRK~1_ww?Tmp^$9^&?Lff0#c0PVsx9{(D~|xgq?6#q`68apM>2sB+oo$vX>f zBJjb;ldr#+lq4M~%KKU#SQ%tCU`QsORWZec<$(VEg_x=Om_C~rOVg*j`gkZ-Ix$x) z^RB~9s*N`9L`dPaTyTUs|_US5>8~rF)`g zgrC%m)WuhxVl-=Fjq)4iXMUrJaXkxX11dgH^pybdy9L?7RJPcZ^#*i7X+HO}bpn5n zEC-=HEHF1d5sp&BG21^|nl1~BqxeNIe&L7bsa@1jChz>BPGjw`!d6ar1InB8)sl8t zoAM%Zq@HF~AK`k9?h(f*frkKqKm<+U#{x5G?LwKQiuN19RO>>tBkx8HQqXMAkGv9M zGRqA>xBI$4M15aoS=gxQvl-%8vjh^m)jUBr;(xQEoN`y2mAIB}?mVcW1M_hlPjU=s z$$Wh+haS3eu++3%^R7Kro#3Z@5}Hvy3|~(ro%%3(Z$Ihj-2&w$-rVX9W!0OQ-`wh> z_)p*m%l)Z+%_&um*8+n$hQF8MAISav{G_j6^U^nn<7JCD(ZqdD#=Z@J-~P(C!92}i z_#Fhl1L1dw5^pDuH%#&O<^DvUc-CZ=*o-{;AnU;`nJ+ifD%LPuvv{jGT#{Q?Aap3sPNlzO5 zveh0VonYUDH4!hCAC&a~KJ+36I0;eP3puViZsRozUvM+JGvQZQFY$|hj)&uIEUh9Q z=4a~~{?vhNay;vR#nlq|GJIJI{;3@QNI&H;vYF`_q2RCM_(%Im&*)~RXOxD|(o?_} zfFKA#h!@}^F&>}e!}l-SAf98qKwAW6cPp`ZZGN}2GlF;kZkoFz05|8&O?mV9-4Ss5 zdmuH0DTegbo~$vKrj$sFarL{JoocBeFYfS2uL-7bOf1x!e|6m6F3<`F2ERo zJbtN=wmz$_D(fk2eUYykfK3j-o(TGQb-18}l|D>s&!Mf4|q zvk{*o4fq@dd1n*tG;kl_CkMWtgz&I9xscZw_|1dgvG6+%XgHa;>&tlHjk$RP83=s8 z4frPs_$LECe!4dsWKIQ{pGvfo!R)sX(iK2Cu}06+W+Ksl|Cio}Qto3AExzgYSrGBG zedc~@#mgyEpH6RLd?CcMy!<;Y>i_9pFQ-M__}3p1Rd(|{#TSl@Hp{78BRL93Ouh1< zKXU4wk2W6oVBe!X^C{}Z5u*kQzt4undCbq!N6ar?A7AaXBWhhv zM@?~QwcDw^w)zxaPqwzxb5!LEN1iwoPMEsY#18uJZfB*06Q<%>sXoRf?)|J3!!4Xn zO%{0ddM3@&sZt_b|7G!oL!oNvg;v=YB`OCO@klJB#^$ukHXm8yu4akn+G_2_ryh-H z77l>=YNlJf%4|~8jKf*J-95`n+aq8@PkuWUBj|w;Fn>=yLZ9Dt4qZ^^D4AbcWHTb<~uORsFy;_v!% zpv`rP*u7l%EW)>Zl+8y^dFrwngEv~gZmJx?F0^_ABi2uO z#x|oo8NL{Wp2Zx0JfbH?kaIcE#bnnJsIzY!2)6&KwXw<}Im;66?}FsX_fPoYMtq2o zx1DfU?p&<KyY6gKuwpoJYXV_fd6&0uOi$2kT~dEa-9i zp2Ypoj@je%XdKOw_CQ4uO?)#=&wlxeFM+tBL_2t8f$_Hw+kC2;C6E*2fK)FeT2XZW zy#rD$Ayp6gj&6y3Ii6_W<=G_pT42!0_<+sOaPIFzw1a0-xNM6ERp>I4QMw(cC&aBG zC^KnL4AbFbv~1Sq?_jJS(GA8iq=FB!oZJZD`hpLuOh9l7f_yJ91>h6`3cQ0q zbxUtgQGQmJz0q`h4CEOZJ{+WlX;3yfusDo1U( zb{MC&amIkX2m@OYv~>4T7;csk1;&Oo==`WgaZ7{hqY3|ItCE&EC?7YglFd*lF#cw; z8mYtSkJZQ^oYp|WwfSw0Q+}(?s9?PnAU3<2X3m6a23KTWr}(N-t`eIIf@4u|Sf#C| z#!)qIgh7(vk_xHL`Ix_m*b-q#oaIs@;=*dHZ1e4obL`r;6cjJamu8ITAupyGSd|wE z3>cB1QFWQG4XhEHtZe6s-6kPsY+Eiznfb`3zU6EYHV>ZwZ0hTlKWw_G8>*DJAz~G{ z(3c7v%^PX?<;mp6bvw<}EE_&2;H^&=pBczP=}ev+(6+h_aJxwjXbbX%p2m+&44ZtB zA5Y^HpLp@E0Vyr70Y)9BKP5iNL{h$RZ}=)gaK8N5fM+A`EG2)Jr<7xEeSx6m7lxZN zUydgd@C5h<{!SQW+JlrVcuhhp|BGEt8(6v&D`TYcvBu`KRYHD5iSj(pKay*0l`YX( zY2>ji<_`XnL2zxkt;Vh%ny9X*uH*V1RLjOA;4X7iRPZFWikfm8Pg3i0R?p|rxOoPC zxq!b4B2I7IUiSbhRkjZ`A+cP&<-}qFqQ#M#4NGE!HYq#WpfzQID6L!JhAgr<7V-y3 z)VCOd760E2>IhkW%8S2z(r8wOa^yTAoZ{%K)0>iVd&&GnMQ`+xb#7gvNF@>1ClARO zq5tUUSSmB{kI;&;cd-)lICDh^s1rHz(v4UKO;`7s?;t$v>s@4OWVpZ`6BRUzgLM z?yJ4nJ$a%EMYLRpM*swV12hA&hrrKwy7>estG97LB8|Xqr|ucrfd$(En}~dXt2%(7{_RVmxQ@b#ztOS{O|$j5$hS zC^G;bg+8ZM=uH#4!8F%r;SX&Uev^c6IJft;?f<7$_>C97A+#aUj?zsHq_tt-@md=O zuRBVju31sNf<<7@N>e8{eQiyeNp;oOXtbn`u@oB!T1wWSUhkzrZHpSzpg0)p z1Zz1lO;fFm7#!z`;}pa9-VK^$`rt+yzGEERDn>8NbpY{NVOwh#X{}wyw~?18Hxi1# z5CK&HPq?72y0*^ACmx~921#9oO^KbyVjop7jpYiXZuFEoYHKPWo90I!SiWe9CW3Gg zF+O)=GIgt~V)uTDHvb^RbY8+R$YkyS8+-&cy7rk+ z=AO3>A79o9sc=*}T>SJxGQKON@MnAWE8U5TRjD99VyhgGb zqFhO1vHLYB1 z)98s;&FC@J)y~o}HmA#8>#*@Nf5CES6P8??3mDB0co>R!>gFF3y7{|XyLl~MYkl&m zn}5Jd*KFF=8g%mpbNzCsqiVjEn)P;fdFk!`5PCZ!3Lj|gNtC}<&2Qd0H}5sV$Jt8k zt|@ier42d2TZeHJ4azWS>g*W@Dsx`n25q`n;e05GPQ8azusuX|1PY66h9}~KUFG&l zK1Yu%VI85P%H~p|2bUpM!)hw3UB%2avYO9adqu6u0sxZ)#FIoMrPQcb*$%5*QtWcL zD(r$dIu%GFwL7X3YF)OujtYlsNy5Oq)5aDJ64D1Sb7}HQN3;T#Q|WKk^Y-0XUQ2iX zFemrQ%zXOKIoroS*X~@pF?xDXrZ8U# z8o~&8fF6~zUtryF|Jw^cT#-!=K>ROc?@X#>6G8uGsm4A;{|rm+|I^})^vEBpu0Lbo zpVl9NzC$BRPw)0j82#yE#_%2W7KzHkyDI5^#?=M?Sbt-&=A2A_1$;Z1R8HMmZ&7L= z-F}!6e4BNJ!Z+PBSw=v2%K=m9mn`q5HuPWE?KeP}yb@YWZ8zO2V< zQ}C#Bo-l6C*-rlf`Rq~nuP4vRjM7bz&pMF*pECKGbhVP7 z_0fx123_e-P!38q(N+rO`nwXZo+#mTJrl?0o9QvBRHdS&KQihAqUgt<7k?<5RnD+JNYO+3&8O_|GkRDlSGr*zi`a`EW?Y=!o$gljhyMARTu}8EM*XA@ z=rKhuO_bGZt|5otUTcDgbG4vOPJiNW*<+&E*u@2-FsnlQV2GEC}Kwox%ejQchN%sIhzbSs} zHmLW2FUJDq^}Sp?1M0(oj_>8-JD(l|Is60d#BWObLU)3EcPjOpeh=-!F{Rw;KfK+8 z`nLzlZHLk>TR*u>Ca6A=*9)jOL4cpreNe9M0rZ%n&-5=vU#$xIne~B6J+@u}?cs~i zz8sK0$I=f$zDI$t{j55yexg1c+S$FJ*Sn$J`Ul#n)j;n7ki(B<@*kCR@4c4(&a{8p z-}Dcb{OIrXF;KoorppzXw`=q_S!}1P@@Rl|?jI_nVZ2gLpuB$qI>VImwSKAStMyCh zSN16FBmG%vFY5`kW4}W^cmw=LmHwhW1nM8}*M4B#@vryML!d7QfZv~BoCEdQ`V#cd z8-$DKV8``g}N;_E}1ntcs zXg42+a&!T{f0Xj74+XtFsPIuAtng+18T9CHr970$i{(_JJZ^Q1`li(x?n31&`q~ER*^pxOa$$zu-n=SoLli)7N&5YumA^l1dGmA8VwOENf z3AQjpbafIuPh?|!7G{v|ISFo*+{2{b>C$hy1k3=OG)A^CBXu*S-#nsN*?vX>cBS=c z2{uaMqls+n&C*Nyy^1Kt2Var=sYEgAcC`feNWkW27fMh8c#yND>S2i`0BbBOYo*EJG6D8ZLRF*sOHWb4V6ED3TY_v_Lx zdo1iS>6d-~T!Q-~cvkYqNWU);dE#ne1|MIO;1UTgBFY0{Otwi9bdcOnN^r5{W)n^J zNZMHvoJJJmg3=7d!hYY7e#a7df^Qio{Yo<%OM>)!o&={8`87mK52Cz|$P5zBlHfKe zj4dYIEy4K`R7w8+5-=IOCH;;eicv#m0CKVfpAcnHVz~rQNbrsXYbE$zf>$MYRDyRU zSSG;+2^LDQT7u7rGP!1x^0`DYxa@jifoeL4hf2h^6;IcUlzVF@~h-#crK8B6N&P08=L7qE`gIM55ldG zzOhAvKc!zrV~GSiB)2qcv#>>*Jg6TpNWUpWF^2diQH+(c-k8+~_RD5?t0dSZ!SzJ3 zUVelq#vC^jWpQG^1dA0>$ge=YC&BF!{7sa_oZS-alfJXLVQ(ToFlS+PPpYScjo?{5 zVr3`QGYk8EtpwjmZnmhy9@b+ERX-5ffTm?QQ4A`w`pfFEBzp_1C#?MTO0Zpm4~SyC z@+YDgNW5QybBMBt!WK&YAc~PpR(B=&StPvVItg|X#VB*N1ZyPND#2q!HVkWFbxW$h zvLQ%m=4@g7e@e91OV5_VSRG|;$Ztfpu5V#7O_Jaj3H~CAHU9My{74kzkBk>qzdw?I z9lV(%!KFm8cqFwG7AAXEhuB&JTk2+YpB;!{ZO4Za?2+OyoUHslmY_z0LlXQf!Ics` zOq7lNucUY^zu%=_rt_@ce=E6}Y|fS78zMh=Xkl`d>XL=&>(|QcJ``3GBwc|;h}`6) zlmhH2#GWGTnXY+K)AF!qJoXe|4@#4knuk5(v8Mogim(SG$QX+~I86pllY!D?BqL8| z@>uLCz@8%P!D+I1w4mIStRmH&n#3ao=BAESgEbnNy8yY1kbAo7&de@CZZ+RA$*Ea| z*fWdA8k4L_o4E^-dj@iA@iS8jk(-AH=Ow3T;*>nuOeZ2M|GgarAjyzgPk4?=~Ka5LG%E-r_Y1pHR zXk2pY804Oa-1*2o4Y{?n=~?;6t;NYo9)sLCTuYmsHU_!#k$W0)tL%(VPD;>xeJjS$5ACRJ~Jy9 zxyK;)H00LeXJ?K?y<^wGK?s8e`_3g4}AGH6dM7S;;*G zxpADS$X$rsT3+dy2u~(Tn~Bn9qO^$4Oq5q9$}1D)m5K7oO36cRmHCOusj8;S9<|k- zn5wC<JsG*DAh#Aj zTkB~g_hjUrg4|P)yAZk6?k+z$S?fuer%>~xBo$x}`jV~+P4>({9*iwr&7XO)@t16j zP3!pblQUBaum?rUN*RYe)38VF`|^|3Y?xc+KR-p&Jjt!fB0n{~0J(X1V7^+HC8vfx z3&qaTnySgknduX;XA<`0V^0zGsAGu9$rCl_B+aRo_mpH!PYcr1a*;b9xu+tx7C$2; zAGr&W8^Zlag|=XCn3#XrAP> z$=HLwq^3{A9^IE|nkOwK7kegQ&t&XD2r@8j8A-Y?g~*eMD9Fgtd8S|wMv#SQ%hUu_ zl%{p|l6xX@PeE=ieumZq6=h~jL~boSD^ulgdUA?3oME1E*fSn^vhbHI^d(i33iFJ` z9$hq)X=?H~?7`8pF}7^omwfC&*wWK;v1dH?;AojBnl_S_J>#(l<&%Y(%SsxDJ>#(l zr_n~N(^Hajk$XIHt3@+CO=}({w-zTY9fhl01x{DlkeuT*CyFppb84hyt7HXE&(=_A zBA$_)toP(&Pa*cG?dpsSZGb4b$0K(>au*`Es_3(llhiT4?8(C(wOO8}&EA;EVywT3(;N} zk3jwigETlkzsD8^9Nd(s9{ZAWjSlS#V&OLg=;Cq+ggm)E@HBApOa6@mcO>B9ddSm9gFhDV^L!$K_Bcdu zEb?>QZBQPZ2F_Ou(d$JFiib|9sV^UKUSlACo^KGyh11Sy;W#)gV!8414@Yrue6IIA zjPoCaefT@bElC3BMhth4C`s{wPEPFRO4Q=XOGTu5+Q_#^v2k z3B!xDQkc@pT6pJT;q9}9H_jG5I7#UKbQ= zSPulU#dkHlnyS&74Q}Qtf-5y$MPD|U?s*nkv))q5zVc*CdCr@$XmtY`-#4q2kXsv9 zJbi-!mgR@+gEwKGd3~-8Lbj<4JMe{$zd?BnbP?E>2$5nOzidSZkd?FSq`U%lLG7(HpaH6 zT5Ecd+)=htzv9`N6HjLT*!R1L90i}H*3^|xxH>r5>djlQwcj3!*R{PfN`;*n$vmgN zdEx+lmJlpqIbWKkw^U(y6{6iYf8qdmqQ>F|X7|b>!S&OF8(V$$Q==WT@&=H>!zCs-#7yya@-{YyaQA@8H0w(M=?b5HF` zPXH#a4aY|Y8}41c#sGg>zVu0kJL#mz!(4AI9pM)qbu|_LLOb>rFCN*SZsDD?`~Vx)`S%Hl;JL?A?;H$Y(`Dy5N{peldUTWIPlLpYe@SF3-J84kHa3<^ z7g*h)wYeRAO@Vt+;_(o-MUqi^wt_ZqqSOXdi*gwK*Fgo7g0JR&qh0uRKO(Rb-gR+P zFoSdZ68ql>mAO-+AKbc(WFy`Vday+-iJsYzHtsU&6$x>8o3%22s{3!vic#w^LpJ6` z;pB@{$t%sxt)V)9>$sw`VxNU_U1Yvc;eWAwP|tzTDj1ah;r%?$rss9wO!^CD1#{tc zmrj~|!i&v7!%Jqvvw54Z>ppJ(DujR)B6^*NjJZj>6@sH5AvrNmAA9a_C za2MgZ^H;*&JU|eW(%V@VaQcRQ&KZO4(9A8$g&SWsDErw)zHpXmgjY;JAsa> ze$tR3Epf|6{hALqzJPaNhE*D&nf48Fo+kVDhnf#_a>j@o!H*r5Ks5Nbpah zw;iv>J*A{*cModPzF344#w^WK{l6IxW{i(R>mt6XNHHHW0lj=Bj;AmWK4t(+2{)xP zn*C!$lA#iWqPLig-6xu_)k^_>Wf3isQhLqjN+{Dd%@5PYZJW=18=F32zXf9DjOX}Y zYT>iV8MkisPD_bMj36nMmPRyse3V6^+xr?n_|HtpzII6)c>djyFU{0k79;{nQd<)} z@OL(kHR2CDfI_k&$TA*NWfa2oyGLReH_3|Dw1F@UB}}pET}ZBZF#;W7XPU-ST4)-6 zFqS3j0S0C(fEBVW%+*Sx2V5@pA@cm2@0Q|K4*ZK`JwC<`#F4EJA5e;=tpSYNDS+^A z`=J`t)2@vpXZ=qJMCabq@Q15|Ek9-tvldnk{o91+Y|%xDM7O<5a8WlmCqyHJ5)~y@Yky*gSdQulcwt9pF?(Sq|$XU*=gk_=e91exqGH6cNV? zgYoL1)Rt3Z2+$!lnxU7zjGrMMhl~?+7S7xkp9o_o^w3^U zjhkp}N2s@HBZj1^xM1K<0J~+X?4y6Tiuy9vCc^M-2Iyb8;{>Gxt3n~5{0_DcU#kPK3O;Fc5E~{d!;5lo7W^@3Aek(j!Gq=d*xv3xZQngi z$i0TQ+bG3ktjva&+}nFZ@PBFk*8hwHp{2_IjE(&9o|Ij4Q8^}lK+hYmT=|H&L|NKa z&uImJU2lADeXjz_K_QqHX*>v`xJ_j*Y4O91ANlf>QT>2T^Byb-$g<7BdZ{DOmX_O? zm5d>T)jNi9Nk;_e{H4WaxC}=WBd)5RrSdO<&QVs@HL9SFzK$0hEtF>m8vmOJj@{9C zdOUPP^mpJrrp3lBVWmf;zy1%+QWDdaT%&StN*ika*^CywlP}ABATGn|+*z7k)4+wv zFu9 z!=_xyKFlh0`_0ToB)mkEU`l$)=phUdG+$TWQShvC9J3k7$`J2G<~f?!Ec&5B)idJl zEg}(O;{Ud{Jz#bI-&%k|2{-F3dpKy87&kIq`dt!T`d|p52jh1vNP4?vOt1=p72VjR zzv|vMgsBVWpW$+0|BGoV1v)BW9*JKtRQY-BNU_`45cc2mr4r!7W`2lxiDOSCPxQdl zGp8k1^>IlAB7Qb7Mz)ba*1s$vK+FGxc>jE`5NObF`H_=&!Jj?)#sBI1{d30>ph0sv zSiC^+@>d?q9xhtvt5D!mQqpR4YH$(ISV`=Wco8uG_UU<19`N)MUr~bAYBWEyisz(@ zNYp3br?`scWV$Y43oUQ1{c^@<|uUQ_a=yx3O~Y9B`5gq@Sa`fyB zTTkf6D5WCd7!Bga& zDZ{99s?V}%sfzW+??a(8|LWK zv&!N{!EW`FhSx4cB`4}TJCdM^yxVGOHEf3RtWz$x5Yd)#?VYdtAiA-1Zo`JAWC@x; z0_O0H{}V*#i!P+ID}hH5yv;Sj;7Y}cJO7G3VpZ1dQA6&FlQx|rmO@5i@tww;M>?k| zTBmy|F1$(p5`C>b+5}e0)ojV7_`&aDf|m%}qBd8D@f5qF(9gsgkw-H+VQdMZB5ci~ zY|KZAF;Nk_0WQvSr%wxRPVrEtc>L#idhWwHCCuAp*M*Mt<6eOC?RUaDK=4z{{i_K{uNE(Z^mP`F+2gNVq?}#dcmzXcI0y|~)W1@S%iDZ{ z(+h^3!TK;9C!vKq;pzH`=FJnD!BJxVg*Ko!+VGPS3LxoqWhOf{TKLidJM~|lE>-@|W5wM9p}Pk{3HcG{44beD5p<$_u9b?LjnM()3+A z-h1|^%KYg~%6B>v97CM!@dBP??z1J_iEH1~58c7<=_bv;qqm%;23#f*WEJNrnp+1# z#4a!0l>y9QB#@gqwD?GjM^`6;ZS|Ph(O&r&UZEy)Sk@l$=F&~Y8h1_!rOB1f(+Oj@ z`TZn9=bk3f%!=pAp;4ERjj6kHCYPB%&}CH9$&fVvhRRcgTH?~Q9(sPk@MKa4aa_vz zHj%rSW#9QUB}h~d;J{;e4ttzB&3|C0CQbLmo$d)%m0^AfO8MCyq%62Hd*TpYm|7at zVzn`u;~e25B}h6DOE}47JAPhr%!Z)GF@$|Sta`&5(}fcFiLhMF4{9$!OmZx6Em{D5 zbvgG1mt|U&@Q3{Z2!X|u``b9-ect1E$Zz|<$5bSJT*mG;Jj@cpXMLs=Fsw z_Y=W>g4Oa$#NH!V`7KV`PB+y-^7jj2<ft)`ZWeWsLTuzCMcMSV}$ma5#N=yTP zO8@)Ja9x&dh{Jd4yh1QvBJR~0E)gkqU@+sG+ETyoi+5~8zy*({X`id^5lL6e2Drj_ z-#>q-!1-eET|}6kpfV=oWQVUjr(PJm@B?m6NUu&<@Ek&Hr0dn~lV2DoDtE38^LKO- zou#TWw|`0JWp-;^bxk{0qs{qGS!}AJNMvD@(@!Ju_ONkZM!)aP<2B9iyl0?KkI#*C zWBkwcNwdLM-aL%qz`^p$>Gx2;N9NXBS|YBsY?OWu?a@eOcB03fniFOT5cl8Jr(K7j z!?&Uz#zY~DYuTzebbD?OJy}R0%w|d(N~f^fa&=(OEcH`d)e_S0<*rHoyLNiuWTv7^ zicb{OaL?QLxZ637v4lu!PX4*6id(_aNmp}R-5|#N9I?e0SDyDFt{>a5M0vD>-UMD* zNW=A+aDsg7C*Orbv+;woF06|(wDO#s&ya|az%=txT>oH1F&gou?^k$ z1QkscZciFaumYdlcaK`@)kiDq?mkrGi^@>elT7Y#xjpLpUTLw$@{w;!1f@4aA&nZfMsN519dDUlhK z*M?u`<&5I4xZ=geezRrPY{f-J_XkK?QhiR2V0^zju|>>YXMMYvetP3+pBnE^i_7*a z@y*s}+%2sFN=rO9>YgkOjm%x`|Dwg$QoA4ZV~&60T%VaW$K#YIr5_X&)s(YboOdt( zfAH~OzW)o%F4(FIyzBYh#29Sfu=)^tPq|6?n#C;%fa(37hg_6}>VA^_D|cD{vM0y7 z<5K5uk!w6+8RhQk3QmLWj_Du592@LU{N51r_wd}{=a;&BKfvSbXCY+vH_eHusKxpl zLdc4--rHl_3uj*DbD0}kOlI4^eJHPDI`}2if3cJAacDwh9O=7hTE{PM>+_Pu-O?* z`TDKQ7cu^#vi`Fw(*mwvGcUV-&onc$(Reml-;^~ye((S1k7xFc&0X!Qu&L_s?EhX@ z`ptw-UEV4FQ*xz-vPvoDr;2r3^nVkwgE7d?*$8Lgx-nyVeFt11H`IOy*4J}?@@^xpg<5NiCa`od~;k_fa%JYJLkYy{B>bQ^zP|z2qvlKTlix)C>yK`^81dpRmv+!!L z2l!R^ODx^?*ujHv5F1fF_+Cw&rL`~;wA}K72G3Fgus=Bians7VGXgE|6re^3`7$SS zNKQ3$rXL2(&*DPwJ`DaXM0fkE(e7q{h8b*lKyZM@)%pq=8kWz$QpQDe{j-~X*Gydm zSFtYoZwv<-JzR?HTy!CQv2bTKs|CIda#&DK!v7v_mu-&&i#D&IZ#|pRZ5;4t z#8!T09i=^e3Hv&{)`CzE5NGaJoA%;W%jKBT)S4{ldVm7VZAEy6D-EHm<~_4N1K9Hz zPm4E)EzAtbP-zgvP+Z&i{gA<&2#mM~&>eP-RX%^$6k3Zu>W&4?ZO`36T_+5TLhV&8 zWmg!dnI(&ARqd_8@w7rINwaT&*)j}ccAx-M4&ryk*#xp?7%grNWT`!-9EG4ox{~LT zbNk2`#T-Tp=YrO?$d7q~zJU3Id0~k47`%;P`ltmX7qz$`6HlUCW%A_?fZ{3zr=?!4 zBkVTg}NcDj%m`(0d3M za0>9ZqwJf7jEMTbJ5vC`w)=Y!-S+GuuXk%XhDGNL8cmAr16&kvoD`f=xQ|}D-O8A_ zPx^yiR^Z6Q;ysXh9qKKZ`l|iodIl{YzX$LPVWmkBLoBKUA2a>5VTit-ceYQ&in6K% z{R7q#*r|hurZzC@cKjq4`w_MVs7_RC2UdrYpFG%rg5jG#a8;vF4A@hxW={##9~>$* zOUNPhJ`~N?DMx~K4?`GSnj{{piuwH+IQ{lk0C!pCaDB0bl2VW3eGp=^(qXSvBNLwA zmjahf8~X?48LvO*B&F3xlQ8_01C+xaokQj8a1&jf!n>wzr`&T))ClsKEtQL;z;Jw) zypM~!Il@NmbxVABc9$k_pfeyOZtsjpTY3w;iGx#ez1qqTy~`I(gu zgl>V65YrMtVkr!mI zVJAg6kr7WE=B@Y8Ve&f3W2;(7d*dzg3{PUCEKZmoNSe%T(FBFtsW4Y@G^>xaNv|b( z@EM#8c2j@y-6d>$GHS=4%rSy457+pYuiOrb1zyhx;G#PnM&i@8H9u578}ytU3{1Ka zLS8;2Ub1paM;~-ZV9hDW+zK41-(!siEkP$UF5pu2^{t2~x8i`ImMsMo{T(c;0D#on zJlOwJOW&k__H`u!!g77fg%0+=_}M;I=!JsO=gnUCKoQ&)y%N5JHb zU{5Qv-$i8Bce^8^!*<}_y^<~QG3b9n`28zRM~?eg{j*}Fa+&B|k{MR%?s_N)w$9A8 zia%@#8tAhO*jdJo>R6GPqfgRCI>mvN`Wgh#%_M2w}MT4m=UL8gcvpAbVz-nxN#c}z=8?7#j*z!T5{LPDNW5Ah@rOu;vdQCdFrR@bhsW~G{ z?i%Gr$UHn#JD*1l4-8Mrlv9{}om%;94Id%b2C?b2AkD0ocWyvo&eOw4?S>T@`XG0? zo})R|q@vd;%e)jSv;Lk|t%e>xi{96($5&;Tw83<*R*3ID-Pj$At}`6_2VztflaX`H zOczDeV8L`SsNnEB?}jwIQIJOUes$gO{&wOv4Hi_$u> zCPN7DThZ_vjY-&1i=t7hxvq6$gTuPjuyV4mh_&dC;<|R4pH0FzC?Aj3A~fLR?-7&y z+Z&KTw#BqPz?3viWq4$8?=WKfXA-EGyadlnhu4LnpHQM-!y151jC3CJtNVAIwhui& z-wYYuvg&5}4w6RQi3<0*pjBwRv_YW8A12 zpVz$L58HA!W}l{qyemGB+v}54^T|{}Za73xY9UUV*&F2g{YCc6ps}{Sz|EQ3MQ7CB zx#bt8d%1AE9!3(vVy`t|D~2TQQyQMzwA?msl`*2T$4c8DJgy4(4ZwF z%wbDV`Z&M^%FtolqK}vEeBQKMF93~Z0Gk=FE-*>o{oo+I2|4k_HzO-D#|Nxh8aGx-hRbtVlnAKAOnUU{w@~4g&V2~ z4_3$JQpi6krc&eUOr!3h@HS3iXoUSu^|#k= z#NG7JZInO8HfJJP4;!6cXz-%ixp407S~9Ma1A$o;O|H9#GJ#}>ZK}?M)H`2XZL+q; zA`hxQ_x@p`uT$9VtL1HH_sGl!%_(qwg2MGIs#wjnsDh1F=wOPYK?!__O+oCy!9 zgD*`jjT1w}#b56)<(ZN~0En%vVUOGSc%5lqYOb)+*rqRwRFs7f(r>Q~g001*byHj3 zkyw?7}CFl1h=8hZ5aUt2ykYk05qPx$Xk-*hn9b6(p*^g%u^h1F&kO{V*D z_8XyZ5T^U#n~hER4}}mYuhW7`9O?Kyc3cguK`-ht zIM3nfR<>v007`W8l%u}}?gkdA6F9KGH^0C1IWcg+Iy^>udoCr=l(3)m@tnPBV=CN_ zSlu5LIFRiSBL*}TLOh&ppd+&eNa+tgK1*u#SHs^FU{}9!C(HNHmF$&I*z;;1PTUBk4-EzuY9HyO`R=cI z(Xp3{1G~;*J3(-*duq3b#S~?>gx2@ALn@htN3($(a_Ef8JYj{fh~20?t1%NPq8C%l-{8rvV$$IXS+#4t9&OxCGTrfN20vIwJ0_v?yn-F2JrBDA&9J7B?9t^5 zs1l#7q4c68lE5Pny6-||6ZF?6HNb8&hgu5(#JW*&Xzvi^UtTY z9l0DTpPhwx9`eVU_}4<@AM`ACchQ~RfB3-G=sqMt8mzu#G^W|- z&614Hdy3-_ulSk7vy)q5h2k4@^?lS%ZizUWaCRCbAo?z;T?a%0wDCLa&fhBp1d|Eo$$dVXwPs_nHy8OaC0-<;d?b? zY)=m`-(IkX&J-ve+XFw?=~#u#EvE6EP?MX71{K6C0i8R_(cfeaze3bgu&=7gVxwNT=}3THT(;l)G% zmXp!wZn~X+WaXR=?I+OT%2*m?a1fHcCF|6W!m>c4YCK6{IcVw)Sjye_uH`Yce8>5w z!{*wLlgRt{uZ8;D`WuB+156Tw8q^Gim_X9Pki+4*`Vh_oG$mZsi=8N!4?}!eyqzb9 zQo8@O0H9Rw5H10;9Qz1J87>BhkoVl2mQ4c>ocvD(9<*Y$euHb{fm~Fe)+#YnAUohH z%<#JzWR6!4$@+o9Vg`;1EFRPo4`>)uU^aUmD@} zc`B-aLM|H0%5(G{Lg^CB?^SFNApwWjWw34_&BIE=j1VtPE3O?#yn~RHD`WaW%zBPeD!Fabrf3jn(k2el-d7 z`*Zd1AGD>abmk=w=2zDU2Eq{U&7{`rbe3IFsDYzs)?PC*whkb~-#VskVnQuE88MOZ zxC+>hika6W<$vm$Jiiy?h}y4aO#+U}+ZM7UkPhvuxjQqdceTnP6+%tePzNotd(ZB8 z{Ur2bxv2Jbe@i(Y9EMo`{a<7GY>yeE#i;Wfd3XO6!xV&jj>~;DyOy%y?S=FEtYYBV zIEzx=>}MreC;VWkv&tkq%EKauyW&_n2QQvpPn3Q&V;V4kp_Cnbd)tsgc2vzML}4tg zZPv?EbGBq6VhRiDrs|7g+HZ>6JaQuIUb&)=5#8~&F6L`KU)!f` z2(EZ+gae7B(JozEA%!ICY2aDGiIj&fqryP#EbDypw4#w1X>%D?8k zpF65vIT<3DW|lUun`*0QAwl8Jr*Nll8-&7A+Z_8<(bLNS(=<6p_&vpc!L^_@ zF!YwCsu4sIW15%%aCcA5>~M>{K2<`Bc;|6&-^~3)dRrI<{4C;=bnLw;cFu-xOi4?y zLaMvw+GX=lj$9uhynbrN!|5JPAe3e{gVmKhAE;?o*{^W;(c=*XK%CfoibVGUFib=|Pt*fB&h$3Qy~??bgVK zGn1LygMK|)$~=SMMAw7bXvssxbIP-rhYWYV?my)>y$(2s1>PTiC^@98*%yf1a~38* zE_EN?nN>&XFX^A44E*O^$Py@MH4X<_g45mHI#oxeP1DwXoJql&8icm(@DSFbTCcX} z^i_Fds@|A{^@0ao0lAo}j+!nh=taU7o7PgtDhFtve!&u4?z-${*IX?i&2@?J&dMgZ zotNQ!`l_Fs^VN z6P3(i0Ox!RJf4g~IL+#(T-I-#UaCI-8r!hI@z_t`2WkBY6Y{}%A0`G04dC~!;dj&P zFKHM6`>wa^VrD)u4S=7nw{KuBVAdoka^;j~AA<}+`%^^=oLg^RB;2xSV02%2>3AKSyV*{#Ix|ihkWoWnyqAtmK8}+NfB&3!|9U~|ANnmz zuqeXV0PcKbEt{Qv| zAqRIkiLn-#hD`%!cs`F8zRIX%-{^@r!;^b-g2$xGNr!Nl?I;S1XlkY4br_k>Vw|9>m|&Bs@6RV1a}upmnmAGa?z{`c5( zKM4$n}hEz*+cFCWfyX3A!g_89A zU%m0aD~^L5r^7s}Xk%iYZFQyU4uJLRdq%D(T zsF|_1eL8WkOCv&mdX%lDk6p2EVKe=w*ETB3;F@iBbAwgo&z(49Odk*;Q{eO&5D0bx8s{#;;zj`7LT>Z#KaVy@E!7d@8x^nPX>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1 + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +getwd() + +source("combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for OR and stability plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df2 = merged_df3_comp +#my_df2 = merged_df3 +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df2) +str(my_df2) + +# sanity check +# Ensure correct data type in columns to plot: need to be factor +is.numeric(my_df2$OR) +#[1] TRUE + +# sanity check: should be <10 +if (max(my_df2$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +#<<<<<<<<<<<<<<<< +# REASSIGNMENT +# FOR Lig Plots +#<<<<<<<<<<<<<<<< + +Lig_df = my_df2 + +rm(my_df2) + +#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1 + +############# +# Plots: Bubble plot +# x = Position, Y = stability +# size of dots = OR +# col: stability +############# + +#================= +# generate plot 1: DUET vs OR by position as geom_points +#================= + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +# Spelling Correction: made redundant as already corrected at the source + +#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising' +#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising' + +table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome)) + +g = ggplot(PS_df, aes(x = factor(Position) + , y = ratioDUET)) + +p1 = g + + geom_point(aes(col = DUET_outcome + , size = OR)) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "DUET(PS)" + , size = "Odds Ratio" + , colour = "DUET Outcome") + + guides(colour = guide_legend(override.aes = list(size=4))) + +p1 + +#================= +# generate plot 2: Lig vs OR by position as geom_points +#================= +my_ats = 20 # axis text size +my_als = 22 # axis label size + +# Spelling Correction: made redundant as already corrected at the source + +#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising' +#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising' + +table(Lig_df$Lig_outcome) + +g = ggplot(Lig_df, aes(x = factor(Position) + , y = ratioPredAff)) + +p2 = g + + geom_point(aes(col = Lig_outcome + , size = OR))+ + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "Ligand Affinity" + , size = "Odds Ratio" + , colour = "Ligand Outcome" + ) + + guides(colour = guide_legend(override.aes = list(size=4))) + +p2 + +#====================== +#combine using cowplot +#====================== +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots" +getwd() + +svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches +#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi +theme_set(theme_gray()) # to preserve default theme + +printFile = cowplot::plot_grid(plot_grid(p1, p2 + , ncol = 1 + , align = 'v' + , labels = c("A", "B") + , label_size = my_als+5)) +print(printFile) +dev.off() + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R new file mode 100644 index 0000000..30b9981 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R @@ -0,0 +1,154 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for Lig plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot: Barplot with scores (unordered) +# corresponds to Lig_outcome +# Stacked Barplot with colours: Lig_outcome @ position coloured by +# Lig_outcome. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding Lig_outcome. +#============================ + +#=================== +# Data for plots +#=================== + +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df = my_df +#%%%%%%%%%%%%%%%%%%%%%%%% + +rm(my_df) + +# sanity checks +upos = unique(my_df$Position) + +# should be a factor +is.factor(df$Lig_outcome) +#TRUE + +table(df$Lig_outcome) + +# should be -1 and 1: may not be in this case because you have filtered the data +# FIXME: normalisation before or after filtering? +min(df$ratioPredAff) # +max(df$ratioPredAff) # + +# sanity checks +tapply(df$ratioPredAff, df$Lig_outcome, min) +tapply(df$ratioPredAff, df$Lig_outcome, max) + +#****************** +# generate plot +#****************** + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +my_title = "Ligand affinity" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = Lig_outcome), colour = "grey") + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R new file mode 100644 index 0000000..169bdaf --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R @@ -0,0 +1,149 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#========================== + +########################### +# Data for DUET plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$DUET_outcome) +my_df$DUET_outcome = as.factor(my_df$DUET_outcome) +is.factor(my_df$DUET_outcome) +#[1] TRUE + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot 2: Barplot with scores (unordered) +# corresponds to DUET_outcome +# Stacked Barplot with colours: DUET_outcome @ position coloured by +# DUET outcome. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET_outcome +#============================ + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +upos = unique(df$Position) + +# should be a factor +is.factor(my_df$DUET_outcome) +#[1] TRUE + +table(my_df$DUET_outcome) + +# should be -1 and 1 +min(df$ratioDUET) +max(df$ratioDUET) + +tapply(df$ratioDUET, df$DUET_outcome, min) +tapply(df$ratioDUET, df$DUET_outcome, max) + +#****************** +# generate plot +#****************** + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +my_title = "Protein stability (DUET)" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = DUET_outcome), colour = "grey") + + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R new file mode 100644 index 0000000..a5d9361 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R @@ -0,0 +1,202 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") +source("../barplot_colour_function.R") + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for Lig plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$Lig_outcome) +my_df$Lig_outcome = as.factor(my_df$Ligoutcome) +is.factor(my_df$Lig_outcome) +#[1] TRUE + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot: Barplot with scores (unordered) +# corresponds to Lig_outcome +# Stacked Barplot with colours: Lig_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding Lig stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +table(df$Lig_outcome) + +# should be -1 and 1: may not be in this case because you have filtered the data +# FIXME: normalisation before or after filtering? +min(df$ratioPredAff) # +max(df$ratioPredAff) # + +# sanity checks +# very important!!!! +tapply(df$ratioPredAff, df$Lig_outcome, min) + +tapply(df$ratioPredAff, df$Lig_outcome, max) + + +#****************** +# generate plot +#****************** + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = Lig_outcome +# subgroup = normalised score i.e ratioPredAff + +# Prepare data: round off ratioLig scores +# round off to 3 significant digits: +# 165 if no rounding is performed: used to generate the originalgraph +# 156 if rounded to 3 places +# FIXME: check if reducing precision creates any ML prob + +# check unique values in normalised data +u = unique(df$ratioPredAff) + +# <<<<< ------------------------------------------- +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$ratioLigR = round(df$ratioPredAff, n) +u = unique(df$ratioLigR) # 156 +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$ratioLigR +df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") + +# else +# uncomment the below if rounding is not required + +#my_grp = df$ratioLig +#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") + +# <<<<< ----------------------------------------------- + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp") +my_title = "Ligand affinity" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R new file mode 100644 index 0000000..8828e90 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R @@ -0,0 +1,192 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") +source("../barplot_colour_function.R") + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for DUET plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$DUET_outcome) +my_df$DUET_outcome = as.factor(my_df$DUET_outcome) +is.factor(my_df$DUET_outcome) +#[1] TRUE + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Barplot with scores (unordered) +# corresponds to DUET_outcome +# Stacked Barplot with colours: DUET_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +upos = unique(df$Position) + +# should be a factor +is.factor(my_df$DUET_outcome) +#[1] TRUE + +table(df$DUET_outcome) + +# should be -1 and 1 +min(df$ratioDUET) +max(df$ratioDUET) + +tapply(df$ratioDUET, df$DUET_outcome, min) +tapply(df$ratioDUET, df$DUET_outcome, max) + +#****************** +# generate plot +#****************** + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = DUET_outcome +# subgroup = normalised score i.e ratioDUET + +# Prepare data: round off ratioDUET scores +# round off to 3 significant digits: +# 323 if no rounding is performed: used to generate the original graph +# 287 if rounded to 3 places +# FIXME: check if reducing precicion creates any ML prob + +# check unique values in normalised data +u = unique(df$ratioDUET) + +# <<<<< ------------------------------------------- +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$ratioDUETR = round(df$ratioDUET, n) +u = unique(df$ratioDUETR) +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$ratioDUETR +df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") + +# else +# uncomment the below if rounding is not required + +#my_grp = df$ratioDUET +#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") + +# <<<<< ----------------------------------------------- + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") +my_title = "Protein stability (DUET)" + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +# for sanity and good practice +rm(df) +#======================= end of plot +# axis colours labels +# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors +# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R new file mode 100644 index 0000000..c4826d3 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R @@ -0,0 +1,215 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") + +#require(data.table) +#require(dplyr) + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for Lig plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$Lig_outcome) +my_df$Lig_outcome = as.factor(my_df$lig_outcome) +is.factor(my_df$Lig_outcome) +#[1] TRUE + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Plot: Basic barplots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< +rm(my_df) + +# sanity checks +str(df) + +if (identical(df$Position, df$position)){ + print("Sanity check passed: Columns 'Position' and 'position' are identical") +} else{ + print("Error!: Check column names and info contained") +} + +#**************** +# generate plot: No of stabilising and destabilsing muts +#**************** +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('basic_barplots_LIG.svg') + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +# uncomment as necessary for either directly outputting results or +# printing on the screen +g = ggplot(df, aes(x = Lig_outcome)) +#prinfFile = g + geom_bar( + g + geom_bar( + aes(fill = Lig_outcome) + , show.legend = TRUE +) + geom_label( + stat = "count" + , aes(label = ..count..) + , color = "black" + , show.legend = FALSE + , size = 10) + theme( + axis.text.x = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_text(size=my_als) + , axis.text.y = element_text(size = my_ats) + , legend.position = c(0.73,0.8) + , legend.text = element_text(size=my_als-2) + , legend.title = element_text(size=my_als) + , plot.title = element_blank() + ) + labs( + title = "" + , y = "Number of SNPs" + #, fill='Ligand Outcome' + ) + scale_fill_discrete(name = "Ligand Outcome" + , labels = c("Destabilising", "Stabilising")) +print(prinfFile) +dev.off() + +#**************** +# generate plot: No of positions +#**************** +#get freq count of positions so you can subset freq<1 +#require(data.table) +setDT(df)[, pos_count := .N, by = .(Position)] #169, 36 + +head(df$pos_count) +table(df$pos_count) +# this is cummulative +#1 2 3 4 5 6 +#5 24 36 56 30 18 + +# use group by on this +snpsBYpos_df <- df %>% + group_by(Position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) +#1 2 3 4 5 6 +#5 12 12 14 6 3 +# this is what will get plotted + +svg('position_count_LIG.svg') + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) +prinfFile = g + geom_bar( + #g + geom_bar( + aes (alpha = 0.5) + , show.legend = FALSE +) + + geom_label( + stat = "count", aes(label = ..count..) + , color = "black" + , size = 10 + ) + + theme( + axis.text.x = element_text( + size = my_ats + , angle = 0 + ) + , axis.text.y = element_text( + size = my_ats + , angle = 0 + , hjust = 1 + ) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , plot.title = element_blank() + ) + + labs( + x = "Number of SNPs" + , y = "Number of Sites" + ) +print(prinfFile) +dev.off() +######################################################################## +# end of Lig barplots # +######################################################################## + + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R new file mode 100644 index 0000000..51a2812 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R @@ -0,0 +1,211 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#========================== + +########################### +# Data for DUET plots +# you need merged_df3 +# or +# merged_df3_comp +# since these have unique SNPs +# I prefer to use the merged_df3 +# because using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +# sanity check +is.factor(my_df$DUET_outcome) +my_df$DUET_outcome = as.factor(my_df$DUET_outcome) +is.factor(my_df$DUET_outcome) +#[1] TRUE + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Plot: Basic barplots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +str(df) + +if (identical(df$Position, df$position)){ + print("Sanity check passed: Columns 'Position' and 'position' are identical") +} else{ + print("Error!: Check column names and info contained") + } + +#**************** +# generate plot: No of stabilising and destabilsing muts +#**************** +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('basic_barplots_DUET.svg') + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +theme_set(theme_grey()) + +# uncomment as necessary for either directly outputting results or +# printing on the screen +g = ggplot(df, aes(x = DUET_outcome)) +prinfFile = g + geom_bar( +#g + geom_bar( + aes(fill = DUET_outcome) + , show.legend = TRUE + ) + geom_label( + stat = "count" + , aes(label = ..count..) + , color = "black" + , show.legend = FALSE + , size = 10) + theme( + axis.text.x = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_text(size=my_als) + , axis.text.y = element_text(size = my_ats) + , legend.position = c(0.73,0.8) + , legend.text = element_text(size=my_als-2) + , legend.title = element_text(size=my_als) + , plot.title = element_blank() + ) + labs( + title = "" + , y = "Number of SNPs" + #, fill='DUET Outcome' + ) + scale_fill_discrete(name = "DUET Outcome" + , labels = c("Destabilising", "Stabilising")) + +print(prinfFile) +dev.off() + +#**************** +# generate plot: No of positions +#**************** +#get freq count of positions so you can subset freq<1 +#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36 + +setDT(df)[, pos_count := .N, by = .(Position)] #335, 36 +table(df$pos_count) +# this is cummulative +#1 2 3 4 5 6 +#34 76 63 104 40 18 + +# use group by on this +snpsBYpos_df <- df %>% + group_by(Position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) +#1 2 3 4 5 6 +#34 38 21 26 8 3 + +foo = select(df, Mutationinformation + , WildPos + , wild_type + , mutant_type + , mutation_info + , position + , pos_count) #335, 5 + +getwd() +write.csv(foo, "../Data/pos_count_freq.csv") + +svg('position_count_DUET.svg') +my_ats = 25 # axis text size +my_als = 22 # axis label size + +g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) +prinfFile = g + geom_bar( +#g + geom_bar( + aes (alpha = 0.5) + , show.legend = FALSE + ) + + geom_label( + stat = "count", aes(label = ..count..) + , color = "black" + , size = 10 + ) + + theme( + axis.text.x = element_text( + size = my_ats + , angle = 0 + ) + , axis.text.y = element_text( + size = my_ats + , angle = 0 + , hjust = 1 + ) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , plot.title = element_blank() + ) + + labs( + x = "Number of SNPs" + , y = "Number of Sites" + ) +print(prinfFile) +dev.off() +######################################################################## +# end of DUET barplots # +######################################################################## + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R new file mode 100644 index 0000000..f3a507e --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R @@ -0,0 +1,175 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages and functions # +######################################################################## + +source("../Header_TT.R") + +#source("barplot_colour_function.R") + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#========================== + +########################### +# Data for PS Corr plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Plot: Correlation plots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +str(df) + +table(df$DUET_outcome) + +# unique positions +length(unique(df$Position)) #{RESULT: unique positions for comp data} + + +# subset data to generate pairwise correlations +corr_data = df[, c("ratioDUET" +# , "ratioPredAff" +# , "DUETStability_Kcalpermol" +# , "PredAffLog" +# , "OR" + , "logor" +# , "pvalue" + , "neglog10pvalue" + , "AF" + , "DUET_outcome" +# , "Lig_outcome" + , "pyrazinamide" + )] +dim(corr_data) +rm(df) + +# assign nice colnames (for display) +my_corr_colnames = c("DUET" +# , "Ligand Affinity" +# , "DUET_raw" +# , "Lig_raw" +# , "OR" + , "Log(Odds Ratio)" +# , "P-value" + , "-LogP" + , "Allele Frequency" + , "DUET_outcome" +# , "Lig_outcome" + , "pyrazinamide") + +# sanity check +if (length(my_corr_colnames) == length(corr_data)){ + print("Sanity check passed: corr_data and corr_names match in length") +}else{ + print("Error: length mismatch!") +} + +colnames(corr_data) +colnames(corr_data) <- my_corr_colnames +colnames(corr_data) + +############### +# PLOTS: corr +# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs +############### +#default pairs plot +start = 1 +end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column +offset = 1 + +my_corr = corr_data[start:(end-offset)] +head(my_corr) + +#my_cols = c("#f8766d", "#00bfc4") +# deep blue :#007d85 +# deep red: #ae301e + +#========== +# psych: ionformative since it draws the ellipsoid +# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html +# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs +#========== + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots" +getwd() + +svg('DUET_corr.svg', width = 15, height = 15) +printFile = pairs.panels(my_corr[1:4] + , method = "spearman" # correlation method + , hist.col = "grey" ##00AFBB + , density = TRUE # show density plots + , ellipses = F # show correlation ellipses + , stars = T + , rug = F + , breaks = "Sturges" + , show.points = T + , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))] + , pch = 21 + , jitter = T + #, alpha = .05 + #, points(pch = 19, col = c("#f8766d", "#00bfc4")) + , cex = 3 + , cex.axis = 2.5 + , cex.labels = 3 + , cex.cor = 1 + , smooth = F +) + +print(printFile) +dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R new file mode 100644 index 0000000..4e05d41 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R @@ -0,0 +1,187 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") + +#source("barplot_colour_function.R") + +######################################################################## +# Read file: call script for combining df for lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for Lig Corr plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df3_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Plot: Correlation plots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df = my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(my_df) + +# sanity checks +str(df) + +table(df$Lig_outcome) + +# unique positions +length(unique(df$Position)) #{RESULT: unique positions for comp data} + +# subset data to generate pairwise correlations +corr_data = df[, c(#"ratioDUET", + "ratioPredAff" +# , "DUETStability_Kcalpermol" +# , "PredAffLog" +# , "OR" + , "logor" +# , "pvalue" + , "neglog10pvalue" + , "AF" +# , "DUET_outcome" + , "Lig_outcome" + , "pyrazinamide" + )] +dim(corr_data) +rm(df) + +# assign nice colnames (for display) +my_corr_colnames = c(#"DUET", + "Ligand Affinity" +# ,"DUET_raw" +# , "Lig_raw" +# , "OR" + , "Log(Odds Ratio)" +# , "P-value" + , "-LogP" + , "Allele Frequency" +# , "DUET_outcome" + , "Lig_outcome" + , "pyrazinamide") + +# sanity check +if (length(my_corr_colnames) == length(corr_data)){ + print("Sanity check passed: corr_data and corr_names match in length") +}else{ + print("Error: length mismatch!") +} + +colnames(corr_data) +colnames(corr_data) <- my_corr_colnames +colnames(corr_data) + +############### +# PLOTS: corr +# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs +############### + +# default pairs plot +start = 1 +end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column +offset = 1 + +my_corr = corr_data[start:(end-offset)] +head(my_corr) + +#my_cols = c("#f8766d", "#00bfc4") +# deep blue :#007d85 +# deep red: #ae301e + +#========== +# psych: ionformative since it draws the ellipsoid +# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html +# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs +#========== + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots" +getwd() + +svg('Lig_corr.svg', width = 15, height = 15) +printFile = pairs.panels(my_corr[1:4] + , method = "spearman" # correlation method + , hist.col = "grey" ##00AFBB + , density = TRUE # show density plots + , ellipses = F # show correlation ellipses + , stars = T + , rug = F + , breaks = "Sturges" + , show.points = T + , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))] + , pch = 21 + , jitter = T +# , alpha = .05 +# , points(pch = 19, col = c("#f8766d", "#00bfc4")) + , cex = 3 + , cex.axis = 2.5 + , cex.labels = 3 + , cex.cor = 1 + , smooth = F +) +print(printFile) +dev.off() + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R new file mode 100644 index 0000000..1f868e4 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R @@ -0,0 +1,227 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") + +require(data.table) + +######################################################################## +# Read file: call script for combining df # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#========================== + +########################### +# Data for plots +# you need merged_df2, comprehensive one +# since this has one-many relationship +# i.e the same SNP can belong to multiple lineages +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +#========================== +# Plot: Lineage barplot +# x = lineage y = No. of samples +# col = Lineage +# fill = lineage +#============================ +table(my_df$lineage) + +# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV +#3 104 1293 264 1311 6 6 105 + +#=========================== +# Plot: Lineage Barplots +#=========================== + +#=================== +# Data for plots +#=================== + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df <- my_df +#<<<<<<<<<<<<<<<<<<<<<<<<< +rm(my_df) + +# get freq count of positions so you can subset freq<1 +#setDT(df)[, lineage_count := .N, by = .(lineage)] + +#****************** +# generate plot: barplot of mutation by lineage +#****************** +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +df_lin = subset(df, subset = lineage %in% sel_lineages ) + +#FIXME; add sanity check for numbers. +# Done this manually + +############################################################ + +######### +# Data for barplot: Lineage barplot +# to show total samples and number of unique mutations +# within each linege +########## + +# Create df with lineage inform & no. of unique mutations +# per lineage and total samples within lineage +# this is essentially barplot with two y axis + +bar = bar = as.data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(df$id)[df$lineage==i]) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = df[df$lineage==i,] + print(paste0(i, "=======")) + print(length(unique(foo$Mutationinformation))) + curr_count = length(unique(foo$Mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} + +print(total_snps_u) +bar$num_snps_u = total_snps_u +bar$total_samples = total_samples +bar + +#***************** +# generate plot: lineage barplot with two y-axis +#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 +#***************** + +bar$num_snps_u = y1 +bar$total_samples = y2 +sel_lineages = x + +to_plot = data.frame(x = x + , y1 = y1 + , y2 = y2) +to_plot + +melted = melt(to_plot, id = "x") +melted + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('lineage_basic_barplot.svg') + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +g = ggplot(melted + , aes(x = x + , y = value + , fill = variable) + ) + + +printFile = g + geom_bar( + +#g + geom_bar( + stat = "identity" + , position = position_stack(reverse = TRUE) + , alpha=.75 + , colour='grey75' + ) + theme( + axis.text.x = element_text( + size = my_ats +# , angle= 30 + ) + , axis.text.y = element_text(size = my_ats + #, angle = 30 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text( + size = my_als + , colour = 'black' + ) + , axis.title.y = element_text( + size = my_als + , colour = 'black' + ) + , legend.position = "top" + , legend.text = element_text(size = my_als) + + #) + geom_text( + ) + geom_label( + aes(label = value) + , size = 5 + , hjust = 0.5 + , vjust = 0.5 + , colour = 'black' + , show.legend = FALSE + #, check_overlap = TRUE + , position = position_stack(reverse = T) + #, position = (' + + ) + labs( + title = '' + , x = '' + , y = "Number" + , fill = 'Variable' + , colour = 'black' + ) + scale_fill_manual( + values = c('grey50', 'gray75') + , name='' + , labels=c('Mutations', 'Total Samples') + ) + scale_x_discrete( + breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') + ) +print(printFile) +dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R new file mode 100644 index 0000000..ebbec76 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R @@ -0,0 +1,233 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") +#require(data.table) + +######################################################################## +# Read file: call script for combining df for Lig # +######################################################################## + +source("../combining_two_df_lig.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info) + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(my_df$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot: Lineage Distribution +# x = mcsm_values, y = dist +# fill = stability +#============================ + +#=================== +# Data for plots +#=================== + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# uncomment as necessary +df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35 + +# refactor +df_lin$lineage = factor(df_lin$lineage) + +table(df_lin$lineage) #{RESULT: No of samples within lineage} +#lineage1 lineage2 lineage3 lineage4 +#78 961 195 803 + +# when merged_df2_comp is used +#lineage1 lineage2 lineage3 lineage4 +#77 955 194 770 + +length(unique(df_lin$Mutationinformation)) +#{Result: No. of unique mutations the 4 lineages contribute to} + +# sanity checks +r1 = 2:5 # when merged_df2 used: because there is missing lineages +if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) { + print ("sanity check passed: numbers match") +} else{ + print("Error!: check your numbers") +} + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df <- df_lin +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(df_lin) + +#****************** +# generate distribution plot of lineages +#****************** +# basic: could improve this! +library(plotly) +library(ggridges) + +fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + +g <- ggplot(df, aes(x = ratioPredAff)) + + geom_density(aes(fill = Lig_outcome) + , alpha = 0.5) + + facet_wrap( ~ lineage + , scales = "free" + , labeller = labeller(lineage = fooNames) ) + + coord_cartesian(xlim = c(-1, 1) +# , ylim = c(0, 6) +# , clip = "off" +) + ggtitle("Kernel Density estimates of Ligand affinity by lineage") + +ggplotly(g) + +# 2 : ggridges (good!) + +my_ats = 15 # axis text size +my_als = 20 # axis label size + +fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('lineage_dist_LIG.svg') + +printFile = ggplot( df, aes(x = ratioPredAff + , y = Lig_outcome) ) + + + geom_density_ridges_gradient( aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" +# , switch = 'x' + , labeller = labeller(lineage = fooNames) ) + + coord_cartesian( xlim = c(-1, 1) +# , ylim = c(0, 6) +# , clip = "off" + ) + + + scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4") + , name = "Ligand Affinity" ) + + theme( axis.text.x = element_text( size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) +# , axis.text.y = element_text( size = my_ats +# , angle = 0 +# , hjust = 1 +# , vjust = 0) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = 10) + , legend.title = element_text(size = my_als) +# , legend.position = c(0.3, 0.8) +# , legend.key.height = unit(1, 'mm') + ) + +print(printFile) +dev.off() + +#=!=!=!=!=!=! +# COMMENT: When you look at all mutations, the lineage differences disappear... +# The pattern we are interested in is possibly only for dr_mutations +#=!=!=!=!=!=! + +#=================================================== + +# COMPARING DISTRIBUTIONS +head(df$lineage) +df$lineage = as.character(df$lineage) + +lin1 = df[df$lineage == "lineage1",]$ratioPredAff +lin2 = df[df$lineage == "lineage2",]$ratioPredAff +lin3 = df[df$lineage == "lineage3",]$ratioPredAff +lin4 = df[df$lineage == "lineage4",]$ratioPredAff + +# ks test +ks.test(lin1,lin2) +ks.test(lin1,lin3) +ks.test(lin1,lin4) + +ks.test(lin2,lin3) +ks.test(lin2,lin4) + +ks.test(lin3,lin4) + + + diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R new file mode 100644 index 0000000..cc7e15a --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R @@ -0,0 +1,212 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("../Header_TT.R") +#source("barplot_colour_function.R") +#require(data.table) + +######################################################################## +# Read file: call script for combining df for PS # +######################################################################## + +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA: +# merged_df2 +# merged_df3 + +# df without NA: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +########################### + +# uncomment as necessary +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +my_df = merged_df2 +#my_df = merged_df2_comp +#<<<<<<<<<<<<<<<<<<<<<<<<< + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot: Lineage Distribution +# x = mcsm_values, y = dist +# fill = stability +#============================ + +#=================== +# Data for plots +#=================== + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# uncomment as necessary +df_lin = subset(my_df, subset = lineage %in% sel_lineages ) + +# refactor +df_lin$lineage = factor(df_lin$lineage) + +table(df_lin$lineage) #{RESULT: No of samples within lineage} +#lineage1 lineage2 lineage3 lineage4 +#104 1293 264 1311 + +# when merged_df2_comp is used +#lineage1 lineage2 lineage3 lineage4 +#99 1275 263 1255 + +length(unique(df_lin$Mutationinformation)) +#{Result: No. of unique mutations the 4 lineages contribute to} + +# sanity checks +r1 = 2:5 # when merged_df2 used: because there is missing lineages +if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) { + print ("sanity check passed: numbers match") +} else{ + print("Error!: check your numbers") +} + +#<<<<<<<<<<<<<<<<<<<<<<<<< +# REASSIGNMENT +df <- df_lin +#<<<<<<<<<<<<<<<<<<<<<<<<< + +rm(df_lin) + +#****************** +# generate distribution plot of lineages +#****************** +# basic: could improve this! +library(plotly) +library(ggridges) + +g <- ggplot(df, aes(x = ratioDUET)) + + geom_density(aes(fill = DUET_outcome) + , alpha = 0.5) + facet_wrap(~ lineage, + scales = "free") + + ggtitle("Kernel Density estimates of Protein stability by lineage") + +ggplotly(g) + +# 2 : ggridges (good!) + +my_ats = 15 # axis text size +my_als = 20 # axis label size + +fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4') + +# set output dir for plots +getwd() +setwd("~/git/Data/pyrazinamide/output/plots") +getwd() + +svg('lineage_dist_PS.svg') + +printFile = ggplot( df, aes(x = ratioDUET + , y = DUET_outcome) )+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient( aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" +# , switch = 'x' + , labeller = labeller(lineage = fooNames) ) + + coord_cartesian( xlim = c(-1, 1) +# , ylim = c(0, 6) +# , clip = "off" + ) + + scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme( axis.text.x = element_text( size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) +# , axis.text.y = element_text( size = my_ats +# , angle = 0 +# , hjust = 1 +# , vjust = 0) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size=my_als) + , legend.text = element_text(size=10) + , legend.title = element_text(size=my_als) +# , legend.position = c(0.3, 0.8) +# , legend.key.height = unit(1, 'mm') + ) + +print(printFile) +dev.off() + +#=!=!=!=!=!=! +# COMMENT: When you look at all mutations, the lineage differences disappear... +# The pattern we are interested in is possibly only for dr_mutations +#=!=!=!=!=!=! +#=================================================== + +# COMPARING DISTRIBUTIONS +head(df$lineage) +df$lineage = as.character(df$lineage) + +lin1 = df[df$lineage == "lineage1",]$ratioDUET +lin2 = df[df$lineage == "lineage2",]$ratioDUET +lin3 = df[df$lineage == "lineage3",]$ratioDUET +lin4 = df[df$lineage == "lineage4",]$ratioDUET + +# ks test +ks.test(lin1,lin2) +ks.test(lin1,lin3) +ks.test(lin1,lin4) + +ks.test(lin2,lin3) +ks.test(lin2,lin4) + +ks.test(lin3,lin4) + + + diff --git a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R new file mode 100644 index 0000000..41ca884 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R @@ -0,0 +1,27 @@ +######################### +#3: Read complex pdb file +########################## +source("Header_TT.R") +# This script only reads the pdb file of your complex + +# read in pdb file complex1 +inDir = "~/git/Data/pyrazinamide/input/structure/" +inFile = paste0(inDir, "complex1_no_water.pdb") +complex1 = inFile + +#inFile2 = paste0(inDir, "complex2_no_water.pdb") +#complex2 = inFile2 + +# list of 8 +my_pdb = read.pdb(complex1 + , maxlines = -1 + , multi = FALSE + , rm.insert = FALSE + , rm.alt = TRUE + , ATOM.only = FALSE + , hex = FALSE + , verbose = TRUE) + +rm(inDir, inFile, complex1) +#====== end of script + diff --git a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R new file mode 100644 index 0000000..658eec4 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R @@ -0,0 +1,386 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() + +######################################################################## +# Installing and loading required packages # +######################################################################## + +source("Header_TT.R") + +######################################################### +# TASK: replace B-factors in the pdb file with normalised values +# use the complex file with no water as mCSM lig was +# performed on this file. You can check it in the script: read_pdb file. +######################################################### + +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile + +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F + , header = T) +str(my_df) + +#========================================================= +# Processing P1: Replacing B factor with mean ratioDUET scores +#========================================================= + +######################### +# Read complex pdb file +# form the R script +########################## + +source("read_pdb.R") # list of 8 + +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] + +# make a copy: required for downstream sanity checks +d2 = d + +# sanity checks: B factor +max(d$b); min(d$b) + +#******************************************* +# plot histograms for inspection +# 1: original B-factors +# 2: original DUET Scores +# 3: replaced B-factors with DUET Scores +#********************************************* +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(3,2)) +#par(mfrow = c(3,2)) + + #1: Original B-factor +hist(d$b + , xlab = "" + , main = "B-factor") + +plot(density(d$b) + , xlab = "" + , main = "B-factor") + +# 2: DUET scores +hist(my_df$average_DUETR + , xlab = "" + , main = "Norm_DUET") + +plot(density(my_df$average_DUETR) + , xlab = "" + , main = "Norm_DUET") + +# 3: After the following replacement +#******************************** + +#========= +# step 0_P1: DONT RUN once you have double checked the matched output +#========= +# sanity check: match and assign to a separate column to double check +# colnames(my_df) +# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)] + +#========= +# step 1_P1 +#========= +# Be brave and replace in place now (don't run sanity check) +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] + +#========= +# step 2_P1 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na + +# count number of 0's in Bactor +sum(d$b == 0) +#table(d$b) + +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 + +# sanity check: should be 0 +sum(is.na(d$b)) + +# sanity check: should be True +if (sum(d$b == 0) == b_na){ + print ("Sanity check passed: NA's replaced with 0's successfully") +} else { + print("Error: NA replacement NOT successful, Debug code!") +} + +max(d$b); min(d$b) + +# sanity checks: should be True +if(max(d$b) == max(my_df$average_DUETR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +if (min(d$b) == min(my_df$average_DUETR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +#========= +# step 3_P1 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) + +#========= +# step 4_P1 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d + +max(d$b); min(d$b) + +#========= +# step 5_P1 +#========= +# output dir +getwd() +outDir = "~/git/Data/pyrazinamide/input/structure/" + +outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile +write.pdb(my_pdb, outFile) + +#******************************** +# Add the 3rd histogram and density plots for comparisons +#******************************** +# Plots continued... +# 3: hist and density of replaced B-factors with DUET Scores +hist(d$b + , xlab = "" + , main = "repalced-B") + +plot(density(d$b) + , xlab = "" + , main = "replaced-B") + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = "DUET_stability" + , side = 3 + , line = 0 + , outer = TRUE) +#******************************** + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# NOTE: This replaced B-factor distribution has the same +# x-axis as the PredAff normalised values, but the distribution +# is affected since 0 is overinflated. This is because all the positions +# where there are no SNPs have been assigned 0. +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + + + + +####################################################################### +#====================== end of section 1 ============================== +####################################################################### + + + + + +#========================================================= +# Processing P2: Replacing B values with PredAff Scores +#========================================================= +# clear workspace +rm(list = ls()) + +########################### +# 2: Read file: average stability values +# or mcsm_normalised file, output of step 4 mcsm pipeline +########################### + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile + +my_df <- read.csv(inFile +# , row.names = 1 +# , stringsAsFactors = F + , header = T) +str(my_df) +#rm(inDir, inFile) + +######################### +# 3: Read complex pdb file +# form the R script +########################## + +source("read_pdb.R") # list of 8 + +# extract atom list into a variable +# since in the list this corresponds to data frame, variable will be a df +d = my_pdb[[1]] + +# make a copy: required for downstream sanity checks +d2 = d + +# sanity checks: B factor +max(d$b); min(d$b) + +#******************************************* +# plot histograms for inspection +# 1: original B-factors +# 2: original Pred Aff Scores +# 3: replaced B-factors with PredAff Scores +#******************************************** +# Set the margin on all sides +par(oma = c(3,2,3,0) + , mar = c(1,3,5,2) + , mfrow = c(3,2)) +#par(mfrow = c(3,2)) + +# 1: Original B-factor +hist(d$b + , xlab = "" + , main = "B-factor") + +plot(density(d$b) + , xlab = "" + , main = "B-factor") + +# 2: Pred Aff scores +hist(my_df$average_PredAffR + , xlab = "" + , main = "Norm_lig_average") + +plot(density(my_df$average_PredAffR) + , xlab = "" + , main = "Norm_lig_average") + +# 3: After the following replacement +#******************************** + +#================================================= +# Processing P2: Replacing B values with ratioPredAff scores +#================================================= +# use match to perform this replacement linking with "position no" +# in the pdb file, this corresponds to column "resno" +# in my_df, this corresponds to column "Position" + +#========= +# step 0_P2: DONT RUN once you have double checked the matched output +#========= +# sanity check: match and assign to a separate column to double check +# colnames(my_df) +# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17 + +#========= +# step 1_P2: BE BRAVE and replace in place now (don't run step 0) +#========= +# this makes all the B-factor values in the non-matched positions as NA +d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] + +#========= +# step 2_P2 +#========= +# count NA in Bfactor +b_na = sum(is.na(d$b)) ; b_na + +# count number of 0's in Bactor +sum(d$b == 0) +#table(d$b) + +# replace all NA in b factor with 0 +d$b[is.na(d$b)] = 0 + +# sanity check: should be 0 +sum(is.na(d$b)) + +if (sum(d$b == 0) == b_na){ + print ("Sanity check passed: NA's replaced with 0's successfully") +} else { + print("Error: NA replacement NOT successful, Debug code!") +} + +max(d$b); min(d$b) + +# sanity checks: should be True +if (max(d$b) == max(my_df$average_PredAffR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +if (min(d$b) == min(my_df$average_PredAffR)){ + print("Sanity check passed: B-factors replaced correctly") +} else { + print ("Error: Debug code please") +} + +#========= +# step 3_P2 +#========= +# sanity check: dim should be same before reassignment +# should be TRUE +dim(d) == dim(d2) + +#========= +# step 4_P2 +#========= +# assign it back to the pdb file +my_pdb[[1]] = d + +max(d$b); min(d$b) + +#========= +# step 5_P2 +#========= + +# output dir +outDir = "~/git/Data/pyrazinamide/input/structure/" +outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile +write.pdb(my_pdb, outFile) + +#******************************** +# Add the 3rd histogram and density plots for comparisons +#******************************** +# Plots continued... +# 3: hist and density of replaced B-factors with PredAff Scores +hist(d$b + , xlab = "" + , main = "repalced-B") + +plot(density(d$b) + , xlab = "" + , main = "replaced-B") + +# graph titles +mtext(text = "Frequency" + , side = 2 + , line = 0 + , outer = TRUE) + +mtext(text = "Lig_stability" + , side = 3 + , line = 0 + , outer = TRUE) + +#******************************** + +########### +# end of output files with Bfactors +########## diff --git a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R new file mode 100644 index 0000000..9f30f28 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R @@ -0,0 +1,257 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") +getwd() + +######################################################### +# 1: Installing and loading required packages # +######################################################### + +source("Header_TT.R") +#source("barplot_colour_function.R") + +########################################################## +# Checking: Entire data frame and for PS # +########################################################## + +########################### +#2) Read file: combined one from the script +########################### +source("combining_two_df.R") + +# df with NA: +# merged_df2 +# merged_df3: + +# df without NA: +# merged_df2_comp: +# merged_df3_comp: + +###################### +# You need to check it +# with the merged_df3 +######################## + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#clear variables +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# should be true +identical(my_df$Position, my_df$position) + +################################# +# Read file: normalised file +# output of step 4 mcsm_pipeline +################################# + + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile + +mcsm_data <- read.csv(inFile + , row.names = 1 + , stringsAsFactors = F + , header = T) +str(mcsm_data) +my_colnames = colnames(mcsm_data) + +#==================================== +# subset my_df to include only the columns in mcsm data +my_df2 = my_df[my_colnames] +#==================================== +# compare the two +head(mcsm_data$Mutationinformation) +head(mcsm_data$Position) + +head(my_df2$Mutationinformation) +head(my_df2$Position) + +# sort mcsm data by Mutationinformation +mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] +head(mcsm_data_s$Mutationinformation) +head(mcsm_data_s$Position) + +# now compare: should be True, but is false.... +# possibly due to rownames!?! +identical(mcsm_data_s, my_df2) + +# from library dplyr +setdiff(mcsm_data_s, my_df2) + +#from lib compare +compare(mcsm_data_s, my_df2) # seems rownames are the problem + +# FIXME: automate this +# write files: checked using meld and files are indeed identical +#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F) +#write.csv(my_df2, "my_df2.csv", row.names = F) + + +#====================================================== end of section 1 + + + +########################################################## +# Checking: LIG(Filtered dataframe) # +########################################################## + +# clear workspace +rm(list = ls()) + +########################### +#3) Read file: combined_lig from the script +########################### +source("combining_two_df_lig.R") + +# df with NA: +# merged_df2 : +# merged_df3: + +# df without NA: +# merged_df2_comp: +# merged_df3_comp: + +###################### +# You need to check it +# with the merged_df3 +######################## + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +my_df = merged_df3 +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#clear variables +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# should be true +identical(my_df$Position, my_df$position) + +################################# +# Read file: normalised file +# output of step 4 mcsm_pipeline +################################# + +inDir = "~/git/Data/pyrazinamide/input/processed/" +inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile + +mcsm_data <- read.csv(inFile + , row.names = 1 + , stringsAsFactors = F + , header = T) +str(mcsm_data) + +########################### +# 4a: Filter/subset data: ONLY for LIGand analysis +# Lig plots < 10Ang +# Filter the lig plots for Dis_to_lig < 10Ang +########################### +# sanity checks +upos = unique(mcsm_data$Position) + +# check range of distances +max(mcsm_data$Dis_lig_Ang) +min(mcsm_data$Dis_lig_Ang) + +# Lig filtered: subset data to have only values less than 10 Ang +mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) + +rm(mcsm_data) #to avoid confusion + +table(mcsm_data2$Dis_lig_Ang<10) +table(mcsm_data2$Dis_lig_Ang>10) + +max(mcsm_data2$Dis_lig_Ang) +min(mcsm_data2$Dis_lig_Ang) + +upos_f = unique(mcsm_data2$Position); upos_f + +# colnames of df that you will need to subset the bigger df from +my_colnames = colnames(mcsm_data2) +#==================================== +# subset bigger df i.e my_df to include only the columns in mcsm data2 +my_df2 = my_df[my_colnames] + +rm(my_df) #to avoid confusion +#==================================== +# compare the two +head(mcsm_data2$Mutationinformation) +head(mcsm_data2$Position) + +head(my_df2$Mutationinformation) +head(my_df2$Position) + +# sort mcsm data by Mutationinformation +mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] +head(mcsm_data2_s$Mutationinformation) +head(mcsm_data2_s$Position) + +# now compare: should be True, but is false.... +# possibly due to rownames!?! +identical(mcsm_data2_s, my_df2) + +# from library dplyr +setdiff(mcsm_data2_s, my_df2) + +# from library compare +compare(mcsm_data2_s, my_df2) # seems rownames are the problem + +#FIXME: automate this +# write files: checked using meld and files are indeed identical +#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F) +#write.csv(my_df2, "my_df2.csv", row.names = F) + + +########################################################## +# extract and write output file for SNP posn: all # +########################################################## + +head(merged_df3$Position) + +foo = merged_df3[order(merged_df3$Position),] +head(foo$Position) + +snp_pos_unique = unique(foo$Position); snp_pos_unique + +# sanity check: +table(snp_pos_unique == combined_df$Position) + +#===================== +# write_output files +#===================== +outDir = "~/Data/pyrazinamide/input/processed/" + + +outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1 +print(paste0("Output file name and path will be:","", outFile1)) + +write.table(snp_pos_unique + , outFile1 + , row.names = F + , col.names = F) + +############################################################## +# extract and write output file for SNP posn: complete only # +############################################################## +head(merged_df3_comp$Position) + +foo = merged_df3_comp[order(merged_df3_comp$Position),] +head(foo$Position) + +snp_pos_unique = unique(foo$Position); snp_pos_unique + +# outDir = "~/Data/pyrazinamide/input/processed/" # already set + +outFile2 = paste0(outDir, "snp_pos_unique_comp.txt") +print(paste0("Output file name and path will be:", outFile2)) + +write.table(snp_pos_unique + , outFile2 + , row.names = F + , col.names = F) +#============================== end of script + + diff --git a/meta_data_analysis/.Rhistory b/meta_data_analysis/.Rhistory new file mode 100644 index 0000000..28b1797 --- /dev/null +++ b/meta_data_analysis/.Rhistory @@ -0,0 +1,512 @@ +, stringsAsFactors = F) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] +my_logor +pnca_snps_or$Mutationinformation == i +View(pnca_snps_or) +#=============== +# Step 4: Calculate for one snp +# using i, when you run the loop, it is easy +#=============== +i = "pnca_p.trp68gly" +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +# uncomment as necessary +pnca_snps_or = pnca_snps_or[1:5,] +pnca_snps_or = pnca_snps_or[c(1:5),] +#=============== +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +pnca_snps_or = pnca_snps_or[1:5,] +pnca_snps_or = pnca_snps_or[c(1:5),] +pnca_snps_or = pnca_snps_or[1:5] +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +pnca_snps_or = pnca_snps_or[1:5] +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +foo = pnca_snps_or[c(1:5,)] +foo = pnca_snps_or[c(1:5),] +foo = as.data.frame(pnca_snps_or[c(1:5),]) +View(foo) +# create an empty dataframe +pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),]) +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +#=============== +# Step 4: Iterate through this unique list +# and calculate OR, but only for one snp +# this is test before you apply it all others +#=============== +pnca_snps_or$mutation == i +View(pnca_snps_or) +# create an empty dataframe +pnca_snps_or = data.frame(mutation = i) +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +View(pnca_snps_or_copy) +#=============== +# Step 4: Iterate through this unique list +# and calculate OR, but only for one snp +# this is test before you apply it all others +#=============== +#reset original df so you don't make a mistake +pnca_snps_or = pnca_snps_or_copy +for (i in pnca_snps_unique){ +print(i) +} +pnca_snps_or = pnca_snps_or_copy #2133, 1 +#........................................ +# create an empty dataframe : uncomment as necessary +pnca_snps_or = data.frame(mutation = c(i, "blank_mut") +#........................................ +# create an empty dataframe : uncomment as necessary +pnca_snps_or = data.frame(mutation = c(i, "blank_mut")) +#........................................ +# create an empty dataframe : uncomment as necessary +pnca_snps_or = data.frame(mutation = c(i, "blank_mut")) +View(pnca_snps_or) +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +View(pnca_snps_or) +pnca_snps_or = pnca_snps_or_copy #2133, 1 +for (i in pnca_snps_unique){ +print(i) +#************* +# start logistic regression model building +#************* +# set the IV and DV for the logistic regression model +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +} +warnings() +View(pnca_snps_or) +View(pnca_snps_or_copy) +#sanity check +pnca_snps_or$mutation == i1 +#sanity check +pnca_snps_or[pnca_snps_or$mutation == i1] +pnca_snps_or[pnca_snps_or$mutation == i2] +pnca_snps_or[pnca_snps_or$mutation == i2,] +pnca_snps_or1 = unique(pnca_snps_or) +write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv") +# you only need it for the unique mutations +pnca_snps_or = unique(pnca_snps_or) #2133, 1 +for (i in pnca_snps_unique){ +print(i) +#************* +# start logistic regression model building +#************* +# set the IV and DV for the logistic regression model +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +} +View(pnca_snps_or) +2.290256e+01 +1.561132e+06 +3.242285e-04 +#sanity check +pnca_snps_or[pnca_snps_or$mutation == i1] +pnca_snps_or[pnca_snps_or$mutation == i2,] +write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv") +my_data = read.csv("../Data_original/meta_pza_with_AF.csv" +, stringsAsFactors = FALSE) #11374, 19 +View(my_data) +# remove the first column +my_data = my_data[-1] #11374, 18 +# check if first col is 'id': should be TRUE +colnames(my_data)[1] == 'id' +# sanity check +snps_all = unique(my_data$mutation)# 337 +pnca_snps_or = snps_all +pnca_snps_or = as.data.frame(snps_all) +View(pnca_snps_or) +snps_all[-"true_wt"] +pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) +View(pnca_snps_or) +snps_all = as.data.frame(snps_all) +View(snps_all) +#remove true_wt entry +w1 = which(rownames(snps_all) == "true_wt") +View(snps_all) +#remove true_wt entry +w1 = which(snps_all$snps_all == "true_wt") +rm(pnca_snps_or) +pnca_snps_or = snps_all[-w1] +pnca_snps_or = snps_all[,-w1] +pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) +#remove true_wt entry +w1 = which(snps_all) == "true_wt" +pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) +my_data = read.csv("../Data_original/meta_pza_with_AF.csv" +, stringsAsFactors = FALSE) #11374, 19 +# remove the first column +my_data = my_data[-1] #11374, 18 +# check if first col is 'id': should be TRUE +colnames(my_data)[1] == 'id' +# sanity check +snps_all = unique(my_data$mutation)# 337 +snps_all = as.data.frame(snps_all) +snps_all[-c(1,1)] +pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) +pnca_snps_or = as.data.frame(snps_all[, -c(1,1)]) +#remove true_wt entry +#w1 = which(snps_all) == "true_wt" +pnca_snps_or = snps_all +pnca_snps_or = pnca_snps_or_copy +#remove true_wt entry +#w1 = which(snps_all) == "true_wt" +pnca_snps_or = snps_all +pnca_snps_or -> pnca_snps_or_copy +#=============== +# Step 4: Iterate through this unique list +# and calculate OR for each snp +# and assign to the pnca_snps_or df that has +# each row as a unique snp +#=============== +# reset original df so you don't make a mistake: IMPORTANT +pnca_snps_or = pnca_snps_or_copy #2133, 1 +# you only need it for the unique mutations +pnca_snps_or = unique(pnca_snps_or) #337, 1 +for (i in pnca_snps_unique){ +print(i) +#************* +# start logistic regression model building +#************* +# set the IV and DV for the logistic regression model +# IV: corresponds to each unique snp (extracted using grep) +x = as.numeric(grepl(i,raw_data$all_muts_pza)) +# DV: pyrazinamide 0 or 1 +y = as.numeric(raw_data$pyrazinamide) +table(y,x) +# run glm model +model = glm(y ~ x, family = binomial) +#model = glm(y ~ x, family = binomial(link = "logit")) +summary(model) +#********** +# extract relevant model output +#********** +# extract log OR i.e the Beta estimate of the logistic model for a given snp +my_logor = summary(model)$coefficients[2,1] +print(paste0('Beta:', my_logor)) +# extract SE of the logistic model for a given snp +my_se = summary(model)$coefficients[2,2] +print(paste0('SE:', my_se)) +# extract Z of the logistic model for a given snp +my_zval = summary(model)$coefficients[2,3] +print(paste0('Z-value:', my_zval)) +# Dervive OR i.e exp(my_logor) from the logistic model for a given snp +#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) +my_or = exp(summary(model)$coefficients[2,1]) +print(paste0('OR:', my_or)) +# sanity check : should be True +log(my_or) == my_logor +# extract P-value of the logistic model for a given snp +my_pval = summary(model)$coefficients[2,4] +print(paste0('P-value:', my_pval)) +# extract confint interval of snp (2 steps, since the output is a named number) +ci_mod = exp(confint(model))[2,] +my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) +print(paste0('CI:', my_ci)) +#************* +# Assign the regression output in the original df +# you can use ('=' or '<-/->') +#************* +#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor +my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] +my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] +my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] +my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] +my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] +my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] +} +getwd() +#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work +setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad +#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac +getwd() +#=============== +# Step 1: read raw data +#=============== +raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv" +,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4 +raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4 +# combine the two mutation columns +raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5 +head(raw_data$all_mutations_pyrazinamide) +# create yet another column that contains all the mutations but in lower case +raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6 +table(grepl("pnca_p",raw_data$all_muts_pza)) +#FALSE TRUE +#10603 1908 +pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" +, stringsAsFactors = F +, header = T) #2133 +# subset a snall section to test +#pnca_snps_or_copy = pnca_snps_or +#pnca_snps_or = pnca_snps_or_copy +pnca_snps_unique = unique(pnca_snps_or$mutation) #293 +i2 = "pnca_p.trp68gly" # Should exist +grep(i2, pnca_snps_unique) +my_data = read.csv("../Data_original/meta_pza_with_AF.csv" +, stringsAsFactors = FALSE) #11374, 19 +# remove the first column +my_data = my_data[-1] #11374, 18 +# check if first col is 'id': should be TRUE +colnames(my_data)[1] == 'id' +# sanity check +head(my_data$mutation) +my_data = unique(my_data$mutation) +my_data[!duplicated(my_data$mutation)] +my_data_unique = my_data[!duplicated(my_data$mutation),] +my_data[!duplicated('mutation'),] +my_data_unique = my_data[!duplicated(my_data[,'mutation']),] +my_data_unique = my_data[!duplicated(my_data['mutation']),] +getwd() +setwd("/git/LSHTM_analysis/meta_data_analysis") +getwd() +getwd() +setwd("/git/github/LSHTM_analysis/meta_data_analysis") +getwd() +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F)) +c = file.choose() +c = file.choose(../Data_original) +c = read.csv(file.choose(), stringsAsFactors = F) +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv(file.choose(), stringsAsFactors = F)) +c = read.csv(file.choose(), stringsAsFactors = F) +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv(file.choose(), stringsAsFactors = F) +#=============== +# Step 1: read GWAS raw data stored in Data_original/ +#=============== +infile = read.csv(file.choose(), stringsAsFactors = F) +raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")] +outdir = paste0("../mcsm_analysis",drug,"/Data/") +# define output variables +drug = 'pyrazinamide' +outdir = paste0("../mcsm_analysis",drug,"/Data/") +outdir = paste0("../mcsm_analysis/",drug,"/Data/") +outFile = "meta_data_with_AFandOR.csv" +output_filename = paste0(outdir, outFile) +output_filename diff --git a/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc b/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1db3c8321b73129609403f35aa5ecc9a7fcd9421 GIT binary patch literal 2203 zcmbtUOLN;c5C-@VDO#4~x8?MqJenyFwQT3m#N+YAj@zbfl1Xb%-C~+@7y-4|P$Ugf zvE)N9`2+p|o{W6#p?{>OoN?``e<7!0K})nfo?a@B7K;zNi+y3YCnp^wf7+k3zs$`b z^fzoqKR9_P|EZxNL=iz0Q!T_qQ>#vNW-xtdo}o7wBc}gW8d_98!^DKxc4r zQA!&+)RZz4Y+_iUrE_kQT9T_$o0h3VCuoIE(kiW;TBIhiI-Qc(G@X&?ES;nCr)5$f z<0Ew8)FxAOkv8bksYa&h4f+B7kbXozJ~hb93GQ#Q>Tvc9?VuCMygp)VO;^V^eC}Y@p#1tAhsQNy=|3`KfE!M{kwJ%rzSCq{bbkF>^`d-99?}YadZ<7rA$jA2^RV^D{=9an#)FvF10usJVK- zxw^W3Z*6_;esgVQ^?~C&^m|z+=IhPOk9ui`Gv@pQZT0Lu{{48dbr58&9Y6D1L6r2e zR?LHgAo9cGitl^vn6h-aoxUau87O)o%QD6l?uGFYw{;3ooqE%0+V+ZA{Z)V>sWU*MkrdL-~uVuBxSVm^;UpOcXn0o-ivcKt9U z<*hJCI^cn@b`#bP{Ln4CIyYoyxCx{riK}r7cpG?wPXRv-qznWerEBp?;A=ptKo)?= z_lx)(kXcFG@&e5|ib=a;~R>i*G96980z)WL|6TLpu;X|N>GYwnkU%!8P+pp@+d) zuil9;U#e$gMXZ-``RK|~9DSBR)4Il`-qWunUpiY9xF}fBpini}&Ye|$UTIJj+t@K; z%h#UBm%2LU)<{`mnLh|)E+)^9y(2kqxTue|%GDqnh*}(7nhx#_VR;^n+ny&%i62ov z6FY2NC@T}A^rT-qI2Y&qtr2V= 1: + indexes.append(i) + new_values.append(presplit) + for value in values: + indexes.append(i) + new_values.append(value) + new_df = df.iloc[indexes, :].copy() + new_df[column] = new_values + return new_df + +######## +# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces +#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas +######## +meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') + +# remove leading white space else these are counted as distinct mutations as well +meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() + +######## +# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces +######## +meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') + +# remove the leading white spaces in the column +meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() + +########## +# Step 4: Reshape data so that all mutations are in one column and the +# annotations for the mutation reflect the column name +# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/ + +# data frame “df” is passed to melt() function +# id_vars is the variable which need to be left unaltered +# var_name are the column names so we named it as 'mutation_info' +# value_name are its values so we named it as 'mutation' +########## +meta_pnca_WF1.columns + +meta_pnca_LF0 = pd.melt(meta_pnca_WF1 + , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide'] + , var_name = 'mutation_info' + , value_name = 'mutation') + +# sanity check: should be true +if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2: + print('sanity check passed: Long format df has the expected length') +else: + print("Sanity check failed: Debug please!") + +########### +# Step 5: This is still dirty data. Filter LF data so that you only have +# mutations corresponding to pnca_p. +# this will be your list you run OR calcs +########### +meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] + +# sanity checks +# unique samples +meta_pnca_LF1['id'].nunique() +if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique(): + print("Sanity check passed: No of samples with pncA mutations match") +else: + print("Sanity check failed: Debug please!") + +# count if all the mutations are indeed in the protein coding region +# i.e begin with pnca_p +meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093 + +# should be true. +# and check against the length of the df, which should match +if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum(): + print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region") +else: + print("Sanity check failed: Debug please!") + +########### +# Step 6: Filter dataframe with "na" in the drug column +# This is because for OR, you can't use the snps that have the +# NA in the specified drug column +# it creates problems when performing calcs in R inside the loop +# so best to filter it here +########### +# NOT NEEDED FOR all snps, only for extracting valid OR snps +del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all) + +########### +# Step 7: count unique pncA_p mutations (all and comp cases) +########### +meta_pnca_LF1['mutation'].nunique() +meta_pnca_LF1.groupby('mutation_info').nunique() + +meta_pnca_LF1['id'].nunique() +meta_pnca_LF1['mutation'].nunique() +meta_pnca_LF1.groupby('id').nunique() + +########### +# Step 8: convert all snps only (IN LOWERCASE) +# because my mcsm file intergated has lowercase +########### +# convert mutation to lower case as it needs to exactly match the dict key +#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc +meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower() + +########### +# Step 9 : Split 'mutation' column into three: wild_type, position and +# mutant_type separately. Then map three letter code to one from the +# referece_dict imported pncaeady. First convert to mutation to lowercase +# to allow to match entries from dict +########### +#======= +# Step 9a: iterate through the dict, create a lookup dict i.e +# lookup_dict = {three_letter_code: one_letter_code}. +# lookup dict should be the key and the value (you want to create a column for) +# Then use this to perform the mapping separetly for wild type and mutant cols. +# The three letter code is extracted using a regex match from the dataframe and then converted +# to 'pandas series'since map only works in pandas series +#======= +# initialise a sub dict that is a lookup dict for three letter code to one +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['one_letter_code'] + wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on + meta_pnca_LF1['wild_type'] = wt.map(lookup_dict) + mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict) + +# extract position info from mutation column separetly using regex +meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') + +# clear variables +del(k, v, wt, mut, lookup_dict) + +#========= +# Step 9b: iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_water} +# Do this for both wild_type and mutant as above. +#========= +# initialise a sub dict that is lookup dict for three letter code to aa prop +lookup_dict = dict() + +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_water'] + #print(lookup_dict) + wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on + meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict) + mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict) + +# added two more cols + +# clear variables +del(k, v, wt, mut, lookup_dict) + +#======== +# Step 9c: iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_polarity} +# Do this for both wild_type and mutant as above. +#========= +# initialise a sub dict that is lookup dict for three letter code to aa prop +lookup_dict = dict() + +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_polarity'] + #print(lookup_dict) + wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on + meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict) + mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict) + +# added two more cols + +# clear variables +del(k, v, wt, mut, lookup_dict) + +######## +# Step 10: combine the wild_type+poistion+mutant_type columns to generate +# Mutationinformation (matches mCSM output field) +# Remember to use .map(str) for int col types to allow string concatenation +######### +meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type'] + +#=#=#=#=#=#=# +# Step 11: +# COMMENT: there is more processing in the older version of this script +# consult if necessary +# possibly due to the presence of true_wt +# since this file doesn't contain any true_wt, we won't need it(hopefully!) +#=#=#=#=#=#=# + +#%% +########### +# Step 12: Output files for only SNPs to run mCSM +########### + +#========= +# Step 12a: all SNPs to run mCSM +#========= +snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) +pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) + +# assign meaningful colnames +#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True) +#list(snps_only.columns) +snps_only.isna().sum() # should be 0 + +# output csv: all SNPS for mCSM analysis +# specify variable name for output file +gene = 'pnca' +#drug = 'pyrazinamide' +my_fname1 = '_snps_' +nrows = len(snps_only) + +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' +#output_file_path = work_dir + '/Data/' +output_file_path = data_dir + '/input/processed/' + drug + '/' + +if not os.path.exists(output_file_path): + print( output_file_path, 'does not exist. Creating') + os.makedirs(output_file_path) + exit() + +output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' +print(output_filename) #<<<- check + +# write to csv: without column or row names +# Bad practice: numbers at the start of a filename +snps_only.to_csv(output_filename, header = False, index = False) + +#========= +# Step 12b: all snps with annotation +#========= +# all snps, selected cols +#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage' +# , 'drtype', 'pyrazinamide' +# , 'mutation_info', 'mutation', 'Mutationinformation']] + +#len(pnca_snps_ALL) + +# sanity check +#meta_pnca_LF1['mutation'].nunique() + +# output csv: WITH column but WITHOUT row names(all snps with meta data) +# specify variable name for output file +#gene = 'pnca' +#drug = 'pyrazinamide' +#my_fname2 = '_snps_with_metadata_' +#nrows = len(pnca_snps_ALL) + +#output_file_path = work_dir + '/Data/' +#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv' +#print(output_filename) #<<<- check + +# write out file +#pnca_snps_ALL.to_csv(output_filename, header = True, index = False) + +#========= +# Step 12c: comp snps for OR calcs with annotation +#========= +# remove all NA's from pyrazinamide column from LF1 + +# counts NA per column +meta_pnca_LF1.isna().sum() + +# remove NA +meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide']) + +# sanity checks +# should be True +len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum() + +# unique counts +meta_pnca_LF2['mutation'].nunique() + +meta_pnca_LF2.groupby('mutation_info').nunique() + +# sanity check +meta_pnca_LF2['id'].nunique() + +# should be True +if meta_pnca_LF2['id'].nunique() == comp_pnca_samples: + print ('sanity check passed: complete numbers match') +else: + print('Error: Please Debug!') + +# value counts +meta_pnca_LF2.mutation.value_counts() +#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size() + +# valid/comp snps +# uncomment as necessary +pnca_snps_COMP = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique()) +len(pnca_snps_COMP) + +# output csv: WITH column but WITHOUT row names (COMP snps with meta data) +# specify variable name for output file + +gene = 'pnca' +#drug = 'pyrazinamide' +my_fname3 = '_comp_snps_with_metadata_' +nrows = len(pnca_snps_COMP) + +#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv' +#print(output_filename) #<<<-check + +# write out file +#pnca_snps_COMP.to_csv(output_filename, header = True, index = False) + + +#========= +# Step 12d: comp snps only +#========= +# output csv: comp SNPS for info (i.e snps for which OR exist) +# specify variable name for output file + +snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique()) + +gene = 'pnca' +#drug = 'pyrazinamide' +my_fname1 = '_comp_snps_' +nrows = len(snps_only) + +output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' +print(output_filename) #<<<- check + +# write to csv: without column or row names +snps_only.to_csv(output_filename, header = False, index = False) + + +#=#=#=#=#=#=#=# +# COMMENT: LF1 is the file to extract all unique snps for mcsm +# but you have that already in file called pnca_snps... +# LF2: is the file for extracting snps tested for DS and hence OR calcs +#=#=#=#=#=#=#=# + +########### +# Step 13 : Output the whole df i.e +# file for meta_data which is now formatted with +# each row as a unique snp rather than the original version where +# each row is a unique id +# ***** This is the file you will ADD the AF and OR calculations to ***** +########### + +# output csv: the entire DF +# specify variable name for output file +gene = 'pnca' +#drug = 'pyrazinamide' +my_fname4 = '_metadata' +#nrows = len(meta_pnca_LF1) +output_filename = output_file_path + gene + my_fname4 + '.csv' +print(output_filename) #<<<-check + +# write out file +meta_pnca_LF1.to_csv(output_filename) diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py new file mode 100755 index 0000000..bf6561b --- /dev/null +++ b/meta_data_analysis/reference_dict.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jun 18 11:32:28 2019 + +@author: tanushree +""" +############################################ +#load libraries +import pandas as pd +import os +############################################# + +#!#########################! +# REQUIREMNETS: +# Data_original/ must exist +# containing GWAS data +#!#########################! + +print(os.getcwd()) +homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde +os.chdir(homedir + '/git/Data/input/original') +print(os.getcwd()) +#========== +#read file +#========== +my_aa = pd.read_csv('aa_codes.csv') #20, 6 +#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index +#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column +my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 + +#========================================================= +#convert file to dict of dicts +#========================================================= +#convert each row into a dict of dicts so that there are 20 aa and 5 keys within +#with your choice of column name that you have assigned to index as the "primary key". +#using 'index' creates a dict of dicts +#using 'records' creates a list of dicts +my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys + +#================================================ +#dict of aa with their corresponding properties +#This is defined twice +#================================================ +#7 categories: no overlap +qualities1 = { ('R', 'H', 'K'): 'Basic' + , ('D', 'E'): 'Acidic' + , ('N', 'Q'): 'Amidic' + , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic' + , ('S', 'T'): 'Hydroxylic' + , ('F', 'W', 'Y'): 'Aromatic' + , ('C', 'M'): 'Sulphur' +} + +#9 categories: allowing for overlap +qualities2 = { ('R', 'H', 'K'): 'Basic' + , ('D', 'E'): 'Acidc' + , ('S', 'T', 'N', 'Q'): 'Polar' + , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic' + , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic' + , ('S', 'G', 'A', 'P'): 'Small' + , ('F', 'W', 'Y', 'H'): 'Aromatic' + , ('V', 'I', 'L', 'M'): 'Aliphatic' + , ('C', 'G', 'P'): 'Special' +} + +qualities_taylor = { ('R', 'H', 'K'): 'Basic' + , ('D', 'E'): 'Acidc' + , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar' + , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic' + #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR! + , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small' + , ('F', 'W', 'Y', 'H'): 'Aromatic' + , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle! + , ('C', 'G', 'P'): 'Special' +} + +qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic' + , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic' +} + +qualities_polarity = { ('D', 'E'): 'acidic' + , ('H', 'K', 'R'): 'basic' + , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral' + , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar' +} + +#============================================================================== +#adding amino acid properties to my dict of dicts +for k, v in my_aa_dict.items(): + #print (k,v) + v['aa_prop1'] = str() #initialise keys + v['aa_prop2'] = list() #initialise keys (allows for overalpping properties) + v['aa_taylor'] = list() #initialise keys (allows for overalpping properties) + v['aa_prop_water'] = str() #initialise keys + v['aa_prop_polarity'] = str() #initialise keys + + for group in qualities1: + if v['one_letter_code'] in group: + v['aa_prop1']+= qualities1[group] # += for str concat + + for group in qualities2: + if v['one_letter_code'] in group: + v['aa_prop2'].append(qualities2[group]) # append to list + + for group in qualities_taylor: + if v['one_letter_code'] in group: + v['aa_taylor'].append(qualities_taylor[group]) # append to list + + for group in qualities_water: + if v['one_letter_code'] in group: + v['aa_prop_water']+= qualities_water[group] # += for str concat + + for group in qualities_polarity: + if v['one_letter_code'] in group: + v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat + +#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa +#============================================================================== + + diff --git a/mk-drug-dirs.sh b/mk-drug-dirs.sh new file mode 100755 index 0000000..56fb820 --- /dev/null +++ b/mk-drug-dirs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Create a drug directory structure for processing +# +# +# Structure: +# +# $DATA_DIR/$DRUG/input +# |- original +# |- processed +# |- structure +# +# $DATA_DIR/$DRUG/output +# |- plots +# |- structure + +DATA_DIR=~/git/Data + +if [[ $1 == '' ]]; then + echo "usage: mk-drug-dirs.sh "; + exit; +else + DRUG=$1 + echo Creating structure for: $DRUG + + if [ -d $DATA_DIR ] + then + echo Doing creation in $DATA_DIR + mkdir -p $DATA_DIR/$DRUG/input/original + mkdir -p $DATA_DIR/$DRUG/input/processed + mkdir -p $DATA_DIR/$DRUG/input/structure + mkdir -p $DATA_DIR/$DRUG/output/plots + mkdir -p $DATA_DIR/$DRUG/output/structure + + else + echo "Error: $DATA_DIR does not exist. Did you check it out of git?" + exit + fi + +fi +