From 3905a81c38345be20a5a8a84e6862e52a55a7bd7 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 6 Apr 2020 19:03:41 +0100 Subject: [PATCH] refactoring code to make it take command line args --- README.md | 25 +- .../pyrazinamide/scripts/Header_TT.R | 130 ---- .../pyrazinamide/scripts/KS_test_PS.R | 157 ---- .../scripts/barplot_colour_function.R | 27 - .../pyrazinamide/scripts/combining_two_df.R | 417 ----------- .../scripts/combining_two_df_lig.R | 330 -------- .../scripts/generate_mut_sequences.py | 215 ------ .../pyrazinamide/scripts/mcsm/run.sh | 9 - .../mcsm/step0_check_duplicate_SNPs.sh | 25 - .../scripts/mcsm/step1_lig_output_urls.sh | 104 --- .../scripts/mcsm/step2_lig_results.sh | 76 -- .../mcsm/step3a_results_format_interim.sh | 74 -- .../scripts/mcsm/step3b_results_format_df.py | 63 -- .../scripts/mcsm/step3c_results_cleaning.R | 230 ------ .../scripts/mcsm/step4_results_normalise.R | 275 ------- .../scripts/mcsm_mean_stability.R | 131 ---- .../pyrazinamide/scripts/plotting/.RData | Bin 43777 -> 0 bytes .../plotting/OR_PS_Ligand_combined_plot.R | 250 ------- .../scripts/plotting/barplots_2colours_LIG.R | 154 ---- .../scripts/plotting/barplots_2colours_PS.R | 149 ---- .../plotting/barplots_subcolours_LIG.R | 202 ----- .../scripts/plotting/barplots_subcolours_PS.R | 192 ----- .../plotting/barplots_subcolours_aa_LIG.R | 296 -------- .../plotting/barplots_subcolours_aa_PS.R | 292 -------- .../scripts/plotting/basic_barplots_LIG.R | 215 ------ .../scripts/plotting/basic_barplots_PS.R | 211 ------ .../scripts/plotting/corr_plots_v3_PS.R | 175 ----- .../scripts/plotting/corr_plots_v3_lig.R | 187 ----- .../scripts/plotting/lineage_basic_barplot.R | 227 ------ .../scripts/plotting/lineage_dist_LIG.R | 253 ------- .../scripts/plotting/lineage_dist_PS.R | 229 ------ .../scripts/plotting/logolas_logoplot.R | 250 ------- .../scripts/plotting/snp_logo_plot.R | 273 ------- .../scripts/plotting/subcols_axis_LIG.R | 208 ------ .../scripts/plotting/subcols_axis_PS.R | 208 ------ mcsm_analysis/pyrazinamide/scripts/read_pdb.R | 27 - .../pyrazinamide/scripts/replaceBfactor_pdb.R | 386 ---------- .../pyrazinamide/scripts/source_data_checks.R | 257 ------- meta_data_analysis/dssp_df.py | 5 +- mk_drug_dirs.sh | 10 +- .../data_extraction.py | 702 +++++++++++------- .../reference_dict.py | 16 +- 42 files changed, 456 insertions(+), 7206 deletions(-) delete mode 100644 mcsm_analysis/pyrazinamide/scripts/Header_TT.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/combining_two_df.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R delete mode 100755 mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py delete mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/.RData delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/read_pdb.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R delete mode 100644 mcsm_analysis/pyrazinamide/scripts/source_data_checks.R rename {meta_data_analysis => scripts}/data_extraction.py (57%) rename {meta_data_analysis => scripts}/reference_dict.py (95%) diff --git a/README.md b/README.md index 628cafd..dd4cfb5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ mCSM Analysis ============= -This repo does mCSM analysis using bash, python and R. - +This contains scripts that does the following: + 1. mCSM analysis: using bash, python and R + 2. meta data analysis: using python and R + Requires an additional 'Data' directory. Batteries not included:-) ## Assumptions @@ -19,17 +21,14 @@ subdirs within this repo *.R *.py - mcsm\_analysis/ - / - scripts/ - *.R - *.py - mcsm/ - *.sh - *.py - *.R - plotting/ - *.R + mcsm_analysis +# / + + foldx_analysis + + plotting + *.R + ``` More docs here as I write them. diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R deleted file mode 100644 index 9eae42a..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R +++ /dev/null @@ -1,130 +0,0 @@ -######################################################### -### A) Installing and loading required packages -######################################################### -#lib_loc = "/usr/local/lib/R/site-library") - -#if (!require("gplots")) { -# install.packages("gplots", dependencies = TRUE) -# library(gplots) -#} - -#if (!require("tidyverse")) { -# install.packages("tidyverse", dependencies = TRUE) -# library(tidyverse) -#} - -if (!require("ggplot2")) { - install.packages("ggplot2", dependencies = TRUE) - library(ggplot2) -} - -if (!require("plotly")) { - install.packages("plotly", dependencies = TRUE) - library(plotly) -} - -if (!require("cowplot")) { - install.packages("copwplot", dependencies = TRUE) - library(cowplot) -} - -if (!require("ggcorrplot")) { - install.packages("ggcorrplot", dependencies = TRUE) - library(ggcorrplot) -} - -if (!require("ggpubr")) { - install.packages("ggpubr", dependencies = TRUE) - library(ggpubr) -} - -if (!require("RColorBrewer")) { - install.packages("RColorBrewer", dependencies = TRUE) - library(RColorBrewer) -} - -if (!require ("GOplot")) { - install.packages("GOplot") - library(GOplot) -} - -if(!require("VennDiagram")) { - install.packages("VennDiagram", dependencies = T) - library(VennDiagram) -} - -if(!require("scales")) { - install.packages("scales", dependencies = T) - library(scales) -} - -if(!require("plotrix")) { - install.packages("plotrix", dependencies = T) - library(plotrix) -} - -if(!require("stats")) { - install.packages("stats", dependencies = T) - library(stats) -} - -if(!require("stats4")) { - install.packages("stats4", dependencies = T) - library(stats4) -} - -if(!require("data.table")) { -install.packages("data.table") - library(data.table) -} - -if (!require("PerformanceAnalytics")){ - install.packages("PerformanceAnalytics", dependencies = T) - library(PerformaceAnalytics) -} - -if (!require ("GGally")){ - install.packages("GGally") - library(GGally) -} - -if (!require ("corrr")){ - install.packages("corrr") - library(corrr) -} - -if (!require ("psych")){ - install.packages("psych") - library(psych) -} - -if (!require ("dplyr")){ - install.packages("dplyr") - library(dplyr) -} - -if (!require ("compare")){ - install.packages("compare") - library(compare) -} - -if (!require ("arsenal")){ - install.packages("arsenal") - library(arsenal) -} - - -####TIDYVERSE -# Install -#if(!require(devtools)) install.packages("devtools") -#devtools::install_github("kassambara/ggcorrplot") - -library(ggcorrplot) - - -###for PDB files -#install.packages("bio3d") -if(!require(bio3d)){ - install.packages("bio3d") - library(bio3d) -} diff --git a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R b/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R deleted file mode 100644 index 5a827c8..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R +++ /dev/null @@ -1,157 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("../barplot_colour_function.R") -#require(data.table) - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA for pyrazinamide: -# merged_df2 -# merged_df3 - -# df without NA for pyrazinamide: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -# using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available, hence use df with NA -########################### - -# uncomment as necessary - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#%%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -table(my_df$mutation_info); str(my_df$mutation_info) - -# subset df with dr muts only -my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") -table(my_df_dr$mutation_info) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Run two times: -# uncomment as necessary -# 1) for all muts -# 2) for dr_muts -#=========================== -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT - -#================ -# for ALL muts -#================ -#plot_df = my_df - -#================ -# for dr muts ONLY -#================ -plot_df = my_df_dr - -#%%%%%%%%%%%%%%%%%%%%%%%% -#============================ -# Plot: Lineage Distribution -# x = mcsm_values, y = dist -# fill = stability -#============================ - -table(plot_df$lineage); str(plot_df$lineage) - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -# uncomment as necessary -df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) - -# refactor -df_lin$lineage = factor(df_lin$lineage) - -table(df_lin$lineage) #{RESULT: No of samples within lineage} -#lineage1 lineage2 lineage3 lineage4 - -length(unique(df_lin$Mutationinformation)) -#{Result: No. of unique mutations the 4 lineages contribute to} - -# sanity checks -r1 = 2:5 # when merged_df2 used: because there is missing lineages -if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { - print ("sanity check passed: numbers match") -} else{ - print("Error!: check your numbers") -} - -#%%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df <- df_lin -#%%%%%%%%%%%%%%%%%%%%%%%%%% - -rm(df_lin) - -# COMPARING DISTRIBUTIONS -head(df$lineage) -df$lineage = as.character(df$lineage) - -lin1 = df[df$lineage == "lineage1",]$ratioDUET -lin2 = df[df$lineage == "lineage2",]$ratioDUET -lin3 = df[df$lineage == "lineage3",]$ratioDUET -lin4 = df[df$lineage == "lineage4",]$ratioDUET - -# ks test -ks.test(lin1,lin2) -ks.test(lin1,lin3) -ks.test(lin1,lin4) - -ks.test(lin2,lin3) -ks.test(lin2,lin4) - -ks.test(lin3,lin4) - - - diff --git a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R deleted file mode 100644 index a3cc403..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R +++ /dev/null @@ -1,27 +0,0 @@ -######################################################### -# 1b: Define function: coloured barplot by subgroup -# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar -######################################################### - -ColourPalleteMulti <- function(df, group, subgroup){ - - # Find how many colour categories to create and the number of colours in each - categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) - , df - , function(x) length(unique(x))) - # return(categories) } - - category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete - - category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom - - #return(category.start); return(category.end)} - - # Build Colour pallette - colours <- unlist(lapply(1:nrow(categories), - function(i){ - colorRampPalette(colors = c(category.start[i] - , category.end[i]))(categories[i,2])})) - return(colours) -} -######################################################### \ No newline at end of file diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R deleted file mode 100644 index 31a533b..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R +++ /dev/null @@ -1,417 +0,0 @@ -######################################################### -# TASK: To combine mcsm and meta data with af and or files -# Input csv files: -# 1) mcsm normalised and struct params -# 2) gene associated meta_data_with_AFandOR - -# Output: -# 1) muts with opposite effects on stability -# 2) large combined df including NAs for AF, OR,etc -# Dim: same no. of rows as gene associated meta_data_with_AFandOR -# 3) small combined df including NAs for AF, OR, etc. -# Dim: same as mcsm data -# 4) large combined df excluding NAs -# Dim: dim(#1) - no. of NAs(AF|OR) + 1 -# 5) small combined df excluding NAs -# Dim: dim(#2) - no. of unique NAs - 1 -# This script is sourced from other .R scripts for plotting -######################################################### -getwd() -setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/') -getwd() - -########################################################## -# Installing and loading required packages -########################################################## -source('Header_TT.R') -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# data dir -#=========== -datadir = paste0('~/git/Data') - -#=========== -# input -#=========== -# infile1: mCSM data -#indir = '~/git/Data/pyrazinamide/input/processed/' -indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} -#in_filename = 'mcsm_complex1_normalised.csv' -in_filename = 'pnca_mcsm_struct_params.csv' -infile = paste0(indir, '/', in_filename) -cat(paste0('Reading infile1: mCSM output file', ' ', infile) ) - -# infile2: gene associated meta data combined with AF and OR -#indir: same as above -in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv') -infile_comb = paste0(indir, '/', in_filename_comb) -cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb)) - -#=========== -# output -#=========== -# Uncomment if and when required to output -outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir -cat('Output dir: ', outdir) -#out_filename = paste0(tolower(gene), 'XXX') -#outfile = paste0(outdir, '/', out_filename) -#cat(paste0('Output file with full path:', outfile)) -#%% end of variable assignment for input and output files - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# -cat('Reading mcsm_data:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename) - -mcsm_data = read.csv(infile - , row.names = 1 - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(mcsm_data) - , '\nNo. of cols:', ncol(mcsm_data)) - -# clear variables -rm(in_filename, infile) - -str(mcsm_data) - -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) - -# spelling Correction 1: DUET -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) -head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) - -# spelling Correction 2: Ligand -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) - -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) -head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) - -# muts with opposing effects on protomer and ligand stability -table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) -changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),] - -# sanity check: redundant, but uber cautious! -dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) -ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome) - -cat('Identifying muts with opposite stability effects') -if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat('PASS: muts with opposite effects on stability and affinity correctly identified' - , '\nNo. of such muts: ', nrow(changes)) -}else { - cat('FAIL: unsuccessful in extracting muts with changed stability effects') -} - -#*************************** -# write file: changed muts -out_filename = 'muts_opp_effects.csv' -outfile = paste0(outdir, '/', out_filename) -cat('Writing file for muts with opp effects:' - , '\nFilename: ', outfile - , '\nPath: ', outdir) - -write.csv(changes, outfile) -#**************************** -# clear variables -rm(out_filename, outfile) -rm(changes, dl_i, ld_i) - -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by Mutationinformation -mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] -head(mcsm_data$Mutationinformation) - -orig_col = ncol(mcsm_data) - -# get freq count of positions and add to the df -setDT(mcsm_data)[, occurrence := .N, by = .(Position)] - -cat('Added 1 col: position frequency to see which posn has how many muts' - , '\nNo. of cols now', ncol(mcsm_data) - , '\nNo. of cols before: ', orig_col) - -pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) - -########################### -# 2: Read file: meta data with AFandOR -########################### -cat('Reading combined meta data and AFandOR file:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename_comb) - -meta_with_afor <- read.csv(infile_comb - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(meta_with_afor) - , '\nNo. of cols:', ncol(meta_with_afor)) - -# counting NAs in AF, OR cols -if (identical(sum(is.na(meta_with_afor$OR)) - , sum(is.na(meta_with_afor$pvalue)) - , sum(is.na(meta_with_afor$AF)))){ - cat('PASS: NA count match for OR, pvalue and AF\n') - na_count = sum(is.na(meta_with_afor$AF)) - cat('No. of NAs: ', sum(is.na(meta_with_afor$OR))) -} else{ - cat('FAIL: NA count mismatch' - , '\nNA in OR: ', sum(is.na(meta_with_afor$OR)) - , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue)) - , '\nNA in AF:', sum(is.na(meta_with_afor$AF))) -} - -# clear variables -rm(in_filename_comb, infile_comb) - -str(meta_with_afor) - -# sort by Mutationinformation -head(meta_with_afor$Mutationinformation) -meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] -head(meta_with_afor$Mutationinformation) - -########################### -# 3: merging two dfs: with NA -########################### -# link col name = 'Mutationinforamtion' -head(mcsm_data$Mutationinformation) -head(meta_with_afor$Mutationinformation) - -cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2') - -######### -# merge 3a (merged_df2): meta data with mcsm -######### -merged_df2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.y = T) - -cat('Dim of merged_df2: ' - , '\nNo. of rows: ', nrow(merged_df2) - , '\nNo. of cols: ', ncol(merged_df2)) -head(merged_df2$Position) - -# sanity check -cat('Checking nrows in merged_df2') -if(nrow(meta_with_afor) == nrow(merged_df2)){ - cat('nrow(merged_df2) = nrow (gene associated metadata)' - ,'\nExpected no. of rows: ',nrow(meta_with_afor) - ,'\nGot no. of rows: ', nrow(merged_df2)) -} else{ - cat('nrow(merged_df2)!= nrow(gene associated metadata)' - , '\nExpected no. of rows after merge: ', nrow(meta_with_afor) - , '\nGot no. of rows: ', nrow(merged_df2) - , '\nFinding discrepancy') - merged_muts_u = unique(merged_df2$Mutationinformation) - meta_muts_u = unique(meta_with_afor$Mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -# sort by Position -head(merged_df2$Position) -merged_df2 = merged_df2[order(merged_df2$Position),] -head(merged_df2$Position) - -merged_df2v2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn't have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$Position%in%merged_df2v2$Position) - -rm(merged_df2v2) - -######### -# merge 3b (merged_df3):remove duplicate mutation information -######### -cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)' - ,'\nCannot trust lineage info from this' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df3') - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] -head(merged_df3$Position); tail(merged_df3$Position) # should be sorted - -# sanity check -cat('Checking nrows in merged_df3') -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat('PASS: No. of rows match with mcsm_data' - ,'\nExpected no. of rows: ', nrow(mcsm_data) - ,'\nGot no. of rows: ', nrow(merged_df3)) -} else { - cat('FAIL: No. of rows mismatch' - , '\nNo. of rows mcsm_data: ', nrow(mcsm_data) - , '\nNo. of rows merged_df3: ', nrow(merged_df3)) -} - -# counting NAs in AF, OR cols in merged_df3 -# this is becuase mcsm has no AF, OR cols, -# so you cannot count NAs -if (identical(sum(is.na(merged_df3$OR)) - , sum(is.na(merged_df3$pvalue)) - , sum(is.na(merged_df3$AF)))){ - cat('PASS: NA count match for OR, pvalue and AF\n') - na_count_df3 = sum(is.na(merged_df3$AF)) - cat('No. of NAs: ', sum(is.na(merged_df3$OR))) -} else{ - cat('FAIL: NA count mismatch' - , '\nNA in OR: ', sum(is.na(merged_df3$OR)) - , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue)) - , '\nNA in AF:', sum(is.na(merged_df3$AF))) -} - -########################### -# 4: merging two dfs: without NA -########################### -######### -# merge 4a (merged_df2_comp): same as merge 1 but excluding NA -######### -cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2_comp') - -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] - -# sanity check -cat('Checking nrows in merged_df2_comp') -if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ - cat('PASS: No. of rows match' - ,'\nDim of merged_df2_comp: ' - ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 - , '\nNo. of rows: ', nrow(merged_df2_comp) - , '\nNo. of cols: ', ncol(merged_df2_comp)) -}else{ - cat('FAIL: No. of rows mismatch' - ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 - ,'\nGot no. of rows: ', nrow(merged_df2_comp)) -} - -######### -# merge 4b (merged_df3_comp): remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] - -cat('Dim of merged_df3_comp: ' - , '\nNo. of rows: ', nrow(merged_df3_comp) - , '\nNo. of cols: ', ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -# sanity check -cat('Checking nrows in merged_df3_comp') -if(nrow(merged_df3_comp) == nrow(merged_df3)){ - cat('NO NAs detected in merged_df3 in AF|OR cols' - ,'\nNo. of rows are identical: ', nrow(merged_df3)) -} else{ - if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { - cat('PASS: NAs detected in merged_df3 in AF|OR cols' - , '\nNo. of NAs: ', na_count_df3 - , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3 - , '\nGot no. of rows: ', nrow(merged_df3_comp)) - } -} - -#=============== end of combining df -#********************* -# writing 1 file in the style of a loop: merged_df3 -# print(output dir) -#i = 'merged_df3' -#out_filename = paste0(i, '.csv') -#outfile = paste0(outdir, '/', out_filename) - -#cat('Writing output file: ' -# ,'\nFilename: ', out_filename -# ,'\nPath: ', outdir) - -#template: write.csv(merged_df3, 'merged_df3.csv') -#write.csv(get(i), outfile, row.names = FALSE) -#cat('Finished writing: ', outfile -# , '\nNo. of rows: ', nrow(get(i)) -# , '\nNo. of cols: ', ncol(get(i))) - -#%% write_output files; all 4 files: -outvars = c('merged_df2' - , 'merged_df3' - , 'merged_df2_comp' - , 'merged_df3_comp') - -cat('Writing output files: ' - , '\nPath:', outdir) - -for (i in outvars){ -# cat(i, '\n') - out_filename = paste0(i, '.csv') -# cat(out_filename, '\n') -# cat('getting value of variable: ', get(i)) - outfile = paste0(outdir, '/', out_filename) -# cat('Full output path: ', outfile, '\n') - cat('Writing output file:' - ,'\nFilename: ', out_filename,'\n') - write.csv(get(i), outfile, row.names = FALSE) - cat('Finished writing: ', outfile - , '\nNo. of rows: ', nrow(get(i)) - , '\nNo. of cols: ', ncol(get(i)), '\n') -} - -# alternate way to replace with implicit loop -# FIXME -#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv'))) -#************************* -# clear variables -rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) -rm(pos_count_check) -#============================= end of script - diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R deleted file mode 100644 index 361b6b6..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R +++ /dev/null @@ -1,330 +0,0 @@ -######################################################### -# TASK: To combine mcsm and meta data with af and or -# by filtering for distance to ligand (<10Ang). -# This script doesn't output anything. -# This script is sourced from other .R scripts for plotting ligand plots - -# Input csv files: -# 1) mcsm normalised and struct params -# 2) gene associated meta_data_with_AFandOR -######################################################### -getwd() -setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/') -getwd() - -########################################################## -# Installing and loading required packages -########################################################## - -source('Header_TT.R') -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# - -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# input -#=========== -# infile1: mCSM data -#indir = '~/git/Data/pyrazinamide/input/processed/' -indir = paste0('~/git/Data', '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} -#in_filename = 'mcsm_complex1_normalised.csv' -in_filename = 'pnca_mcsm_struct_params.csv' -infile = paste0(indir, '/', in_filename) -cat(paste0('Reading infile1: mCSM output file', ' ', infile) ) - -# infile2: gene associated meta data combined with AF and OR -#indir: same as above -in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv') -infile_comb = paste0(indir, '/', in_filename_comb) -cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb)) - -#=========== -# output -#=========== -# Uncomment if and when required to output -outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir -cat('Output dir: ', outdir) -#out_filename = paste0(tolower(gene), 'XXX') -#outfile = paste0(outdir, '/', out_filename) -#cat(paste0('Output file with full path:', outfile)) -#%% end of variable assignment for input and output files - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# -cat('Reading mcsm_data:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename) - -mcsm_data = read.csv(infile - , row.names = 1 - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(mcsm_data) - , '\nNo. of cols:', ncol(mcsm_data)) - -# clear variables -rm(in_filename, infile) - -str(mcsm_data) - -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) - -# spelling Correction 1: DUET -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) -head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) - -# spelling Correction 2: Ligand -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) - -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) -head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) - -# muts with opposing effects on protomer and ligand stability -# excluded from here as it is redundant. -# check 'combining_two_df.R' to refer if required. - -########################### !!! only for mcsm_lig -# 4: Filter/subset data -# Lig plots < 10Ang -# Filter the lig plots for Dis_to_lig < 10Ang -########################### - -# check range of distances -max(mcsm_data$Dis_lig_Ang) -min(mcsm_data$Dis_lig_Ang) - -# count -table(mcsm_data$Dis_lig_Ang<10) - -# subset data to have only values less than 10 Ang -mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) - -# sanity checks -max(mcsm_data2$Dis_lig_Ang) -min(mcsm_data2$Dis_lig_Ang) - -# count no of unique positions -length(unique(mcsm_data2$Position)) - -# count no of unique mutations -length(unique(mcsm_data2$Mutationinformation)) - -# count Destabilisinga and stabilising -table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang} - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(mcsm_data2$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -#!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT: so as not to alter the script -mcsm_data = mcsm_data2 -#!!!!!!!!!!!!!!!!!!!!! -# clear variables -rm(mcsm_data2) - -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by Mutationinformation -mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] -head(mcsm_data$Mutationinformation) - -orig_col = ncol(mcsm_data) -# get freq count of positions and add to the df -setDT(mcsm_data)[, occurrence := .N, by = .(Position)] - -cat('Added 1 col: position frequency to see which posn has how many muts' - , '\nNo. of cols now', ncol(mcsm_data) - , '\nNo. of cols before: ', orig_col) - -pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) - -########################### -# 2: Read file: meta data with AFandOR -########################### -cat('Reading combined meta data and AFandOR file:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename_comb) - -meta_with_afor <- read.csv(infile_comb - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(meta_with_afor) - , '\nNo. of cols:', ncol(meta_with_afor)) - -# clear variables -rm(in_filename_comb, infile_comb) - -str(meta_with_afor) - -# sort by Mutationinformation -head(meta_with_afor$Mutationinformation) -meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] -head(meta_with_afor$Mutationinformation) - -########################### -# 3: merging two dfs: with NA -########################### -# link col name = 'Mutationinforamtion' -cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2') - -head(mcsm_data$Mutationinformation) -head(meta_with_afor$Mutationinformation) - -######### -# merge 3a: meta data with mcsm -######### -merged_df2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.y = T) - -cat('Dim of merged_df2: ' - , '\nNo. of rows: ', nrow(merged_df2) - , '\nNo. of cols: ', ncol(merged_df2)) -head(merged_df2$Position) - -if(nrow(meta_with_afor) == nrow(merged_df2)){ - cat('nrow(merged_df2) = nrow (gene associated metadata)' - ,'\nExpected no. of rows: ',nrow(meta_with_afor) - ,'\nGot no. of rows: ', nrow(merged_df2)) -} else{ - cat('nrow(merged_df2)!= nrow(gene associated metadata)' - , '\nExpected no. of rows after merge: ', nrow(meta_with_afor) - , '\nGot no. of rows: ', nrow(merged_df2) - , '\nFinding discrepancy') - merged_muts_u = unique(merged_df2$Mutationinformation) - meta_muts_u = unique(meta_with_afor$Mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -# sort by Position -head(merged_df2$Position) -merged_df2 = merged_df2[order(merged_df2$Position),] -head(merged_df2$Position) - -merged_df2v2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn't have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$Position%in%merged_df2v2$Position) - -rm(merged_df2v2) - -######### -# merge 3b:remove duplicate mutation information -######### -cat('Merging dfs with NAs: small df (removing duplicate muts)' - ,'\nCannot trust lineage info from this' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df3') - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] -head(merged_df3$Position); tail(merged_df3$Position) # should be sorted - -# sanity checks -# nrows of merged_df3 should be the same as the nrows of mcsm_data -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat('PASS: No. of rows match with mcsm_data' - ,'\nExpected no. of rows: ', nrow(mcsm_data) - ,'\nGot no. of rows: ', nrow(merged_df3)) -} else { - cat('FAIL: No. of rows mismatch' - , '\nNo. of rows mcsm_data: ', nrow(mcsm_data) - , '\nNo. of rows merged_df3: ', nrow(merged_df3)) -} - -########################### -# 4: merging two dfs: without NA -########################### -cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2_comp') - -######### -# merge 4a: same as merge 1 but excluding NA -######### -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] - -cat('Dim of merged_df2_comp: ' - , '\nNo. of rows: ', nrow(merged_df2_comp) - , '\nNo. of cols: ', ncol(merged_df2_comp)) - -######### -# merge 4b: remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] - -cat('Dim of merged_df3_comp: ' - , '\nNo. of rows: ', nrow(merged_df3_comp) - , '\nNo. of cols: ', ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -#=============== end of combining df -#********************* -# write_output files -# Not required as this is a subset of the combining_two_df.R -#************************* -# clear variables -rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) -rm(pos_count_check) -#============================= end of script - diff --git a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py deleted file mode 100755 index 5cc5f09..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Tue Jun 25 08:46:36 2019 - -@author: tanushree -""" -############################################ -# load libraries -import os -import pandas as pd -import numpy as np -from Bio import SeqIO -############################################ -#******************************************************************** -# TASK: Read in fasta files and create mutant sequences akin to a MSA, -# to allow generation of logo plots - -# Requirements: -# input: Fasta file of protein/target for which mut seqs will be created - # path: "Data//input/original/" -# output: MSA for mutant sequences - # path: "Data//input/processed/" -#*********************************************************************** -#%% -# specify input and output variables -homedir = os.path.expanduser('~') -#======= -# input -#======= -############# -# fasta file -############# -indir = 'git/Data/pyrazinamide/input/original' -in_filename_fasta = "3pl1.fasta.txt" -infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta -print(infile_fasta) - -############# -# meta data -############# -# FIXME when you change the dir struc -inpath_p = "git/Data/pyrazinamide/input/processed" -in_filename_meta_data = "meta_data_with_AFandOR.csv" -infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data -print("Input file is:", infile_meta_data) - -#======= -# output -#======= -outdir = 'git/Data/pyrazinamide/output' -# filenames in respective sections - -################## end of variable assignment for input and output files -#%% -#========== -# read files -#========== - -############# -# fasta file -############# -my_fasta_o = str() -for seq_record in SeqIO.parse(infile_fasta, "fasta"): - my_seq = seq_record.seq - my_fasta_o = str(my_seq) #convert to a string - print(my_fasta_o) - print(len(my_fasta_o)) -# print( type(my_fasta) ) - -# remove non_struc positions from fasta -def remove_char(str, n): - first_part = str[:n] - last_part = str[n+1:] - return first_part + last_part -#print(remove_char('Python', 0)) - -ns_pos_o = 186 -offset = 1 # 0 based indexing -ns_pos = ns_pos_o - offset -my_fasta = remove_char(my_fasta_o, ns_pos) -print("orig length:", len(my_fasta_o)) -print("new length:", len(my_fasta)) - -############# -# SNP info and no of MSA to generate -############# -# read mutant_info file and extract cols with positions and mutant_info -# This should be all samples with pncA muts -#my_data = pd.read_csv('mcsm_complex1_normalised.csv') -my_data = pd.read_csv(infile_meta_data) -list(my_data.columns) -#my_data['OR'].value_counts() -#my_data['OR'].isna().sum() - -#FIXME: You need a better way to identify this -# ideally this file should not contain any non_struc pos -# remove positions not in the structure -my_data = my_data[my_data.position != ns_pos_o] - -# if multiple positions, then try the example below; -# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas -#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)] - -# count mutations per sample -mut_info = my_data[['id', 'Mutationinformation', 'wild_type', 'position', 'mutant_type']] - -# test -foo = mut_info[mut_info.Mutationinformation.str.contains('C72Y')] - -foo = mut_info.pivot_table(values = ['Mutationinformation'] - , index = ['Mutationinformation', 'id'] -# , columns = - , aggfunc = 'count') - -# table -foo_tab = mut_info.pivot_table(values = ['Mutationinformation'] -# , index = ['Mutationinformation'] - , columns = ['id', 'Mutationinformation'] - , aggfunc = 'count' -# , margins = True) - ) -foo_tab.stack('id') - -mut_info.to_csv('mutinfo.csv') - -mut_info1 = my_data[['position', 'mutant_type']] -#%% -################ -# data cleaning -################ -# extract only those positions that have a frequency count of pos>1 -###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy - -# add a column of frequency for each position -#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') -mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts()) - -# sort by position -mut_info2 = mut_info1.sort_values(by=['position']) - -# count how many pos have freq 1 as you will need to exclude those -mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos - -# extract entries with freq_pos>1 -# should be 3093-211 = 3072 -mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072 -print("orig length:", len(mut_info1)) -print("No. of excluded values:", mutfreq1_count) -print("new length:", len(mut_info3)) -# sanity check -if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ): - print("Sanity check passed: Filtered data correctly") -else: - print("Error: Debug you code") - -# reset index to allow iteration !!!!!!!!!! IMPORTANT -mut_info = mut_info3.reset_index(drop = True) - -##del(mut_info1, mut_info2, mut_info3, my_data) - -################### -# generate mut seqs -################### -mut_seqsL = [] * len(mut_info) - -# iterate -for i, pos in enumerate(mut_info['position']): - my_fastaL = list(my_fasta) - mut = mut_info['mutant_type'][i] - offset_pos = pos-1 - - print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut) - my_fastaL[offset_pos] = mut - print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut) - - mut_seq = "".join(my_fastaL) -# print(mut_seq + '\n') - print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut, mut_seq) - mut_seqsL.append(mut_seq) - - -############### -# sanity check -################ -len_orig = len(my_fasta) -# checking if all the mutant sequences have the same length as the original fasta file sequence -for seqs in mut_seqsL: -# print(seqs) -# print(len(seqs)) - if len(seqs) != len_orig: - print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig) - else: - print('**Hooray** Length of mutant and original sequences match') - -del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs) - -############ -# write file -############ -#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' -#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt' -print(outdir) -out_filename = "gene_msa.txt" -outfile_gene = homedir + '/' + outdir + '/' + out_filename -print(outfile_gene) - -with open(outfile_gene, 'w') as file_handler: - for item in mut_seqsL: - file_handler.write("{}\n".format(item)) - -#R = "\n".join(mut_seqsL) -#f = open('Columns.csv','w') -#f.write(R) -#f.close() diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh deleted file mode 100755 index 7e00fb1..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# run all bash scripts for mcsm - -#./step0_check_duplicate_SNPs.sh -#./step1_lig_output_urls.sh -./step2_lig_results.sh -./step3a_results_format_interim.sh - diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh deleted file mode 100755 index 4c24392..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -#************************************* -# need to be in the correct directory -#************************************* -##: comments for code -#: commented out code - -#********************************************************************** -# TASK: Text file containing a list of SNPs; SNP in the format(C2E) -# per line. Sort by unique, which automatically removes duplicates. -# sace file in current directory -#********************************************************************** -infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv" -outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv" - -# sort unique entries and output to current directory -sort -u ${infile} > ${outfile} - -# count no. of unique snps mCSM will run on -count=$(wc -l < ${outfile}) - -# print to console no. of unique snps mCSM will run on -echo "${count} unique mutations for mCSM to run on" - diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh deleted file mode 100755 index 6361b62..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash - -#********************************************************************** -# TASK: submit requests using curl: HANDLE redirects and refresh url. -# Iterate over mutation file and write/append result urls to a file -# Mutation file must have one mutation (format A1B) per line -# Requirements -# input: mutation list (format: A1B), complex struc: (pdb format) - # mutation: outFile from step0, one unique mutation/line, no chain ID - # path: "Data//input/processed/" - # structure: pdb file of drug-target complex - # path: "Data//input/structure/" -# output: should be n urls (n=no. of unique mutations in file) - # path: "Data//input/processed/" - -# NOTE: these are just result urls, not actual values for results -#********************************************************************** -############# specify variables for input and output paths and filenames -homedir="${HOME}" -#echo Home directory is ${homedir} -basedir="/git/Data/pyrazinamide/input" - -# input -inpath_mut="/processed" -in_filename_mut="/pnca_mis_SNPs_v2_unique.csv" -infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}" -echo Input Mut filename: ${infile_mut} - -inpath_struc="/structure" -in_filename_struc="/complex1_no_water.pdb" -infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}" -echo Input Struc filename: ${infile_struc} - -# output -outpath="/processed" -out_filename="/complex1_result_url.txt" -outfile="${homedir}${basedir}${outpath}${out_filename}" -#echo Output filename: ${outfile} -################## end of variable assignment for input and output files - -# iterate over mutation file (infile_mut); line by line and -# submit query using curl -# some useful messages -echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n" -COUNT=0 -while read -r line; do -((COUNT++)) -mutation="${line}" -# echo "${mutation}" -#pdb='../Data/complex1_no_water.pdb' -pdb="${infile_struc}" -mutation="${mutation}" -chain="A" -lig_id="PZA" -affin_wt="0.99" -host="http://biosig.unimelb.edu.au" -call_url="/mcsm_lig/prediction" - -#========================================= -##html field_names names required for curl -##complex_field:wild=@ -##mutation_field:mutation=@ -##chain_field:chain=@ -##ligand_field:lig_id@ -##energy_field:affin_wt -#========================================= -refresh_url=$(curl -L \ - -sS \ - -F "wild=@${pdb}" \ - -F "mutation=${mutation}" \ - -F "chain=${chain}" \ - -F "lig_id=${lig_id}" \ - -F "affin_wt=${affin_wt}" \ - ${host}${call_url} | grep "http-equiv") - -#echo Refresh URL: $refresh_url -#echo Host+Refresh: ${host}${refresh_url} - -# use regex to extract the relevant bit from the refresh url -# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g' - -# Now build: result url using host and refresh url and write the urls to a file -result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g') -sleep 10 - -echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..." - -# create output file with the added number of muts from file -# after much thought, bad idea as less generic! -#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt -echo -e "${host}${result_url}" >> ${outfile} -#echo -n '.' -done < "${infile_mut}" - -#FIXME: stop executing if error else these echo statements are misleading! -echo -echo Output filename: ${outfile} -echo -echo Number of urls saved: $(wc -l < ${infile_mut}) -echo -echo "Processing Complete" - -# end of submitting query, receiving result url and storing results url in a file - diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh deleted file mode 100755 index 51a7844..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -#******************************************************************** -# TASK: submit result urls and fetch actual results using curl -# Iterate over each result url from the output of step1 stored in processed/ -# Use curl to fetch results and extract relevant sections using hxtools -# and store these in another file in processed/ - -# Requirements: -# input: output of step1, file containing result urls - # path: "Data//input/processed/" -# output: name of the file where extracted results will be stored - # path: "Data//input/processed/" - -# Optional: can make these command line args you pass when calling script -# by uncommenting code as indicated -#********************************************************************* -############################# uncomment: to make it command line args -#if [ "$#" -ne 2 ]; then - #if [ -Z $1 ]; then -# echo " -# Please provide both Input and Output files. - -# Usage: batch_read_urls.sh INFILE OUTFILE -# " -# exit 1 -#fi - -# First argument: Input File -# Second argument: Output File -#infile=$1 -#outfile=$2 -############################ end of code block to make command line args - -############# specify variables for input and output paths and filenames -homedir="${HOME}" -#echo Home directory is ${homedir} -basedir="/git/Data/pyrazinamide/input" - -# input -inpath="/processed" -in_filename="/complex1_result_url.txt" -infile="${homedir}${basedir}${inpath}${in_filename}" -echo Input Mut filename: ${infile} - -# output -outpath="/processed" -out_filename="/complex1_output_MASTER.txt" -outfile="${homedir}${basedir}${outpath}${out_filename}" -echo Output filename: ${outfile} -################## end of variable assignment for input and output files - -# Iterate over each result url, and extract results using hxtools -# which nicely cleans and formats html -echo -n "Processing $(wc -l < ${infile}) entries from ${infile}" -echo -COUNT=0 -while read -r line; do -#COUNT=$(($COUNT+1)) -((COUNT++)) - curl --silent ${line} \ - | hxnormalize -x \ - | hxselect -c div.span4 \ - | hxselect -c div.well \ - | sed -r -e 's/<[^>]*>//g' \ - | sed -re 's/ +//g' \ - >> ${outfile} - #| tee -a ${outfile} -# echo -n '.' -echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..." - -done < "${infile}" - -echo -echo "Processing Complete" - diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh deleted file mode 100755 index 0861996..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -#******************************************************************** -# TASK: Intermediate results processing -# output file has a convenient delimiter of ":" that can be used to -# format the file into two columns (col1: field_desc and col2: values) -# However the section "PredictedAffinityChange:...." and -# "DUETstabilitychange:.." are split over multiple lines and -# prevent this from happening. Additionally there are other empty lines -# that need to be omiited. In order ensure these sections are not split -# over multiple lines, this script is written. - -# Requirements: -# input: output of step2, file containing mcsm results as described above - # path: "Data//input/processed/" -# output: replaces file in place. -# Therefore first create a copy of the input file -# but rename it to remove the word "MASTER" and add the word "processed" -# file format: .txt - -# NOTE: This replaces the file in place! -# the output is a txt file with no newlines and formatting -# to have the following format "<:> -#*********************************************************************** -############# specify variables for input and output paths and filenames -homedir="${HOME}" -basedir="/git/Data/pyrazinamide/input" - -inpath="/processed" - -# Create input file: copy and rename output file of step2 -oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt" -newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt" -cp $oldfile $newfile - -echo Input filename is ${oldfile} -echo -echo Output i.e copied filename is ${newfile} - -# output: No output perse -# Replacement in place inside the copied file -################## end of variable assignment for input and output files - -#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \ -# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile} - -# Outputs records separated by a newline, that look something like this: -# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing -# Mutationinformation: -# Wild-type:L -# Position:4 -# Mutant-type:W -# Chain:A -# LigandID:PZA -# Distancetoligand:15.911Å -# DUETstabilitychange:-2.169Kcal/mol -# -# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing -# (...etc) - -# This script brings everything in a convenient format for further processing in python. -sed -i '/PredictedAffinityChange/ { -N -N -N -N -s/\n//g -} -/DUETstabilitychange:/ { -N -N -s/\n//g -} -/^$/d' ${newfile} diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py deleted file mode 100755 index 0e07c0d..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/python - -################### -# load libraries -import os, sys -import pandas as pd -from collections import defaultdict -#################### - -#******************************************************************** -# TASK: Formatting results with nice colnames -# step3a processed the mcsm results to remove all newlines and -# brought data in a format where the delimiter ":" splits -# data into a convenient format of "colname": "value". -# this script formats the data and outputs a df with each row -# as a mutation and its corresponding mcsm_values - -# Requirements: -# input: output of step3a, file containing "..._output_processed.txt" - # path: "Data//input/processed/" -# output: formatted .csv file - # path: "Data//input/processed/" -#*********************************************************************** -############# specify variables for input and output paths and filenames -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -basedir = "/git/Data/pyrazinamide/input" - -# input -inpath = "/processed" -in_filename = "/complex1_output_processed.txt" -infile = homedir + basedir + inpath + in_filename -print("Input file is:", infile) - -# output -outpath = "/processed" -out_filename = "/complex1_formatted_results.csv" -outfile = homedir + basedir + outpath + out_filename -print("Output file is:", outfile) -################## end of variable assignment for input and output files - -outCols=[ - 'PredictedAffinityChange', - 'Mutationinformation', - 'Wild-type', - 'Position', - 'Mutant-type', - 'Chain', - 'LigandID', - 'Distancetoligand', - 'DUETstabilitychange' - ] - -lines = [line.rstrip('\n') for line in open(infile)] - -outputs = defaultdict(list) - -for item in lines: - col, val = item.split(':') - outputs[col].append(val) - -dfOut=pd.DataFrame(outputs) - -pd.DataFrame.to_csv(dfOut, outfile, columns=outCols) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R deleted file mode 100644 index c58dc8b..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R +++ /dev/null @@ -1,230 +0,0 @@ -getwd() -#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") -getwd() - -#======================================================= -# TASK: read formatted_results_df.csv to complete -# missing info, adding DUET categories, assigning -# meaningful colnames, etc. - -# Requirements: -# input: output of step3b, python processing, - # path: Data//input/processed/" -# output: NO output as the next scripts refers to this -# for yet more processing -#======================================================= - -# specify variables for input and output paths and filenames -homedir = "~" -basedir = "/git/Data/pyrazinamide/input" -inpath = "/processed" -in_filename = "/complex1_formatted_results.csv" -infile = paste0(homedir, basedir, inpath, in_filename) -print(paste0("Input file is:", infile)) - -#====================================================== -#TASK: To tidy the columns so you can generate figures -#======================================================= -#################### -#### read file #####: this will be the output from python script (csv file) -#################### -data = read.csv(infile - , header = T - , stringsAsFactors = FALSE) -dim(data) -str(data) - -# clear variables -rm(homedir, basedir, inpath, in_filename, infile) - -########################### -##### Data processing ##### -########################### - -# populate mutation information columns as currently it is empty -head(data$Mutationinformation) -tail(data$Mutationinformation) - -# should not be blank: create muation information -data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type) - -head(data$Mutationinformation) -tail(data$Mutationinformation) -#write.csv(data, 'test.csv') - -########################################## -# Remove duplicate SNPs as a sanity check -########################################## -# very important -table(duplicated(data$Mutationinformation)) - -# extract duplicated entries -dups = data[duplicated(data$Mutationinformation),] #0 - -# No of dups should match with the no. of TRUE in the above table -#u_dups = unique(dups$Mutationinformation) #10 -sum( table(dups$Mutationinformation) ) - -#*************************************************************** -# select non-duplicated SNPs and create a new df -df = data[!duplicated(data$Mutationinformation),] -#*************************************************************** -# sanity check -u = unique(df$Mutationinformation) -u2 = unique(data$Mutationinformation) -table(u%in%u2) - -# should all be 1 -sum(table(df$Mutationinformation) == 1) - -# sort df by Position -# MANUAL CHECKPOINT: -#foo <- df[order(df$Position),] -#df <- df[order(df$Position),] - -# clear variables -rm(u, u2, dups) - -#################### -#### give meaningful colnames to reflect units to enable correct data type -#################### - -#======= -#STEP 1 -#======== -# make a copy of the PredictedAffinityColumn and call it Lig_outcome -df$Lig_outcome = df$PredictedAffinityChange - - #make Predicted...column numeric and outcome column categorical -head(df$PredictedAffinityChange) -df$PredictedAffinityChange = gsub("log.*" - , "" - , df$PredictedAffinityChange) - -# sanity checks -head(df$PredictedAffinityChange) - -# should be numeric, check and if not make it numeric -is.numeric( df$PredictedAffinityChange ) - -# change to numeric -df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange) - -# should be TRUE -is.numeric( df$PredictedAffinityChange ) - -# change the column name to indicate units -n = which(colnames(df) == "PredictedAffinityChange"); n -colnames(df)[n] = "PredAffLog" -colnames(df)[n] - -#======== -#STEP 2 -#======== -# make Lig_outcome column categorical showing effect of mutation -head(df$Lig_outcome) -df$Lig_outcome = gsub("^.*-" - , "", - df$Lig_outcome) -# sanity checks -head(df$Lig_outcome) - -# should be factor, check and if not change it to factor -is.factor(df$Lig_outcome) - -# change to factor -df$Lig_outcome = as.factor(df$Lig_outcome) - -# should be TRUE -is.factor(df$Lig_outcome) - -#======== -#STEP 3 -#======== -# gsub -head(df$Distancetoligand) -df$Distancetoligand = gsub("Å" - , "" - , df$Distancetoligand) -# sanity checks -head(df$Distancetoligand) - -# should be numeric, check if not change it to numeric -is.numeric(df$Distancetoligand) - -# change to numeric -df$Distancetoligand = as.numeric(df$Distancetoligand) - -# should be TRUE -is.numeric(df$Distancetoligand) - -# change the column name to indicate units -n = which(colnames(df) == "Distancetoligand") -colnames(df)[n] <- "Dis_lig_Ang" -colnames(df)[n] - -#======== -#STEP 4 -#======== -#gsub -head(df$DUETstabilitychange) -df$DUETstabilitychange = gsub("Kcal/mol" - , "" - , df$DUETstabilitychange) -# sanity checks -head(df$DUETstabilitychange) - -# should be numeric, check if not change it to numeric -is.numeric(df$DUETstabilitychange) - -# change to numeric -df$DUETstabilitychange = as.numeric(df$DUETstabilitychange) - -# should be TRUE -is.numeric(df$DUETstabilitychange) - -# change the column name to indicate units -n = which(colnames(df) == "DUETstabilitychange"); n -colnames(df)[n] = "DUETStability_Kcalpermol" -colnames(df)[n] - -#======== -#STEP 5 -#======== -# create yet another extra column: classification of DUET stability only -df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0 - , "Stabilizing" - , "Destabilizing") # spelling to be consistent with mcsm - -table(df$Lig_outcome) - -table(df$DUET_outcome) - -#============================== -#FIXME -#Insert a venn diagram -#================================ - -#======== -#STEP 6 -#======== -# assign wild and mutant colnames correctly - -wt = which(colnames(df) == "Wild.type"); wt -colnames(df)[wt] <- "Wild_type" -colnames(df[wt]) - -mut = which(colnames(df) == "Mutant.type"); mut -colnames(df)[mut] <- "Mutant_type" -colnames(df[mut]) - -#======== -#STEP 7 -#======== -# create an extra column: maybe useful for some plots -df$WildPos = paste0(df$Wild_type, df$Position) - -# clear variables -rm(n, wt, mut) - -################ end of data cleaning diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R deleted file mode 100644 index eb24cab..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R +++ /dev/null @@ -1,275 +0,0 @@ -################## -# load libraries - library(compare) -################## - -getwd() - -#======================================================= -# TASK:read cleaned data and perform rescaling - # of DUET stability scores - # of Pred affinity -# compare scaling methods with plots - -# Requirements: -# input: R script, step3c_results_cleaning.R - # path: Data//input/processed/" -# output: NO output as the next scripts refers to this -# for yet more processing -# output normalised file -#======================================================= - -# specify variables for input and output paths and filenames -homedir = "~" -currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm" -in_filename = "/step3c_results_cleaning.R" -infile = paste0(homedir, currdir, in_filename) -print(paste0("Input file is:", infile)) - -# output file -basedir = "/git/Data/pyrazinamide/input" -outpath = "/processed" -out_filename = "/mcsm_complex1_normalised.csv" -outfile = paste0(homedir, basedir, outpath, out_filename) -print(paste0("Output file is:", outfile)) - -#################### -#### read file #####: this will be the output of my R script that cleans the data columns -#################### -source(infile) - -#This will outut two dataframes: -# data: unclean data: 10 cols -# df : cleaned df: 13 cols -# you can remove data if you want as you will not need it -rm(data) - -colnames(df) - -#=================== -#3a: PredAffLog -#=================== -n = which(colnames(df) == "PredAffLog"); n -group = which(colnames(df) == "Lig_outcome"); group - -#=================================================== -# order according to PredAffLog values -#=================================================== -# This is because this makes it easier to see the results of rescaling for debugging -head(df$PredAffLog) - -# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom -df = df[order(df$PredAffLog),] -head(df$PredAffLog) - -# sanity checks -head(df[,n]) # all negatives -tail(df[,n]) # all positives - -# sanity checks -mean(df[,n]) -#-0.9526746 - -tapply(df[,n], df[,group], mean) - -#=========================== -# Same as above: in 2 steps -#=========================== - -# find range of your data -my_min = min(df[,n]); my_min # -my_max = max(df[,n]); my_max # - -#=============================================== -# WITHIN GROUP rescaling 2: method "ratio" -# create column to store the rescaled values -# Rescaling separately (Less dangerous) -# =====> chosen one: preserves sign -#=============================================== -df$ratioPredAff = ifelse(df[,n] < 0 - , df[,n]/abs(my_min) - , df[,n]/my_max - )# 14 cols -# sanity checks -head(df$ratioPredAff) -tail(df$ratioPredAff) - -min(df$ratioPredAff); max(df$ratioPredAff) - -tapply(df$ratioPredAff, df$Lig_outcome, min) - -tapply(df$ratioPredAff, df$Lig_outcome, max) - -# should be the same as below -sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0) - -table(df$Lig_outcome) - -#=============================================== -# Hist and density plots to compare the rescaling -# methods: Base R -#=============================================== -# uncomment as necessary -my_title = "Ligand_stability" -# my_title = colnames(df[n]) - -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(2,2)) - -hist(df[,n] - , xlab = "" - , main = "Raw values" -) - -hist(df$ratioPredAff - , xlab = "" - , main = "ratio rescaling" -) - -# Plot density plots underneath -plot(density( df[,n] ) - , main = "Raw values" -) - -plot(density( df$ratioPredAff ) - , main = "ratio rescaling" -) - -# titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = my_title - , side = 3 - , line = 0 - , outer = TRUE) - - -#clear variables -rm(my_min, my_max, my_title, n, group) - -#=================== -# 3b: DUET stability -#=================== -dim(df) # 14 cols - -n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10 -group = which(colnames(df) == "DUET_outcome"); group #12 - -#=================================================== -# order according to DUET scores -#=================================================== -# This is because this makes it easier to see the results of rescaling for debugging -head(df$DUETStability_Kcalpermol) - -# ORDER BY DUET scores: negative values at the top and positive at the bottom -df = df[order(df$DUETStability_Kcalpermol),] - -# sanity checks -head(df[,n]) # negatives -tail(df[,n]) # positives - -# sanity checks -mean(df[,n]) - -tapply(df[,n], df[,group], mean) - -#=============================================== -# WITHIN GROUP rescaling 2: method "ratio" -# create column to store the rescaled values -# Rescaling separately (Less dangerous) -# =====> chosen one: preserves sign -#=============================================== -# find range of your data -my_min = min(df[,n]); my_min -my_max = max(df[,n]); my_max - -df$ratioDUET = ifelse(df[,n] < 0 - , df[,n]/abs(my_min) - , df[,n]/my_max - ) # 15 cols -# sanity check -head(df$ratioDUET) -tail(df$ratioDUET) - -min(df$ratioDUET); max(df$ratioDUET) - -# sanity checks -tapply(df$ratioDUET, df$DUET_outcome, min) - -tapply(df$ratioDUET, df$DUET_outcome, max) - -# should be the same as below (267 and 42) -sum(df$ratioDUET < 0); sum(df$ratioDUET > 0) - -table(df$DUET_outcome) - -#=============================================== -# Hist and density plots to compare the rescaling -# methods: Base R -#=============================================== -# uncomment as necessary -my_title = "DUET_stability" -#my_title = colnames(df[n]) - -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(2,2)) - -hist(df[,n] - , xlab = "" - , main = "Raw values" -) - -hist(df$ratioDUET - , xlab = "" - , main = "ratio rescaling" -) - -# Plot density plots underneath -plot(density( df[,n] ) - , main = "Raw values" -) - -plot(density( df$ratioDUET ) - , main = "ratio rescaling" -) - -# graph titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = my_title - , side = 3 - , line = 0 - , outer = TRUE) - -# reorder by column name -#data <- data[c("A", "B", "C")] -colnames(df) -df2 = df[c("X", "Mutationinformation", "WildPos", "Position" - , "Wild_type", "Mutant_type" - , "DUETStability_Kcalpermol", "DUET_outcome" - , "Dis_lig_Ang", "PredAffLog", "Lig_outcome" - , "ratioDUET", "ratioPredAff" - , "LigandID","Chain")] - -# sanity check -# should be True -#compare(df, df2, allowAll = T) -compare(df, df2, ignoreColOrder = T) -#TRUE -#reordered columns - -#=================== -# write output as csv file -#=================== -#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) -write.csv(df2, outfile, row.names = FALSE) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R deleted file mode 100644 index 877215a..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R +++ /dev/null @@ -1,131 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("barplot_colour_function.R") -require(data.table) -require(dplyr) - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -########################### -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -########################### - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -########################### -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -########################### -# Data for bfactor figure -# PS average -# Lig average -########################### - -head(my_df$Position) -head(my_df$ratioDUET) - -# order data frame -df = my_df[order(my_df$Position),] - -head(df$Position) -head(df$ratioDUET) - -#*********** -# PS: average by position -#*********** - -mean_DUET_by_position <- df %>% - group_by(Position) %>% - summarize(averaged.DUET = mean(ratioDUET)) - -#*********** -# Lig: average by position -#*********** -mean_Lig_by_position <- df %>% - group_by(Position) %>% - summarize(averaged.Lig = mean(ratioPredAff)) - - -#*********** -# cbind:mean_DUET_by_position and mean_Lig_by_position -#*********** - -combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) - -# sanity check -# mean_PS_Lig_Bfactor - -colnames(combined) - -colnames(combined) = c("Position" - , "average_DUETR" - , "Position2" - , "average_PredAffR") - -colnames(combined) - -identical(combined$Position, combined$Position2) - -n = which(colnames(combined) == "Position2"); n - -combined_df = combined[,-n] - -max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) - -max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) - -#============= -# output csv -#============ -outDir = "~/git/Data/pyrazinamide/input/processed/" -outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") -print(paste0("Output file with path will be:","", outFile)) - -head(combined_df$Position); tail(combined_df$Position) - -write.csv(combined_df, outFile - , row.names = F) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData deleted file mode 100644 index 9ebc62b0d6fbe54e858e3c9da53c9878c18035b4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 43777 zcmV)QK(xOfiwFP!000001MNKrd=$mkbLovVdJnya^g{6x2~7fq5GkUDBe{^1G%gp4 z6)X1Md+!AmJ30{+yHXXf&{PmX5o}1wx3l|Z_wC&7l@!GPm;8P&H#0l)=Djy>`plcc zyqPHxGb0Eg0VFUehy(;m&Y%G4pM?aGaOpR!a!K)G+hSL(E|p-h1Y;#QTY{@47%#yq5}YnUssx=RxJiN}37(f=js$%q zXfHuG2~LyXR0-NjkS#%m1j8kGMgqpUOM+wx91={B;9d#3O0Z0VAt0msB{)ffyCf)+ z;5-S=WwMmuSqZEXFqsdMV2T6}Nl+od6%ve;;5i8%kYJ1i*GX`V1P@D4EWt$*luJ-3 zfm4Dw2}&i1mmpt)t0Z_rf)^!NBEhQ?+$cd031&!;BbON=k4SK(1Sd-nCBbM3Zjs;u z3DPB)F2M^D^phZ0f-@yZkl+*v9wba(2#JxPT7q5@L`v|I1d}D0C&A+qTx+09OKgiP zyy+6?9EvyCFkVz*p{=U4x-zlWZf7!(^d>-(iRES{->yMn8DDPiHs|xd3Lk#<{`cSM zch%J8H};7A{F<20v#0&Oe9{@&7yXm_{mLVYN5@&uUH4r3Gs5=`9Z+^(K;_I!zdbeY z{;vw_-gvuBz^K0m-TKhi)4#qm`|yWnTyb#X-i%q#{=MMR>#utF?eW789QbWa-jVQY zLZ7za4&|6*pJ{`JG&T;8j8_p~;vb}o2lR>}kWr~P=kv-jJmI(G$mQczos+A38%`-FWx*`x_3`MqIyS$BXx$Uo`xzv>BfSFP?mT%A4P> zdvnRh_0F91Ay@Xe{=!Mm-*$U(ZO*6Njs>2)@B3acLCa!OM<4lR;Jm(fc1w9-$dFUQ zx{SW?;%~lRzGG|cU-!IxBxpo_yYR~&>hj{SqwzaK{@VH34Lz23xOme=n|>TQ^TC@p z&j@+2eDM8$-`4hqg>mohJ@wrMZ%z8jl=f0;lOm{r|Z;KIpHsLyPYo`}t$B zfBpT=_;YTY)?vg+0d2?ZO6&h+-m97OS3H|JV(oj8Ba`=>(>L+7@>hy~P1tz-pLKmc zN?1|-*v^_WFT7~X>W}u%d};A@-wvgRUJu@Q^+#{~_Qn_Omaoi6Sl@8y$p=rHzVGyk zi#8>$df=4J-DCR4+`e)2$H%^3ob+1i3tjI1uIJ)Ir(`}g!T#4D3-7)0y%(m}58U}z z#()<)t=trJ-PHNxM|PUqz4(*YZt3pqG-C9$(=WSb$JNEX!UBJMW%%bc7k9Vay!8Cl z$tnNbv+c>-_gxTn1_^&{a_q>Mu(KlWUst*JvcetJQ>?=twJ+Uw#w~}Yrc~7>4ZC#4 zlwGHV_Kkk*{Lgj=?3?lPwOuA$S#j2_I-BSk8@_P`S_-;iKo4^ zdh?6LML(BKYk&I250-V?`_kN=J#Sf67jkV()K^31?uc2t=&dE))?`-BJpH>lQDvQW z{y6d6^AB4(@30ShqTlmtr{B8fx3#BNzi`QA-EN5A`+DFb|Li?DujbV&-dgwJ^0Zl9 z$J{kI?~5P$-aPxl4tpX)o|^yt4HtF%BlWRK`xAE5bXqy=-2(+TT~WK^ns0N;fBN#h z{L?zWc4YL3PY(P!eLUV7*6qh1()uCsXZcQsdCv!QR?!$Y#xl~>#qG3UUV zOzXClSsj15Wpzl0ixW!*)_)tkbJXfyXWTIH(eHY9nYoo9!q?a6uNrWLP7&KbFBeg4t-+qcCGUG&fm zb<3Wh7Z1B8VE7X&KfC8j=c8Y`_TPBU4SBy-^_qF`-p5At?l`jM#-hce3Zom^P0HxE z>YC(pw*8R&*LfYv-d+>=)N2nNnSRZ{7>a+ zQ+hr0`<}={H~sZc(VT9}>`~A5FWmM|aroSjOYeH(s%tOKZ}>3zywr>PrF^k;P0jGQ z5AuEtJ#*8U`}dx+;y_hW&9Xf!pT6_{qA|toPJZ?3GapF5<*AiJcD%kOU`6-2cL#j2 zyWNy!pN75u`H{y0EeG0%hb?_(#rMD8@b$PWuKnfq5pRF9zTmS9$5l8_t-s~m)e8!f zzBsn+m6Dr2-C0@iz{^SJUi`#@_}#PaP1!s3@oz>&eNyPmd%s;x#f|;@PP**9tin6) z=<@!myVewr-Z1L#wt2UFM9#bTv((V~&2McUWtm%e!?KgE>HW(orD1D#et6B=m%DX) zJ~-p1on@s78-(oF~4?q1W8lammV~vG1ikwEXd$1K(z? z=;2yFA>{Y7-&i?0CicErckKNuGOcKw?W_2A_t!p98NBt=wmav~>9Lor8l4g{F#Y`Y z&y4OeeAKA;2It)3X-OaaesbR>Ma#3S{jb?Fjl6S1(2IW_dHnO3TRKg6_vAlP-yk=Q z+%YOMbb7&~r^ZfO@Y+aM*N`g%PRkl}`y;Mtwy-xocx>3umPalbcE@Skez>M#`>41( z_F2zPMKv*#yI-9CNc{)xM8Nt#{z=$vgq&%Zn8PE zpDa6V)kjyQ^nM_VOuQ-Tr;jd}Kl{(+mCs!K=gV*WwRe5>T_b`HmG|C!T})d38K>vo z_t|^Bx@3;{dw>0*g<*xcZIW2ZJ80{`2K;=UeW7dj9CYg96%qdF9MsN-q2~vERnpGgnYQHvMW!Eh^dv1U7+n$dWU-sk&U3Q+dyu-1fXN-+H|JKKE`{4cG zuU}qV_s-{M-o54UtUIne^T-3|TxHGs@QK7(=f!otZC~JxKgsYB{>TYj$RO{I2_M8nxi5Ip=jA^3jh$ zryTy{ffb(~+;r{fIg1lc%`CgNb4S+)iJv5I9`*bK&prF;EkDh;sQ-r#Z2jktV@EST z{N%fLZ~N?rLAHN>9B^0GpI0y1zVyTK7lk^-aIMJp9kI9{oFT#h7VhPtBS!cge+z*BrU&-9ypW-ao{3(aL+4 z#@_VtsMTN7H*;3qr>b4pT4#A?t7!Rbr|r$4Y`^7oYoGDL!)I>jePo*bo%d%~ zuNnAzj4;*+K8DD!04YF5Xal4QaL-tuSy+Ql_ysmq z{G73nYNxGgo}Ken6QOb-KBZ?ytWjzUuaC>CqtpQ4`-di#EtV>DDj%emlI& z{cqD>M%O(2Rm5m|K#J{Jc{a$M&C%Xg=0z4Z4`TDiJ0*8#2{IHq$gPsQo4!mer*aAA zCMTs7U{4|T6k*SF&6Apzhdtx5rvQ6UnzYnB>=}kXwtNnNo<{JUlorIYkqv6*$) z?kUKP<4i^FLgd!+O3y@iGEv%0lr|HkMRaDOyfRTeXTIi~tU0x087P@rxD&IJrXjaV$D}m14Poxd$UOzQ zwfNauPb0Y}Bli^Ko{HRs$gOsF`N_#zPs%)nnkOZx0DI7vbX91wX9n_MZ0TzL%#)43 zWMgbv$Csa+nNol~C|Xv^IP95*J!;>VpR8uX+$#V1DVpX zHSAd^c9zyuO-{~CpNKt^uqPjTim*o=LrhMds5vKTPPM$JBx`zFke-%{-1*2o6}h$e z87cY5U5MN`PLbx$M)9*z{A?6o8@Uu@W~%Y0CMQim?gHdir8PAvD^Gh-rhYvEa$Du>gPQ?%g>^Nhou@yL^fzht2=shU)n zXDs&UqM=MvlgD8Xj+TwFW$V7=V-LcXo}P<6ls@+@uk&fIF& zv(mKD{j3ZWCtK2JUl)QR0N6%yJp6KZJW&>*z^_{*Y6loCT#yz{i-i+n;Wfd+3(3O8 zW8utMcu84!17zW1vvAF|@QQBXHQ2&?77LfFg-gZ4+j$G`t1Y~twaoHGDuheL+i?T- z?P4UwSbz}0&44Yl#VPLpA5v7MV(|@hQn6@KIbK->7M$)=X3FAVPU7Rg-uvRP=$!sr z-(L9PitL<$ciqn%IRmC_?yzh~Ku+&VP#?vD7k*S9 z3E`3Qn&X1{Na^dTTSxjRoB$Ky@P+0!dG=bDZLXujagL*E9t1{CLqRQFOf5qRJJiD# zT^A7LfJ&wWl|(u46A0g`1WGREtt?w%)!eCRnma8Oxid6(MzZG4#O^Gvgt;l)8IYT* z2Bs#dP8D5XZe}*ez}6di@R;N@)tSlZ3(QMaNoDJZ$epdZQ&D(Yx|%dqB_uB`TXm*u zp&8s6kf)N7m!%?OOKCiKY;w95oSBN;D%E3CGkGvu9^=8|lGO~yrK{PD%TiH~%T`N( ztwSMqmgY`DaZ*z>cZMpd@yVI0Q;i)sUMr&U*(!Z(g@y-DPzyb9LVB|1)FNbRp;>Cy zY-xiBPfX=80w$)bE>!{(vsE>il%|q1DO(jZTMW?LDM^}JC5X9Eob+tXttv|po1Sy@ z(u`bnrl`C~^KMRGzA7VWdd&kTtNF1xGmkJu(*tR;t2#B=OH)+UnW<(bO+-01HrwQG zHorvfRLz~1gxp!0Tg@||Q00|P88vqXa;w!VP?{F1PE~0Fr=@EVGSW3?hUQdRn5L?C z5Sztm?qn3M$~1^gx-@qNim#$%?kvr%NlTgvsZO=M36v&3sxwQ)!6rD!ou#?e7Kgc$ zkvm&+r=zs#2&dW{1hH8Ga;u6yT`jplX}GUBRsETssk&4lOQUy=i;cf|@C;21Y|PAq zXQ}OR;4D>V0%v7tq1oxaclUJdl1H;9-2VanD7!H0>fqn#mJNfiTy#?#`toV)ZyH_m zcSF`^6~pMa*UrA~r{AOK*0b-5>oM~=>$~41ez{=&YxMJn=bb<9iwL?d=c`W>23?@y z-B08{`s?aVD~?nyqra47U-M(??R4M9vb-xSZRoy7Ub$(@r1^B;;*+2K>eAtKU$4a< z52*N&?!Bjb*Pa~*)4gXfM_ao0^b0cHfBbueqj1hqG;kDA;IxmDLqIiG-rKy3SQnUCFbEVo8Ls zP)4dm9gpHdYN91> z+sg9kUiKFRzXwowcW^f|==auZ>m(scfq{sKL-5PTPoeYX)l^iwc=HmRnwXrJq#$b3 z;F);+Nr+mh%;Y_IGjiHiRaX_)fa(-O^Z-U&OoI?LsMcP=&NLqeR3Ye_X3@?RL6`W_BRUbg#sn$*r3Oz@mA)jDn*#F4p|l> z`2n)d4W1EqX)r^!q^bJjsH&-R>5E9t6%|W6Or5F=byXFPT9?t63R_K0#S-m{$W5FX zHvzL}8-zyKq|tx5-Q_6pq7?(y8;sap?9Oh2QFcN7998{LfeV@T6B`6m9o7lUt;MUTp64iG=ey)j@(l*- ze!~BJ)iO&}W9oN>y=tDT+y{Yr$~t16-DTJf7*@*=si$YHk$36>Mhz1r91L}zNA2KG zRq#@$hl;Gak>e;c3LQOK3mpu26LH>hi4yJ42zQ9n?y7TEdCHwPn~4p?tET7#9%w5q zH7}P$4|+1(s8m87wTV@A6%_`Y3dxgABsxs2IAJwT`@&+IJm~PXgGArCL3J{ZuEuAYFDKU2tBACbEhr>2B&G6f0)VQ_+iM6;)}kt+maw7cX+!YKVKU z8##?JmLN?O5(X%rGa3w{B@IY$)F#*}oOWC3k_4%+YVB37#Dv_^Qb*Oi1Y1?L)ap4C zYJI{rXu!~fMdgl?@`Oc>ii!lMy{5ueVo#7TB-j^AV-1E;#+2wqdHm`9EG7OI3Ldhh zH@Fj`braeUDN)owgJx5mZ}3C15pp5a<})n9AQ-Ctym`dET1T+qXEB-?MDxts1CeYp z#4g&g2~kI)G(z2pvi`|_*^Gc)X5^z_3(I>ej8XG4qB?kLkrSw^T1g9 zVy5b~(x9n!VpXY~59xi#cXUhS%ke}4o&ev#pFCLH(P*+5Bj~eE0YfY8m2>S*)@Z{7 zSwA3=8p|kgJ{VHW&&UVsvC1W7F{lwjUzV82+J(c!y?Px-tsjZM6CfV>M+^Q|b^dqM z*_TO46PjcBR)NHJV2FT9U6NT)S6y4@R2Q|_f?QpNP0;D6#IaQi9nR{iN~u?s{6|#< zfr@kiF&HqaOPRWXl)9iNRMl!{sog2xdRDO(>E#eE%9Kr$j8{?Xs-UrF$-S(N>1)4K zOg4GV1aB4^wJkv<6^n&`V4KLC0 z;kKvcH+?ZVYKzO9cC@%5M-_ynUtCb`@esq+wDm!J&&2Av^c!=Mryem*BLfMHXKxO@wNS^@Mw!Q-;bT#^S~1pOzSW3Q+|ELeTXk zOzNzC>P0m33#5T#vY1=@uu9a%@B^)F%Y%W~)qX<1(r3b=6GDEMsyaibw_B*Yy+ zc&*pYocP#B4HNqy&UK8-X>(K+m)M*xd#%G(C7`kB(e%b>_z0JPE=m5$f?un_FkRx` zh=zmfpdl+BSeR-OPn*stWzF>rB~27iO*TQ*QHa-8@K56G8ujQtu%HcOs_R?^Nj#7u zvY?HY=1TLj`NeF%$AB0vH-CxsK*^bAl!V%xT=r(!G>Sp_&Y;|d88FfahQf^)@_9A@@F^ArTM|7mX-2d+;rvdtp#*%-5*w00{rA%aK4ysVgNS*t zvhTqjz7zKvfGBr~qI@AkOVa>x-kxxu5L6OAYwg7?p<-8!? zHiLUMxYh2h4FEeR-}b6P?LT7*Bp=(0eidxIv$3i7zdX>MrRBYP7r@gFc;S;%o(EUnNW~xCfC`S& z6b>|b`SXj;SYpsJS)kcE2n<+IgizVQh^!5$%*ss4%#LNt%g95zQcvl)9L9ni0(^u} z1pEq5ID$w6h(^HE#2|=65D(BEK|F$venK~Z9wyKYK^qPHUfAUgwFQRrs3FVZ{s5vi z96r>g8MvWo?z$8~+&h2($cuLdK|~uV9@PA8g@$=D)USTHenkj$VjU9xnpVda6RoxN zL8vnuQ2Y}?sG)*T!}LNe`d5Uy1ZCO_GUXWsA>dtHsE@$n8nqBc5Tb|X*lN)q9Gz<` zneTGis%qIzNU_1vHD28<#8Xyb$=Yc1ULJkGNVKrQ3n}3$8D=L}*(qS(h;ctJA_+)c zkcLS3D4GK!;~c3y6eLzYIo#+fb5>XC1;E$Zr(vc8h+Y6*X-!wF~MTwdHKZ)KTT&>v`4QR3@y{QR}i* zl?W=^52UzDW@;CW@n#iG;lV!LT=!8W4 zlw%P>zQ90K-Wma;%`y2q+6dDf0tg;cUF|H@5R736;;`n*mk+A!HmCG~b&S7x*f$Gt z-^$Q^571Lv(9rxiYYcEURe=_uFeh?1mul1+;PGviBzA7NJ3q%{&v?v`Aks@Nb%jfh_#WN8qvCghz*uJaSz>8-+DjFYM2- zpt1$>Z|o>Q?7#i#>#OmviEfydjfb|VwR~Eze1tkR-G_E#26E3t?peUA?o@|KnVxPU zQ=hidJm|_)6vnrE__o1h^i6DYCKcfS4?l0WaVg28G?$fU?L6Y(UPhwjamjaUd2r)u~n4RRmj#R5~`|Q369E|3OhT{ZZA!6R3*5|9kmH1teT8WkQl71 zaMdPM+Lk2DwI?icR#(kS6xP~;C43c$?i_H+T4dge$DK$*O&Me2Z^F@Z+g$eEM1WaY z6hc_Os3WGw;C1-hIPH@DIE&?@lM9Wr(M#M_ZkG@2dYPdX8|g<-zw^%Pr-hpy0uNu3 zs2veC>Ygy8p*2oNrOmm-;8S?5y`;LTRQYJo-EsBM2&}M|xdfskk%$C(L$qsxD!Bu6F0cS_Wp5Gke_6b|Akn`f>)=LwC6 z)+O4N+Fz_7L0O74DR&eS=knvd$IzltBNmGm8|5)sX`^(zPfcyAmavRWS|`6&lH6Jb zTWyWK#AUKA6(X~OFLNLq<$}=g3dht2<1|`9C^s)EY)h&|TOqjBT2jsL?yGH}K~XF- zQuGKV2YF*Hz>H3Bt*S$4QELRAeOhhyrt@?C(0PyhI-!dbD{W`1_q0Mx?$ldO(gFvg z2x3Ee6&8xx6cFw2m;v&jx+;gu2b^N938|9m$U^bLjwenXRc4%Cjoh9Xyw(o}uki&> z6Pa4#srYbhoP>bmA*s_7+leKRq2eWRGt+LU>9j#7K^Jg1|S=sKBev3m1-gu>8| zm@2DF9c4>A+p`K&z8c;%3jvtP&>q zUT>&Lt`_g%|BKu2|K|2{OKjd#-gqLYLwH&;1A{8r7^ssQWQB z=;XMS2k}@TU!*}Vvl^b5bvwZdhb4_nCwLlMwr(|O(7M$?JnM=!#tSe#5GjuFf=o8Z z{22M+EwU**&sM!{)mtAdm-P)m>4FwlIM|d5*&Wk=Ut;)O?(Kt#UeEDNLW*;zmpBYh zK97TWQMPj2L(FD6L8P&kj7%tV&s&F&*I#Z&34CD1Ntk+q7{PE8kj8evPl179E*AkJ zU3cQ!Z6BS*;K#lvL+VBfK$!5{)1Xak?nMI|qvD>_U|!63HlSTP!{W^shhwdsuNJSh z5p!$AWGyo=kLIaUZq5$Z?0kVYI{X3s!dh*)(@{0wm}`FOp-E=dqD!hPYwBEf!@VuH zy(e!=w~-2#Vz7bSRQ4lFowh~FpuJW@J-1%ml?MWCl?x3G9b+eNW5&?{!_sp3;qJ|=YfAJ_zOV>6Cm$tMAH+-`)CNW0}a!`UkbF$ zlwcOPXT!IX1U$^m>d#rkuR(yn7K&1Y_?L-;glMAQX(8y_F#$JkL``VXb|jR;55@A^ zJ_3$FJ&yB5`h)b}MTdC4=8=V?SgwsK@(jY!PvNx^?LfwPQg-#O$9D$-$!$ zF+U^rqI_g81h#MWsoM~}f<+U*6zo<{3MN(pPh7j6)UUc@4wu>4ogD#%)5oLXcKl$Kz>Pk~pNp%I5Lr$0~ z1KnVJ?h4x85GU5#1k@!HW&LI6B-f@o&DCV-Mn6IY!Qt8j<2fj#5{7t$?B@ zF=@%zdBDUHA$EdE>~NOB42EAw(2F#_=`iBBw;ta$-kJ>jEtcTwrA*9?^5~<_<5f|+ z`})VzMMgY%;;|p`tyH7rQZ2W$muhv{Nv*ee@kyD@vZb^Nf*n8<^&9a**Uk?6H6kbw#HV% zkIoy%U}Lah;;TE2!6A;dag1Pj^~o!Wxy?X^#~3OOPeqjIV-d-%S~2WZUCyJ~P^qh)4YvN7MNxqv_+$10m)GmP|Yr*fOz>rx4@Tt<5}_ZEl6#eNZFPq+MY^ zF`nnysIk-@Rr-5b{2HpY%+lX$@c9{#H=jvz`M)V9x16JVSmCW9P4yrgGeuzO(mZ?@ zh06V6{sc||8HB%K60XC8tTW0hN72N+3%9tNpgo`avBgGTP3HzFetCK*{XX45TE=uP zrz^^Ut^lB&uL3ig`fL2j<$^3ZKFRr2wiuL^B7aQn29xIB;))XR1RyZ^^D7?<3<3@U zPHu#e8)sy{;Y2g;$ji$DmFpn-h6mIvJW`NrwBo;&``h>_-!{!CUmx;4uO;%ez~G1B zje_4eCH!0-9_y!EV*e$%G-tjR7}7JmF^YdV_qRoKjUsw0$ZXYw>1}_YuNTnF=x1#J zUoGzCA#Nzd<<@+8IMd%M$gel>!OHAx2ouvY`l7w$#p#Lxd}eX6Mdx^s1GH_~(5Ak? z<RMv1Qw^W66YeG2IPJRZqG+8CAz;r0SM7{7lRR) z#T6|q^XWUcjf&fwD5ACf5kh)o(by|pF$m1!qCdZMTa_(Zh_JvSg#{KQEHLt6GUnyY z`Yo|OD*5UA+U_WiaK7Nf7kBtV4qvq4<;xdiI8A(!B}ZeD(;(Jy)Cc)n$5(J3yDI)y zxW6ajHH^3)!Qu6T2)rjK{#S8d?duq&7t!hmuNN&bM&UTambxQq3SQChGM86x;FbLb z3UUhu-g_(lYs~6?tUv=_4B)(QW6VDMHHs&Hbf6DuybkmwIvu#^zor8gm=iHxdcbf0 zM#^!3;(xs{avVssK8W`{|0Ow^)g>-3PjzV!(dv>9<0o~zJ%}{l_$ikBck1h4(s(@^ zLUej|$A3-FEU-wyc z9EKBjMT3PU_Blz3o)2a7CQp{A_*Zg&iUAsJEy#x&^gPCN9A`c+w?w{D{I`5;IbLg_ z{NMI5|7ZRsxxe`@<-4MVxghuBxvaOh zKk(Zgc8iOafTuUCjp!WFv9>&M{ov$(-2J4 zz~8%&`eXE!HgD#u@X>rv*9i>GK!rJK6BpSOqc;J_)D^@_HcAuXa?F*@$mkF97vxSE zo3BJ2*PyYZf+sS9?UgmIC4SHlTr$^DRjMUmq`cSQJ$Iw4?%d44dqT$JcwBMO0O52P zNAC+@e_g2zdNS(k=@Usz+AsK#VH+#Y-w#cRyPlt(brA?7CT zbvOdN;0Yi)p}gY*jg8U#Rk+YiwCr6!6bj>h-v`{QQJjBWX+4yksca3(u~oh%@)dOY zQ4^hh%tY>wBli=?jW1F?iQGQjof68<$TkT-OM96QykZRS)SKS-J9%hg&+-=o{tpEW%+Vk6T4~1PF$@1bokV z!FR5o_?}1jmLqpIa=#$RzN5g~?ZS7n_?-3&h3{N1)wA6m%1^{~k&k(ETJJE64|Q4I z)spkqcDq=(yvt=&=7Ky|n8-6r$D44hSqbZ85Eg3?*3CUkILxZVOE3rvGYIQ#5T*t- zmt0;(vUml#UqxFfCsg`@ea9Srp+jmzMR?f|HJlDUn@#_)@p#zY`*dJ_mbjo2)~1`V}p z&7mdpHsOC6;(w)~Lg1r~&%(rlprb2hivDpS1*PG;K^|0HMHl6A+8kA?9WP9>+58C4p-kCo zuZTicgR6-CKwKoK0i)wGqHNv=5zV~UL=Xi#Y|snB!yr5yh4%)$tm+K_yd8*g7=hzs zV-seEX(Wh|<;xBroJy3c91nE1gI`8>f1;WA>VR-@8rVewc6or^U1uXjc9G)@$hQc> zIuOmA7t4oV-0DFzGhU4T1n~EQ-`dAb$E0?I7nR0zU%%_ zI~z9)_+bYVY6O0aOnh;El89#N3-gmg^d`TC13#A{exitWxgmxqy1@=M_|a@?y=7PGyH(7sG*(d&<mj{(gh-cO{Kdov>hZc6i3-wx?<{@~)_)g`NIh;uhcAU9bgWdZ9IsKxNGD(BX*^~&pl}f%O{!`mt~~fs}NG{mMa;r3g%`0z}W@BS3O=o>R>%DY0 z8~m*y8-3ACZ@(7kq%A-YpD;-NFStK|_+)U)WB+LZKR?LK@OAZ!&treekGEQuDq#rE z7dRcihKl`N^>}r7%qD;qDDZu(kX<>k0XUjnyk(4ej!4EkQ&Z zATfOW%6&MZBbpeRX^`tEpR`3I9{D8N?6|V<*Qm*9IMH9f<~*~JIKHyXaIBA< z_VA_dL>c_~d$@iLFC#8ne$Bp(@_ilu4(18gbAL4PX%hY!j(;2{|7%(z-&kVODEUCp zx_?O!kGDj=7HDi4-Z-M!H(fDTF=tI66{-u03{!95j(GvM`JXZ;L0tMiAW4NA%AYkF#wnI3AU=zUi zWOyBkAzA|#hoG$&z;|Id4V@8mLcr}ya6i5~)4h=pCqN$nc0Z;cKz{@Sya2va!}%GC z0Nx3g#123nX|ge!^08299Hr546wFP=Cs#n!G>oG zj%i_ojZiRCp9|{j+81%V=sk?d090a`3KKyzk!6l5M{T(uH)70WV?tMjPk(>#q-p?56XuDoSlFXBM2YNLYdW6EO829!OjY0 zRT867+vrN$V)xv+I7g2L^uqU6 zANh_$O}D1baywUHo9p&qQcJhy=a{uHY)s6?HEf)CfEb@X;3p3FHYMMl3nAL8M=TD1 z?I;|@i6q(v=_wbW565VN*zG7ZBfiyKM%YOLHRHk>IZMkKg3L2^a&2ZzRPmmReA-Fkh^lU*Twk ztz%vxYiuhgFN*0&o&k_9nFk6^haaBrW-ADwyn>P``VN>97k00`QZMXg8&W?{d1%HAjafmqWB~>s^U|@1>=34x z>N!{*YN;TG`{^QCKM4MxMY2?cPGY#R?hWByQ-m%Ph69ixcFd>`w$XikQ7RtS<+;@^ zSGCyA2>bhP)j67YuFf%@rR1MBskp9 zGh)Wiwn^B2_sWd09lys@{1&gqIXYH-sK(If$hEKuwRW$c>BUOAj~A_Qc;Q#ELV0Mc z8dHZo6!Z@!1@xm)0h{6X=Zr(l64pk@uNo9r*vrH-Q^Hw?S4BM(&N{d?5W*P+Z+ceL zoc+CUoWN_H5{?TP_LwEr6<+%L{kYvwNhkV0Gm?c@w!D(uogCL@!0+NQZ3dXu@tZI| zH<)12^`mo%;R9IS>ihw-I^WYjJo^MZjqSS=h;Q{W7S&6q!P`u|i06vgn|Q96eLS=8 zU&-eX-JiFL+1IlS+_YSH9qTtQJ@-Me=k8A$*Sve;L3mBeD-Q$72~<;abgcSNQ=LSs zsf?RJq|qJb0Y5u!!wP(yD=rZ_Tnlu_{IHe}d*6@Y_|14url9S?uDpDIJG0 z#1{|9`BFxq%~(c0Rh}H(7615{emNfb#v)u?rQ>m#(fz8x&1gYo<`*saP-Htl z%c5N~$|RN;HCl6hlvaBW(lVm5Dbi}IkLbPX_{NcZQGtv<&#M4#>{OwMA5M6hApar|r1C8nK zd+O=i*HZMcyt)zXVX5Y<<-Li%miHkBwLC{7{9mZ$2RUsCM0?!CAMNf(j%)2ccs%lr zMYy;=_9y1Oj}a|R>3#YV?Qtf5%3@$M$|RN;wKPs2r8UTdw2Y{1inInGdA;t# zAB>(U^cCU}v^&m_fS{KN^hMCmPZ)$?@NtGj1iqAQk_n_DNcR)65M&=`up-DYfiVd3 z`~>`b#KhwalM%#e(3GnOcQxXWyBuzSOQj)@pJr8FD}eL7^XAn!t6kOnJnX!A?pQb` ztAr_l%`AA!$=hW=DP?Z_)IjTV0-C!y&E20U*Bn}Hsir!BT(_g^-C>IrnXxSAqsynaj;87gj%_d#AGT2-=DC51d#T$y%Q z^k4fO z@lR`2XsrtWZtF~rqS}H<>t1FnaaB797F5?dToQdl6RMr1cBj2GVRS;#Ad~83L3LUE z7|tgQX!H^yTk@?{YoTI%07#WaIyH`}d7KSbxxLcPM?EFg5*dyvn@f!zT!z>TtEs4V z72B&8YWcuDa4fO_Bn5~kiAPGQQLjyNl<>f|GAGJ!B;eV@`w8BwUQe9Eyo43;LEU9xYX^c0d%28WutNM@mLTjq4Tn<}FGk8O4 z>*m^=b^mRjt9nVbHb89k{SWyf7FJixuU%xDXRm4v`5z3?Plp0*Y0cn@u%#s|sI$3T z&gv5+B%=`_@g>RWNl6J(OIPQdYpY5~SyXzQzl4=kI~UqrE_*ZiW9G{_xoqXt_JlMw zn`Q|PskBu%=GHaQ7g=I++7{}B@ZS!KE49s+8qXLc_XLVJJd*kOh*^IZO-0rNNZ^zx(fAdQu zYY^?a8GNII+Em#VI_BA2)lSqP_#PNjSzTA;OR{T$l~7nJ6#tE$oo zzlDQ>OKg?uT37RWqGV4(ZFPk-==_(%LZyjDbwx#Ucss~zktNPLN3A4DX^tulOf-3V z{u-yOTu5`1lIlv^3I8E*p3Bh)XYkq5l+JdXJrPy*MG155c2|YH#X~}BY_+vESBvH*df-+cMEw$f z;iQ=Q%k@UAPvUPd@GJgu{p9+j@@-O{YW)%A1>tdC%GIoV#q#vVca-oOCj9bnv3~LV zItcBFS-op-lE1fK3-HN*{Y@{HQzX#Af5mbV^K)ybOyuKlI*_B89=qwas5fSMXvQD^ zC0>4^f3P0&u|A0MhV&vHQ+z^tzx%f+B_2@@^zD&`b6)^#0>FMx-!=*EO$Om&d@+ug zm(e#dy%|nVagD+~(QhUPGrtkdVjkXP=?RA?K0TF}H@=Ky;!oN5A~R3vyy5Zpdt-Ik z3|C|7gFkfpK|imrBuIb|1fITO2x|hE4%id=6b{ ztE#hA46Jky$rz+dV_P`LXB=V*vls?Agj{Pg{LNKMVTQB9G=)i?Y0c#~rO0axAGm5Z zDjVl_lUnMDMPG$TUlSziU7x-AO)+@m!rvUdrT4Cz{`2IAWPO%oQ%l>sjjcbDLohVZFqaSIsRyTKh;{|`S_qG1ER!z_b(@~Ar_RekW zY4kkF(YfPMWoLHsffZIocU-HO_b5&2`J?6sDuY=F=U6 zF>}lGN&b&D8Z+K+1|9W2i#TpFYl7bT+yeSB6ZDj;egt2uwr3Vk z6eBk6;5U}&O{~YlH)e4phG^!aW00T4VQ0}I;g{i#g76sh8-wEUxD0OxlwTbBjRoL& zMj|}12rthw8bLV9rwu?j(F`2%-zXH`2Kgg}c#$Z6w2*HbA%7gkabW(iJg{8YiMLKf zd8(!(xVwPA6O<3f(_ZkmMbHj_=OvaImk*~E^A=^oFXpK|y-;`uAzzM% z8PMvE;&J{s&25RkF2)h9zIF%bNt6d>x&U+~nyz#Me`f)(ZgDwsJ&A+#9f@A&a9PFy zUY>7v_~r8J0KolmNCsF};!&QwjH8f@`8OWM<-p}0h2n6(SWf(VJjw^>7boBl({dSb z{&*R8gkL5%R@Mms-Ql-4(Vlk!v1tS?;zoL$i`uD?7FkuFg-U5TdW_#2lclRc|P z{ot3&gXhKTV_Ou?%Y)Y;#>-HHa1I?&oIxmFHz8g(^o;{63w4HGbHY2+vA5kV#NE+c zm`i=gtGOJ$@O3=1XdFGhq&g0qp7;)T^=n+55`Q8e2Zt|J#)ATQsGc6g(sH~vjvil< zHx6NQ3E_Ck@c*;Jg93P{DZXf(hpOjSx7FBwSwDgZW!BhjC9eNnZ>6KO)^U#ACvRYF z-NHsVLsM$)HkW1?t_fbhvh{~Guih%hLd|yD3F!?{OilbtzDDYXvGd<$r3D%Lx7dxS zt(&W__^l!ShBR+`;=%jxvUwcz{rEbYu>)+1_h0I|nl8U)t3OTfHiH%%uW~+726jys zZ_9T}CkXq^==~q-kc^*+4ROt_C@$sOnt_s!)5soqNGx;ODj{hEOUkxT0+veu8X6ip z5dHHIk&rTq9lK(WPSiQo!G3UMb*Wu_&!W^(Tf;p;OYAmvN>H*yjpK6Ks`$$gfu&AW zr%PN*xU+l_cLtQH=}IbWwfr3|{u~BeXC?nZY|DD4EOVYC0+I4`eXBo~32> zhKY)!4SMoSDty!DjpLBp>`__u0p2D6!{3JGAE+@F!|?)RZm$6aO;ZE`>G@+o$Jj5s zm-ibneazS|zNE0;5au(U025pRq*1bA59`DmHynN=P#U)H&Gu^}A$=%O_F|&|4?lNMPLzG@VSqapaKyuJTS(9LaEl;r z2T05CbOE0D9`rz>>^Jv<{G%YO3kqZW4K#ZLQzI)~FpvRph5+A7l9xmi1E2k0OAkXP$;dDmngNd=wB5SK`^XC9Sa`F{3Qrvq8L<*hJeOL8 z=S&5Su>_ixVx(VqG~#2bs;jts1mzR6(a>9hjL_rw>WhkHjw*-C4qx#*jb^zpW}=A4 z@}DINv5h`{2XFDC_3_t=-(!gHRRX7rMd_G7jx?@NBII$o-NoL-bC;CaFQz2kvjnwK zs?9l1Y{c3U&nnwa|2-RzBH8!$#JDyFFQ}`QRl!566vvTt&>^uz3Jk<~YC?z)trqV_ zxxbU1=xsuN+f6HfX=zQx5@RA9Vxm}_upoEJ*nEMi&Uz|(V*~+6T89lXH0#(rs6m`j zJP!)HOz;mest9Hk(F0ylQ12PxWq+*VTCDpC{_|87jMOm9cQlaS5}rR=*6<`EtpUdCR%!|(PJC&Q zy~VQ-OMua|FO+E0FGl@D_|B)OL68Qz)V9joCUoHM-y;IC2B!VW-J7r@yC zXle&E@nL)*xbgQ8^gS54IgMe!6Y!X&@T7FX2Z=iUbZ~!=;-7_i?yywPWS|EiL^G9S z1kE9@FycPVWBm0;eDkslmisLFVX(Gsl*D)N zOZ80gr5CXt@$lxVKN1~bK(DF?A@JKqS0>63o`pvvIfk3aQIruc=NO{3HDa3*F37;- zuTkY4LEQC_%a<)};C3PyXjSlQ7aM=50zeF>edX0bd&s3abZ25XS4G zhtafL52A^?-tz`CdZ~uXQ_sSUj^ZHyMgU4%!e>Sns7HLnEB2t`2%B%H81zHHj~yev!3LLcf?J_KPsyQ{CgAq zMp|!(+a2V@HilTg$I4OMTni)mzED_M_tqCSwcT^yMrx+^&xd}uS=vGOe%ZJ^w1BL5 z-H!ELR*jihE9&L{#M(km{E4+FI<@TmZP&n`1MON|OSNkj&?O7#2Jef+6`X)3LbyBl zdsx7&@EgywGWO#5It%i|bO#wgC;|h%%-~<+m*bh;h_VcA0<=L8Lp0+T9OAqHkJk=? zXi%Xu(JtX*qZM6*Fwr1`2>9}PPcP8hH)zCR7`AEgu_4!jL8eGtCBR)J%Xw1qh#GvkE@yxrr;97nGO#y z(QhO5%|Zg|Y)+f9Vu}P3?h7B0rS>vgU4;uyT8MJR8WC)uW{mbL8q5#G`f>Q_L zFe*0_zTm?rWygq<48{rTOim`or}g**f-lX-K^(pi7ecfSJ|4b%T9D(D57t3Mp}5fy zCl-F=AZ`rNEGhB0+{y~?WI7Vf(h8sabcXcp4bpR~FC2P6+-|^20`Y57G+g|ypW%@A zg^7siVKrd_lPu!(W#VCkS6C=CJued?*GlokjM)*m;M$rBhs%I9oLIjO=yi?Ky@BwB zKVdP^l)qqhd|zZwSSjo-%o2Nu90_OMgq1~00xy)ttStJmBHB?m#WOhDCkWlrOO%_i(x53FyHO%UF{;pJ71`ek>_M?Nnb4<=4W*&Tf;xLibw#Z%rKPVYE%2o(g}R?Q zMaepQBg`_;I6{AugyJE^Ol{Tkp~ZG_Jto_ILZtc1Q2<;uE0ycX?ActKUp z>P0^AhrX+FRdJL$t7{;x(R!CTF4H`E2{P&-<{36^;^cz-iQ{JZkS}fv^ao(w_mZOZ za`O3fV}x_AqpCEq%u!)-a;=wB z&)fLBN*97?u|R*;GOKiohN7LJ?%W^dlf1O_VJ#ppPrC;!y?*yVkrN)+!-}IE$Cq98 zo*?_~P*y#GuLPp)xLb%@zwdEyY%mx;uHzbUsoMhh#j}VW$~XbXZ>}=t`t3uVjBBC% zb*tZP1$+Srj8<#Cz`wZ)-HXdh*~PT<=Ke?nc^lO&ZUPn6tv=cWS{XUvdKLpXQ%K8B zpz*{x62)l)abk!zfo5@8uQ-xu=5f7Ep!sB((ITAGZ{`WG@p$=PoB(eU^AS$B^b_d! zIC?gi^@?+&{$hT@1lDBuY#AV?6XvdNc!jyDPorpqxm~l2rVZwR9!J#%vp|odYxj|2 zV@BDQfxhtSUY3?YJf1J3Y5&KN3LXE#7_y&#V@RcLH1`>(q84VmnvxFb6?^S z5=2fQr=6%_Q?)amBZlLa$61egp^cQRM$BL(!xrewnfGB?Muw-JhuF#v7}f} zG;2Tc`7kw2l%qDWq}=ATmAK@p58pfDb_3@D&w((+M`PMmFT&fe2mnfO$0n z2|@E}Y+@+~N@XuZ3$|6%l-qnL1yye_!y;|jNI-f$fx%t~#4$-fvZ0aq!^)~9c`Lez z*|Q8WC>_gS^Bj^e1==ab8ke51T8#J>PDCJT%emldKpys;(Gv99Xfa z)Gt~rIT~x0+y?L2;r2y68W*C*PgUc`VttxhVJn%>BU%4eBVhxc*lkb=)X_qSrv}Wu zol}%dr`Lk;eLKbHsZ;UaUgmhE(TDV4FH;^4>TE~`aOGUg9ZQJ*;3lTove+naiB@2t zB7WaH2GdM=V=soQ4Yc?{*Jf}v;#B?rP$%5(1j<8&Al`-xe6#QoZWA2fKZrL?f*gW) zqlsh?#2ZQ>EJsVP%Ezr)iI4{ZVAIE&LDxd$eo}KQmL$9!r$<8rOR|SnOV{T%t>?F- z`kIeH-|lw$UQ(QDXDK>)kFh=pzvYMm-X|0qaHL_KH?K;u&LdW)2*t=U8(3%@rMYut z84qFoj5|~HdyzJfu%tH_#=(OPONJ81lBsBdB@6rp9lDx+gl8JW4qee@FE~vG38pwl z0nX7HPNRNOO^=2E7ORk#m%diT;cs8d=sdZhW%JNMh`D)XziPEXjr@<+Y_(oAu`XLC z81guVkG7(R_JJZGpW^_3)I>?`HtQ*_5vUGBW)D zSR~#oiDo(uao_;}%NQQ7bw#{&MZ6IY0O+qL*45gJ;ofE~@!j};2v|eoI@(^rEz8wO zJnJIY!wy8*LFT6tLy5b=e-<~4cs}*W;zS$7i6eR=|M5TvEALdI8CAn4AUhE4E%h<6gC<8<|eG`--rH{_iFdG|qa`$3!m@H-IV z_J=quU$&6RmWkPE))Jzec40V*APm#Z9+|H|ai&8Urm-EkS=kLIezg;(_k|+$R@gBL z{b5l{CoziveHP*3cm1JIuLsQ8>XZD3nU}T{t-EW+Kju)qjt&LE-sE;RXjx#j7{k3YN3NCO%Q1b%>1dVwa=@P1ih6 z^mT0lpCUey9}9GO;#qLk%J^a@8J`ypAv#M}{9Lno2MSKQ$;~_;jYcgcx<5Yh3`8LO z9hWoDL3$n>nC6x^^K5}`PCFB?-3JUod_)u9EK{BVJQN|`c==!pi~I~bFBeV+=k<75 zqio9g&v3YuE`o#~rZoj3^e_Ehl;5QQ_e*yRVc#?fGj^CU&Fkm%Zy%B;><9e8* zI&l2vq(?agZ<)&R$M`9qm}aC0^BGMv>)=McRp9JV!-%e-EyWEpOB$#6?MefzA1a^@ zsK`@C;SCROw0JxVGT_dS*y4(2)R34NMu)g+!vPoe2H*yxr3|Atwju9;WMVlEhUYEN z0kgd075_Bu$9B~Dk{OpjZ@gL98Nd^6yg5Ie5Fb1*acRfLOCgPtzdv?CJ1aEG11f)- zl+L92sxZgX+;%}3on28`Fu#5v=vI$Q^?s%7cs=I$2=T8y@P>!;?*#rmEcK~t9UQ{E zfG-zxViClfKsy8-OrSFWj~5AGbo9T62jFpdI)*F33-m?M-v<~BFbF{(0PY_GFbn~Y z!~H`LAd{EPbx4_QgM^!#r&R0@`eu>Rg2TMnK~&BBjcYKVOTgQ~f4pewEN{7-ur?+6 zVxwP1B=g|+G$Y}m{2F0IlG|kTWH(Z||6-i)iE1NKnb{b~$lNAobqzC(Elm(KW0Il@ zG}VZtqrok$Wje3bL_0x={qiPz>*l)hXAsu9xvo$DzrMMyI~GoaqqfAS!?6Ok(}Wsk zwhd`?K+UMO`FMNJtb6wPwjoatsOMhpV;jp4KPzs=8}CATpi=IsFIhR>I1eSCU{w3X z76v!DbJ1zn?(PKO6NP%KR0d6$QD33iRc z({2|w$iqs?9TlZcdzFGjH9n_PiFx^sT4fi>GQ+GfJtiOyFSk{dR@jTHY)!KY%%}=B zD041WSe#Z4ZX&j`4C0z~?c(=P_nygJyk?9mV?wy{=?~}3z09oEv)L`oT1gr4%@k8V zqM2f1arlENArObnu)0E=a3KzVe55<*0-Fi3SrWHt%r@$yiM9g6(!>#MgPpm{A#Dca z5l^(k1l@rrmX5WpMG&_G!qWxPbphT70=>P!-3__43G4R6*+{^|)u+SG|F_`#uprBc7D|^Z>K>uCb1RNyB7fu_v(I=c9 z(5kzmHnGfB;*uXc#tDRD0^Ten>t1ma88tXEplx71WOgN$NY?JU8ywEww=({I+g%b{)tJ$=~i$% z&*`4@qJM^4@)$opB4Dt^@3V2+;L-}ks?nF)jKhJB?@+ z!&Svh^wDOu<;1hld3l5$moQ~nVjAbqS+Bbau~$F2N3N9 z13%e-_OxM>Oxpm(d+q*iB!YFe(URHamh*H}I!S^=sJsL`@9OxN<7%zgbvx)I(dQTHi3vuiIPtXT(eCy^7 zCTSkKdxJSziuhfab2X~_Z-(RCEjAo*p9s0X!XQk1nss<^;`Qi?uJIld_k_?}FT*{k z1>1U)P>*v>Q3}lFguw*PXxWZKwtl% zF*gO>Ek??wihQ$$hY_h3BZG-2TSmI9yjmgpe_M4g-vlxw=K>d%K#SrvWJEvthPBl8Y)#$MqlWb3rZYnWfOfG<5HiBm7Iry?0(^fVW zVw%dPsG}k6DS(TmIf-aXh-?aimk^^MA3k;B=Qd)AZ*Oh)CdcckFpJv~Uw}TnmnAQy zcOgVy!}xbkbP7v}W><*)${*v`FF9&GG5_x0F%V-u&3+ zMw-#x&Obb6bdTb6>kUCzMvHWJ(0@Nk{4Q+Bc%vuDxE3m7Q6qi@!w$7#{(`{yx>x1>+5Fyma%zOZR`77qefyiJ**MmHN*&J<#!kR36^+ z;|EF}EqVL^$+|>nC&l}ld?U_|`Q!5OfXA>mHN#_pUJY&5lyiS5@ozIKlpNRbdSOfC zYXP0tHn)`Cm~ZIB5>LG`r#hY(-(nVLXl&ff&u52#pP#|zT!a(dvu1Ly?yr_dBF-$( zv$y2-n6Gq2>FFvuQRr%;qpRgw1q+OZIDH+6vEDGf@I`O5f0dijnxQOZ3|PgizJ(Ip;|^5M^4y~=88`NosRmv39W97mog z<=c+@&y?>-TrOP<_~+{`?STJ8qLeL;!;b~9urP?jdpU-0I8oY-p+u8S8-lH~XuYMP zTgqtM0;6w!sO}))2O*Uaqyo zZF%v?1K)ygLRpN$Wf9{?S;PWu!;yUXVbiXHe7l>-ubYYdxZGm=kQr2{C_I#S~9~39bNletnUL-v;Yr3qTLl$1%Ns2b|1&z0E^8vN+s) z{ahZeb)T-q=KcTWeL7y3o1-oA=SI0$m${Dd8+UNi&Ju;{rMixUdDC5>GieBnD!gzg zL_H<1YY0NnDfkd#i5Dv!#NpdIaRM`Z@sgiNA0pt!#-4=30m=zw9#F?K1_7Usjh<-wr!mG90t{r}&E}C%M#jbvymWR!ARm7f_&7XW z?t|!kdfMEG9V4={O~hn)U^R^QOHDbnYH6O1Uw06O%e(cNbJE%?HnCUq*6_K_58Ov< zO{*$^UG0y)`!9;}H)g&xFLzM&!1U$YqjCM9)8XBhIi3>7OTIN#L9( zRMyqH5}bBdowF*zmVk5c+Orr8aN7A=Vg{>;^sgcE5yk~vdjfO$3XshAUF2z*oRO) z;~?%RqS;0mFQl6Q>6lDkM&EdTOlGSA-z1=KGSGJ_{1(9PRG@bX$}eBQ^^qX|X^{3b z_?-?oicp-H@a=T?HVeXLNbtRZJebU9L%bQlA2ZB;0OFiU6un_`VMk$$C71(pA4RmG zYda_>CY!ks2d66)(urk14}Qym?lR(D)@MVU3Wzh`AWjv;sew4vLL6Skte!~iS2G@~ z^?Dn7XanFphQdANAE5s(-k={K#upCuXp7I<+k?9JL4MJ%j4DO?86GdkYX9_v$L!Y^ z`3O@nuksOY{`7%O$0qqbN}Bs_U2io}St7|*1+X-}*^ zzG!sLExvFJ*X_y0c>VXzf~YIZinOigbd~-VQ9I(fl@s1m>nwinp#Sc6+}qPJTK2t@ zhwsF_Rvt2v3jr&(-uU&@nh2LH%q}OScq(`H7faAGHMgEF62da{bbgn8@9N>ZyI_nT zR`VtpBf9iwbv0(%)=KrwUR!J{EiE?bQ9K{xb?XV@j7fI8xZqZ|dia+Q+J(&P z8&`cjb2Uh$tDjzqe!o!IpLlM`2I#*hd7_8?v93ovg5_&#GSCYxgFMi}z6&ON2McAg zz@+9hiRK~1_ww?Tmp^$9^&?Lff0#c0PVsx9{(D~|xgq?6#q`68apM>2sB+oo$vX>f zBJjb;ldr#+lq4M~%KKU#SQ%tCU`QsORWZec<$(VEg_x=Om_C~rOVg*j`gkZ-Ix$x) z^RB~9s*N`9L`dPaTyTUs|_US5>8~rF)`g zgrC%m)WuhxVl-=Fjq)4iXMUrJaXkxX11dgH^pybdy9L?7RJPcZ^#*i7X+HO}bpn5n zEC-=HEHF1d5sp&BG21^|nl1~BqxeNIe&L7bsa@1jChz>BPGjw`!d6ar1InB8)sl8t zoAM%Zq@HF~AK`k9?h(f*frkKqKm<+U#{x5G?LwKQiuN19RO>>tBkx8HQqXMAkGv9M zGRqA>xBI$4M15aoS=gxQvl-%8vjh^m)jUBr;(xQEoN`y2mAIB}?mVcW1M_hlPjU=s z$$Wh+haS3eu++3%^R7Kro#3Z@5}Hvy3|~(ro%%3(Z$Ihj-2&w$-rVX9W!0OQ-`wh> z_)p*m%l)Z+%_&um*8+n$hQF8MAISav{G_j6^U^nn<7JCD(ZqdD#=Z@J-~P(C!92}i z_#Fhl1L1dw5^pDuH%#&O<^DvUc-CZ=*o-{;AnU;`nJ+ifD%LPuvv{jGT#{Q?Aap3sPNlzO5 zveh0VonYUDH4!hCAC&a~KJ+36I0;eP3puViZsRozUvM+JGvQZQFY$|hj)&uIEUh9Q z=4a~~{?vhNay;vR#nlq|GJIJI{;3@QNI&H;vYF`_q2RCM_(%Im&*)~RXOxD|(o?_} zfFKA#h!@}^F&>}e!}l-SAf98qKwAW6cPp`ZZGN}2GlF;kZkoFz05|8&O?mV9-4Ss5 zdmuH0DTegbo~$vKrj$sFarL{JoocBeFYfS2uL-7bOf1x!e|6m6F3<`F2ERo zJbtN=wmz$_D(fk2eUYykfK3j-o(TGQb-18}l|D>s&!Mf4|q zvk{*o4fq@dd1n*tG;kl_CkMWtgz&I9xscZw_|1dgvG6+%XgHa;>&tlHjk$RP83=s8 z4frPs_$LECe!4dsWKIQ{pGvfo!R)sX(iK2Cu}06+W+Ksl|Cio}Qto3AExzgYSrGBG zedc~@#mgyEpH6RLd?CcMy!<;Y>i_9pFQ-M__}3p1Rd(|{#TSl@Hp{78BRL93Ouh1< zKXU4wk2W6oVBe!X^C{}Z5u*kQzt4undCbq!N6ar?A7AaXBWhhv zM@?~QwcDw^w)zxaPqwzxb5!LEN1iwoPMEsY#18uJZfB*06Q<%>sXoRf?)|J3!!4Xn zO%{0ddM3@&sZt_b|7G!oL!oNvg;v=YB`OCO@klJB#^$ukHXm8yu4akn+G_2_ryh-H z77l>=YNlJf%4|~8jKf*J-95`n+aq8@PkuWUBj|w;Fn>=yLZ9Dt4qZ^^D4AbcWHTb<~uORsFy;_v!% zpv`rP*u7l%EW)>Zl+8y^dFrwngEv~gZmJx?F0^_ABi2uO z#x|oo8NL{Wp2Zx0JfbH?kaIcE#bnnJsIzY!2)6&KwXw<}Im;66?}FsX_fPoYMtq2o zx1DfU?p&<KyY6gKuwpoJYXV_fd6&0uOi$2kT~dEa-9i zp2Ypoj@je%XdKOw_CQ4uO?)#=&wlxeFM+tBL_2t8f$_Hw+kC2;C6E*2fK)FeT2XZW zy#rD$Ayp6gj&6y3Ii6_W<=G_pT42!0_<+sOaPIFzw1a0-xNM6ERp>I4QMw(cC&aBG zC^KnL4AbFbv~1Sq?_jJS(GA8iq=FB!oZJZD`hpLuOh9l7f_yJ91>h6`3cQ0q zbxUtgQGQmJz0q`h4CEOZJ{+WlX;3yfusDo1U( zb{MC&amIkX2m@OYv~>4T7;csk1;&Oo==`WgaZ7{hqY3|ItCE&EC?7YglFd*lF#cw; z8mYtSkJZQ^oYp|WwfSw0Q+}(?s9?PnAU3<2X3m6a23KTWr}(N-t`eIIf@4u|Sf#C| z#!)qIgh7(vk_xHL`Ix_m*b-q#oaIs@;=*dHZ1e4obL`r;6cjJamu8ITAupyGSd|wE z3>cB1QFWQG4XhEHtZe6s-6kPsY+Eiznfb`3zU6EYHV>ZwZ0hTlKWw_G8>*DJAz~G{ z(3c7v%^PX?<;mp6bvw<}EE_&2;H^&=pBczP=}ev+(6+h_aJxwjXbbX%p2m+&44ZtB zA5Y^HpLp@E0Vyr70Y)9BKP5iNL{h$RZ}=)gaK8N5fM+A`EG2)Jr<7xEeSx6m7lxZN zUydgd@C5h<{!SQW+JlrVcuhhp|BGEt8(6v&D`TYcvBu`KRYHD5iSj(pKay*0l`YX( zY2>ji<_`XnL2zxkt;Vh%ny9X*uH*V1RLjOA;4X7iRPZFWikfm8Pg3i0R?p|rxOoPC zxq!b4B2I7IUiSbhRkjZ`A+cP&<-}qFqQ#M#4NGE!HYq#WpfzQID6L!JhAgr<7V-y3 z)VCOd760E2>IhkW%8S2z(r8wOa^yTAoZ{%K)0>iVd&&GnMQ`+xb#7gvNF@>1ClARO zq5tUUSSmB{kI;&;cd-)lICDh^s1rHz(v4UKO;`7s?;t$v>s@4OWVpZ`6BRUzgLM z?yJ4nJ$a%EMYLRpM*swV12hA&hrrKwy7>estG97LB8|Xqr|ucrfd$(En}~dXt2%(7{_RVmxQ@b#ztOS{O|$j5$hS zC^G;bg+8ZM=uH#4!8F%r;SX&Uev^c6IJft;?f<7$_>C97A+#aUj?zsHq_tt-@md=O zuRBVju31sNf<<7@N>e8{eQiyeNp;oOXtbn`u@oB!T1wWSUhkzrZHpSzpg0)p z1Zz1lO;fFm7#!z`;}pa9-VK^$`rt+yzGEERDn>8NbpY{NVOwh#X{}wyw~?18Hxi1# z5CK&HPq?72y0*^ACmx~921#9oO^KbyVjop7jpYiXZuFEoYHKPWo90I!SiWe9CW3Gg zF+O)=GIgt~V)uTDHvb^RbY8+R$YkyS8+-&cy7rk+ z=AO3>A79o9sc=*}T>SJxGQKON@MnAWE8U5TRjD99VyhgGb zqFhO1vHLYB1 z)98s;&FC@J)y~o}HmA#8>#*@Nf5CES6P8??3mDB0co>R!>gFF3y7{|XyLl~MYkl&m zn}5Jd*KFF=8g%mpbNzCsqiVjEn)P;fdFk!`5PCZ!3Lj|gNtC}<&2Qd0H}5sV$Jt8k zt|@ier42d2TZeHJ4azWS>g*W@Dsx`n25q`n;e05GPQ8azusuX|1PY66h9}~KUFG&l zK1Yu%VI85P%H~p|2bUpM!)hw3UB%2avYO9adqu6u0sxZ)#FIoMrPQcb*$%5*QtWcL zD(r$dIu%GFwL7X3YF)OujtYlsNy5Oq)5aDJ64D1Sb7}HQN3;T#Q|WKk^Y-0XUQ2iX zFemrQ%zXOKIoroS*X~@pF?xDXrZ8U# z8o~&8fF6~zUtryF|Jw^cT#-!=K>ROc?@X#>6G8uGsm4A;{|rm+|I^})^vEBpu0Lbo zpVl9NzC$BRPw)0j82#yE#_%2W7KzHkyDI5^#?=M?Sbt-&=A2A_1$;Z1R8HMmZ&7L= z-F}!6e4BNJ!Z+PBSw=v2%K=m9mn`q5HuPWE?KeP}yb@YWZ8zO2V< zQ}C#Bo-l6C*-rlf`Rq~nuP4vRjM7bz&pMF*pECKGbhVP7 z_0fx123_e-P!38q(N+rO`nwXZo+#mTJrl?0o9QvBRHdS&KQihAqUgt<7k?<5RnD+JNYO+3&8O_|GkRDlSGr*zi`a`EW?Y=!o$gljhyMARTu}8EM*XA@ z=rKhuO_bGZt|5otUTcDgbG4vOPJiNW*<+&E*u@2-FsnlQV2GEC}Kwox%ejQchN%sIhzbSs} zHmLW2FUJDq^}Sp?1M0(oj_>8-JD(l|Is60d#BWObLU)3EcPjOpeh=-!F{Rw;KfK+8 z`nLzlZHLk>TR*u>Ca6A=*9)jOL4cpreNe9M0rZ%n&-5=vU#$xIne~B6J+@u}?cs~i zz8sK0$I=f$zDI$t{j55yexg1c+S$FJ*Sn$J`Ul#n)j;n7ki(B<@*kCR@4c4(&a{8p z-}Dcb{OIrXF;KoorppzXw`=q_S!}1P@@Rl|?jI_nVZ2gLpuB$qI>VImwSKAStMyCh zSN16FBmG%vFY5`kW4}W^cmw=LmHwhW1nM8}*M4B#@vryML!d7QfZv~BoCEdQ`V#cd z8-$DKV8``g}N;_E}1ntcs zXg42+a&!T{f0Xj74+XtFsPIuAtng+18T9CHr970$i{(_JJZ^Q1`li(x?n31&`q~ER*^pxOa$$zu-n=SoLli)7N&5YumA^l1dGmA8VwOENf z3AQjpbafIuPh?|!7G{v|ISFo*+{2{b>C$hy1k3=OG)A^CBXu*S-#nsN*?vX>cBS=c z2{uaMqls+n&C*Nyy^1Kt2Var=sYEgAcC`feNWkW27fMh8c#yND>S2i`0BbBOYo*EJG6D8ZLRF*sOHWb4V6ED3TY_v_Lx zdo1iS>6d-~T!Q-~cvkYqNWU);dE#ne1|MIO;1UTgBFY0{Otwi9bdcOnN^r5{W)n^J zNZMHvoJJJmg3=7d!hYY7e#a7df^Qio{Yo<%OM>)!o&={8`87mK52Cz|$P5zBlHfKe zj4dYIEy4K`R7w8+5-=IOCH;;eicv#m0CKVfpAcnHVz~rQNbrsXYbE$zf>$MYRDyRU zSSG;+2^LDQT7u7rGP!1x^0`DYxa@jifoeL4hf2h^6;IcUlzVF@~h-#crK8B6N&P08=L7qE`gIM55ldG zzOhAvKc!zrV~GSiB)2qcv#>>*Jg6TpNWUpWF^2diQH+(c-k8+~_RD5?t0dSZ!SzJ3 zUVelq#vC^jWpQG^1dA0>$ge=YC&BF!{7sa_oZS-alfJXLVQ(ToFlS+PPpYScjo?{5 zVr3`QGYk8EtpwjmZnmhy9@b+ERX-5ffTm?QQ4A`w`pfFEBzp_1C#?MTO0Zpm4~SyC z@+YDgNW5QybBMBt!WK&YAc~PpR(B=&StPvVItg|X#VB*N1ZyPND#2q!HVkWFbxW$h zvLQ%m=4@g7e@e91OV5_VSRG|;$Ztfpu5V#7O_Jaj3H~CAHU9My{74kzkBk>qzdw?I z9lV(%!KFm8cqFwG7AAXEhuB&JTk2+YpB;!{ZO4Za?2+OyoUHslmY_z0LlXQf!Ics` zOq7lNucUY^zu%=_rt_@ce=E6}Y|fS78zMh=Xkl`d>XL=&>(|QcJ``3GBwc|;h}`6) zlmhH2#GWGTnXY+K)AF!qJoXe|4@#4knuk5(v8Mogim(SG$QX+~I86pllY!D?BqL8| z@>uLCz@8%P!D+I1w4mIStRmH&n#3ao=BAESgEbnNy8yY1kbAo7&de@CZZ+RA$*Ea| z*fWdA8k4L_o4E^-dj@iA@iS8jk(-AH=Ow3T;*>nuOeZ2M|GgarAjyzgPk4?=~Ka5LG%E-r_Y1pHR zXk2pY804Oa-1*2o4Y{?n=~?;6t;NYo9)sLCTuYmsHU_!#k$W0)tL%(VPD;>xeJjS$5ACRJ~Jy9 zxyK;)H00LeXJ?K?y<^wGK?s8e`_3g4}AGH6dM7S;;*G zxpADS$X$rsT3+dy2u~(Tn~Bn9qO^$4Oq5q9$}1D)m5K7oO36cRmHCOusj8;S9<|k- zn5wC<JsG*DAh#Aj zTkB~g_hjUrg4|P)yAZk6?k+z$S?fuer%>~xBo$x}`jV~+P4>({9*iwr&7XO)@t16j zP3!pblQUBaum?rUN*RYe)38VF`|^|3Y?xc+KR-p&Jjt!fB0n{~0J(X1V7^+HC8vfx z3&qaTnySgknduX;XA<`0V^0zGsAGu9$rCl_B+aRo_mpH!PYcr1a*;b9xu+tx7C$2; zAGr&W8^Zlag|=XCn3#XrAP> z$=HLwq^3{A9^IE|nkOwK7kegQ&t&XD2r@8j8A-Y?g~*eMD9Fgtd8S|wMv#SQ%hUu_ zl%{p|l6xX@PeE=ieumZq6=h~jL~boSD^ulgdUA?3oME1E*fSn^vhbHI^d(i33iFJ` z9$hq)X=?H~?7`8pF}7^omwfC&*wWK;v1dH?;AojBnl_S_J>#(l<&%Y(%SsxDJ>#(l zr_n~N(^Hajk$XIHt3@+CO=}({w-zTY9fhl01x{DlkeuT*CyFppb84hyt7HXE&(=_A zBA$_)toP(&Pa*cG?dpsSZGb4b$0K(>au*`Es_3(llhiT4?8(C(wOO8}&EA;EVywT3(;N} zk3jwigETlkzsD8^9Nd(s9{ZAWjSlS#V&OLg=;Cq+ggm)E@HBApOa6@mcO>B9ddSm9gFhDV^L!$K_Bcdu zEb?>QZBQPZ2F_Ou(d$JFiib|9sV^UKUSlACo^KGyh11Sy;W#)gV!8414@Yrue6IIA zjPoCaefT@bElC3BMhth4C`s{wPEPFRO4Q=XOGTu5+Q_#^v2k z3B!xDQkc@pT6pJT;q9}9H_jG5I7#UKbQ= zSPulU#dkHlnyS&74Q}Qtf-5y$MPD|U?s*nkv))q5zVc*CdCr@$XmtY`-#4q2kXsv9 zJbi-!mgR@+gEwKGd3~-8Lbj<4JMe{$zd?BnbP?E>2$5nOzidSZkd?FSq`U%lLG7(HpaH6 zT5Ecd+)=htzv9`N6HjLT*!R1L90i}H*3^|xxH>r5>djlQwcj3!*R{PfN`;*n$vmgN zdEx+lmJlpqIbWKkw^U(y6{6iYf8qdmqQ>F|X7|b>!S&OF8(V$$Q==WT@&=H>!zCs-#7yya@-{YyaQA@8H0w(M=?b5HF` zPXH#a4aY|Y8}41c#sGg>zVu0kJL#mz!(4AI9pM)qbu|_LLOb>rFCN*SZsDD?`~Vx)`S%Hl;JL?A?;H$Y(`Dy5N{peldUTWIPlLpYe@SF3-J84kHa3<^ z7g*h)wYeRAO@Vt+;_(o-MUqi^wt_ZqqSOXdi*gwK*Fgo7g0JR&qh0uRKO(Rb-gR+P zFoSdZ68ql>mAO-+AKbc(WFy`Vday+-iJsYzHtsU&6$x>8o3%22s{3!vic#w^LpJ6` z;pB@{$t%sxt)V)9>$sw`VxNU_U1Yvc;eWAwP|tzTDj1ah;r%?$rss9wO!^CD1#{tc zmrj~|!i&v7!%Jqvvw54Z>ppJ(DujR)B6^*NjJZj>6@sH5AvrNmAA9a_C za2MgZ^H;*&JU|eW(%V@VaQcRQ&KZO4(9A8$g&SWsDErw)zHpXmgjY;JAsa> ze$tR3Epf|6{hALqzJPaNhE*D&nf48Fo+kVDhnf#_a>j@o!H*r5Ks5Nbpah zw;iv>J*A{*cModPzF344#w^WK{l6IxW{i(R>mt6XNHHHW0lj=Bj;AmWK4t(+2{)xP zn*C!$lA#iWqPLig-6xu_)k^_>Wf3isQhLqjN+{Dd%@5PYZJW=18=F32zXf9DjOX}Y zYT>iV8MkisPD_bMj36nMmPRyse3V6^+xr?n_|HtpzII6)c>djyFU{0k79;{nQd<)} z@OL(kHR2CDfI_k&$TA*NWfa2oyGLReH_3|Dw1F@UB}}pET}ZBZF#;W7XPU-ST4)-6 zFqS3j0S0C(fEBVW%+*Sx2V5@pA@cm2@0Q|K4*ZK`JwC<`#F4EJA5e;=tpSYNDS+^A z`=J`t)2@vpXZ=qJMCabq@Q15|Ek9-tvldnk{o91+Y|%xDM7O<5a8WlmCqyHJ5)~y@Yky*gSdQulcwt9pF?(Sq|$XU*=gk_=e91exqGH6cNV? zgYoL1)Rt3Z2+$!lnxU7zjGrMMhl~?+7S7xkp9o_o^w3^U zjhkp}N2s@HBZj1^xM1K<0J~+X?4y6Tiuy9vCc^M-2Iyb8;{>Gxt3n~5{0_DcU#kPK3O;Fc5E~{d!;5lo7W^@3Aek(j!Gq=d*xv3xZQngi z$i0TQ+bG3ktjva&+}nFZ@PBFk*8hwHp{2_IjE(&9o|Ij4Q8^}lK+hYmT=|H&L|NKa z&uImJU2lADeXjz_K_QqHX*>v`xJ_j*Y4O91ANlf>QT>2T^Byb-$g<7BdZ{DOmX_O? zm5d>T)jNi9Nk;_e{H4WaxC}=WBd)5RrSdO<&QVs@HL9SFzK$0hEtF>m8vmOJj@{9C zdOUPP^mpJrrp3lBVWmf;zy1%+QWDdaT%&StN*ika*^CywlP}ABATGn|+*z7k)4+wv zFu9 z!=_xyKFlh0`_0ToB)mkEU`l$)=phUdG+$TWQShvC9J3k7$`J2G<~f?!Ec&5B)idJl zEg}(O;{Ud{Jz#bI-&%k|2{-F3dpKy87&kIq`dt!T`d|p52jh1vNP4?vOt1=p72VjR zzv|vMgsBVWpW$+0|BGoV1v)BW9*JKtRQY-BNU_`45cc2mr4r!7W`2lxiDOSCPxQdl zGp8k1^>IlAB7Qb7Mz)ba*1s$vK+FGxc>jE`5NObF`H_=&!Jj?)#sBI1{d30>ph0sv zSiC^+@>d?q9xhtvt5D!mQqpR4YH$(ISV`=Wco8uG_UU<19`N)MUr~bAYBWEyisz(@ zNYp3br?`scWV$Y43oUQ1{c^@<|uUQ_a=yx3O~Y9B`5gq@Sa`fyB zTTkf6D5WCd7!Bga& zDZ{99s?V}%sfzW+??a(8|LWK zv&!N{!EW`FhSx4cB`4}TJCdM^yxVGOHEf3RtWz$x5Yd)#?VYdtAiA-1Zo`JAWC@x; z0_O0H{}V*#i!P+ID}hH5yv;Sj;7Y}cJO7G3VpZ1dQA6&FlQx|rmO@5i@tww;M>?k| zTBmy|F1$(p5`C>b+5}e0)ojV7_`&aDf|m%}qBd8D@f5qF(9gsgkw-H+VQdMZB5ci~ zY|KZAF;Nk_0WQvSr%wxRPVrEtc>L#idhWwHCCuAp*M*Mt<6eOC?RUaDK=4z{{i_K{uNE(Z^mP`F+2gNVq?}#dcmzXcI0y|~)W1@S%iDZ{ z(+h^3!TK;9C!vKq;pzH`=FJnD!BJxVg*Ko!+VGPS3LxoqWhOf{TKLidJM~|lE>-@|W5wM9p}Pk{3HcG{44beD5p<$_u9b?LjnM()3+A z-h1|^%KYg~%6B>v97CM!@dBP??z1J_iEH1~58c7<=_bv;qqm%;23#f*WEJNrnp+1# z#4a!0l>y9QB#@gqwD?GjM^`6;ZS|Ph(O&r&UZEy)Sk@l$=F&~Y8h1_!rOB1f(+Oj@ z`TZn9=bk3f%!=pAp;4ERjj6kHCYPB%&}CH9$&fVvhRRcgTH?~Q9(sPk@MKa4aa_vz zHj%rSW#9QUB}h~d;J{;e4ttzB&3|C0CQbLmo$d)%m0^AfO8MCyq%62Hd*TpYm|7at zVzn`u;~e25B}h6DOE}47JAPhr%!Z)GF@$|Sta`&5(}fcFiLhMF4{9$!OmZx6Em{D5 zbvgG1mt|U&@Q3{Z2!X|u``b9-ect1E$Zz|<$5bSJT*mG;Jj@cpXMLs=Fsw z_Y=W>g4Oa$#NH!V`7KV`PB+y-^7jj2<ft)`ZWeWsLTuzCMcMSV}$ma5#N=yTP zO8@)Ja9x&dh{Jd4yh1QvBJR~0E)gkqU@+sG+ETyoi+5~8zy*({X`id^5lL6e2Drj_ z-#>q-!1-eET|}6kpfV=oWQVUjr(PJm@B?m6NUu&<@Ek&Hr0dn~lV2DoDtE38^LKO- zou#TWw|`0JWp-;^bxk{0qs{qGS!}AJNMvD@(@!Ju_ONkZM!)aP<2B9iyl0?KkI#*C zWBkwcNwdLM-aL%qz`^p$>Gx2;N9NXBS|YBsY?OWu?a@eOcB03fniFOT5cl8Jr(K7j z!?&Uz#zY~DYuTzebbD?OJy}R0%w|d(N~f^fa&=(OEcH`d)e_S0<*rHoyLNiuWTv7^ zicb{OaL?QLxZ637v4lu!PX4*6id(_aNmp}R-5|#N9I?e0SDyDFt{>a5M0vD>-UMD* zNW=A+aDsg7C*Orbv+;woF06|(wDO#s&ya|az%=txT>oH1F&gou?^k$ z1QkscZciFaumYdlcaK`@)kiDq?mkrGi^@>elT7Y#xjpLpUTLw$@{w;!1f@4aA&nZfMsN519dDUlhK z*M?u`<&5I4xZ=geezRrPY{f-J_XkK?QhiR2V0^zju|>>YXMMYvetP3+pBnE^i_7*a z@y*s}+%2sFN=rO9>YgkOjm%x`|Dwg$QoA4ZV~&60T%VaW$K#YIr5_X&)s(YboOdt( zfAH~OzW)o%F4(FIyzBYh#29Sfu=)^tPq|6?n#C;%fa(37hg_6}>VA^_D|cD{vM0y7 z<5K5uk!w6+8RhQk3QmLWj_Du592@LU{N51r_wd}{=a;&BKfvSbXCY+vH_eHusKxpl zLdc4--rHl_3uj*DbD0}kOlI4^eJHPDI`}2if3cJAacDwh9O=7hTE{PM>+_Pu-O?* z`TDKQ7cu^#vi`Fw(*mwvGcUV-&onc$(Reml-;^~ye((S1k7xFc&0X!Qu&L_s?EhX@ z`ptw-UEV4FQ*xz-vPvoDr;2r3^nVkwgE7d?*$8Lgx-nyVeFt11H`IOy*4J}?@@^xpg<5NiCa`od~;k_fa%JYJLkYy{B>bQ^zP|z2qvlKTlix)C>yK`^81dpRmv+!!L z2l!R^ODx^?*ujHv5F1fF_+Cw&rL`~;wA}K72G3Fgus=Bians7VGXgE|6re^3`7$SS zNKQ3$rXL2(&*DPwJ`DaXM0fkE(e7q{h8b*lKyZM@)%pq=8kWz$QpQDe{j-~X*Gydm zSFtYoZwv<-JzR?HTy!CQv2bTKs|CIda#&DK!v7v_mu-&&i#D&IZ#|pRZ5;4t z#8!T09i=^e3Hv&{)`CzE5NGaJoA%;W%jKBT)S4{ldVm7VZAEy6D-EHm<~_4N1K9Hz zPm4E)EzAtbP-zgvP+Z&i{gA<&2#mM~&>eP-RX%^$6k3Zu>W&4?ZO`36T_+5TLhV&8 zWmg!dnI(&ARqd_8@w7rINwaT&*)j}ccAx-M4&ryk*#xp?7%grNWT`!-9EG4ox{~LT zbNk2`#T-Tp=YrO?$d7q~zJU3Id0~k47`%;P`ltmX7qz$`6HlUCW%A_?fZ{3zr=?!4 zBkVTg}NcDj%m`(0d3M za0>9ZqwJf7jEMTbJ5vC`w)=Y!-S+GuuXk%XhDGNL8cmAr16&kvoD`f=xQ|}D-O8A_ zPx^yiR^Z6Q;ysXh9qKKZ`l|iodIl{YzX$LPVWmkBLoBKUA2a>5VTit-ceYQ&in6K% z{R7q#*r|hurZzC@cKjq4`w_MVs7_RC2UdrYpFG%rg5jG#a8;vF4A@hxW={##9~>$* zOUNPhJ`~N?DMx~K4?`GSnj{{piuwH+IQ{lk0C!pCaDB0bl2VW3eGp=^(qXSvBNLwA zmjahf8~X?48LvO*B&F3xlQ8_01C+xaokQj8a1&jf!n>wzr`&T))ClsKEtQL;z;Jw) zypM~!Il@NmbxVABc9$k_pfeyOZtsjpTY3w;iGx#ez1qqTy~`I(gu zgl>V65YrMtVkr!mI zVJAg6kr7WE=B@Y8Ve&f3W2;(7d*dzg3{PUCEKZmoNSe%T(FBFtsW4Y@G^>xaNv|b( z@EM#8c2j@y-6d>$GHS=4%rSy457+pYuiOrb1zyhx;G#PnM&i@8H9u578}ytU3{1Ka zLS8;2Ub1paM;~-ZV9hDW+zK41-(!siEkP$UF5pu2^{t2~x8i`ImMsMo{T(c;0D#on zJlOwJOW&k__H`u!!g77fg%0+=_}M;I=!JsO=gnUCKoQ&)y%N5JHb zU{5Qv-$i8Bce^8^!*<}_y^<~QG3b9n`28zRM~?eg{j*}Fa+&B|k{MR%?s_N)w$9A8 zia%@#8tAhO*jdJo>R6GPqfgRCI>mvN`Wgh#%_M2w}MT4m=UL8gcvpAbVz-nxN#c}z=8?7#j*z!T5{LPDNW5Ah@rOu;vdQCdFrR@bhsW~G{ z?i%Gr$UHn#JD*1l4-8Mrlv9{}om%;94Id%b2C?b2AkD0ocWyvo&eOw4?S>T@`XG0? zo})R|q@vd;%e)jSv;Lk|t%e>xi{96($5&;Tw83<*R*3ID-Pj$At}`6_2VztflaX`H zOczDeV8L`SsNnEB?}jwIQIJOUes$gO{&wOv4Hi_$u> zCPN7DThZ_vjY-&1i=t7hxvq6$gTuPjuyV4mh_&dC;<|R4pH0FzC?Aj3A~fLR?-7&y z+Z&KTw#BqPz?3viWq4$8?=WKfXA-EGyadlnhu4LnpHQM-!y151jC3CJtNVAIwhui& z-wYYuvg&5}4w6RQi3<0*pjBwRv_YW8A12 zpVz$L58HA!W}l{qyemGB+v}54^T|{}Za73xY9UUV*&F2g{YCc6ps}{Sz|EQ3MQ7CB zx#bt8d%1AE9!3(vVy`t|D~2TQQyQMzwA?msl`*2T$4c8DJgy4(4ZwF z%wbDV`Z&M^%FtolqK}vEeBQKMF93~Z0Gk=FE-*>o{oo+I2|4k_HzO-D#|Nxh8aGx-hRbtVlnAKAOnUU{w@~4g&V2~ z4_3$JQpi6krc&eUOr!3h@HS3iXoUSu^|#k= z#NG7JZInO8HfJJP4;!6cXz-%ixp407S~9Ma1A$o;O|H9#GJ#}>ZK}?M)H`2XZL+q; zA`hxQ_x@p`uT$9VtL1HH_sGl!%_(qwg2MGIs#wjnsDh1F=wOPYK?!__O+oCy!9 zgD*`jjT1w}#b56)<(ZN~0En%vVUOGSc%5lqYOb)+*rqRwRFs7f(r>Q~g001*byHj3 zkyw?7}CFl1h=8hZ5aUt2ykYk05qPx$Xk-*hn9b6(p*^g%u^h1F&kO{V*D z_8XyZ5T^U#n~hER4}}mYuhW7`9O?Kyc3cguK`-ht zIM3nfR<>v007`W8l%u}}?gkdA6F9KGH^0C1IWcg+Iy^>udoCr=l(3)m@tnPBV=CN_ zSlu5LIFRiSBL*}TLOh&ppd+&eNa+tgK1*u#SHs^FU{}9!C(HNHmF$&I*z;;1PTUBk4-EzuY9HyO`R=cI z(Xp3{1G~;*J3(-*duq3b#S~?>gx2@ALn@htN3($(a_Ef8JYj{fh~20?t1%NPq8C%l-{8rvV$$IXS+#4t9&OxCGTrfN20vIwJ0_v?yn-F2JrBDA&9J7B?9t^5 zs1l#7q4c68lE5Pny6-||6ZF?6HNb8&hgu5(#JW*&Xzvi^UtTY z9l0DTpPhwx9`eVU_}4<@AM`ACchQ~RfB3-G=sqMt8mzu#G^W|- z&614Hdy3-_ulSk7vy)q5h2k4@^?lS%ZizUWaCRCbAo?z;T?a%0wDCLa&fhBp1d|Eo$$dVXwPs_nHy8OaC0-<;d?b? zY)=m`-(IkX&J-ve+XFw?=~#u#EvE6EP?MX71{K6C0i8R_(cfeaze3bgu&=7gVxwNT=}3THT(;l)G% zmXp!wZn~X+WaXR=?I+OT%2*m?a1fHcCF|6W!m>c4YCK6{IcVw)Sjye_uH`Yce8>5w z!{*wLlgRt{uZ8;D`WuB+156Tw8q^Gim_X9Pki+4*`Vh_oG$mZsi=8N!4?}!eyqzb9 zQo8@O0H9Rw5H10;9Qz1J87>BhkoVl2mQ4c>ocvD(9<*Y$euHb{fm~Fe)+#YnAUohH z%<#JzWR6!4$@+o9Vg`;1EFRPo4`>)uU^aUmD@} zc`B-aLM|H0%5(G{Lg^CB?^SFNApwWjWw34_&BIE=j1VtPE3O?#yn~RHD`WaW%zBPeD!Fabrf3jn(k2el-d7 z`*Zd1AGD>abmk=w=2zDU2Eq{U&7{`rbe3IFsDYzs)?PC*whkb~-#VskVnQuE88MOZ zxC+>hika6W<$vm$Jiiy?h}y4aO#+U}+ZM7UkPhvuxjQqdceTnP6+%tePzNotd(ZB8 z{Ur2bxv2Jbe@i(Y9EMo`{a<7GY>yeE#i;Wfd3XO6!xV&jj>~;DyOy%y?S=FEtYYBV zIEzx=>}MreC;VWkv&tkq%EKauyW&_n2QQvpPn3Q&V;V4kp_Cnbd)tsgc2vzML}4tg zZPv?EbGBq6VhRiDrs|7g+HZ>6JaQuIUb&)=5#8~&F6L`KU)!f` z2(EZ+gae7B(JozEA%!ICY2aDGiIj&fqryP#EbDypw4#w1X>%D?8k zpF65vIT<3DW|lUun`*0QAwl8Jr*Nll8-&7A+Z_8<(bLNS(=<6p_&vpc!L^_@ zF!YwCsu4sIW15%%aCcA5>~M>{K2<`Bc;|6&-^~3)dRrI<{4C;=bnLw;cFu-xOi4?y zLaMvw+GX=lj$9uhynbrN!|5JPAe3e{gVmKhAE;?o*{^W;(c=*XK%CfoibVGUFib=|Pt*fB&h$3Qy~??bgVK zGn1LygMK|)$~=SMMAw7bXvssxbIP-rhYWYV?my)>y$(2s1>PTiC^@98*%yf1a~38* zE_EN?nN>&XFX^A44E*O^$Py@MH4X<_g45mHI#oxeP1DwXoJql&8icm(@DSFbTCcX} z^i_Fds@|A{^@0ao0lAo}j+!nh=taU7o7PgtDhFtve!&u4?z-${*IX?i&2@?J&dMgZ zotNQ!`l_Fs^VN z6P3(i0Ox!RJf4g~IL+#(T-I-#UaCI-8r!hI@z_t`2WkBY6Y{}%A0`G04dC~!;dj&P zFKHM6`>wa^VrD)u4S=7nw{KuBVAdoka^;j~AA<}+`%^^=oLg^RB;2xSV02%2>3AKSyV*{#Ix|ihkWoWnyqAtmK8}+NfB&3!|9U~|ANnmz zuqeXV0PcKbEt{Qv| zAqRIkiLn-#hD`%!cs`F8zRIX%-{^@r!;^b-g2$xGNr!Nl?I;S1XlkY4br_k>Vw|9>m|&Bs@6RV1a}upmnmAGa?z{`c5( zKM4$n}hEz*+cFCWfyX3A!g_89A zU%m0aD~^L5r^7s}Xk%iYZFQyU4uJLRdq%D(T zsF|_1eL8WkOCv&mdX%lDk6p2EVKe=w*ETB3;F@iBbAwgo&z(49Odk*;Q{eO&5D0bx8s{#;;zj`7LT>Z#KaVy@E!7d@8x^nPX>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1 - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -getwd() - -source("combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for OR and stability plots -# you need merged_df3_comp -# since these are matched -# to allow pairwise corr -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df2 = merged_df3_comp -#my_df2 = merged_df3 -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df2) -str(my_df2) - -# sanity check -# Ensure correct data type in columns to plot: need to be factor -is.numeric(my_df2$OR) -#[1] TRUE - -# sanity check: should be <10 -if (max(my_df2$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -#<<<<<<<<<<<<<<<< -# REASSIGNMENT -# FOR Lig Plots -#<<<<<<<<<<<<<<<< - -Lig_df = my_df2 - -rm(my_df2) - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1 - -############# -# Plots: Bubble plot -# x = Position, Y = stability -# size of dots = OR -# col: stability -############# - -#================= -# generate plot 1: DUET vs OR by position as geom_points -#================= - -my_ats = 20 # axis text size -my_als = 22 # axis label size - -# Spelling Correction: made redundant as already corrected at the source - -#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising' -#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising' - -table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome)) - -g = ggplot(PS_df, aes(x = factor(Position) - , y = ratioDUET)) - -p1 = g + - geom_point(aes(col = DUET_outcome - , size = OR)) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_ats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , legend.text = element_text(size = my_als) - , legend.title = element_text(size = my_als) ) + - #, legend.key.size = unit(1, "cm")) + - labs(title = "" - , x = "Position" - , y = "DUET(PS)" - , size = "Odds Ratio" - , colour = "DUET Outcome") + - guides(colour = guide_legend(override.aes = list(size=4))) - -p1 - -#================= -# generate plot 2: Lig vs OR by position as geom_points -#================= -my_ats = 20 # axis text size -my_als = 22 # axis label size - -# Spelling Correction: made redundant as already corrected at the source - -#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising' -#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising' - -table(Lig_df$Lig_outcome) - -g = ggplot(Lig_df, aes(x = factor(Position) - , y = ratioPredAff)) - -p2 = g + - geom_point(aes(col = Lig_outcome - , size = OR))+ - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_ats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , legend.text = element_text(size = my_als) - , legend.title = element_text(size = my_als) ) + - #, legend.key.size = unit(1, "cm")) + - labs(title = "" - , x = "Position" - , y = "Ligand Affinity" - , size = "Odds Ratio" - , colour = "Ligand Outcome" - ) + - guides(colour = guide_legend(override.aes = list(size=4))) - -p2 - -#====================== -#combine using cowplot -#====================== -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches -#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi -theme_set(theme_gray()) # to preserve default theme - -printFile = cowplot::plot_grid(plot_grid(p1, p2 - , ncol = 1 - , align = 'v' - , labels = c("(a)", "(b)") - , label_size = my_als+5)) -print(printFile) -dev.off() - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R deleted file mode 100644 index 30b9981..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R +++ /dev/null @@ -1,154 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for Lig plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Plot: Barplot with scores (unordered) -# corresponds to Lig_outcome -# Stacked Barplot with colours: Lig_outcome @ position coloured by -# Lig_outcome. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding Lig_outcome. -#============================ - -#=================== -# Data for plots -#=================== - -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df = my_df -#%%%%%%%%%%%%%%%%%%%%%%%% - -rm(my_df) - -# sanity checks -upos = unique(my_df$Position) - -# should be a factor -is.factor(df$Lig_outcome) -#TRUE - -table(df$Lig_outcome) - -# should be -1 and 1: may not be in this case because you have filtered the data -# FIXME: normalisation before or after filtering? -min(df$ratioPredAff) # -max(df$ratioPredAff) # - -# sanity checks -tapply(df$ratioPredAff, df$Lig_outcome, min) -tapply(df$ratioPredAff, df$Lig_outcome, max) - -#****************** -# generate plot -#****************** - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -my_title = "Ligand affinity" - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = Lig_outcome), colour = "grey") + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -# for sanity and good practice -rm(df) -#======================= end of plot -# axis colours labels -# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors -# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R deleted file mode 100644 index 169bdaf..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R +++ /dev/null @@ -1,149 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#========================== - -########################### -# Data for DUET plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$DUET_outcome) -my_df$DUET_outcome = as.factor(my_df$DUET_outcome) -is.factor(my_df$DUET_outcome) -#[1] TRUE - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Plot 2: Barplot with scores (unordered) -# corresponds to DUET_outcome -# Stacked Barplot with colours: DUET_outcome @ position coloured by -# DUET outcome. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding DUET_outcome -#============================ - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -upos = unique(df$Position) - -# should be a factor -is.factor(my_df$DUET_outcome) -#[1] TRUE - -table(my_df$DUET_outcome) - -# should be -1 and 1 -min(df$ratioDUET) -max(df$ratioDUET) - -tapply(df$ratioDUET, df$DUET_outcome, min) -tapply(df$ratioDUET, df$DUET_outcome, max) - -#****************** -# generate plot -#****************** - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -my_title = "Protein stability (DUET)" - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = DUET_outcome), colour = "grey") + - - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -# for sanity and good practice -rm(df) -#======================= end of plot -# axis colours labels -# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors -# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R deleted file mode 100644 index a5d9361..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R +++ /dev/null @@ -1,202 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") -source("../barplot_colour_function.R") - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for Lig plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$Lig_outcome) -my_df$Lig_outcome = as.factor(my_df$Ligoutcome) -is.factor(my_df$Lig_outcome) -#[1] TRUE - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Plot: Barplot with scores (unordered) -# corresponds to Lig_outcome -# Stacked Barplot with colours: Lig_outcome @ position coloured by -# stability scores. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding Lig stability value. -# Normalised values (range between -1 and 1 ) to aid visualisation -# NOTE: since barplot plots discrete values, colour = score, so number of -# colours will be equal to the no. of unique normalised scores -# rather than a continuous scale -# will require generating the colour scale separately. -#============================ - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -table(df$Lig_outcome) - -# should be -1 and 1: may not be in this case because you have filtered the data -# FIXME: normalisation before or after filtering? -min(df$ratioPredAff) # -max(df$ratioPredAff) # - -# sanity checks -# very important!!!! -tapply(df$ratioPredAff, df$Lig_outcome, min) - -tapply(df$ratioPredAff, df$Lig_outcome, max) - - -#****************** -# generate plot -#****************** - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -# My colour FUNCTION: based on group and subgroup -# in my case; -# df = df -# group = Lig_outcome -# subgroup = normalised score i.e ratioPredAff - -# Prepare data: round off ratioLig scores -# round off to 3 significant digits: -# 165 if no rounding is performed: used to generate the originalgraph -# 156 if rounded to 3 places -# FIXME: check if reducing precision creates any ML prob - -# check unique values in normalised data -u = unique(df$ratioPredAff) - -# <<<<< ------------------------------------------- -# Run this section if rounding is to be used -# specify number for rounding -n = 3 -df$ratioLigR = round(df$ratioPredAff, n) -u = unique(df$ratioLigR) # 156 -# create an extra column called group which contains the "gp name and score" -# so colours can be generated for each unique values in this column -my_grp = df$ratioLigR -df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") - -# else -# uncomment the below if rounding is not required - -#my_grp = df$ratioLig -#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") - -# <<<<< ----------------------------------------------- - -# Call the function to create the palette based on the group defined above -colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp") -my_title = "Ligand affinity" - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -# for sanity and good practice -rm(df) -#======================= end of plot -# axis colours labels -# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors -# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R deleted file mode 100644 index 8828e90..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R +++ /dev/null @@ -1,192 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") -source("../barplot_colour_function.R") - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for DUET plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$DUET_outcome) -my_df$DUET_outcome = as.factor(my_df$DUET_outcome) -is.factor(my_df$DUET_outcome) -#[1] TRUE - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Barplot with scores (unordered) -# corresponds to DUET_outcome -# Stacked Barplot with colours: DUET_outcome @ position coloured by -# stability scores. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding DUET stability value. -# Normalised values (range between -1 and 1 ) to aid visualisation -# NOTE: since barplot plots discrete values, colour = score, so number of -# colours will be equal to the no. of unique normalised scores -# rather than a continuous scale -# will require generating the colour scale separately. -#============================ - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -upos = unique(df$Position) - -# should be a factor -is.factor(my_df$DUET_outcome) -#[1] TRUE - -table(df$DUET_outcome) - -# should be -1 and 1 -min(df$ratioDUET) -max(df$ratioDUET) - -tapply(df$ratioDUET, df$DUET_outcome, min) -tapply(df$ratioDUET, df$DUET_outcome, max) - -#****************** -# generate plot -#****************** - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -# My colour FUNCTION: based on group and subgroup -# in my case; -# df = df -# group = DUET_outcome -# subgroup = normalised score i.e ratioDUET - -# Prepare data: round off ratioDUET scores -# round off to 3 significant digits: -# 323 if no rounding is performed: used to generate the original graph -# 287 if rounded to 3 places -# FIXME: check if reducing precicion creates any ML prob - -# check unique values in normalised data -u = unique(df$ratioDUET) - -# <<<<< ------------------------------------------- -# Run this section if rounding is to be used -# specify number for rounding -n = 3 -df$ratioDUETR = round(df$ratioDUET, n) -u = unique(df$ratioDUETR) -# create an extra column called group which contains the "gp name and score" -# so colours can be generated for each unique values in this column -my_grp = df$ratioDUETR -df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") - -# else -# uncomment the below if rounding is not required - -#my_grp = df$ratioDUET -#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") - -# <<<<< ----------------------------------------------- - -# Call the function to create the palette based on the group defined above -colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") -my_title = "Protein stability (DUET)" - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -# for sanity and good practice -rm(df) -#======================= end of plot -# axis colours labels -# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors -# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R deleted file mode 100644 index 432749e..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R +++ /dev/null @@ -1,296 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -############################################################ -# 1: Installing and loading required packages and functions -############################################################ - -source("../Header_TT.R") -source("../barplot_colour_function.R") - -############################################################ -# Output dir for plots -############################################################ -out_dir = "~/git/Data/pyrazinamide/output/plots" - -############################################################ -# 2: call script the prepares the data with columns containing -# colours for axis labels -############################################################ - -source("subcols_axis_LIG.R") - -# this should return -#mut_pos_cols: 52, 4 -#my_df: 169, 39 - -# clear excess variable -# "mut_pos_cols" is just for inspection in case you need to cross check -# position numbers and colours -# open file from deskptop ("sample_axis_cols") for cross checking - -table(mut_pos_cols$lab_bg) - -sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True - -table(mut_pos_cols$lab_bg2) - -sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True - -table(mut_pos_cols$lab_fg) - -sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True - -# very important!: should be the length of the unique positions -my_axis_colours = mut_pos_cols$lab_fg - -# now clear mut_pos_cols -rm(mut_pos_cols) - -########################### -# 2: Plot: Lig scores -########################### -#========================== -# Plot 2: Barplot with scores (unordered) -# corresponds to Lig_outcome -# Stacked Barplot with colours: Lig_outcome @ position coloured by -# stability scores. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding PredAff stability value. -# Normalised values (range between -1 and 1 ) to aid visualisation -# NOTE: since barplot plots discrete values, colour = score, so number of -# colours will be equal to the no. of unique normalised scores -# rather than a continuous scale -# will require generating the colour scale separately. -#============================ -# sanity checks -upos = unique(my_df$Position) - -str(my_df$Lig_outcome) - -colnames(my_df) - -#=========================== -# Data preparation for plots -#=========================== -#!!!!!!!!!!!!!!!!! -# REASSIGNMENT -df <- my_df -#!!!!!!!!!!!!!!!!! - -rm(my_df) - -# sanity checks -# should be a factor -is.factor(df$Lig_outcome); -#FALSE - -df$Lig_outcome = as.factor(df$Lig_outcome) -is.factor(df$Lig_outcome); -#TRUE - -table(df$Lig_outcome) - -# check the range -min(df$ratioPredAff) -max(df$ratioPredAff) - -# sanity checks -# very important!!!! -tapply(df$ratioPredAff, df$Lig_outcome, min) - -tapply(df$ratioPredAff, df$Lig_outcome, max) - -# My colour FUNCTION: based on group and subgroup -# in my case; -# df = df -# group = Lig_outcome -# subgroup = normalised score i.e ratioPredAff - -# Prepare data: round off ratioPredAff scores -# round off to 3 significant digits: -# 323 if no rounding is performed: used to generate the original graph -# 287 if rounded to 3 places -# FIXME: check if reducing precicion creates any ML prob - -# check unique values in normalised data -u = unique(df$ratioPredAff) - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Run this section if rounding is to be used -# specify number for rounding -n = 3 -df$ratioPredAffR = round(df$ratioPredAff, n) -u = unique(df$ratioPredAffR) - -# create an extra column called group which contains the "gp name and score" -# so colours can be generated for each unique values in this column -my_grp = df$ratioPredAffR -df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") - -# ELSE -# uncomment the below if rounding is not required - -#my_grp = df$ratioPredAff -#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -#****************** -# generate plot -#****************** - -# Call the function to create the palette based on the group defined above -colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp") -my_title = "Ligand Affinity" -library(ggplot2) - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis according to frequency -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -#======================== -# plot with axis colours -#======================== -class(df$lab_bg) -# make this a named vector - -# define cartesian coord -my_xlim = length(unique(df$Position)); my_xlim - -# axis label size -my_xals = 15 -my_yals = 15 - -# axes text size -my_xats = 15 -my_yats = 18 - -# using geom_tile -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - coord_cartesian(xlim = c(1, my_xlim) - , ylim = c(0, 6) - , clip = "off") + - - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - geom_tile(aes(,-0.8, width = 0.95, height = 0.85) - , fill = df$lab_bg) + - geom_tile(aes(,-1.2, width = 0.95, height = -0.2) - , fill = df$lab_bg2) + - - # Here it's important to specify that your axis goes from 1 to max number of levels - theme( axis.text.x = element_text(size = my_xats - , angle = 90 - , hjust = 1 - , vjust = 0.4 - , colour = my_axis_colours) - , axis.text.y = element_text(size = my_yats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xals) - , axis.title.y = element_text(size = my_yals ) - , axis.ticks.x = element_blank() - ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -#======================== -# output plot as svg/png -#======================== -class(df$lab_bg) -# make this a named vector - -# define cartesian coord -my_xlim = length(unique(df$Position)); my_xlim - -# axis label size -my_xals = 18 -my_yals = 18 - -# axes text size -my_xats = 16 #14 in PS -my_yats = 18 - -# set output dir for plots -#getwd() -#setwd("~/git/Data/pyrazinamide/output/plots") -#getwd() - -plot_name = "barplot_LIG_acoloured.svg" -my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name - -svg(my_plot_name, width = 26, height = 4) - -g = ggplot(df, aes(factor(Position, ordered = T))) - -outFile = g + - coord_cartesian(xlim = c(1, my_xlim) - , ylim = c(0, 6) - , clip = "off" - ) + - - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + -# geom_tile(aes(,-0.6, width = 0.9, height = 0.7) -# , fill = df$lab_bg) + -# geom_tile(aes(,-1, width = 0.9, height = 0.3) -# , fill = df$lab_bg2) + - geom_tile(aes(,-0.8, width = 0.95, height = 0.85) - , fill = df$lab_bg) + - geom_tile(aes(,-1.2, width = 0.95, height = -0.2) - , fill = df$lab_bg2) + - -# Here it's important to specify that your axis goes from 1 to max number of levels - theme( axis.text.x = element_text(size = my_xats - , angle = 90 - , hjust = 1 - , vjust = 0.4 - , colour = my_axis_colours) - , axis.text.y = element_text(size = my_yats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xals) - , axis.title.y = element_text(size = my_yals ) - , axis.ticks.x = element_blank() - ) + - labs(title = "" - , x = "Position" - , y = "Frequency") - - -print(outFile) -dev.off() - -# for sanity and good practice -#rm(df) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R deleted file mode 100644 index 78029be..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R +++ /dev/null @@ -1,292 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -############################################################ -# 1: Installing and loading required packages and functions -############################################################ - -source("../Header_TT.R") -source("../barplot_colour_function.R") - -############################################################ -# Output dir for plots -############################################################ -out_dir = "~/git/Data/pyrazinamide/output/plots" - -############################################################ -# 2: call script the prepares the data with columns containing -# colours for axis labels -############################################################ - -source("subcols_axis.R") - -# this should return -#mut_pos_cols: 130, 4 -#my_df: 335, 39 - -# clear excess variable -# "mut_pos_cols" is just for inspection in case you need to cross check -# position numbers and colours -# open file from deskptop ("sample_axis_cols") for cross checking - -table(mut_pos_cols$lab_bg) - -sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True - -table(mut_pos_cols$lab_bg2) - -sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True - -table(mut_pos_cols$lab_fg) - -sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True - -# very important! -my_axis_colours = mut_pos_cols$lab_fg - -# now clear mut_pos_cols -rm(mut_pos_cols) - -########################### -# 2: Plot: DUET scores -########################### -#========================== -# Plot 2: Barplot with scores (unordered) -# corresponds to DUET_outcome -# Stacked Barplot with colours: DUET_outcome @ position coloured by -# stability scores. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding DUET stability value. -# Normalised values (range between -1 and 1 ) to aid visualisation -# NOTE: since barplot plots discrete values, colour = score, so number of -# colours will be equal to the no. of unique normalised scores -# rather than a continuous scale -# will require generating the colour scale separately. -#============================ -# sanity checks -upos = unique(my_df$Position) - -str(my_df$DUET_outcome) - -colnames(my_df) - -#=========================== -# Data preparation for plots -#=========================== -#!!!!!!!!!!!!!!!!! -# REASSIGNMENT -df <- my_df -#!!!!!!!!!!!!!!!!! - -rm(my_df) - -# sanity checks -# should be a factor -is.factor(df$DUET_outcome) -#TRUE - -table(df$DUET_outcome) - -# should be -1 and 1 -min(df$ratioDUET) -max(df$ratioDUET) - -# sanity checks -# very important!!!! -tapply(df$ratioDUET, df$DUET_outcome, min) - -tapply(df$ratioDUET, df$DUET_outcome, max) - -# My colour FUNCTION: based on group and subgroup -# in my case; -# df = df -# group = DUET_outcome -# subgroup = normalised score i.e ratioDUET - -# Prepare data: round off ratioDUET scores -# round off to 3 significant digits: -# 323 if no rounding is performed: used to generate the original graph -# 287 if rounded to 3 places -# FIXME: check if reducing precicion creates any ML prob - -# check unique values in normalised data -u = unique(df$ratioDUET) - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Run this section if rounding is to be used -# specify number for rounding -n = 3 -df$ratioDUETR = round(df$ratioDUET, n) -u = unique(df$ratioDUETR) - -# create an extra column called group which contains the "gp name and score" -# so colours can be generated for each unique values in this column -my_grp = df$ratioDUETR -df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") - -# ELSE -# uncomment the below if rounding is not required - -#my_grp = df$ratioDUET -#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -#****************** -# generate plot -#****************** - -# Call the function to create the palette based on the group defined above -colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") -my_title = "Protein stability (DUET)" -library(ggplot2) - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis according to frequency -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -#======================== -# plot with axis colours -#======================== -class(df$lab_bg) -# make this a named vector - -# define cartesian coord -my_xlim = length(unique(df$Position)); my_xlim - -# axis label size -my_xals = 15 -my_yals = 15 - -# axes text size -my_xats = 15 -my_yats = 18 - -# using geom_tile -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - coord_cartesian(xlim = c(1, my_xlim) - , ylim = c(0, 6) - , clip = "off") + - - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - geom_tile(aes(,-0.8, width = 0.95, height = 0.85) - , fill = df$lab_bg) + - geom_tile(aes(,-1.2, width = 0.95, height = -0.2) - , fill = df$lab_bg2) + - - # Here it's important to specify that your axis goes from 1 to max number of levels - theme( axis.text.x = element_text(size = my_xats - , angle = 90 - , hjust = 1 - , vjust = 0.4 - , colour = my_axis_colours) - , axis.text.y = element_text(size = my_yats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xals) - , axis.title.y = element_text(size = my_yals ) - , axis.ticks.x = element_blank() - ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -#======================== -# output plot as svg/png -#======================== -class(df$lab_bg) -# make this a named vector - -# define cartesian coord -my_xlim = length(unique(df$Position)); my_xlim - -# axis label size -my_xals = 18 -my_yals = 18 - -# axes text size -my_xats = 14 -my_yats = 18 - -# set output dir for plots -#getwd() -#setwd("~/git/Data/pyrazinamide/output/plots") -#getwd() - -plot_name = "barplot_PS_acoloured.svg" -my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name - -svg(my_plot_name, width = 26, height = 4) - -g = ggplot(df, aes(factor(Position, ordered = T))) - -outFile = g + - coord_cartesian(xlim = c(1, my_xlim) - , ylim = c(0, 6) - , clip = "off" - ) + - - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + -# geom_tile(aes(,-0.6, width = 0.9, height = 0.7) -# , fill = df$lab_bg) + -# geom_tile(aes(,-1, width = 0.9, height = 0.3) -# , fill = df$lab_bg2) + - geom_tile(aes(,-0.8, width = 0.95, height = 0.85) - , fill = df$lab_bg) + - geom_tile(aes(,-1.2, width = 0.95, height = -0.2) - , fill = df$lab_bg2) + - -# Here it's important to specify that your axis goes from 1 to max number of levels - theme( axis.text.x = element_text(size = my_xats - , angle = 90 - , hjust = 1 - , vjust = 0.4 - , colour = my_axis_colours) - , axis.text.y = element_text(size = my_yats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xals) - , axis.title.y = element_text(size = my_yals ) - , axis.ticks.x = element_blank() - ) + - labs(title = "" - , x = "Position" - , y = "Frequency") - - -print(outFile) -dev.off() - -# for sanity and good practice -#rm(df) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R deleted file mode 100644 index c4826d3..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R +++ /dev/null @@ -1,215 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") - -#require(data.table) -#require(dplyr) - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for Lig plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$Lig_outcome) -my_df$Lig_outcome = as.factor(my_df$lig_outcome) -is.factor(my_df$Lig_outcome) -#[1] TRUE - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Plot: Basic barplots -#=========================== - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< -rm(my_df) - -# sanity checks -str(df) - -if (identical(df$Position, df$position)){ - print("Sanity check passed: Columns 'Position' and 'position' are identical") -} else{ - print("Error!: Check column names and info contained") -} - -#**************** -# generate plot: No of stabilising and destabilsing muts -#**************** -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -svg('basic_barplots_LIG.svg') - -my_ats = 25 # axis text size -my_als = 22 # axis label size - -# uncomment as necessary for either directly outputting results or -# printing on the screen -g = ggplot(df, aes(x = Lig_outcome)) -#prinfFile = g + geom_bar( - g + geom_bar( - aes(fill = Lig_outcome) - , show.legend = TRUE -) + geom_label( - stat = "count" - , aes(label = ..count..) - , color = "black" - , show.legend = FALSE - , size = 10) + theme( - axis.text.x = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_text(size=my_als) - , axis.text.y = element_text(size = my_ats) - , legend.position = c(0.73,0.8) - , legend.text = element_text(size=my_als-2) - , legend.title = element_text(size=my_als) - , plot.title = element_blank() - ) + labs( - title = "" - , y = "Number of SNPs" - #, fill='Ligand Outcome' - ) + scale_fill_discrete(name = "Ligand Outcome" - , labels = c("Destabilising", "Stabilising")) -print(prinfFile) -dev.off() - -#**************** -# generate plot: No of positions -#**************** -#get freq count of positions so you can subset freq<1 -#require(data.table) -setDT(df)[, pos_count := .N, by = .(Position)] #169, 36 - -head(df$pos_count) -table(df$pos_count) -# this is cummulative -#1 2 3 4 5 6 -#5 24 36 56 30 18 - -# use group by on this -snpsBYpos_df <- df %>% - group_by(Position) %>% - summarize(snpsBYpos = mean(pos_count)) - -table(snpsBYpos_df$snpsBYpos) -#1 2 3 4 5 6 -#5 12 12 14 6 3 -# this is what will get plotted - -svg('position_count_LIG.svg') - -my_ats = 25 # axis text size -my_als = 22 # axis label size - -g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) -prinfFile = g + geom_bar( - #g + geom_bar( - aes (alpha = 0.5) - , show.legend = FALSE -) + - geom_label( - stat = "count", aes(label = ..count..) - , color = "black" - , size = 10 - ) + - theme( - axis.text.x = element_text( - size = my_ats - , angle = 0 - ) - , axis.text.y = element_text( - size = my_ats - , angle = 0 - , hjust = 1 - ) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , plot.title = element_blank() - ) + - labs( - x = "Number of SNPs" - , y = "Number of Sites" - ) -print(prinfFile) -dev.off() -######################################################################## -# end of Lig barplots # -######################################################################## - - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R deleted file mode 100644 index 51a2812..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R +++ /dev/null @@ -1,211 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#========================== - -########################### -# Data for DUET plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$DUET_outcome) -my_df$DUET_outcome = as.factor(my_df$DUET_outcome) -is.factor(my_df$DUET_outcome) -#[1] TRUE - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Plot: Basic barplots -#=========================== - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -str(df) - -if (identical(df$Position, df$position)){ - print("Sanity check passed: Columns 'Position' and 'position' are identical") -} else{ - print("Error!: Check column names and info contained") - } - -#**************** -# generate plot: No of stabilising and destabilsing muts -#**************** -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -svg('basic_barplots_DUET.svg') - -my_ats = 25 # axis text size -my_als = 22 # axis label size - -theme_set(theme_grey()) - -# uncomment as necessary for either directly outputting results or -# printing on the screen -g = ggplot(df, aes(x = DUET_outcome)) -prinfFile = g + geom_bar( -#g + geom_bar( - aes(fill = DUET_outcome) - , show.legend = TRUE - ) + geom_label( - stat = "count" - , aes(label = ..count..) - , color = "black" - , show.legend = FALSE - , size = 10) + theme( - axis.text.x = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_text(size=my_als) - , axis.text.y = element_text(size = my_ats) - , legend.position = c(0.73,0.8) - , legend.text = element_text(size=my_als-2) - , legend.title = element_text(size=my_als) - , plot.title = element_blank() - ) + labs( - title = "" - , y = "Number of SNPs" - #, fill='DUET Outcome' - ) + scale_fill_discrete(name = "DUET Outcome" - , labels = c("Destabilising", "Stabilising")) - -print(prinfFile) -dev.off() - -#**************** -# generate plot: No of positions -#**************** -#get freq count of positions so you can subset freq<1 -#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36 - -setDT(df)[, pos_count := .N, by = .(Position)] #335, 36 -table(df$pos_count) -# this is cummulative -#1 2 3 4 5 6 -#34 76 63 104 40 18 - -# use group by on this -snpsBYpos_df <- df %>% - group_by(Position) %>% - summarize(snpsBYpos = mean(pos_count)) - -table(snpsBYpos_df$snpsBYpos) -#1 2 3 4 5 6 -#34 38 21 26 8 3 - -foo = select(df, Mutationinformation - , WildPos - , wild_type - , mutant_type - , mutation_info - , position - , pos_count) #335, 5 - -getwd() -write.csv(foo, "../Data/pos_count_freq.csv") - -svg('position_count_DUET.svg') -my_ats = 25 # axis text size -my_als = 22 # axis label size - -g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) -prinfFile = g + geom_bar( -#g + geom_bar( - aes (alpha = 0.5) - , show.legend = FALSE - ) + - geom_label( - stat = "count", aes(label = ..count..) - , color = "black" - , size = 10 - ) + - theme( - axis.text.x = element_text( - size = my_ats - , angle = 0 - ) - , axis.text.y = element_text( - size = my_ats - , angle = 0 - , hjust = 1 - ) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , plot.title = element_blank() - ) + - labs( - x = "Number of SNPs" - , y = "Number of Sites" - ) -print(prinfFile) -dev.off() -######################################################################## -# end of DUET barplots # -######################################################################## - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R deleted file mode 100644 index 0059bca..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R +++ /dev/null @@ -1,175 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") - -#source("barplot_colour_function.R") - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#========================== - -########################### -# Data for PS Corr plots -# you need merged_df3_comp -# since these are matched -# to allow pairwise corr -########################### - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Plot: Correlation plots -#=========================== - -#=================== -# Data for plots -#=================== - -#!!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT -df = my_df -#!!!!!!!!!!!!!!!!!!!!!!!! - -rm(my_df) - -# sanity checks -str(df) - -table(df$DUET_outcome) - -# unique positions -length(unique(df$Position)) #{RESULT: unique positions for comp data} - - -# subset data to generate pairwise correlations -corr_data = df[, c("ratioDUET" -# , "ratioPredAff" -# , "DUETStability_Kcalpermol" -# , "PredAffLog" -# , "OR" - , "logor" -# , "pvalue" - , "neglog10pvalue" - , "AF" - , "DUET_outcome" -# , "Lig_outcome" - , "pyrazinamide" - )] -dim(corr_data) -rm(df) - -# assign nice colnames (for display) -my_corr_colnames = c("DUET" -# , "Ligand Affinity" -# , "DUET_raw" -# , "Lig_raw" -# , "OR" - , "Log(Odds Ratio)" -# , "P-value" - , "-LogP" - , "Allele Frequency" - , "DUET_outcome" -# , "Lig_outcome" - , "pyrazinamide") - -# sanity check -if (length(my_corr_colnames) == length(corr_data)){ - print("Sanity check passed: corr_data and corr_names match in length") -}else{ - print("Error: length mismatch!") -} - -colnames(corr_data) -colnames(corr_data) <- my_corr_colnames -colnames(corr_data) - -############### -# PLOTS: corr -# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs -############### -#default pairs plot -start = 1 -end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column -offset = 1 - -my_corr = corr_data[start:(end-offset)] -head(my_corr) - -#my_cols = c("#f8766d", "#00bfc4") -# deep blue :#007d85 -# deep red: #ae301e - -#========== -# psych: ionformative since it draws the ellipsoid -# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html -# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs -#========== - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots" -getwd() - -svg('DUET_corr.svg', width = 15, height = 15) -printFile = pairs.panels(my_corr[1:4] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = TRUE # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))] - , pch = 21 - , jitter = T - #, alpha = .05 - #, points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 3 - , cex.axis = 2.5 - , cex.labels = 3 - , cex.cor = 1 - , smooth = F -) - -print(printFile) -dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R deleted file mode 100644 index 4e05d41..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R +++ /dev/null @@ -1,187 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") - -#source("barplot_colour_function.R") - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for Lig Corr plots -# you need merged_df3_comp -# since these are matched -# to allow pairwise corr -########################### - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Plot: Correlation plots -#=========================== - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -str(df) - -table(df$Lig_outcome) - -# unique positions -length(unique(df$Position)) #{RESULT: unique positions for comp data} - -# subset data to generate pairwise correlations -corr_data = df[, c(#"ratioDUET", - "ratioPredAff" -# , "DUETStability_Kcalpermol" -# , "PredAffLog" -# , "OR" - , "logor" -# , "pvalue" - , "neglog10pvalue" - , "AF" -# , "DUET_outcome" - , "Lig_outcome" - , "pyrazinamide" - )] -dim(corr_data) -rm(df) - -# assign nice colnames (for display) -my_corr_colnames = c(#"DUET", - "Ligand Affinity" -# ,"DUET_raw" -# , "Lig_raw" -# , "OR" - , "Log(Odds Ratio)" -# , "P-value" - , "-LogP" - , "Allele Frequency" -# , "DUET_outcome" - , "Lig_outcome" - , "pyrazinamide") - -# sanity check -if (length(my_corr_colnames) == length(corr_data)){ - print("Sanity check passed: corr_data and corr_names match in length") -}else{ - print("Error: length mismatch!") -} - -colnames(corr_data) -colnames(corr_data) <- my_corr_colnames -colnames(corr_data) - -############### -# PLOTS: corr -# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs -############### - -# default pairs plot -start = 1 -end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column -offset = 1 - -my_corr = corr_data[start:(end-offset)] -head(my_corr) - -#my_cols = c("#f8766d", "#00bfc4") -# deep blue :#007d85 -# deep red: #ae301e - -#========== -# psych: ionformative since it draws the ellipsoid -# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html -# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs -#========== - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots" -getwd() - -svg('Lig_corr.svg', width = 15, height = 15) -printFile = pairs.panels(my_corr[1:4] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = TRUE # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))] - , pch = 21 - , jitter = T -# , alpha = .05 -# , points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 3 - , cex.axis = 2.5 - , cex.labels = 3 - , cex.cor = 1 - , smooth = F -) -print(printFile) -dev.off() - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R deleted file mode 100644 index 1f868e4..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R +++ /dev/null @@ -1,227 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("barplot_colour_function.R") - -require(data.table) - -######################################################################## -# Read file: call script for combining df # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#========================== - -########################### -# Data for plots -# you need merged_df2, comprehensive one -# since this has one-many relationship -# i.e the same SNP can belong to multiple lineages -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -#========================== -# Plot: Lineage barplot -# x = lineage y = No. of samples -# col = Lineage -# fill = lineage -#============================ -table(my_df$lineage) - -# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV -#3 104 1293 264 1311 6 6 105 - -#=========================== -# Plot: Lineage Barplots -#=========================== - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df <- my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< -rm(my_df) - -# get freq count of positions so you can subset freq<1 -#setDT(df)[, lineage_count := .N, by = .(lineage)] - -#****************** -# generate plot: barplot of mutation by lineage -#****************** -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -df_lin = subset(df, subset = lineage %in% sel_lineages ) - -#FIXME; add sanity check for numbers. -# Done this manually - -############################################################ - -######### -# Data for barplot: Lineage barplot -# to show total samples and number of unique mutations -# within each linege -########## - -# Create df with lineage inform & no. of unique mutations -# per lineage and total samples within lineage -# this is essentially barplot with two y axis - -bar = bar = as.data.frame(sel_lineages) #4, 1 -total_snps_u = NULL -total_samples = NULL - -for (i in sel_lineages){ - #print(i) - curr_total = length(unique(df$id)[df$lineage==i]) - total_samples = c(total_samples, curr_total) - print(total_samples) - - foo = df[df$lineage==i,] - print(paste0(i, "=======")) - print(length(unique(foo$Mutationinformation))) - curr_count = length(unique(foo$Mutationinformation)) - - total_snps_u = c(total_snps_u, curr_count) -} - -print(total_snps_u) -bar$num_snps_u = total_snps_u -bar$total_samples = total_samples -bar - -#***************** -# generate plot: lineage barplot with two y-axis -#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 -#***************** - -bar$num_snps_u = y1 -bar$total_samples = y2 -sel_lineages = x - -to_plot = data.frame(x = x - , y1 = y1 - , y2 = y2) -to_plot - -melted = melt(to_plot, id = "x") -melted - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -svg('lineage_basic_barplot.svg') - -my_ats = 20 # axis text size -my_als = 22 # axis label size - -g = ggplot(melted - , aes(x = x - , y = value - , fill = variable) - ) - - -printFile = g + geom_bar( - -#g + geom_bar( - stat = "identity" - , position = position_stack(reverse = TRUE) - , alpha=.75 - , colour='grey75' - ) + theme( - axis.text.x = element_text( - size = my_ats -# , angle= 30 - ) - , axis.text.y = element_text(size = my_ats - #, angle = 30 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text( - size = my_als - , colour = 'black' - ) - , axis.title.y = element_text( - size = my_als - , colour = 'black' - ) - , legend.position = "top" - , legend.text = element_text(size = my_als) - - #) + geom_text( - ) + geom_label( - aes(label = value) - , size = 5 - , hjust = 0.5 - , vjust = 0.5 - , colour = 'black' - , show.legend = FALSE - #, check_overlap = TRUE - , position = position_stack(reverse = T) - #, position = (' - - ) + labs( - title = '' - , x = '' - , y = "Number" - , fill = 'Variable' - , colour = 'black' - ) + scale_fill_manual( - values = c('grey50', 'gray75') - , name='' - , labels=c('Mutations', 'Total Samples') - ) + scale_x_discrete( - breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') - ) -print(printFile) -dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R deleted file mode 100644 index e4e6972..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R +++ /dev/null @@ -1,253 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("barplot_colour_function.R") -#require(data.table) - -######################################################################## -# Read file: call script for combining df for Lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -table(my_df$mutation_info) - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## -#========================== -# Data for plot: assign as -# necessary -#=========================== - -# uncomment as necessary -#!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT - -#================== -# data for ALL muts -#================== -plot_df = my_df -my_plot_name = 'lineage_dist_PS.svg' -#my_plot_name = 'lineage_dist_PS_comp.svg' - -#======================= -# data for dr_muts ONLY -#======================= -#plot_df = my_df_dr -#my_plot_name = 'lineage_dist_dr_PS.svg' -#my_plot_name = 'lineage_dist_dr_PS_comp.svg' -#!!!!!!!!!!!!!!!!!!!!!!! - -#========================== -# Plot: Lineage Distribution -# x = mcsm_values, y = dist -# fill = stability -#============================ - -#=================== -# Data for plots -#=================== - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -# uncomment as necessary -df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35 - -# refactor -df_lin$lineage = factor(df_lin$lineage) - -table(df_lin$lineage) #{RESULT: No of samples within lineage} -#lineage1 lineage2 lineage3 lineage4 -#78 961 195 803 - -# when merged_df2_comp is used -#lineage1 lineage2 lineage3 lineage4 -#77 955 194 770 - -length(unique(df_lin$Mutationinformation)) -#{Result: No. of unique mutations the 4 lineages contribute to} - -# sanity checks -r1 = 2:5 # when merged_df2 used: because there is missing lineages -if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) { - print ("sanity check passed: numbers match") -} else{ - print("Error!: check your numbers") -} - -#!!!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT -df <- df_lin -#!!!!!!!!!!!!!!!!!!!!!!!!! - -rm(df_lin) - -#****************** -# generate distribution plot of lineages -#****************** -# basic: could improve this! -library(plotly) -library(ggridges) - -my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - -g <- ggplot(df, aes(x = ratioPredAff)) + - geom_density(aes(fill = Lig_outcome) - , alpha = 0.5) + - facet_wrap( ~ lineage - , scales = "free" - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian(xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" -) - ggtitle("Kernel Density estimates of Ligand affinity by lineage") - -ggplotly(g) - -# 2 : ggridges (good!) - -my_ats = 15 # axis text size -my_als = 20 # axis label size - -my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -# check plot name -my_plot_name - -svg(my_plot_name) - -printFile = ggplot( df, aes(x = ratioPredAff - , y = Lig_outcome) ) + - - geom_density_ridges_gradient( aes(fill = ..x..) - , scale = 3 - , size = 0.3 ) + - facet_wrap( ~lineage - , scales = "free" -# , switch = 'x' - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" - ) + - - scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4") - , name = "Ligand Affinity" ) + - theme( axis.text.x = element_text( size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) -# , axis.text.y = element_text( size = my_ats -# , angle = 0 -# , hjust = 1 -# , vjust = 0) - , axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = 10) - , legend.title = element_text(size = my_als) -# , legend.position = c(0.3, 0.8) -# , legend.key.height = unit(1, 'mm') - ) - -print(printFile) -dev.off() -#=================================================== - -# COMPARING DISTRIBUTIONS -head(df$lineage) -df$lineage = as.character(df$lineage) - -lin1 = df[df$lineage == "lineage1",]$ratioPredAff -lin2 = df[df$lineage == "lineage2",]$ratioPredAff -lin3 = df[df$lineage == "lineage3",]$ratioPredAff -lin4 = df[df$lineage == "lineage4",]$ratioPredAff - -# ks test -ks.test(lin1,lin2) -ks.test(lin1,lin3) -ks.test(lin1,lin4) - -ks.test(lin2,lin3) -ks.test(lin2,lin4) - -ks.test(lin3,lin4) - - - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R deleted file mode 100644 index 703a206..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R +++ /dev/null @@ -1,229 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("../barplot_colour_function.R") -#require(data.table) - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA for pyrazinamide: -# merged_df2 -# merged_df3 - -# df without NA for pyrazinamide: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -# using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available, hence use df with NA -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -table(my_df$mutation_info); str(my_df$mutation_info) - -# subset df with dr muts only -my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") -table(my_df_dr$mutation_info) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Run two times: -# uncomment as necessary -# 1) for all muts -# 2) for dr_muts -#=========================== - -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT - -#================ -# for ALL muts -#================ -plot_df = my_df -my_plot_name = 'lineage_dist_PS.svg' -#my_plot_name = 'lineage_dist_PS_comp.svg' - -#================ -# for dr muts ONLY -#================ -#plot_df = my_df_dr -#my_plot_name = 'lineage_dist_dr_PS.svg' -#my_plot_name = 'lineage_dist_dr_PS_comp.svg' - -#%%%%%%%%%%%%%%%%%%%%%%%% - -#========================== -# Plot: Lineage Distribution -# x = mcsm_values, y = dist -# fill = stability -#============================ - -#=================== -# Data for plots -#=================== -table(plot_df$lineage); str(plot_df$lineage) - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -# uncomment as necessary -df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) - -# refactor -df_lin$lineage = factor(df_lin$lineage) - -table(df_lin$lineage) #{RESULT: No of samples within lineage} -#lineage1 lineage2 lineage3 lineage4 - -length(unique(df_lin$Mutationinformation)) -#{Result: No. of unique mutations the 4 lineages contribute to} - -# sanity checks -r1 = 2:5 # when merged_df2 used: because there is missing lineages -if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { - print ("sanity check passed: numbers match") -} else{ - print("Error!: check your numbers") -} - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df <- df_lin -#%%%%%%%%%%%%%%%%%%%%%%%%% - -rm(df_lin) - -#****************** -# generate distribution plot of lineages -#****************** -# basic: could improve this! -#library(plotly) -#library(ggridges) - -g <- ggplot(df, aes(x = ratioDUET)) + - geom_density(aes(fill = DUET_outcome) - , alpha = 0.5) + facet_wrap(~ lineage, - scales = "free") + - ggtitle("Kernel Density estimates of Protein stability by lineage") - -ggplotly(g) - -# 2 : ggridges (good!) -my_ats = 15 # axis text size -my_als = 20 # axis label size - -my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -# check plot name -my_plot_name - -# output svg -svg(my_plot_name) -printFile = ggplot(df, aes(x = ratioDUET - , y = DUET_outcome))+ - - #printFile=geom_density_ridges_gradient( - geom_density_ridges_gradient(aes(fill = ..x..) - , scale = 3 - , size = 0.3 ) + - facet_wrap( ~lineage - , scales = "free" -# , switch = 'x' - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" -) + - scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") - , name = "DUET" ) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) -# , axis.text.y = element_text(size = my_ats -# , angle = 0 -# , hjust = 1 -# , vjust = 0) - , axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = 10) - , legend.title = element_text(size = my_als) -# , legend.position = c(0.3, 0.8) -# , legend.key.height = unit(1, 'mm') - ) - -print(printFile) -dev.off() - -#=!=!=!=!=!=!=! -# COMMENT: Not much differences in the distributions -# when using merged_df2 or merged_df2_comp. -# Also, the lineage differences disappear when looking at all muts -# The pattern we are interested in is possibly only for dr_mutations -#=!=!=!=!=!=!=! -#=================================================== - -# COMPARING DISTRIBUTIONS: KS test -# run: "../KS_test_PS.R" - - - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R deleted file mode 100644 index f60fb0b..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R +++ /dev/null @@ -1,250 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") - -#source("barplot_colour_function.R") - -library(ggseqlogo) - -#======= -# input -#======= -############# -# msa file: output of generate_mut_sequences.py -############# -homedir = '~' -indir = 'git/Data/pyrazinamide/output' -in_filename = "gene_msa.txt" -infile = paste0(homedir, '/', indir,'/', in_filename) -print(infile) - -#======= -# input -#======= -############# -# combined dfs -############# -source("../combining_two_df.R") - -########################### -# Data for Logo plots -# you need big df i.e -# merged_df2 -# or -# merged_df2_comp -# since these have unique SNPs -# I prefer to use the merged_df2 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# doesn't work if you use the big df as it has duplicate snps -#rownames(my_df) = my_df$Mutationinformation - -# sanity check: should be True -table(my_df$position == my_df$Position) - -c1 = unique(my_df$Position) # 130 -nrow(my_df) # 3092 - -#FIXME -#!!! RESOLVE !!! -# get freq count of positions and add to the df -setDT(my_df)[, occurrence_sample := .N, by = .(id)] -table(my_df$occurrence_sample) - - -my_df2 = my_df %>% - select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample) - -write.csv(my_df2, "my_df2.csv") - -# extract freq_pos>1 since this will not add to much in the logo plot -# pos 5 has one mutation but coming from atleast 5 samples? -table(my_df$occurrence) -foo = my_df[my_df$occurrence ==1,] - -# uncomment as necessary -my_data_snp = my_df #3092 - -#!!! RESOLVE -# FIXME -my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019 - -u = unique(my_data_snp$Position) #96 - - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -######################################################### -# Task: To generate a logo plot or bar plot but coloured -# aa properties. -# step1: read mcsm file and OR file -# step2: plot wild type positions -# step3: plot mutants per position coloured by aa properties -# step4: make the size of the letters/bars prop to OR if you can! -######################################################### -##useful links -#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -#https://omarwagih.github.io/ggseqlogo/ -#https://kkdey.github.io/Logolas-pages/workflow.html -#A new sequence logo plot to highlight enrichment and depletion. -# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ - -##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ - -#============== -# matrix for mutant type -# frequency of mutant type by position -#============== -table(my_data_snp$Mutant_type, my_data_snp$Position) -tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) -class(tab_mt) -# unclass to convert to matrix -tab_mt = unclass(tab_mt) -tab_mt = as.matrix(tab_mt, rownames = T) - -# should be TRUE -is.matrix(tab_mt) - -rownames(tab_mt) #aa -colnames(tab_mt) #pos - -#********************** -# Plot 1: mutant logo -#********************** -my_ymax = max(my_data_snp$occurrence); my_ymax -my_ylim = c(0,my_ymax) # very important - -# axis sizes -# common: text and label -my_ats = 15 -my_als = 20 - -# individual: text and label -my_xats = 15 -my_yats = 20 -my_xals = 15 -my_yals = 20 - -# legend size: text and label -my_lts = 20 -#my_lls = 20 - -# Color scheme based on chemistry of amino acids -chemistry = data.frame( - letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'), - group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)), - col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)), - stringsAsFactors = F -) - -# uncomment as necessary -my_type = "EDLogo" -#my_type = "Logo" - -logomaker(tab_mt - , type = my_type - , return_heights = T -# , color_type = "per_row" -# , colors = chemistry$col -# , method = 'custom' -# , seq_type = 'aa' -# , col_scheme = "taylor" -# , col_scheme = "chemistry2" -) + -theme(legend.position = "bottom" - , legend.title = element_blank() - , legend.text = element_text(size = my_lts ) - , axis.text.x = element_text(size = my_ats , angle = 90) - , axis.text.y = element_text(size = my_ats , angle = 90)) - -p0 = logomaker(tab_mt - , type = my_type - , return_heights = T - , color_type = "per_row" - , colors = chemistry$col -# , seq_type = 'aa' -# , col_scheme = "taylor" -# , col_scheme = "chemistry2" -) + - #ylab('my custom height') + - theme(axis.text.x = element_blank()) + -# theme_logo()+ - # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) - scale_x_continuous(breaks = 1:ncol(tab_mt) - , labels = colnames(tab_mt))+ - scale_y_continuous( breaks = 1:my_ymax - , limits = my_ylim) - -p0 - -# further customisation -p1 = p0 + theme(legend.position = "bottom" - , legend.title = element_blank() - , legend.text = element_text(size = my_lts) - , axis.text.x = element_text(size = my_ats , angle = 90) - , axis.text.y = element_text(size = my_ats , angle = 90)) -p1 - -#======= -# input -#======= -############# -# msa file: output of generate_mut_sequences.py -############# -homedir = '~' -indir = 'git/Data/pyrazinamide/output' -in_filename = "gene_msa.txt" -infile = paste0(homedir, '/', indir,'/', in_filename) -print(infile) - -############## -# ggseqlogo: custom matrix of my data -############## -snps = read.csv(infile - , stringsAsFactors = F - , header = F) #3072, - -class(snps); str(snps) # df and chr - -# turn to a character vector -snps2 = as.character(snps[1:nrow(snps),]) - -class(snps2); str(snps2) #character, chr - -# plot -logomaker(snps2, type = my_type - , color_type = "per_row") + - theme(axis.text.x = element_blank()) + - theme_logo()+ - # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) - scale_x_continuous(breaks = 1:ncol(tab_mt) - , labels = colnames(tab_mt))+ - scale_y_continuous( breaks = 0:5 - , limits = my_ylim) - - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R deleted file mode 100644 index 80f1971..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R +++ /dev/null @@ -1,273 +0,0 @@ -getwd() -setwd("~/git//LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -# TASK: Multiple mutations per site -# as aa symbol coloured by aa property - -######################################################################## -# Installing and loading required packages # -######################################################################## - -#source("../Header_TT.R") - -#source("barplot_colour_function.R") - -library(ggseqlogo) - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -#merged_df2 # 3092, 35 -#merged_df2_comp #3012, 35 - -#merged_df3 #335, 35 -#merged_df3_comp #293, 35 -#========================== - -########################### -# Data for Logo plots -# you need small df i.e -# merged_df3 -# or -# merged_df3_comp? possibly -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 # to show multiple mutations per site -my_df = read.csv(file.choose()) -#%%%%%%%%%%%%%%%%%%%%%%%% - -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -colnames(my_df) -str(my_df) - -rownames(my_df) = my_df$Mutationinformation - -c1 = unique(my_df$Position) #130 -nrow(my_df) #335 - -table(my_df$occurrence) -#1 2 3 4 5 6 -#34 76 63 104 40 18 - -# get freq count of positions so you can subset freq<1 -#: already done in teh combining script -#require(data.table) -#setDT(my_df)[, occurrence := .N, by = .(Position)] #189, 36 - -table(my_df$Position); table(my_df$occurrence) - -# extract freq_pos>1 -my_data_snp = my_df[my_df$occurrence!=1,] #301, 36 -u_pos = unique(my_data_snp$Position) #96 - -# sanity check -exp_dim = nrow(my_df) - table(my_df$occurrence)[[1]]; exp_dim -if ( nrow(my_data_snp) == exp_dim ){ - print("Sanity check passed: Data filtered correctly, dim match") -} else { - print("Error: Please Debug") -} - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -######################################################### -# Task: To generate a logo plot or bar plot but coloured -# aa properties. -# step1: read data file -# step2: plot wild type positions -# step3: plot mutants per position coloured by aa properties -# step4: make the size of the letters/bars prop to OR if you can! -######################################################### -# useful links -# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -# https://omarwagih.github.io/ggseqlogo/ -# https://kkdey.github.io/Logolas-pages/workflow.html -# A new sequence logo plot to highlight enrichment and depletion. -# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ -# very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ - - -############# -# PLOTS: Bar plot with aa properties -# using gglogo -# useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -############# - -############## -# ggseqlogo: custom matrix of my data -############## - -#============== -# matrix for mutant type -# frequency of mutant type by position -#============== -table(my_data_snp$Mutant_type, my_data_snp$Position) -tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) -class(tab_mt) - -# unclass to convert to matrix -tab_mt = unclass(tab_mt) -tab_mt = as.matrix(tab_mt, rownames = T) - -# should be TRUE -is.matrix(tab_mt) - -rownames(tab_mt) #aa -colnames(tab_mt) #pos - -#============== -# matrix for wild type -# frequency of wild type by position -#============== -# remove wt duplicates -wt = my_data_snp[, c("Position", "Wild_type")] #301, 2 -wt = wt[!duplicated(wt),]#96, 2 - -table(wt$Wild_type) # contains duplicates - -tab_wt = table(wt$Wild_type, wt$Position); tab_wt # should all be 1 - -tab_wt = unclass(tab_wt) #important -class(tab_wt); rownames(tab_wt) -#tab_wt = as.matrix(tab_wt, rownames = T) - -rownames(tab_wt) -rownames(tab_mt) - -# sanity check -if (ncol(tab_wt) == length(u_pos) ){ - print("Sanity check passed: wt data dim match") -} else { - print("Error: Please debug") -} - -#************** -# Plot 1: mutant logo -#************** -#install.packages("digest") -#library(digest) -# following example -require(ggplot2) -require(reshape2) -library(gglogo) -library(ggrepel) -library(ggseqlogo) - -# generate seq logo for mutant type -my_ymax = max(my_data_snp$occurrence); my_ymax -my_ylim = c(0, my_ymax) -#my_yrange = 1:my_ymax; my_yrange - -# axis sizes -# common: text and label -my_ats = 15 -my_als = 20 - -# individual: text and label -my_xats = 15 -my_yats = 20 -my_xals = 15 -my_yals = 20 - -# legend size: text and label -my_lts = 20 -#my_lls = 20 - -p0 = ggseqlogo(tab_mt - , method = 'custom' - , seq_type = 'aa' -# , col_scheme = "taylor" -# , col_scheme = "chemistry2" -) + -# ylab('my custom height') + - theme(axis.text.x = element_blank()) + - theme_logo()+ -# scale_x_continuous(breaks=1:51, parse (text = colnames(tab_mt)) ) - scale_x_continuous(breaks = 1:ncol(tab_mt) - , labels = colnames(tab_mt))+ - scale_y_continuous( breaks = 1:my_ymax - , limits = my_ylim) - -p0 - -# further customisation -p1 = p0 + theme(legend.position = "none" - , legend.title = element_blank() - , legend.text = element_text(size = my_lts) - , axis.text.x = element_text(size = my_xats, angle = 90) - , axis.text.y = element_text(size = my_yats, angle = 90)) -p1 - -#************** -# Plot 2: for wild_type -# with custom x axis to reflect my aa positions -#************** -# sanity check: MUST BE TRUE -# for the correctnes of the x axis -identical(colnames(tab_mt), colnames(tab_wt)) -identical(ncol(tab_mt), ncol(tab_wt)) - -p2 = ggseqlogo(tab_wt - , method = 'custom' - , seq_type = 'aa' -# , col_scheme = "taylor" -# , col_scheme = chemistry2 -) + -# ylab('my custom height') + - theme(axis.text.x = element_blank() - , axis.text.y = element_blank()) + - theme_logo() + - scale_x_continuous(breaks = 1:ncol(tab_wt) - , labels = colnames(tab_wt)) + - scale_y_continuous( breaks = 0:1 - , limits = my_ylim ) - -p2 - -# further customise - -p3 = p2 + - theme(legend.position = "bottom" - , legend.text = element_text(size = my_lts) - , axis.text.x = element_text(size = my_ats - , angle = 90) - , axis.text.y = element_blank()) - -p3 - - -# Now combine using cowplot, which ensures the plots are aligned -suppressMessages( require(cowplot) ) - -plot_grid(p1, p3, ncol = 1, align = 'v') #+ -# background_grid(minor = "xy" -# , size.minor = 1 -# , colour.minor = "grey86") - - -#colour scheme -#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R deleted file mode 100644 index 2049c3e..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R +++ /dev/null @@ -1,208 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -############################################################ -# 1: Installing and loading required packages and functions -############################################################ - -#source("../Header_TT.R") -#source("../barplot_colour_function.R") -#library(tidyverse) - -########################### -#2: Read file: normalised file, output of step 4 mcsm pipeline -########################### -#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv" -# , row.names = 1 -# , stringsAsFactors = F -# , header = T) - -# call script combining_df -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -# from Plotting to Scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA for pyrazinamide: -#merged_df2 -#merged_df2_comp - -# df without NA for pyrazinamide: -#merged_df3 -#merged_df3_comp -#========================== -########################### -# Data to choose: -# We will be using the small dfs -# to generate the coloured axis -########################### - -# uncomment as necessary -#!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#!!!!!!!!!!!!!!!!!!!!!!! - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -str(my_df) -my_df$Position -c1 = my_df[my_df$Mutationinformation == "A134V",] - -# order my_df by Position -my_df_o = my_df[order(my_df$Position),] -head(my_df_o$Position); tail(my_df_o$Position) - -c2 = my_df_o[my_df_o$Mutationinformation == "A134V",] - -# sanity check -if (sum(table(c1 == c2)) == ncol(my_df)){ - print ("Sanity check passsd") -}else{ - print ("Error!: Please debug your code") -} - -rm(my_df, c1, c2) - -# create a new df with unique position numbers and cols -Position = unique(my_df_o$Position) -Position_cols = as.data.frame(Position) - -head(Position_cols) ; tail(Position_cols) - -# specify active site residues and bg colour -Position = c(49, 51, 57, 71 - , 8, 96, 138 - , 13, 68 - , 103, 137 - , 133, 134) #13 - -lab_bg = rep(c("purple" - , "yellow" - , "cornflowerblue" - , "blue" - , "green"), times = c(4, 3, 2, 2, 2) -) - -# second bg colour for active site residues -#lab_bg2 = rep(c("white" -# , "green" , "white", "green" -# , "white" -# , "white" -# , "white"), times = c(4 -# , 1, 1, 1 -# , 2 -# , 2 -# , 2) -#) - -#%%%%%%%%% -# revised: leave the second box coloured as the first one incase there is no second colour -#%%%%%%%%% -lab_bg2 = rep(c("purple" - , "green", "yellow", "green" - , "cornflowerblue" - , "blue" - , "green"), times = c(4 - , 1, 1, 1 - , 2 - , 2 - , 2)) - -# fg colour for labels for active site residues -lab_fg = rep(c("white" - , "black" - , "black" - , "white" - , "black"), times = c(4, 3, 2, 2, 2)) - -#%%%%%%%%% -# revised: make the purple ones black -# fg colour for labels for active site residues -#%%%%%%%%% -#lab_fg = rep(c("black" -# , "black" -# , "black" -# , "white" -# , "black"), times = c(4, 3, 2, 2, 2)) - -# combined df with active sites, bg and fg colours -aa_cols_ref = data.frame(Position - , lab_bg - , lab_bg2 - , lab_fg - , stringsAsFactors = F) #13, 4 - -str(Position_cols); class(Position_cols) -str(aa_cols_ref); class(aa_cols_ref) - -# since Position is int and numeric in the two dfs resp, -# converting numeric to int for consistency -aa_cols_ref$Position = as.integer(aa_cols_ref$Position) -class(aa_cols_ref$Position) - -#=========== -# Merge 1: merging Positions df (Position_cols) and -# active site cols (aa_cols_ref) -# linking column: "Position" -# This is so you can have colours defined for all 130 positions -#=========== -head(Position_cols$Position); head(aa_cols_ref$Position) - -mut_pos_cols = merge(Position_cols, aa_cols_ref - , by = "Position" - , all.x = TRUE) - -head(mut_pos_cols) -# replace NA's -# :column "lab_bg" with "white" -# : column "lab_fg" with "black" -mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white" -mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white" -mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black" -head(mut_pos_cols) - -#=========== -# Merge 2: Merge mut_pos_cols with mcsm df -# Now combined the 130 positions with aa colours with -# the mcsm_data -#=========== -# dfs to merge -df0 = my_df_o -df1 = mut_pos_cols - -# check the column on which merge will be performed -head(df0$Position); tail(df0$Position) -head(df1$Position); tail(df1$Position) - -# should now have 3 extra columns -my_df = merge(df0, df1 - , by = "Position" - , all.x = TRUE) - -# sanity check -my_df[my_df$Position == "49",] -my_df[my_df$Position == "13",] - -my_df$Position - -# clear variables -rm(aa_cols_ref - , df0 - , df1 - , my_df_o - , Position_cols - , lab_bg - , lab_bg2 - , lab_fg - , Position - ) - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R deleted file mode 100644 index 37dfe32..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R +++ /dev/null @@ -1,208 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -############################################################ -# 1: Installing and loading required packages and functions -############################################################ - -#source("../Header_TT.R") -#source("../barplot_colour_function.R") -#library(tidyverse) - -########################### -#2: Read file: normalised file, output of step 4 mcsm pipeline -########################### -#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv" -# , row.names = 1 -# , stringsAsFactors = F -# , header = T) - -# call script combining_df -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -# from Plotting to Scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA for pyrazinamide: -#merged_df2 -#merged_df2_comp - -# df without NA for pyrazinamide: -#merged_df3 -#merged_df3_comp -#========================== -########################### -# Data to choose: -# We will be using the small dfs -# to generate the coloured axis -########################### - -# uncomment as necessary -#!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#!!!!!!!!!!!!!!!!!!!!!!! - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -str(my_df) -my_df$Position -c1 = my_df[my_df$Mutationinformation == "L4S",] - -# order my_df by Position -my_df_o = my_df[order(my_df$Position),] -head(my_df_o$Position); tail(my_df_o$Position) - -c2 = my_df_o[my_df_o$Mutationinformation == "L4S",] - -# sanity check -if (sum(table(c1 == c2)) == ncol(my_df)){ - print ("Sanity check passsd") -}else{ - print ("Error!: Please debug your code") -} - -rm(my_df, c1, c2) - -# create a new df with unique position numbers and cols -Position = unique(my_df_o$Position) #130 -Position_cols = as.data.frame(Position) - -head(Position_cols) ; tail(Position_cols) - -# specify active site residues and bg colour -Position = c(49, 51, 57, 71 - , 8, 96, 138 - , 13, 68 - , 103, 137 - , 133, 134) #13 - -lab_bg = rep(c("purple" - , "yellow" - , "cornflowerblue" - , "blue" - , "green"), times = c(4, 3, 2, 2, 2) -) - -# second bg colour for active site residues -#lab_bg2 = rep(c("white" -# , "green" , "white", "green" -# , "white" -# , "white" -# , "white"), times = c(4 -# , 1, 1, 1 -# , 2 -# , 2 -# , 2) -#) - -#%%%%%%%%% -# revised: leave the second box coloured as the first one incase there is no second colour -#%%%%%%%%% -lab_bg2 = rep(c("purple" - , "green", "yellow", "green" - , "cornflowerblue" - , "blue" - , "green"), times = c(4 - , 1, 1, 1 - , 2 - , 2 - , 2)) - -# fg colour for labels for active site residues -lab_fg = rep(c("white" - , "black" - , "black" - , "white" - , "black"), times = c(4, 3, 2, 2, 2)) - -#%%%%%%%%% -# revised: make the purple ones black -# fg colour for labels for active site residues -#%%%%%%%%% -#lab_fg = rep(c("black" -# , "black" -# , "black" -# , "white" -# , "black"), times = c(4, 3, 2, 2, 2)) - -# combined df with active sites, bg and fg colours -aa_cols_ref = data.frame(Position - , lab_bg - , lab_bg2 - , lab_fg - , stringsAsFactors = F) #13, 4 - -str(Position_cols); class(Position_cols) -str(aa_cols_ref); class(aa_cols_ref) - -# since Position is int and numeric in the two dfs resp, -# converting numeric to int for consistency -aa_cols_ref$Position = as.integer(aa_cols_ref$Position) -class(aa_cols_ref$Position) - -#=========== -# Merge 1: merging Positions df (Position_cols) and -# active site cols (aa_cols_ref) -# linking column: "Position" -# This is so you can have colours defined for all 130 positions -#=========== -head(Position_cols$Position); head(aa_cols_ref$Position) - -mut_pos_cols = merge(Position_cols, aa_cols_ref - , by = "Position" - , all.x = TRUE) - -head(mut_pos_cols) -# replace NA's -# :column "lab_bg" with "white" -# : column "lab_fg" with "black" -mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white" -mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white" -mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black" -head(mut_pos_cols) - -#=========== -# Merge 2: Merge mut_pos_cols with mcsm df -# Now combined the 130 positions with aa colours with -# the mcsm_data -#=========== -# dfs to merge -df0 = my_df_o -df1 = mut_pos_cols - -# check the column on which merge will be performed -head(df0$Position); tail(df0$Position) -head(df1$Position); tail(df1$Position) - -# should now have 3 extra columns -my_df = merge(df0, df1 - , by = "Position" - , all.x = TRUE) - -# sanity check -my_df[my_df$Position == "49",] -my_df[my_df$Position == "13",] - -my_df$Position - -# clear variables -rm(aa_cols_ref - , df0 - , df1 - , my_df_o - , Position_cols - , lab_bg - , lab_bg2 - , lab_fg - , Position - ) - diff --git a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R deleted file mode 100644 index 41ca884..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R +++ /dev/null @@ -1,27 +0,0 @@ -######################### -#3: Read complex pdb file -########################## -source("Header_TT.R") -# This script only reads the pdb file of your complex - -# read in pdb file complex1 -inDir = "~/git/Data/pyrazinamide/input/structure/" -inFile = paste0(inDir, "complex1_no_water.pdb") -complex1 = inFile - -#inFile2 = paste0(inDir, "complex2_no_water.pdb") -#complex2 = inFile2 - -# list of 8 -my_pdb = read.pdb(complex1 - , maxlines = -1 - , multi = FALSE - , rm.insert = FALSE - , rm.alt = TRUE - , ATOM.only = FALSE - , hex = FALSE - , verbose = TRUE) - -rm(inDir, inFile, complex1) -#====== end of script - diff --git a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R deleted file mode 100644 index 658eec4..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R +++ /dev/null @@ -1,386 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("Header_TT.R") - -######################################################### -# TASK: replace B-factors in the pdb file with normalised values -# use the complex file with no water as mCSM lig was -# performed on this file. You can check it in the script: read_pdb file. -######################################################### - -########################### -# 2: Read file: average stability values -# or mcsm_normalised file, output of step 4 mcsm pipeline -########################### - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile - -my_df <- read.csv(inFile -# , row.names = 1 -# , stringsAsFactors = F - , header = T) -str(my_df) - -#========================================================= -# Processing P1: Replacing B factor with mean ratioDUET scores -#========================================================= - -######################### -# Read complex pdb file -# form the R script -########################## - -source("read_pdb.R") # list of 8 - -# extract atom list into a variable -# since in the list this corresponds to data frame, variable will be a df -d = my_pdb[[1]] - -# make a copy: required for downstream sanity checks -d2 = d - -# sanity checks: B factor -max(d$b); min(d$b) - -#******************************************* -# plot histograms for inspection -# 1: original B-factors -# 2: original DUET Scores -# 3: replaced B-factors with DUET Scores -#********************************************* -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(3,2)) -#par(mfrow = c(3,2)) - - #1: Original B-factor -hist(d$b - , xlab = "" - , main = "B-factor") - -plot(density(d$b) - , xlab = "" - , main = "B-factor") - -# 2: DUET scores -hist(my_df$average_DUETR - , xlab = "" - , main = "Norm_DUET") - -plot(density(my_df$average_DUETR) - , xlab = "" - , main = "Norm_DUET") - -# 3: After the following replacement -#******************************** - -#========= -# step 0_P1: DONT RUN once you have double checked the matched output -#========= -# sanity check: match and assign to a separate column to double check -# colnames(my_df) -# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)] - -#========= -# step 1_P1 -#========= -# Be brave and replace in place now (don't run sanity check) -# this makes all the B-factor values in the non-matched positions as NA -d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] - -#========= -# step 2_P1 -#========= -# count NA in Bfactor -b_na = sum(is.na(d$b)) ; b_na - -# count number of 0's in Bactor -sum(d$b == 0) -#table(d$b) - -# replace all NA in b factor with 0 -d$b[is.na(d$b)] = 0 - -# sanity check: should be 0 -sum(is.na(d$b)) - -# sanity check: should be True -if (sum(d$b == 0) == b_na){ - print ("Sanity check passed: NA's replaced with 0's successfully") -} else { - print("Error: NA replacement NOT successful, Debug code!") -} - -max(d$b); min(d$b) - -# sanity checks: should be True -if(max(d$b) == max(my_df$average_DUETR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -if (min(d$b) == min(my_df$average_DUETR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -#========= -# step 3_P1 -#========= -# sanity check: dim should be same before reassignment -# should be TRUE -dim(d) == dim(d2) - -#========= -# step 4_P1 -#========= -# assign it back to the pdb file -my_pdb[[1]] = d - -max(d$b); min(d$b) - -#========= -# step 5_P1 -#========= -# output dir -getwd() -outDir = "~/git/Data/pyrazinamide/input/structure/" - -outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile -write.pdb(my_pdb, outFile) - -#******************************** -# Add the 3rd histogram and density plots for comparisons -#******************************** -# Plots continued... -# 3: hist and density of replaced B-factors with DUET Scores -hist(d$b - , xlab = "" - , main = "repalced-B") - -plot(density(d$b) - , xlab = "" - , main = "replaced-B") - -# graph titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = "DUET_stability" - , side = 3 - , line = 0 - , outer = TRUE) -#******************************** - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# NOTE: This replaced B-factor distribution has the same -# x-axis as the PredAff normalised values, but the distribution -# is affected since 0 is overinflated. This is because all the positions -# where there are no SNPs have been assigned 0. -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - - - - -####################################################################### -#====================== end of section 1 ============================== -####################################################################### - - - - - -#========================================================= -# Processing P2: Replacing B values with PredAff Scores -#========================================================= -# clear workspace -rm(list = ls()) - -########################### -# 2: Read file: average stability values -# or mcsm_normalised file, output of step 4 mcsm pipeline -########################### - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile - -my_df <- read.csv(inFile -# , row.names = 1 -# , stringsAsFactors = F - , header = T) -str(my_df) -#rm(inDir, inFile) - -######################### -# 3: Read complex pdb file -# form the R script -########################## - -source("read_pdb.R") # list of 8 - -# extract atom list into a variable -# since in the list this corresponds to data frame, variable will be a df -d = my_pdb[[1]] - -# make a copy: required for downstream sanity checks -d2 = d - -# sanity checks: B factor -max(d$b); min(d$b) - -#******************************************* -# plot histograms for inspection -# 1: original B-factors -# 2: original Pred Aff Scores -# 3: replaced B-factors with PredAff Scores -#******************************************** -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(3,2)) -#par(mfrow = c(3,2)) - -# 1: Original B-factor -hist(d$b - , xlab = "" - , main = "B-factor") - -plot(density(d$b) - , xlab = "" - , main = "B-factor") - -# 2: Pred Aff scores -hist(my_df$average_PredAffR - , xlab = "" - , main = "Norm_lig_average") - -plot(density(my_df$average_PredAffR) - , xlab = "" - , main = "Norm_lig_average") - -# 3: After the following replacement -#******************************** - -#================================================= -# Processing P2: Replacing B values with ratioPredAff scores -#================================================= -# use match to perform this replacement linking with "position no" -# in the pdb file, this corresponds to column "resno" -# in my_df, this corresponds to column "Position" - -#========= -# step 0_P2: DONT RUN once you have double checked the matched output -#========= -# sanity check: match and assign to a separate column to double check -# colnames(my_df) -# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17 - -#========= -# step 1_P2: BE BRAVE and replace in place now (don't run step 0) -#========= -# this makes all the B-factor values in the non-matched positions as NA -d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] - -#========= -# step 2_P2 -#========= -# count NA in Bfactor -b_na = sum(is.na(d$b)) ; b_na - -# count number of 0's in Bactor -sum(d$b == 0) -#table(d$b) - -# replace all NA in b factor with 0 -d$b[is.na(d$b)] = 0 - -# sanity check: should be 0 -sum(is.na(d$b)) - -if (sum(d$b == 0) == b_na){ - print ("Sanity check passed: NA's replaced with 0's successfully") -} else { - print("Error: NA replacement NOT successful, Debug code!") -} - -max(d$b); min(d$b) - -# sanity checks: should be True -if (max(d$b) == max(my_df$average_PredAffR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -if (min(d$b) == min(my_df$average_PredAffR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -#========= -# step 3_P2 -#========= -# sanity check: dim should be same before reassignment -# should be TRUE -dim(d) == dim(d2) - -#========= -# step 4_P2 -#========= -# assign it back to the pdb file -my_pdb[[1]] = d - -max(d$b); min(d$b) - -#========= -# step 5_P2 -#========= - -# output dir -outDir = "~/git/Data/pyrazinamide/input/structure/" -outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile -write.pdb(my_pdb, outFile) - -#******************************** -# Add the 3rd histogram and density plots for comparisons -#******************************** -# Plots continued... -# 3: hist and density of replaced B-factors with PredAff Scores -hist(d$b - , xlab = "" - , main = "repalced-B") - -plot(density(d$b) - , xlab = "" - , main = "replaced-B") - -# graph titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = "Lig_stability" - , side = 3 - , line = 0 - , outer = TRUE) - -#******************************** - -########### -# end of output files with Bfactors -########## diff --git a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R deleted file mode 100644 index 9f30f28..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R +++ /dev/null @@ -1,257 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") -getwd() - -######################################################### -# 1: Installing and loading required packages # -######################################################### - -source("Header_TT.R") -#source("barplot_colour_function.R") - -########################################################## -# Checking: Entire data frame and for PS # -########################################################## - -########################### -#2) Read file: combined one from the script -########################### -source("combining_two_df.R") - -# df with NA: -# merged_df2 -# merged_df3: - -# df without NA: -# merged_df2_comp: -# merged_df3_comp: - -###################### -# You need to check it -# with the merged_df3 -######################## - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -#clear variables -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# should be true -identical(my_df$Position, my_df$position) - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# - - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile - -mcsm_data <- read.csv(inFile - , row.names = 1 - , stringsAsFactors = F - , header = T) -str(mcsm_data) -my_colnames = colnames(mcsm_data) - -#==================================== -# subset my_df to include only the columns in mcsm data -my_df2 = my_df[my_colnames] -#==================================== -# compare the two -head(mcsm_data$Mutationinformation) -head(mcsm_data$Position) - -head(my_df2$Mutationinformation) -head(my_df2$Position) - -# sort mcsm data by Mutationinformation -mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] -head(mcsm_data_s$Mutationinformation) -head(mcsm_data_s$Position) - -# now compare: should be True, but is false.... -# possibly due to rownames!?! -identical(mcsm_data_s, my_df2) - -# from library dplyr -setdiff(mcsm_data_s, my_df2) - -#from lib compare -compare(mcsm_data_s, my_df2) # seems rownames are the problem - -# FIXME: automate this -# write files: checked using meld and files are indeed identical -#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F) -#write.csv(my_df2, "my_df2.csv", row.names = F) - - -#====================================================== end of section 1 - - - -########################################################## -# Checking: LIG(Filtered dataframe) # -########################################################## - -# clear workspace -rm(list = ls()) - -########################### -#3) Read file: combined_lig from the script -########################### -source("combining_two_df_lig.R") - -# df with NA: -# merged_df2 : -# merged_df3: - -# df without NA: -# merged_df2_comp: -# merged_df3_comp: - -###################### -# You need to check it -# with the merged_df3 -######################## - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -#clear variables -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# should be true -identical(my_df$Position, my_df$position) - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile - -mcsm_data <- read.csv(inFile - , row.names = 1 - , stringsAsFactors = F - , header = T) -str(mcsm_data) - -########################### -# 4a: Filter/subset data: ONLY for LIGand analysis -# Lig plots < 10Ang -# Filter the lig plots for Dis_to_lig < 10Ang -########################### -# sanity checks -upos = unique(mcsm_data$Position) - -# check range of distances -max(mcsm_data$Dis_lig_Ang) -min(mcsm_data$Dis_lig_Ang) - -# Lig filtered: subset data to have only values less than 10 Ang -mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) - -rm(mcsm_data) #to avoid confusion - -table(mcsm_data2$Dis_lig_Ang<10) -table(mcsm_data2$Dis_lig_Ang>10) - -max(mcsm_data2$Dis_lig_Ang) -min(mcsm_data2$Dis_lig_Ang) - -upos_f = unique(mcsm_data2$Position); upos_f - -# colnames of df that you will need to subset the bigger df from -my_colnames = colnames(mcsm_data2) -#==================================== -# subset bigger df i.e my_df to include only the columns in mcsm data2 -my_df2 = my_df[my_colnames] - -rm(my_df) #to avoid confusion -#==================================== -# compare the two -head(mcsm_data2$Mutationinformation) -head(mcsm_data2$Position) - -head(my_df2$Mutationinformation) -head(my_df2$Position) - -# sort mcsm data by Mutationinformation -mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] -head(mcsm_data2_s$Mutationinformation) -head(mcsm_data2_s$Position) - -# now compare: should be True, but is false.... -# possibly due to rownames!?! -identical(mcsm_data2_s, my_df2) - -# from library dplyr -setdiff(mcsm_data2_s, my_df2) - -# from library compare -compare(mcsm_data2_s, my_df2) # seems rownames are the problem - -#FIXME: automate this -# write files: checked using meld and files are indeed identical -#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F) -#write.csv(my_df2, "my_df2.csv", row.names = F) - - -########################################################## -# extract and write output file for SNP posn: all # -########################################################## - -head(merged_df3$Position) - -foo = merged_df3[order(merged_df3$Position),] -head(foo$Position) - -snp_pos_unique = unique(foo$Position); snp_pos_unique - -# sanity check: -table(snp_pos_unique == combined_df$Position) - -#===================== -# write_output files -#===================== -outDir = "~/Data/pyrazinamide/input/processed/" - - -outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1 -print(paste0("Output file name and path will be:","", outFile1)) - -write.table(snp_pos_unique - , outFile1 - , row.names = F - , col.names = F) - -############################################################## -# extract and write output file for SNP posn: complete only # -############################################################## -head(merged_df3_comp$Position) - -foo = merged_df3_comp[order(merged_df3_comp$Position),] -head(foo$Position) - -snp_pos_unique = unique(foo$Position); snp_pos_unique - -# outDir = "~/Data/pyrazinamide/input/processed/" # already set - -outFile2 = paste0(outDir, "snp_pos_unique_comp.txt") -print(paste0("Output file name and path will be:", outFile2)) - -write.table(snp_pos_unique - , outFile2 - , row.names = F - , col.names = F) -#============================== end of script - - diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py index 7b59cfa..5d3dc64 100755 --- a/meta_data_analysis/dssp_df.py +++ b/meta_data_analysis/dssp_df.py @@ -48,9 +48,10 @@ gene = 'pncA' datadir = homedir + '/' + 'git/Data' #======= -# input +# input from outdir #======= -indir = datadir + '/' + drug + '/' + 'output' +#indir = datadir + '/' + drug + '/' + 'output' +outdir = datadir + '/' + drug + '/' + 'output' #in_filename = 'pnca.dssp' in_filename = gene.lower() +'.dssp' infile = indir + '/' + in_filename diff --git a/mk_drug_dirs.sh b/mk_drug_dirs.sh index 6a6dd6d..a336ed3 100755 --- a/mk_drug_dirs.sh +++ b/mk_drug_dirs.sh @@ -4,9 +4,6 @@ ## Structure: # # $DATA_DIR/$DRUG/input -# |- original -# |- processed -# |- structure # # $DATA_DIR/$DRUG/output # |- plots @@ -15,18 +12,17 @@ DATA_DIR=~/git/Data if [[ $1 == '' ]]; then + echo "Error" echo "usage: mk-drug-dirs.sh "; exit; else DRUG=$1 - echo Creating structure for: $DRUG + echo Creating directory structure for: $DRUG if [ -d $DATA_DIR ] then echo Doing creation in $DATA_DIR - mkdir -p $DATA_DIR/$DRUG/input/original - mkdir -p $DATA_DIR/$DRUG/input/processed - mkdir -p $DATA_DIR/$DRUG/input/structure + mkdir -p $DATA_DIR/$DRUG/input mkdir -p $DATA_DIR/$DRUG/output/plots mkdir -p $DATA_DIR/$DRUG/output/structure diff --git a/meta_data_analysis/data_extraction.py b/scripts/data_extraction.py similarity index 57% rename from meta_data_analysis/data_extraction.py rename to scripts/data_extraction.py index 70f3008..451d6cf 100755 --- a/meta_data_analysis/data_extraction.py +++ b/scripts/data_extraction.py @@ -11,63 +11,77 @@ Created on Tue Aug 6 12:56:03 2019 # FIXME: import dirs.py to get the basic dir paths available #======================================================================= -# TASK: extract ALL pncA_p. mutations from GWAS data +# TASK: extract ALL matched mutations from GWAS data # Input data file has the following format: each row = unique sample id -# id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide... -# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,pncA_p.POS; pncA_c.POS... -# where multiple mutations and multiple mutation types are separated by ';'. We are interested in the -# protein coding region i.e mutation with the 'p.' format. - -# the script splits the mutations on the ';' and extracts protein coding muts only +# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col... +# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_matchPOS; pncA_c.POS... +# where multiple mutations and multiple mutation types are separated by ';'. +# We are interested in the protein coding region i.e mutation with the_'p.' format. +# This script splits the mutations on the ';' and extracts protein coding muts only # where each row is a separate mutation # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique -# output files: -# 0) pnca_common_ids.csv -# 1) pnca_ambiguous_muts.csv -# 2) pnca_mcsm_snps.csv -# 3) pnca_metadata.csv -# 4) pnca_all_muts_msa.csv -# 5) pnca_mutational_positons.csv +# output files: all lower case +# 0) _common_ids.csv +# 1) _ambiguous_muts.csv +# 2) _mcsm_snps.csv +# 3) _metadata.csv +# 4) _all_muts_msa.csv +# 5) _mutational_positons.csv #======================================================================= #%% load libraries import os, sys import pandas as pd #import numpy as np - -#from pandas.api.types import is_string_dtype -#from pandas.api.types import is_numeric_dtype - -#%% specify homedir as python doesn't recognise tilde +import argparse +#======================================================================= +#%% homdir and curr dir and local imports homedir = os.path.expanduser('~') - # set working dir os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # import aa dict -from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! +from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' +#drug = 'pyrazinamide' +#gene = 'pncA' +drug = args.drug +gene = args.gene gene_match = gene + '_p.' +# building cols to extract +dr_muts_col = 'dr_mutations_' + drug +other_muts_col = 'other_mutations_' + drug +print('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n===============================================================') +#======================================================================= +#%% input and output dirs and files #======= # data dir #======= -#indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= -#indir = 'git/Data/pyrazinamide/input/original' in_filename = 'original_tanushree_data_v2.csv' infile = datadir + '/' + in_filename print('Input filename: ', in_filename - , '\nInput path: ', indir + , '\nInput path: ', datadir , '\n============================================================') #======= @@ -88,15 +102,15 @@ master_data = pd.read_csv(infile, sep = ',') # column names #list(master_data.columns) -# extract elevant columns to extract from meta data related to the pyrazinamide +# extract elevant columns to extract from meta data related to the drug meta_data = master_data[['id' ,'country' ,'lineage' ,'sublineage' ,'drtype' - , 'pyrazinamide' - , 'dr_mutations_pyrazinamide' - , 'other_mutations_pyrazinamide' + , drug + , dr_muts_col + , other_muts_col ]] del(master_data) @@ -115,13 +129,13 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum() meta_data.head() # equivalent of table in R -# pyrazinamide counts -meta_data.pyrazinamide.value_counts() -print('RESULT: Sus and Res samples:\n', meta_data.pyrazinamide.value_counts() +# drug counts +meta_data[drug].value_counts() +print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts() , '\n===========================================================') # clear variables -del(indir, in_filename,infile) +del(in_filename,infile) #del(outdir) #%% # !!!IMPORTANT!!! sanity check: @@ -129,18 +143,18 @@ del(indir, in_filename,infile) # can use it to check if your data extraction process for dr_muts # and other_muts has worked correctly AND also to check the dim of the # final formatted data. -# This will have: unique COMBINATION of sample id and pncA_p.mutations +# This will have: unique COMBINATION of sample id and mutations #======== -# First: counting pncA_p. mutations in dr_mutations_pyrazinamide column +# First: counting mutations in dr_muts_col column #======== -print('Now counting WT & pncA_p. muts within the column: dr_mutations_pyrazinamide') +print('Now counting WT &', gene_match, 'muts within the column:', dr_muts_col) # drop na and extract a clean df -clean_df = meta_data.dropna(subset=['dr_mutations_pyrazinamide']) +clean_df = meta_data.dropna(subset=[dr_muts_col]) # sanity check: count na -na_count = meta_data['dr_mutations_pyrazinamide'].isna().sum() +na_count = meta_data[dr_muts_col].isna().sum() if len(clean_df) == (total_samples - na_count): print('PASS: clean_df extracted: length is', len(clean_df) @@ -150,7 +164,7 @@ else: print('FAIL: dropping NA failed' , '\n==========================================================') -dr_pnca_count = 0 +dr_gene_count = 0 wt = 0 id_dr = [] id2_dr = [] @@ -158,45 +172,44 @@ id2_dr = [] for i, id in enumerate(clean_df.id): # print (i, id) # id_dr.append(id) -# count_pnca_dr = clean_df.dr_mutations_pyrazinamide.iloc[i].count('pncA_p.') #works 2201 - count_pnca_dr = clean_df.dr_mutations_pyrazinamide.iloc[i].count(gene_match) #works 2201 - if count_pnca_dr > 0: + count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) + if count_gene_dr > 0: id_dr.append(id) - if count_pnca_dr > 1: + if count_gene_dr > 1: id2_dr.append(id) -# print(id, count_pnca_dr) - dr_pnca_count = dr_pnca_count + count_pnca_dr - count_wt = clean_df.dr_mutations_pyrazinamide.iloc[i].count('WT') +# print(id, count_gene_dr) + dr_gene_count = dr_gene_count + count_gene_dr + count_wt = clean_df[dr_muts_col].iloc[i].count('WT') wt = wt + count_wt print('RESULTS:') -print('Total WT in dr_mutations_pyrazinamide:', wt) -print('Total matches of', gene_match, 'in dr_mutations_pyrazinamide:', dr_pnca_count) -print('Total samples with > 1', gene_match, 'muts in dr_mutations_pyrazinamide:', len(id2_dr) ) +print('Total WT in dr_muts_col:', wt) +print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count) +print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) ) print('=================================================================') -del(i, id, wt, id2_dr, clean_df, na_count, count_pnca_dr, count_wt) +del(i, id, wt, id2_dr, clean_df, na_count, count_gene_dr, count_wt) #======== -# Second: counting pncA_p. mutations in dr_mutations_pyrazinamide column +# Second: counting mutations in dr_muts_col column #======== -print('Now counting WT & pncA_p. muts within the column: other_mutations_pyrazinamide') +print('Now counting WT &', gene_match, 'muts within the column:', other_muts_col) # drop na and extract a clean df -clean_df = meta_data.dropna(subset=['other_mutations_pyrazinamide']) +clean_df = meta_data.dropna(subset=[other_muts_col]) # sanity check: count na -na_count = meta_data['other_mutations_pyrazinamide'].isna().sum() +na_count = meta_data[other_muts_col].isna().sum() if len(clean_df) == (total_samples - na_count): print('PASS: clean_df extracted: length is', len(clean_df) - , '\nNo.of NA s=', na_count, '/', total_samples + , '\nNo.of NAs =', na_count, '/', total_samples , '\n=========================================================') else: print('FAIL: dropping NA failed' , '\n=========================================================') -other_pnca_count = 0 +other_gene_count = 0 wt_other = 0 id_other = [] id2_other = [] @@ -204,63 +217,63 @@ id2_other = [] for i, id in enumerate(clean_df.id): # print (i, id) # id_other.append(id) -# count_pnca_other = clean_df.other_mutations_pyrazinamide.iloc[i].count('pncA_p.') - count_pnca_other = clean_df.other_mutations_pyrazinamide.iloc[i].count(gene_match) - if count_pnca_other > 0: +# count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match') + count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) + if count_gene_other > 0: id_other.append(id) - if count_pnca_other > 1: + if count_gene_other > 1: id2_other.append(id) -# print(id, count_pnca_other) - other_pnca_count = other_pnca_count + count_pnca_other - count_wt = clean_df.other_mutations_pyrazinamide.iloc[i].count('WT') +# print(id, count_gene_other) + other_gene_count = other_gene_count + count_gene_other + count_wt = clean_df[other_muts_col].iloc[i].count('WT') wt_other = wt_other + count_wt print('RESULTS:') -print('Total WT in other_mutations_pyrazinamide:', wt_other) -print('Total matches of', gene_match, 'in other_mutations_pyrazinamide:', other_pnca_count) -print('Total samples with > 1', gene_match, 'muts in other_mutations_pyrazinamide:', len(id2_other) ) +print('Total WT in other_muts_col:', wt_other) +print('Total matches of', gene_match, 'in', other_muts_col, ':', other_gene_count) +print('Total samples with > 1', gene_match, 'muts in other_muts_col:', len(id2_other) ) print('=================================================================') -print('Predicting total no. of rows in your curated df:', dr_pnca_count + other_pnca_count ) -expected_rows = dr_pnca_count + other_pnca_count +print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count + , '\n===================================================================') +expected_rows = dr_gene_count + other_gene_count -del(i, id, wt_other, clean_df, na_count, id2_other, count_pnca_other, count_wt) +del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt) #%% ############ # extracting dr and other muts separately along with the common cols ############# -print('=================================================================') -print('Extracting dr_muts in a dr_mutations_pyrazinamide with other meta_data') +print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data') print('gene to extract:', gene_match ) #=============== -# dr mutations: extract pncA_p. entries with meta data and ONLY dr_muts col +# dr mutations: extract gene_match entries with meta data and ONLY dr_muts col #=============== -# FIXME: replace pyrazinamide with variable containing the drug name +# FIXME: replace drug with variable containing the drug name # !!! important !!! meta_data_dr = meta_data[['id' ,'country' ,'lineage' ,'sublineage' ,'drtype' - , 'pyrazinamide' - , 'dr_mutations_pyrazinamide' + , drug + , dr_muts_col ]] print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) print('actual dim:', meta_data_dr.shape , '\n===============================================================') # Extract within this the gene of interest using string match -#meta_pnca_dr = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] -meta_pnca_dr = meta_data_dr.loc[meta_data_dr.dr_mutations_pyrazinamide.str.contains(gene_match, na = False)] +#meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] +meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] -dr_id = meta_pnca_dr['id'].unique() +dr_id = meta_gene_dr['id'].unique() print('RESULT: No. of samples with dr muts in pncA:', len(dr_id)) print('checking RESULT:', '\nexpected len =', len(id_dr), - '\nactual len =', len(meta_pnca_dr) ) + '\nactual len =', len(meta_gene_dr) ) -if len(id_dr) == len(meta_pnca_dr): +if len(id_dr) == len(meta_gene_dr): print('PASS: lengths match' , '\n===============================================================') else: @@ -270,18 +283,18 @@ else: dr_id = pd.Series(dr_id) #================= -# other mutations: extract pncA_p. entries +# other mutations: extract gene_match entries #================== -print('Extracting dr_muts in a other_mutations_pyrazinamide with other meta_data') -# FIXME: replace pyrazinamide with variable containing the drug name +print('Extracting dr_muts from:', other_muts_col,'with other meta_data') +# FIXME: replace drug with variable containing the drug name # !!! important !!! meta_data_other = meta_data[['id' ,'country' ,'lineage' ,'sublineage' ,'drtype' - , 'pyrazinamide' - , 'other_mutations_pyrazinamide' + , drug + , other_muts_col ]] print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) @@ -289,15 +302,15 @@ print('actual dim:', meta_data_other.shape , '\n===============================================================') # Extract within this the gene of interest using string match -meta_pnca_other = meta_data_other.loc[meta_data_other.other_mutations_pyrazinamide.str.contains(gene_match, na = False)] +meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] -other_id = meta_pnca_other['id'].unique() +other_id = meta_gene_other['id'].unique() print('RESULT: No. of samples with other muts:', len(other_id)) print('checking RESULT:', '\nexpected len =', len(id_other), - '\nactual len =', len(meta_pnca_other) ) + '\nactual len =', len(meta_gene_other) ) -if len(id_other) == len(meta_pnca_other): +if len(id_other) == len(meta_gene_other): print('PASS: lengths match' , '\n==============================================================') else: @@ -308,7 +321,7 @@ other_id = pd.Series(other_id) #%% Find common IDs print('Now extracting common_ids...') common_mut_ids = dr_id.isin(other_id).sum() -print('RESULT: No. of common Ids:', common_mut_ids) +print('RESULT: No. of common ids:', common_mut_ids) # sanity checks # check if True: should be since these are common @@ -327,9 +340,9 @@ common_ids2.columns = ['index', 'id2'] # should be True print(common_ids['id'].equals(common_ids2['id2'])) -# good sanity check: use it later to check pnca_sample_counts -expected_pnca_samples = ( len(meta_pnca_dr) + len(meta_pnca_other) - common_mut_ids ) -print('expected no. of pnca samples:', expected_pnca_samples) +# good sanity check: use it later to check gene_sample_counts +expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids ) +print('expected no. of gene samples:', expected_gene_samples) print('=================================================================') #%% write file #print(outdir) @@ -348,47 +361,47 @@ del(out_filename0) # clear variables del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) +#%% Now extract 'all' pncA mutations: i.e 'gene_match*' +print('extracting from string match:', gene_match, 'mutations from cols:\n' + , dr_muts_col, 'and', other_muts_col, 'using string match:' + , '\n===================================================================') +#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match) | meta_data[other_muts_col].str.contains(gene_match) ] +meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ] -#%% Now extract 'all' pncA mutations: i.e 'pncA_p.*' -print('extracting all pncA mutations from dr_ and other_ cols using string match:', gene_match - , '\n===============================================================') -#meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match) ] -meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match, na = False) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match, na = False) ] +extracted_gene_samples = meta_gene_all['id'].nunique() +print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples + , '\n===================================================================') -extracted_pnca_samples = meta_pnca_all['id'].nunique() -print('RESULT: actual no. of pnca samples extracted:', extracted_pnca_samples) -print('======================================================================') - -# sanity check: length of pnca samples +# sanity check: length of gene samples print('Performing sanity check:') -if extracted_pnca_samples == expected_pnca_samples: - print('No. of pnca samples:', len(meta_pnca_all) - , '\nPASS: expected & actual no. of pnca samples match' +if extracted_gene_samples == expected_gene_samples: + print('No. of gene samples:', len(meta_gene_all) + , '\nPASS: expected & actual no. of gene samples match' , '\n=========================================================') else: print('FAIL: Debug please!' , '\n===============================================================') -# count NA in pyrazinamide column -pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() -print('No. of pnca samples without pza testing i.e NA in pza column:', pnca_na) +# count NA in drug column +gene_na = meta_gene_all[drug].isna().sum() +print('No. of gene samples without pza testing i.e NA in pza column:', gene_na) # use it later to check number of complete samples from LF data -comp_pnca_samples = len(meta_pnca_all) - pnca_na -print('comp pnca samples tested for pza:', comp_pnca_samples) +comp_gene_samples = len(meta_gene_all) - gene_na +print('comp gene samples tested for pza:', comp_gene_samples) print('=================================================================') # Comment: This is still dirty data since these -# are samples that have pncA_p. muts, but can have others as well +# are samples that have gene_match muts, but can have others as well # since the format for mutations is mut1; mut2, etc. -print('This is still dirty data: samples have pncA_p. muts, but may have others as well' +print('This is still dirty data: samples have ', gene_match, 'muts but may have others as well' , '\nsince the format for mutations is mut1; mut2, etc.' , '\n=============================================================') #%% tidy_split():Function to split mutations on specified delimiter: ';' #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas +print('Performing tidy_split(): to separate the mutations into indivdual rows') -print('Performing tidy_spllit(): to separate the mutations into indivdual rows') # define the split function def tidy_split(df, column, sep='|', keep=False): ''' @@ -428,38 +441,38 @@ def tidy_split(df, column, sep='|', keep=False): #%% end of tidy_split() #========= -# DF1: dr_mutations_pyrazinamide +# DF1: dr_muts_col #========= ######## -# tidy_split(): on 'dr_mutations_pyrazinamide' column and remove leading white spaces +# tidy_split(): on dr_muts_col column and remove leading white spaces ######## -col_to_split1 = 'dr_mutations_pyrazinamide' -print ('Firstly, applying tidy split on dr df:', meta_pnca_dr.shape - , '\ncolumn name:', col_to_split1 +col_to_split1 = dr_muts_col +print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape + , '\ncolumn name to apply tidy_split():', col_to_split1 , '\n============================================================') # apply tidy_split() -dr_WF0 = tidy_split(meta_pnca_dr, col_to_split1, sep = ';') +dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') # remove leading white space else these are counted as distinct mutations as well -dr_WF0['dr_mutations_pyrazinamide'] = dr_WF0['dr_mutations_pyrazinamide'].str.lstrip() +dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip() -# extract only the samples/rows with pncA_p. -dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)] +# extract only the samples/rows with gene_match +dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] print('lengths after tidy split and extracting', gene_match, 'muts:' - , '\nold length:' , len(meta_pnca_dr) + , '\nold length:' , len(meta_gene_dr) , '\nlen after split:', len(dr_WF0) - , '\ndr_pnca_WF0 length:', len(dr_pnca_WF0) - , '\nexpected len:', dr_pnca_count) + , '\ndr_gene_WF0 length:', len(dr_gene_WF0) + , '\nexpected len:', dr_gene_count) -if len(dr_pnca_WF0) == dr_pnca_count: - print('PASS: length of dr_pnca_WF0 match with expected length' +if len(dr_gene_WF0) == dr_gene_count: + print('PASS: length of dr_gene_WF0 match with expected length' , '\n===============================================================') else: print('FAIL: lengths mismatch' , '\n===============================================================') # count the freq of 'dr_muts' samples -dr_muts_df = dr_pnca_WF0 [['id', 'dr_mutations_pyrazinamide']] +dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] print('dim of dr_muts_df:', dr_muts_df.shape) # add freq column @@ -468,13 +481,13 @@ dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') print('revised dim of dr_muts_df:', dr_muts_df.shape) c1 = dr_muts_df.dr_sample_freq.value_counts() -print('counting no. of sample frequency:\n', c1) -print('=================================================================') +print('counting no. of sample frequency:\n', c1 + , '\n===================================================================') -# sanity check: length of pnca samples -if len(dr_pnca_WF0) == c1.sum(): +# sanity check: length of gene samples +if len(dr_gene_WF0) == c1.sum(): print('PASS: WF data has expected length' - , '\nlength of dr_pnca WFO:', c1.sum() + , '\nlength of dr_gene WFO:', c1.sum() , '\n===============================================================') else: print('FAIL: Debug please!' @@ -483,7 +496,7 @@ else: #!!! Important !!! # Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df -dr_df = dr_pnca_WF0.assign(mutation_info = 'dr_mutations_pyrazinamide') +dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) print('dim of dr_df:', dr_df.shape , '\n==============================================================' , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' @@ -493,35 +506,35 @@ print('dim of dr_df:', dr_df.shape # DF2: other_mutations_pyrazinamdie #========= ######## -# tidy_split(): on 'other_mutations_pyrazinamide' column and remove leading white spaces +# tidy_split(): on other_muts_col column and remove leading white spaces ######## -col_to_split2 = 'other_mutations_pyrazinamide' -print ('applying second tidy split separately on df:', meta_pnca_other.shape - , '\ncolumn name:', col_to_split2 +col_to_split2 = other_muts_col +print ('applying second tidy split() separately on other muts df', meta_gene_other.shape + , '\ncolumn name to apply tidy_split():', col_to_split2 , '\n============================================================') # apply tidy_split() -other_WF1 = tidy_split(meta_pnca_other, col_to_split2, sep = ';') +other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') # remove the leading white spaces in the column -other_WF1['other_mutations_pyrazinamide'] = other_WF1['other_mutations_pyrazinamide'].str.strip() +other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() -# extract only the samples/rows with pncA_p. -other_pnca_WF1 = other_WF1.loc[other_WF1.other_mutations_pyrazinamide.str.contains(gene_match)] +# extract only the samples/rows with gene_match +other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] print('lengths after tidy split and extracting', gene_match, 'muts:', - '\nold length:' , len(meta_pnca_other), + '\nold length:' , len(meta_gene_other), '\nlen after split:', len(other_WF1), - '\nother_pnca_WF1 length:', len(other_pnca_WF1), - '\nexpected len:', other_pnca_count) + '\nother_gene_WF1 length:', len(other_gene_WF1), + '\nexpected len:', other_gene_count) -if len(other_pnca_WF1) == other_pnca_count: - print('PASS: length of dr_pnca_WF0 match with expected length +if len(other_gene_WF1) == other_gene_count: + print('PASS: length of dr_gene_WF0 match with expected length' , '\n===============================================================') else: - print('FAIL: lengths mismatch + print('FAIL: lengths mismatch' , '\n===============================================================') # count the freq of 'other muts' samples -other_muts_df = other_pnca_WF1 [['id', 'other_mutations_pyrazinamide']] +other_muts_df = other_gene_WF1 [['id', other_muts_col]] print('dim of other_muts_df:', other_muts_df.shape) # add freq column @@ -531,10 +544,10 @@ print('revised dim of other_muts_df:', other_muts_df.shape) c2 = other_muts_df.other_sample_freq.value_counts() print('counting no. of sample frequency:\n', c2) print('=================================================================') -# sanity check: length of pnca samples -if len(other_pnca_WF1) == c2.sum(): +# sanity check: length of gene samples +if len(other_gene_WF1) == c2.sum(): print('PASS: WF data has expected length' - , '\nlength of other_pnca WFO:', c2.sum() + , '\nlength of other_gene WFO:', c2.sum() , '\n===============================================================') else: print('FAIL: Debug please!' @@ -543,7 +556,7 @@ else: #!!! Important !!! # Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df -other_df = other_pnca_WF1.assign(mutation_info = 'other_mutations_pyrazinamide') +other_df = other_gene_WF1.assign(mutation_info = other_muts_col) print('dim of other_df:', other_df.shape , '\n===============================================================' , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' @@ -555,17 +568,19 @@ print('dim of other_df:', other_df.shape #!!! important !!! # change column names to allow concat: # dr_muts.. & other_muts : 'mutation' -print('Now concatenating the two dfs by row') +print('Now concatenating the two dfs by row' + , '\nfirst assigning a common colname: "mutation" to the col containing muts' + , '\nthis is done for both dfs' + , '\n===================================================================') dr_df.columns -dr_df.rename(columns = {'dr_mutations_pyrazinamide': 'mutation'}, inplace = True) +dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True) dr_df.columns other_df.columns -other_df.rename(columns = {'other_mutations_pyrazinamide': 'mutation'}, inplace = True) +other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True) other_df.columns -print('=================================================================') print('Now appending the two dfs:' , '\ndr_df dim:', dr_df.shape , '\nother_df dim:', other_df.shape @@ -582,18 +597,18 @@ else: print('FAIL: Debug please!') # concatenate (axis = 0): Rbind -pnca_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) +gene_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) # checking colnames and length after concat print('checking colnames AFTER concatenating the two dfs...') -if (set(dr_df.columns) == set(pnca_LF0.columns)): +if (set(dr_df.columns) == set(gene_LF0.columns)): print('PASS: column names match') else: print('FAIL: Debug please!') print('checking length AFTER concatenating the two dfs...') -if len(pnca_LF0) == len(dr_df) + len(other_df): +if len(gene_LF0) == len(dr_df) + len(other_df): print('PASS:length of df after concat match' , '\n===============================================================') else: @@ -603,61 +618,59 @@ else: ########### # This is hopefully clean data, but just double check # Filter LF data so that you only have -# mutations corresponding to pncA_p.* (string match pattern) +# mutations corresponding to gene_match* (string match pattern) # this will be your list you run OR calcs ########### -print('length of pnca_LF0:', len(pnca_LF0), +print('length of gene_LF0:', len(gene_LF0), '\nThis should be what you need. But just double check and extract', gene_match, - '\nfrom LF0 (concatenated data)') + '\nfrom LF0 (concatenated data) using string match:', gene_match) -print('using string match:', gene_match) +print('Double checking and creating df: gene_LF1') +gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)] -print('Double checking and creating df: pnca_LF1') -pnca_LF1 = pnca_LF0[pnca_LF0['mutation'].str.contains(gene_match)] - -if len(pnca_LF0) == len(pnca_LF1): - print('PASS: length of pnca_LF0 and pnca_LF1 match', +if len(gene_LF0) == len(gene_LF1): + print('PASS: length of gene_LF0 and gene_LF1 match', '\nconfirming extraction and concatenating worked correctly') else: print('FAIL: BUT NOT FATAL!' - , '\npnca_LF0 and pnca_LF1 lengths differ' + , '\ngene_LF0 and gene_LF1 lengths differ' , '\nsuggesting error in extraction process' - , ' use pnca_LF1 for downstreama analysis' + , ' use gene_LF1 for downstreama analysis' , '\n=========================================================') print('length of dfs pre and post processing...' - , '\npnca WF data (unique samples in each row):', extracted_pnca_samples - , '\npnca LF data (unique mutation in each row):', len(pnca_LF1) + , '\ngene WF data (unique samples in each row):', extracted_gene_samples + , '\ngene LF data (unique mutation in each row):', len(gene_LF1) , '\n=============================================================') -#%% -# final sanity check +#%% sanity check for extraction print('Verifying whether extraction process worked correctly...') -if len(pnca_LF1) == expected_rows: +if len(gene_LF1) == expected_rows: print('PASS: extraction process performed correctly' , '\nexpected length:', expected_rows - , '\ngot:', len(pnca_LF1) - , '\nRESULT: Total no. of mutant sequences for logo plot:', len(pnca_LF1) + , '\ngot:', len(gene_LF1) + , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1) , '\n=========================================================') else: print('FAIL: extraction process has bugs' , '\nexpected length:', expected_rows - , '\ngot:', len(pnca_LF1) + , '\ngot:', len(gene_LF1) , ', \Debug please' , '\n=========================================================') #%% -print('Perfmorning some more sanity checks...') +print('Performing some more sanity checks...') + # From LF1: # no. of unique muts -distinct_muts = pnca_LF1.mutation.value_counts() -print('Distinct mutations:', len(distinct_muts)) +distinct_muts = gene_LF1.mutation.value_counts() +print('Distinct genomic mutations:', len(distinct_muts)) # no. of samples contributing the unique muta -pnca_LF1.id.nunique() -print('No.of samples contributing to distinct muts:', pnca_LF1.id.nunique() ) +gene_LF1.id.nunique() +print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique() ) # no. of dr and other muts -mut_grouped = pnca_LF1.groupby('mutation_info').mutation.nunique() -print('No.of unique dr and other muts:', pnca_LF1.groupby('mutation_info').mutation.nunique() ) +mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique() +print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique() ) # sanity check if len(distinct_muts) == mut_grouped.sum() : @@ -670,7 +683,7 @@ else: , '\nmuts should be distinct within dr* and other* type' , '\ninspecting ...' , '\n=========================================================') - muts_split = list(pnca_LF1.groupby('mutation_info')) + muts_split = list(gene_LF1.groupby('mutation_info')) dr_muts = muts_split[0][1].mutation other_muts = muts_split[1][1].mutation # print('splitting muts by mut_info:', muts_split) @@ -679,7 +692,7 @@ else: #%% # !!! IMPORTANT !!!! # sanity check: There should not be any common muts -# i.e the same mutation cannot be classed as a 'drug' AND 'others' +# i.e the same mutation cannot be classed as a drug AND 'others' if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' , '\n===============================================================') @@ -695,8 +708,8 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum() , '\nThese are:\n', dr_muts[dr_muts.isin(other_muts)] , '\n=========================================================' - , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum(), - , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)], + , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum() + , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)] , '\n=========================================================') else: print('Error: ambiguous muts present, but extraction failed. Debug!' @@ -706,22 +719,22 @@ print('Counting no. of ambiguous muts...') if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() - print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)), - 'list of ambiguous mutations (see below):', *common_muts, sep = '\n' - , '\n=========================================================') + print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) + , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') + print('\n===========================================================') else: print('Error: ambiguous muts detected, but extraction failed. Debug!' - , '\nNo. of ambiguous muts in dr:', len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) - , '\nNo. of ambiguous muts in other:', len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in dr:' + , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in other:' + , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) , '\n=========================================================') #%% clear variables -del(id_dr, id_other, meta_data, meta_pnca_dr, meta_pnca_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_pnca_count, pnca_LF0, pnca_na) +del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na) -del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_muts_df, dr_pnca_WF0, dr_pnca_count, expected_pnca_samples, other_pnca_WF1) +del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1) -#%% end of data extraction and some files writing. Below are some more files writing. - #%%: write file: ambiguous muts # uncomment as necessary #print(outdir) @@ -734,8 +747,8 @@ print('Writing file: ambiguous muts', '\nFilename:', out_filename1, '\nPath:', outdir) -#common_muts = ['pncA_p.Val180Phe','pncA_p.Gln10Pro'] # test -inspect = pnca_LF1[pnca_LF1['mutation'].isin(common_muts)] +#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test +inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] inspect.to_csv(outfile1) print('Finished writing:', out_filename1 @@ -746,22 +759,33 @@ print('Finished writing:', out_filename1 , '\n=============================================================') del(out_filename1) - -#%% read aa dict and pull relevant info -print('Reading aa dict and fetching1 letter aa code' +#%% end of data extraction and some files writing. Below are some more files writing. +#============================================================================= +#%% Formatting df: read aa dict and pull relevant info +print('Now some more formatting:' + , '\nread aa dict and pull relevant info' + , '\nformat mutations:' + , '\nsplit mutation into mCSM style muts: ' , '\nFormatting mutation in mCSM style format: {WT}{MUT}' - , '\nAdding aa properties' - , '\n============================================================') - + , '\nassign aa properties: adding 2 cols at a time for each prop' + , '\n===================================================================') + +# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut +# in each lookup cycle +ncol_mutf_add = 3 # mut split into 3 cols +ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping + #=========== # Split 'mutation' column into three: wild_type, position and # mutant_type separately. Then map three letter code to one using -# reference_dict. -# First: Import reference dict -# Second: convert to mutation to lowercase for compatibility with dict +# reference_dict imported at the beginning. +# After importing, convert to mutation to lowercase for compatibility with dict #=========== -pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower() +gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower() +gene_regex = gene_match.lower()+'(\w{3})' +print('gene regex being used:', gene_regex) +mylen0 = len(gene_LF1.columns) #======= # Iterate through the dict, create a lookup dict i.e # lookup_dict = {three_letter_code: one_letter_code}. @@ -770,17 +794,47 @@ pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower() # The three letter code is extracted using a string match match from the dataframe and then converted # to 'pandas series'since map only works in pandas series #======= +print('Adding', ncol_mutf_add, 'more cols:\n') +# initialise a sub dict that is lookup dict for three letter code to 1-letter code +# adding three more cols lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['one_letter_code'] - wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - pnca_LF1['wild_type'] = wt.map(lookup_dict) - mut = pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() - pnca_LF1['mutant_type'] = mut.map(lookup_dict) +# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + gene_LF1['wild_type'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + gene_LF1['mutant_type'] = mut.map(lookup_dict) # extract position info from mutation column separetly using string match -pnca_LF1['position'] = pnca_LF1['mutation'].str.extract(r'(\d+)') +gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') + +mylen1 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen1 == mylen0 + ncol_mutf_add: + print('PASS: successfully added', ncol_mutf_add, 'cols' + , '\nold length:', mylen0 + , '\nnew len:', mylen1) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen0 + , '\nnew len:', mylen1) # clear variables del(k, v, wt, mut, lookup_dict) @@ -790,18 +844,45 @@ del(k, v, wt, mut, lookup_dict) # lookup_dict = {three_letter_code: aa_prop_water} # Do this for both wild_type and mutant as above. #========= -# initialise a sub dict that is lookup dict for three letter code to aa prop -lookup_dict = dict() +print('Adding', ncol_aa_add, 'more cols:\n') +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_prop_water'] #print(lookup_dict) - wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - pnca_LF1['wt_prop_water'] = wt.map(lookup_dict) - mut = pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() - pnca_LF1['mut_prop_water'] = mut.map(lookup_dict) - -# added two more cols +# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + gene_LF1['wt_prop_water'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + gene_LF1['mut_prop_water'] = mut.map(lookup_dict) + +mylen2 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen2 == mylen1 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) # clear variables del(k, v, wt, mut, lookup_dict) @@ -811,19 +892,92 @@ del(k, v, wt, mut, lookup_dict) # lookup_dict = {three_letter_code: aa_prop_polarity} # Do this for both wild_type and mutant as above. #========= -# initialise a sub dict that is lookup dict for three letter code to aa prop -lookup_dict = dict() +print('Adding', ncol_aa_add, 'more cols:\n') +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_prop_polarity'] #print(lookup_dict) - wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict) - mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() - pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict) - -# added two more cols +# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict) +mylen3 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen3 == mylen2 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) + +# clear variables +del(k, v, wt, mut, lookup_dict) + +#======== +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_calcprop} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_calcprop'] + #print(lookup_dict) +# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + gene_LF1['wt_calcprop'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + gene_LF1['mut_calcprop'] = mut.map(lookup_dict) + +mylen4 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen4 == mylen3 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) + # clear variables del(k, v, wt, mut, lookup_dict) @@ -833,56 +987,62 @@ del(k, v, wt, mut, lookup_dict) # Do this for both wild_type and mutant as above. # caution: taylor mapping will create a list within a column #========= +#print('Adding', ncol_aa_add, 'more cols:\n') #lookup_dict = dict() - #for k, v in my_aa_dict.items(): # lookup_dict[k] = v['aa_taylor'] -# #print(lookup_dict) -# wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on -# pnca_LF1['wt_taylor'] = wt.map(lookup_dict) -# mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() -# pnca_LF1['mut_taylor'] = mut.map(lookup_dict) + #print(lookup_dict) +# wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() +# gene_LF1['wt_taylor'] = wt.map(lookup_dict) +# mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() +# gene_LF1['mut_taylor'] = mut.map(lookup_dict) -# added two more cols +#mylen5 = len(gene_LF1.columns) + +# sanity checks +#print('checking if 3-letter wt&mut residue extraction worked correctly') +#if wt.isna().sum() & mut.isna().sum() == 0: +# print('PASS: 3-letter wt&mut residue extraction worked correctly:' +# , '\nNo NAs detected:' +# , '\nwild-type\n', wt +# , '\nmutant-type\n', mut +# , '\ndim of df:', gene_LF1.shape) +#else: +# print('FAIL: 3-letter wt&mut residue extraction failed' +# , '\nNo NAs detected:' +# , '\nwild-type\n', wt +# , '\nmutant-type\n', mut +# , '\ndim of df:', gene_LF1.shape) + +#if mylen5 == mylen4 + ncol_aa_add: +# print('PASS: successfully added', ncol_aa_add, 'cols' +# , '\nold length:', mylen4 +# , '\nnew len:', mylen5) +#else: +# print('FAIL: failed to add cols:' +# , '\nold length:', mylen4 +# , '\nnew len:', mylen5) # clear variables #del(k, v, wt, mut, lookup_dict) -#======== -# iterate through the dict, create a lookup dict that i.e -# lookup_dict = {three_letter_code: aa_calcprop} -# Do this for both wild_type and mutant as above. -#========= -lookup_dict = dict() - -for k, v in my_aa_dict.items(): - lookup_dict[k] = v['aa_calcprop'] - #print(lookup_dict) - wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - pnca_LF1['wt_calcprop'] = wt.map(lookup_dict) - mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() - pnca_LF1['mut_calcprop'] = mut.map(lookup_dict) - -# added two more cols -# clear variables -del(k, v, wt, mut, lookup_dict) - ######## # combine the wild_type+poistion+mutant_type columns to generate # Mutationinformation (matches mCSM output field) # Remember to use .map(str) for int col types to allow string concatenation ######### -pnca_LF1['Mutationinformation'] = pnca_LF1['wild_type'] + pnca_LF1.position.map(str) + pnca_LF1['mutant_type'] +gene_LF1['Mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] print('Created column: Mutationinformation' - , '\n===============================================================') + , '\n=====================================================================' + , gene_LF1.Mutationinformation.head(10)) #%% Write file: mCSM muts -snps_only = pd.DataFrame(pnca_LF1['Mutationinformation'].unique()) +snps_only = pd.DataFrame(gene_LF1['Mutationinformation'].unique()) snps_only.head() # assign column name snps_only.columns = ['Mutationinformation'] # count how many positions this corresponds to -pos_only = pd.DataFrame(pnca_LF1['position'].unique()) +pos_only = pd.DataFrame(gene_LF1['position'].unique()) print('Checking NA in snps...')# should be 0 if snps_only.Mutationinformation.isna().sum() == 0: @@ -912,7 +1072,7 @@ print('Finished writing:', out_filename2 , '\n=============================================================') del(out_filename2) -#%% Write file: pnca_metadata (i.e pnca_LF1) +#%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids out_filename3 = gene.lower() + '_' + 'metadata.csv' outfile3 = outdir + '/' + out_filename3 @@ -921,15 +1081,15 @@ print('Writing file: LF formatted data' , '\nPath:', outdir , '\n============================================================') -pnca_LF1.to_csv(outfile3, header = True, index = False) +gene_LF1.to_csv(outfile3, header = True, index = False) print('Finished writing:', out_filename3 - , '\nNo. of rows:', len(pnca_LF1) - , '\nNo. of cols:', len(pnca_LF1.columns) + , '\nNo. of rows:', len(gene_LF1) + , '\nNo. of cols:', len(gene_LF1.columns) , '\n=============================================================') del(out_filename3) #%% write file: mCSM style but with repitions for MSA and logo plots -all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation']) +all_muts_msa = pd.DataFrame(gene_LF1['Mutationinformation']) all_muts_msa.head() # assign column name all_muts_msa.columns = ['Mutationinformation'] @@ -978,7 +1138,7 @@ del(out_filename4) #%% write file for mutational positions # count how many positions this corresponds to -pos_only = pd.DataFrame(pnca_LF1['position'].unique()) +pos_only = pd.DataFrame(gene_LF1['position'].unique()) # assign column name pos_only.columns = ['position'] # make sure dtype of column position is int or numeric and not string diff --git a/meta_data_analysis/reference_dict.py b/scripts/reference_dict.py similarity index 95% rename from meta_data_analysis/reference_dict.py rename to scripts/reference_dict.py index 0461523..8087009 100644 --- a/meta_data_analysis/reference_dict.py +++ b/scripts/reference_dict.py @@ -23,34 +23,31 @@ homedir = os.path.expanduser('~') # set working dir #os.getcwd() -#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +#os.chdir(homedir + '/git/LSHTM_analysis/scripts') #os.getcwd() #======================================================================= #%% variable assignment: input and output -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' +#drug = 'pyrazinamide' +#gene = 'pncA' +#gene_match = gene + '_p.' #========== # data dir #========== -#indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= -indir = datadir + '/' + drug + 'input' in_filename = 'aa_codes.csv' -infile = indir + '/' + in_filename +infile = datadir + '/' + in_filename print('Input filename:', in_filename - , '\nInput path:', indir + , '\nInput path:', datadir , '\n============================================================') #======= # output: No output #======= - #outdir = datadir + '/' + drug + '/' + 'output' #out_filename = '' #outfile = outdir + '/' + out_filename @@ -76,6 +73,7 @@ my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 # using 'index' creates a dict of dicts # using 'records' creates a list of dicts my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys +print('Printing my_aa_dict:', my_aa_dict.keys()) #================================================ # dict of aa with their corresponding properties