diff --git a/README.md b/README.md index 628cafd..dd4cfb5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ mCSM Analysis ============= -This repo does mCSM analysis using bash, python and R. - +This contains scripts that does the following: + 1. mCSM analysis: using bash, python and R + 2. meta data analysis: using python and R + Requires an additional 'Data' directory. Batteries not included:-) ## Assumptions @@ -19,17 +21,14 @@ subdirs within this repo *.R *.py - mcsm\_analysis/ - / - scripts/ - *.R - *.py - mcsm/ - *.sh - *.py - *.R - plotting/ - *.R + mcsm_analysis +# / + + foldx_analysis + + plotting + *.R + ``` More docs here as I write them. diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R deleted file mode 100644 index 9eae42a..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R +++ /dev/null @@ -1,130 +0,0 @@ -######################################################### -### A) Installing and loading required packages -######################################################### -#lib_loc = "/usr/local/lib/R/site-library") - -#if (!require("gplots")) { -# install.packages("gplots", dependencies = TRUE) -# library(gplots) -#} - -#if (!require("tidyverse")) { -# install.packages("tidyverse", dependencies = TRUE) -# library(tidyverse) -#} - -if (!require("ggplot2")) { - install.packages("ggplot2", dependencies = TRUE) - library(ggplot2) -} - -if (!require("plotly")) { - install.packages("plotly", dependencies = TRUE) - library(plotly) -} - -if (!require("cowplot")) { - install.packages("copwplot", dependencies = TRUE) - library(cowplot) -} - -if (!require("ggcorrplot")) { - install.packages("ggcorrplot", dependencies = TRUE) - library(ggcorrplot) -} - -if (!require("ggpubr")) { - install.packages("ggpubr", dependencies = TRUE) - library(ggpubr) -} - -if (!require("RColorBrewer")) { - install.packages("RColorBrewer", dependencies = TRUE) - library(RColorBrewer) -} - -if (!require ("GOplot")) { - install.packages("GOplot") - library(GOplot) -} - -if(!require("VennDiagram")) { - install.packages("VennDiagram", dependencies = T) - library(VennDiagram) -} - -if(!require("scales")) { - install.packages("scales", dependencies = T) - library(scales) -} - -if(!require("plotrix")) { - install.packages("plotrix", dependencies = T) - library(plotrix) -} - -if(!require("stats")) { - install.packages("stats", dependencies = T) - library(stats) -} - -if(!require("stats4")) { - install.packages("stats4", dependencies = T) - library(stats4) -} - -if(!require("data.table")) { -install.packages("data.table") - library(data.table) -} - -if (!require("PerformanceAnalytics")){ - install.packages("PerformanceAnalytics", dependencies = T) - library(PerformaceAnalytics) -} - -if (!require ("GGally")){ - install.packages("GGally") - library(GGally) -} - -if (!require ("corrr")){ - install.packages("corrr") - library(corrr) -} - -if (!require ("psych")){ - install.packages("psych") - library(psych) -} - -if (!require ("dplyr")){ - install.packages("dplyr") - library(dplyr) -} - -if (!require ("compare")){ - install.packages("compare") - library(compare) -} - -if (!require ("arsenal")){ - install.packages("arsenal") - library(arsenal) -} - - -####TIDYVERSE -# Install -#if(!require(devtools)) install.packages("devtools") -#devtools::install_github("kassambara/ggcorrplot") - -library(ggcorrplot) - - -###for PDB files -#install.packages("bio3d") -if(!require(bio3d)){ - install.packages("bio3d") - library(bio3d) -} diff --git a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R b/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R deleted file mode 100644 index 5a827c8..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R +++ /dev/null @@ -1,157 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("../barplot_colour_function.R") -#require(data.table) - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA for pyrazinamide: -# merged_df2 -# merged_df3 - -# df without NA for pyrazinamide: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -# using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available, hence use df with NA -########################### - -# uncomment as necessary - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#%%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -table(my_df$mutation_info); str(my_df$mutation_info) - -# subset df with dr muts only -my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") -table(my_df_dr$mutation_info) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Run two times: -# uncomment as necessary -# 1) for all muts -# 2) for dr_muts -#=========================== -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT - -#================ -# for ALL muts -#================ -#plot_df = my_df - -#================ -# for dr muts ONLY -#================ -plot_df = my_df_dr - -#%%%%%%%%%%%%%%%%%%%%%%%% -#============================ -# Plot: Lineage Distribution -# x = mcsm_values, y = dist -# fill = stability -#============================ - -table(plot_df$lineage); str(plot_df$lineage) - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -# uncomment as necessary -df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) - -# refactor -df_lin$lineage = factor(df_lin$lineage) - -table(df_lin$lineage) #{RESULT: No of samples within lineage} -#lineage1 lineage2 lineage3 lineage4 - -length(unique(df_lin$Mutationinformation)) -#{Result: No. of unique mutations the 4 lineages contribute to} - -# sanity checks -r1 = 2:5 # when merged_df2 used: because there is missing lineages -if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { - print ("sanity check passed: numbers match") -} else{ - print("Error!: check your numbers") -} - -#%%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df <- df_lin -#%%%%%%%%%%%%%%%%%%%%%%%%%% - -rm(df_lin) - -# COMPARING DISTRIBUTIONS -head(df$lineage) -df$lineage = as.character(df$lineage) - -lin1 = df[df$lineage == "lineage1",]$ratioDUET -lin2 = df[df$lineage == "lineage2",]$ratioDUET -lin3 = df[df$lineage == "lineage3",]$ratioDUET -lin4 = df[df$lineage == "lineage4",]$ratioDUET - -# ks test -ks.test(lin1,lin2) -ks.test(lin1,lin3) -ks.test(lin1,lin4) - -ks.test(lin2,lin3) -ks.test(lin2,lin4) - -ks.test(lin3,lin4) - - - diff --git a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R deleted file mode 100644 index a3cc403..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R +++ /dev/null @@ -1,27 +0,0 @@ -######################################################### -# 1b: Define function: coloured barplot by subgroup -# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar -######################################################### - -ColourPalleteMulti <- function(df, group, subgroup){ - - # Find how many colour categories to create and the number of colours in each - categories <- aggregate(as.formula(paste(subgroup, group, sep="~" )) - , df - , function(x) length(unique(x))) - # return(categories) } - - category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete - - category.end <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom - - #return(category.start); return(category.end)} - - # Build Colour pallette - colours <- unlist(lapply(1:nrow(categories), - function(i){ - colorRampPalette(colors = c(category.start[i] - , category.end[i]))(categories[i,2])})) - return(colours) -} -######################################################### \ No newline at end of file diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R deleted file mode 100644 index 31a533b..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R +++ /dev/null @@ -1,417 +0,0 @@ -######################################################### -# TASK: To combine mcsm and meta data with af and or files -# Input csv files: -# 1) mcsm normalised and struct params -# 2) gene associated meta_data_with_AFandOR - -# Output: -# 1) muts with opposite effects on stability -# 2) large combined df including NAs for AF, OR,etc -# Dim: same no. of rows as gene associated meta_data_with_AFandOR -# 3) small combined df including NAs for AF, OR, etc. -# Dim: same as mcsm data -# 4) large combined df excluding NAs -# Dim: dim(#1) - no. of NAs(AF|OR) + 1 -# 5) small combined df excluding NAs -# Dim: dim(#2) - no. of unique NAs - 1 -# This script is sourced from other .R scripts for plotting -######################################################### -getwd() -setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/') -getwd() - -########################################################## -# Installing and loading required packages -########################################################## -source('Header_TT.R') -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# data dir -#=========== -datadir = paste0('~/git/Data') - -#=========== -# input -#=========== -# infile1: mCSM data -#indir = '~/git/Data/pyrazinamide/input/processed/' -indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} -#in_filename = 'mcsm_complex1_normalised.csv' -in_filename = 'pnca_mcsm_struct_params.csv' -infile = paste0(indir, '/', in_filename) -cat(paste0('Reading infile1: mCSM output file', ' ', infile) ) - -# infile2: gene associated meta data combined with AF and OR -#indir: same as above -in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv') -infile_comb = paste0(indir, '/', in_filename_comb) -cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb)) - -#=========== -# output -#=========== -# Uncomment if and when required to output -outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir -cat('Output dir: ', outdir) -#out_filename = paste0(tolower(gene), 'XXX') -#outfile = paste0(outdir, '/', out_filename) -#cat(paste0('Output file with full path:', outfile)) -#%% end of variable assignment for input and output files - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# -cat('Reading mcsm_data:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename) - -mcsm_data = read.csv(infile - , row.names = 1 - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(mcsm_data) - , '\nNo. of cols:', ncol(mcsm_data)) - -# clear variables -rm(in_filename, infile) - -str(mcsm_data) - -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) - -# spelling Correction 1: DUET -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) -head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) - -# spelling Correction 2: Ligand -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) - -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) -head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) - -# muts with opposing effects on protomer and ligand stability -table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) -changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),] - -# sanity check: redundant, but uber cautious! -dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) -ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome) - -cat('Identifying muts with opposite stability effects') -if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat('PASS: muts with opposite effects on stability and affinity correctly identified' - , '\nNo. of such muts: ', nrow(changes)) -}else { - cat('FAIL: unsuccessful in extracting muts with changed stability effects') -} - -#*************************** -# write file: changed muts -out_filename = 'muts_opp_effects.csv' -outfile = paste0(outdir, '/', out_filename) -cat('Writing file for muts with opp effects:' - , '\nFilename: ', outfile - , '\nPath: ', outdir) - -write.csv(changes, outfile) -#**************************** -# clear variables -rm(out_filename, outfile) -rm(changes, dl_i, ld_i) - -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by Mutationinformation -mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] -head(mcsm_data$Mutationinformation) - -orig_col = ncol(mcsm_data) - -# get freq count of positions and add to the df -setDT(mcsm_data)[, occurrence := .N, by = .(Position)] - -cat('Added 1 col: position frequency to see which posn has how many muts' - , '\nNo. of cols now', ncol(mcsm_data) - , '\nNo. of cols before: ', orig_col) - -pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) - -########################### -# 2: Read file: meta data with AFandOR -########################### -cat('Reading combined meta data and AFandOR file:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename_comb) - -meta_with_afor <- read.csv(infile_comb - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(meta_with_afor) - , '\nNo. of cols:', ncol(meta_with_afor)) - -# counting NAs in AF, OR cols -if (identical(sum(is.na(meta_with_afor$OR)) - , sum(is.na(meta_with_afor$pvalue)) - , sum(is.na(meta_with_afor$AF)))){ - cat('PASS: NA count match for OR, pvalue and AF\n') - na_count = sum(is.na(meta_with_afor$AF)) - cat('No. of NAs: ', sum(is.na(meta_with_afor$OR))) -} else{ - cat('FAIL: NA count mismatch' - , '\nNA in OR: ', sum(is.na(meta_with_afor$OR)) - , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue)) - , '\nNA in AF:', sum(is.na(meta_with_afor$AF))) -} - -# clear variables -rm(in_filename_comb, infile_comb) - -str(meta_with_afor) - -# sort by Mutationinformation -head(meta_with_afor$Mutationinformation) -meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] -head(meta_with_afor$Mutationinformation) - -########################### -# 3: merging two dfs: with NA -########################### -# link col name = 'Mutationinforamtion' -head(mcsm_data$Mutationinformation) -head(meta_with_afor$Mutationinformation) - -cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2') - -######### -# merge 3a (merged_df2): meta data with mcsm -######### -merged_df2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.y = T) - -cat('Dim of merged_df2: ' - , '\nNo. of rows: ', nrow(merged_df2) - , '\nNo. of cols: ', ncol(merged_df2)) -head(merged_df2$Position) - -# sanity check -cat('Checking nrows in merged_df2') -if(nrow(meta_with_afor) == nrow(merged_df2)){ - cat('nrow(merged_df2) = nrow (gene associated metadata)' - ,'\nExpected no. of rows: ',nrow(meta_with_afor) - ,'\nGot no. of rows: ', nrow(merged_df2)) -} else{ - cat('nrow(merged_df2)!= nrow(gene associated metadata)' - , '\nExpected no. of rows after merge: ', nrow(meta_with_afor) - , '\nGot no. of rows: ', nrow(merged_df2) - , '\nFinding discrepancy') - merged_muts_u = unique(merged_df2$Mutationinformation) - meta_muts_u = unique(meta_with_afor$Mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -# sort by Position -head(merged_df2$Position) -merged_df2 = merged_df2[order(merged_df2$Position),] -head(merged_df2$Position) - -merged_df2v2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn't have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$Position%in%merged_df2v2$Position) - -rm(merged_df2v2) - -######### -# merge 3b (merged_df3):remove duplicate mutation information -######### -cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)' - ,'\nCannot trust lineage info from this' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df3') - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] -head(merged_df3$Position); tail(merged_df3$Position) # should be sorted - -# sanity check -cat('Checking nrows in merged_df3') -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat('PASS: No. of rows match with mcsm_data' - ,'\nExpected no. of rows: ', nrow(mcsm_data) - ,'\nGot no. of rows: ', nrow(merged_df3)) -} else { - cat('FAIL: No. of rows mismatch' - , '\nNo. of rows mcsm_data: ', nrow(mcsm_data) - , '\nNo. of rows merged_df3: ', nrow(merged_df3)) -} - -# counting NAs in AF, OR cols in merged_df3 -# this is becuase mcsm has no AF, OR cols, -# so you cannot count NAs -if (identical(sum(is.na(merged_df3$OR)) - , sum(is.na(merged_df3$pvalue)) - , sum(is.na(merged_df3$AF)))){ - cat('PASS: NA count match for OR, pvalue and AF\n') - na_count_df3 = sum(is.na(merged_df3$AF)) - cat('No. of NAs: ', sum(is.na(merged_df3$OR))) -} else{ - cat('FAIL: NA count mismatch' - , '\nNA in OR: ', sum(is.na(merged_df3$OR)) - , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue)) - , '\nNA in AF:', sum(is.na(merged_df3$AF))) -} - -########################### -# 4: merging two dfs: without NA -########################### -######### -# merge 4a (merged_df2_comp): same as merge 1 but excluding NA -######### -cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2_comp') - -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] - -# sanity check -cat('Checking nrows in merged_df2_comp') -if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ - cat('PASS: No. of rows match' - ,'\nDim of merged_df2_comp: ' - ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 - , '\nNo. of rows: ', nrow(merged_df2_comp) - , '\nNo. of cols: ', ncol(merged_df2_comp)) -}else{ - cat('FAIL: No. of rows mismatch' - ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 - ,'\nGot no. of rows: ', nrow(merged_df2_comp)) -} - -######### -# merge 4b (merged_df3_comp): remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] - -cat('Dim of merged_df3_comp: ' - , '\nNo. of rows: ', nrow(merged_df3_comp) - , '\nNo. of cols: ', ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -# sanity check -cat('Checking nrows in merged_df3_comp') -if(nrow(merged_df3_comp) == nrow(merged_df3)){ - cat('NO NAs detected in merged_df3 in AF|OR cols' - ,'\nNo. of rows are identical: ', nrow(merged_df3)) -} else{ - if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { - cat('PASS: NAs detected in merged_df3 in AF|OR cols' - , '\nNo. of NAs: ', na_count_df3 - , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3 - , '\nGot no. of rows: ', nrow(merged_df3_comp)) - } -} - -#=============== end of combining df -#********************* -# writing 1 file in the style of a loop: merged_df3 -# print(output dir) -#i = 'merged_df3' -#out_filename = paste0(i, '.csv') -#outfile = paste0(outdir, '/', out_filename) - -#cat('Writing output file: ' -# ,'\nFilename: ', out_filename -# ,'\nPath: ', outdir) - -#template: write.csv(merged_df3, 'merged_df3.csv') -#write.csv(get(i), outfile, row.names = FALSE) -#cat('Finished writing: ', outfile -# , '\nNo. of rows: ', nrow(get(i)) -# , '\nNo. of cols: ', ncol(get(i))) - -#%% write_output files; all 4 files: -outvars = c('merged_df2' - , 'merged_df3' - , 'merged_df2_comp' - , 'merged_df3_comp') - -cat('Writing output files: ' - , '\nPath:', outdir) - -for (i in outvars){ -# cat(i, '\n') - out_filename = paste0(i, '.csv') -# cat(out_filename, '\n') -# cat('getting value of variable: ', get(i)) - outfile = paste0(outdir, '/', out_filename) -# cat('Full output path: ', outfile, '\n') - cat('Writing output file:' - ,'\nFilename: ', out_filename,'\n') - write.csv(get(i), outfile, row.names = FALSE) - cat('Finished writing: ', outfile - , '\nNo. of rows: ', nrow(get(i)) - , '\nNo. of cols: ', ncol(get(i)), '\n') -} - -# alternate way to replace with implicit loop -# FIXME -#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv'))) -#************************* -# clear variables -rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) -rm(pos_count_check) -#============================= end of script - diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R deleted file mode 100644 index 361b6b6..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R +++ /dev/null @@ -1,330 +0,0 @@ -######################################################### -# TASK: To combine mcsm and meta data with af and or -# by filtering for distance to ligand (<10Ang). -# This script doesn't output anything. -# This script is sourced from other .R scripts for plotting ligand plots - -# Input csv files: -# 1) mcsm normalised and struct params -# 2) gene associated meta_data_with_AFandOR -######################################################### -getwd() -setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/') -getwd() - -########################################################## -# Installing and loading required packages -########################################################## - -source('Header_TT.R') -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# - -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# input -#=========== -# infile1: mCSM data -#indir = '~/git/Data/pyrazinamide/input/processed/' -indir = paste0('~/git/Data', '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} -#in_filename = 'mcsm_complex1_normalised.csv' -in_filename = 'pnca_mcsm_struct_params.csv' -infile = paste0(indir, '/', in_filename) -cat(paste0('Reading infile1: mCSM output file', ' ', infile) ) - -# infile2: gene associated meta data combined with AF and OR -#indir: same as above -in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv') -infile_comb = paste0(indir, '/', in_filename_comb) -cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb)) - -#=========== -# output -#=========== -# Uncomment if and when required to output -outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir -cat('Output dir: ', outdir) -#out_filename = paste0(tolower(gene), 'XXX') -#outfile = paste0(outdir, '/', out_filename) -#cat(paste0('Output file with full path:', outfile)) -#%% end of variable assignment for input and output files - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# -cat('Reading mcsm_data:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename) - -mcsm_data = read.csv(infile - , row.names = 1 - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(mcsm_data) - , '\nNo. of cols:', ncol(mcsm_data)) - -# clear variables -rm(in_filename, infile) - -str(mcsm_data) - -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) - -# spelling Correction 1: DUET -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) -head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) - -# spelling Correction 2: Ligand -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) - -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) -head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) - -# muts with opposing effects on protomer and ligand stability -# excluded from here as it is redundant. -# check 'combining_two_df.R' to refer if required. - -########################### !!! only for mcsm_lig -# 4: Filter/subset data -# Lig plots < 10Ang -# Filter the lig plots for Dis_to_lig < 10Ang -########################### - -# check range of distances -max(mcsm_data$Dis_lig_Ang) -min(mcsm_data$Dis_lig_Ang) - -# count -table(mcsm_data$Dis_lig_Ang<10) - -# subset data to have only values less than 10 Ang -mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) - -# sanity checks -max(mcsm_data2$Dis_lig_Ang) -min(mcsm_data2$Dis_lig_Ang) - -# count no of unique positions -length(unique(mcsm_data2$Position)) - -# count no of unique mutations -length(unique(mcsm_data2$Mutationinformation)) - -# count Destabilisinga and stabilising -table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang} - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(mcsm_data2$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -#!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT: so as not to alter the script -mcsm_data = mcsm_data2 -#!!!!!!!!!!!!!!!!!!!!! -# clear variables -rm(mcsm_data2) - -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by Mutationinformation -mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] -head(mcsm_data$Mutationinformation) - -orig_col = ncol(mcsm_data) -# get freq count of positions and add to the df -setDT(mcsm_data)[, occurrence := .N, by = .(Position)] - -cat('Added 1 col: position frequency to see which posn has how many muts' - , '\nNo. of cols now', ncol(mcsm_data) - , '\nNo. of cols before: ', orig_col) - -pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) - -########################### -# 2: Read file: meta data with AFandOR -########################### -cat('Reading combined meta data and AFandOR file:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename_comb) - -meta_with_afor <- read.csv(infile_comb - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(meta_with_afor) - , '\nNo. of cols:', ncol(meta_with_afor)) - -# clear variables -rm(in_filename_comb, infile_comb) - -str(meta_with_afor) - -# sort by Mutationinformation -head(meta_with_afor$Mutationinformation) -meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] -head(meta_with_afor$Mutationinformation) - -########################### -# 3: merging two dfs: with NA -########################### -# link col name = 'Mutationinforamtion' -cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2') - -head(mcsm_data$Mutationinformation) -head(meta_with_afor$Mutationinformation) - -######### -# merge 3a: meta data with mcsm -######### -merged_df2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.y = T) - -cat('Dim of merged_df2: ' - , '\nNo. of rows: ', nrow(merged_df2) - , '\nNo. of cols: ', ncol(merged_df2)) -head(merged_df2$Position) - -if(nrow(meta_with_afor) == nrow(merged_df2)){ - cat('nrow(merged_df2) = nrow (gene associated metadata)' - ,'\nExpected no. of rows: ',nrow(meta_with_afor) - ,'\nGot no. of rows: ', nrow(merged_df2)) -} else{ - cat('nrow(merged_df2)!= nrow(gene associated metadata)' - , '\nExpected no. of rows after merge: ', nrow(meta_with_afor) - , '\nGot no. of rows: ', nrow(merged_df2) - , '\nFinding discrepancy') - merged_muts_u = unique(merged_df2$Mutationinformation) - meta_muts_u = unique(meta_with_afor$Mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -# sort by Position -head(merged_df2$Position) -merged_df2 = merged_df2[order(merged_df2$Position),] -head(merged_df2$Position) - -merged_df2v2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn't have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$Position%in%merged_df2v2$Position) - -rm(merged_df2v2) - -######### -# merge 3b:remove duplicate mutation information -######### -cat('Merging dfs with NAs: small df (removing duplicate muts)' - ,'\nCannot trust lineage info from this' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df3') - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] -head(merged_df3$Position); tail(merged_df3$Position) # should be sorted - -# sanity checks -# nrows of merged_df3 should be the same as the nrows of mcsm_data -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat('PASS: No. of rows match with mcsm_data' - ,'\nExpected no. of rows: ', nrow(mcsm_data) - ,'\nGot no. of rows: ', nrow(merged_df3)) -} else { - cat('FAIL: No. of rows mismatch' - , '\nNo. of rows mcsm_data: ', nrow(mcsm_data) - , '\nNo. of rows merged_df3: ', nrow(merged_df3)) -} - -########################### -# 4: merging two dfs: without NA -########################### -cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2_comp') - -######### -# merge 4a: same as merge 1 but excluding NA -######### -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] - -cat('Dim of merged_df2_comp: ' - , '\nNo. of rows: ', nrow(merged_df2_comp) - , '\nNo. of cols: ', ncol(merged_df2_comp)) - -######### -# merge 4b: remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] - -cat('Dim of merged_df3_comp: ' - , '\nNo. of rows: ', nrow(merged_df3_comp) - , '\nNo. of cols: ', ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -#=============== end of combining df -#********************* -# write_output files -# Not required as this is a subset of the combining_two_df.R -#************************* -# clear variables -rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) -rm(pos_count_check) -#============================= end of script - diff --git a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py deleted file mode 100755 index 5cc5f09..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Tue Jun 25 08:46:36 2019 - -@author: tanushree -""" -############################################ -# load libraries -import os -import pandas as pd -import numpy as np -from Bio import SeqIO -############################################ -#******************************************************************** -# TASK: Read in fasta files and create mutant sequences akin to a MSA, -# to allow generation of logo plots - -# Requirements: -# input: Fasta file of protein/target for which mut seqs will be created - # path: "Data//input/original/" -# output: MSA for mutant sequences - # path: "Data//input/processed/" -#*********************************************************************** -#%% -# specify input and output variables -homedir = os.path.expanduser('~') -#======= -# input -#======= -############# -# fasta file -############# -indir = 'git/Data/pyrazinamide/input/original' -in_filename_fasta = "3pl1.fasta.txt" -infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta -print(infile_fasta) - -############# -# meta data -############# -# FIXME when you change the dir struc -inpath_p = "git/Data/pyrazinamide/input/processed" -in_filename_meta_data = "meta_data_with_AFandOR.csv" -infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data -print("Input file is:", infile_meta_data) - -#======= -# output -#======= -outdir = 'git/Data/pyrazinamide/output' -# filenames in respective sections - -################## end of variable assignment for input and output files -#%% -#========== -# read files -#========== - -############# -# fasta file -############# -my_fasta_o = str() -for seq_record in SeqIO.parse(infile_fasta, "fasta"): - my_seq = seq_record.seq - my_fasta_o = str(my_seq) #convert to a string - print(my_fasta_o) - print(len(my_fasta_o)) -# print( type(my_fasta) ) - -# remove non_struc positions from fasta -def remove_char(str, n): - first_part = str[:n] - last_part = str[n+1:] - return first_part + last_part -#print(remove_char('Python', 0)) - -ns_pos_o = 186 -offset = 1 # 0 based indexing -ns_pos = ns_pos_o - offset -my_fasta = remove_char(my_fasta_o, ns_pos) -print("orig length:", len(my_fasta_o)) -print("new length:", len(my_fasta)) - -############# -# SNP info and no of MSA to generate -############# -# read mutant_info file and extract cols with positions and mutant_info -# This should be all samples with pncA muts -#my_data = pd.read_csv('mcsm_complex1_normalised.csv') -my_data = pd.read_csv(infile_meta_data) -list(my_data.columns) -#my_data['OR'].value_counts() -#my_data['OR'].isna().sum() - -#FIXME: You need a better way to identify this -# ideally this file should not contain any non_struc pos -# remove positions not in the structure -my_data = my_data[my_data.position != ns_pos_o] - -# if multiple positions, then try the example below; -# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas -#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)] - -# count mutations per sample -mut_info = my_data[['id', 'Mutationinformation', 'wild_type', 'position', 'mutant_type']] - -# test -foo = mut_info[mut_info.Mutationinformation.str.contains('C72Y')] - -foo = mut_info.pivot_table(values = ['Mutationinformation'] - , index = ['Mutationinformation', 'id'] -# , columns = - , aggfunc = 'count') - -# table -foo_tab = mut_info.pivot_table(values = ['Mutationinformation'] -# , index = ['Mutationinformation'] - , columns = ['id', 'Mutationinformation'] - , aggfunc = 'count' -# , margins = True) - ) -foo_tab.stack('id') - -mut_info.to_csv('mutinfo.csv') - -mut_info1 = my_data[['position', 'mutant_type']] -#%% -################ -# data cleaning -################ -# extract only those positions that have a frequency count of pos>1 -###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy - -# add a column of frequency for each position -#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') -mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts()) - -# sort by position -mut_info2 = mut_info1.sort_values(by=['position']) - -# count how many pos have freq 1 as you will need to exclude those -mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos - -# extract entries with freq_pos>1 -# should be 3093-211 = 3072 -mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072 -print("orig length:", len(mut_info1)) -print("No. of excluded values:", mutfreq1_count) -print("new length:", len(mut_info3)) -# sanity check -if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ): - print("Sanity check passed: Filtered data correctly") -else: - print("Error: Debug you code") - -# reset index to allow iteration !!!!!!!!!! IMPORTANT -mut_info = mut_info3.reset_index(drop = True) - -##del(mut_info1, mut_info2, mut_info3, my_data) - -################### -# generate mut seqs -################### -mut_seqsL = [] * len(mut_info) - -# iterate -for i, pos in enumerate(mut_info['position']): - my_fastaL = list(my_fasta) - mut = mut_info['mutant_type'][i] - offset_pos = pos-1 - - print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut) - my_fastaL[offset_pos] = mut - print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut) - - mut_seq = "".join(my_fastaL) -# print(mut_seq + '\n') - print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut, mut_seq) - mut_seqsL.append(mut_seq) - - -############### -# sanity check -################ -len_orig = len(my_fasta) -# checking if all the mutant sequences have the same length as the original fasta file sequence -for seqs in mut_seqsL: -# print(seqs) -# print(len(seqs)) - if len(seqs) != len_orig: - print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig) - else: - print('**Hooray** Length of mutant and original sequences match') - -del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs) - -############ -# write file -############ -#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' -#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt' -print(outdir) -out_filename = "gene_msa.txt" -outfile_gene = homedir + '/' + outdir + '/' + out_filename -print(outfile_gene) - -with open(outfile_gene, 'w') as file_handler: - for item in mut_seqsL: - file_handler.write("{}\n".format(item)) - -#R = "\n".join(mut_seqsL) -#f = open('Columns.csv','w') -#f.write(R) -#f.close() diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh deleted file mode 100755 index 7e00fb1..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# run all bash scripts for mcsm - -#./step0_check_duplicate_SNPs.sh -#./step1_lig_output_urls.sh -./step2_lig_results.sh -./step3a_results_format_interim.sh - diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh deleted file mode 100755 index 4c24392..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -#************************************* -# need to be in the correct directory -#************************************* -##: comments for code -#: commented out code - -#********************************************************************** -# TASK: Text file containing a list of SNPs; SNP in the format(C2E) -# per line. Sort by unique, which automatically removes duplicates. -# sace file in current directory -#********************************************************************** -infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv" -outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv" - -# sort unique entries and output to current directory -sort -u ${infile} > ${outfile} - -# count no. of unique snps mCSM will run on -count=$(wc -l < ${outfile}) - -# print to console no. of unique snps mCSM will run on -echo "${count} unique mutations for mCSM to run on" - diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh deleted file mode 100755 index 6361b62..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash - -#********************************************************************** -# TASK: submit requests using curl: HANDLE redirects and refresh url. -# Iterate over mutation file and write/append result urls to a file -# Mutation file must have one mutation (format A1B) per line -# Requirements -# input: mutation list (format: A1B), complex struc: (pdb format) - # mutation: outFile from step0, one unique mutation/line, no chain ID - # path: "Data//input/processed/" - # structure: pdb file of drug-target complex - # path: "Data//input/structure/" -# output: should be n urls (n=no. of unique mutations in file) - # path: "Data//input/processed/" - -# NOTE: these are just result urls, not actual values for results -#********************************************************************** -############# specify variables for input and output paths and filenames -homedir="${HOME}" -#echo Home directory is ${homedir} -basedir="/git/Data/pyrazinamide/input" - -# input -inpath_mut="/processed" -in_filename_mut="/pnca_mis_SNPs_v2_unique.csv" -infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}" -echo Input Mut filename: ${infile_mut} - -inpath_struc="/structure" -in_filename_struc="/complex1_no_water.pdb" -infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}" -echo Input Struc filename: ${infile_struc} - -# output -outpath="/processed" -out_filename="/complex1_result_url.txt" -outfile="${homedir}${basedir}${outpath}${out_filename}" -#echo Output filename: ${outfile} -################## end of variable assignment for input and output files - -# iterate over mutation file (infile_mut); line by line and -# submit query using curl -# some useful messages -echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n" -COUNT=0 -while read -r line; do -((COUNT++)) -mutation="${line}" -# echo "${mutation}" -#pdb='../Data/complex1_no_water.pdb' -pdb="${infile_struc}" -mutation="${mutation}" -chain="A" -lig_id="PZA" -affin_wt="0.99" -host="http://biosig.unimelb.edu.au" -call_url="/mcsm_lig/prediction" - -#========================================= -##html field_names names required for curl -##complex_field:wild=@ -##mutation_field:mutation=@ -##chain_field:chain=@ -##ligand_field:lig_id@ -##energy_field:affin_wt -#========================================= -refresh_url=$(curl -L \ - -sS \ - -F "wild=@${pdb}" \ - -F "mutation=${mutation}" \ - -F "chain=${chain}" \ - -F "lig_id=${lig_id}" \ - -F "affin_wt=${affin_wt}" \ - ${host}${call_url} | grep "http-equiv") - -#echo Refresh URL: $refresh_url -#echo Host+Refresh: ${host}${refresh_url} - -# use regex to extract the relevant bit from the refresh url -# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g' - -# Now build: result url using host and refresh url and write the urls to a file -result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g') -sleep 10 - -echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..." - -# create output file with the added number of muts from file -# after much thought, bad idea as less generic! -#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt -echo -e "${host}${result_url}" >> ${outfile} -#echo -n '.' -done < "${infile_mut}" - -#FIXME: stop executing if error else these echo statements are misleading! -echo -echo Output filename: ${outfile} -echo -echo Number of urls saved: $(wc -l < ${infile_mut}) -echo -echo "Processing Complete" - -# end of submitting query, receiving result url and storing results url in a file - diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh deleted file mode 100755 index 51a7844..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -#******************************************************************** -# TASK: submit result urls and fetch actual results using curl -# Iterate over each result url from the output of step1 stored in processed/ -# Use curl to fetch results and extract relevant sections using hxtools -# and store these in another file in processed/ - -# Requirements: -# input: output of step1, file containing result urls - # path: "Data//input/processed/" -# output: name of the file where extracted results will be stored - # path: "Data//input/processed/" - -# Optional: can make these command line args you pass when calling script -# by uncommenting code as indicated -#********************************************************************* -############################# uncomment: to make it command line args -#if [ "$#" -ne 2 ]; then - #if [ -Z $1 ]; then -# echo " -# Please provide both Input and Output files. - -# Usage: batch_read_urls.sh INFILE OUTFILE -# " -# exit 1 -#fi - -# First argument: Input File -# Second argument: Output File -#infile=$1 -#outfile=$2 -############################ end of code block to make command line args - -############# specify variables for input and output paths and filenames -homedir="${HOME}" -#echo Home directory is ${homedir} -basedir="/git/Data/pyrazinamide/input" - -# input -inpath="/processed" -in_filename="/complex1_result_url.txt" -infile="${homedir}${basedir}${inpath}${in_filename}" -echo Input Mut filename: ${infile} - -# output -outpath="/processed" -out_filename="/complex1_output_MASTER.txt" -outfile="${homedir}${basedir}${outpath}${out_filename}" -echo Output filename: ${outfile} -################## end of variable assignment for input and output files - -# Iterate over each result url, and extract results using hxtools -# which nicely cleans and formats html -echo -n "Processing $(wc -l < ${infile}) entries from ${infile}" -echo -COUNT=0 -while read -r line; do -#COUNT=$(($COUNT+1)) -((COUNT++)) - curl --silent ${line} \ - | hxnormalize -x \ - | hxselect -c div.span4 \ - | hxselect -c div.well \ - | sed -r -e 's/<[^>]*>//g' \ - | sed -re 's/ +//g' \ - >> ${outfile} - #| tee -a ${outfile} -# echo -n '.' -echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..." - -done < "${infile}" - -echo -echo "Processing Complete" - diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh deleted file mode 100755 index 0861996..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -#******************************************************************** -# TASK: Intermediate results processing -# output file has a convenient delimiter of ":" that can be used to -# format the file into two columns (col1: field_desc and col2: values) -# However the section "PredictedAffinityChange:...." and -# "DUETstabilitychange:.." are split over multiple lines and -# prevent this from happening. Additionally there are other empty lines -# that need to be omiited. In order ensure these sections are not split -# over multiple lines, this script is written. - -# Requirements: -# input: output of step2, file containing mcsm results as described above - # path: "Data//input/processed/" -# output: replaces file in place. -# Therefore first create a copy of the input file -# but rename it to remove the word "MASTER" and add the word "processed" -# file format: .txt - -# NOTE: This replaces the file in place! -# the output is a txt file with no newlines and formatting -# to have the following format "<:> -#*********************************************************************** -############# specify variables for input and output paths and filenames -homedir="${HOME}" -basedir="/git/Data/pyrazinamide/input" - -inpath="/processed" - -# Create input file: copy and rename output file of step2 -oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt" -newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt" -cp $oldfile $newfile - -echo Input filename is ${oldfile} -echo -echo Output i.e copied filename is ${newfile} - -# output: No output perse -# Replacement in place inside the copied file -################## end of variable assignment for input and output files - -#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \ -# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile} - -# Outputs records separated by a newline, that look something like this: -# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing -# Mutationinformation: -# Wild-type:L -# Position:4 -# Mutant-type:W -# Chain:A -# LigandID:PZA -# Distancetoligand:15.911Å -# DUETstabilitychange:-2.169Kcal/mol -# -# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing -# (...etc) - -# This script brings everything in a convenient format for further processing in python. -sed -i '/PredictedAffinityChange/ { -N -N -N -N -s/\n//g -} -/DUETstabilitychange:/ { -N -N -s/\n//g -} -/^$/d' ${newfile} diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py deleted file mode 100755 index 0e07c0d..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/python - -################### -# load libraries -import os, sys -import pandas as pd -from collections import defaultdict -#################### - -#******************************************************************** -# TASK: Formatting results with nice colnames -# step3a processed the mcsm results to remove all newlines and -# brought data in a format where the delimiter ":" splits -# data into a convenient format of "colname": "value". -# this script formats the data and outputs a df with each row -# as a mutation and its corresponding mcsm_values - -# Requirements: -# input: output of step3a, file containing "..._output_processed.txt" - # path: "Data//input/processed/" -# output: formatted .csv file - # path: "Data//input/processed/" -#*********************************************************************** -############# specify variables for input and output paths and filenames -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -basedir = "/git/Data/pyrazinamide/input" - -# input -inpath = "/processed" -in_filename = "/complex1_output_processed.txt" -infile = homedir + basedir + inpath + in_filename -print("Input file is:", infile) - -# output -outpath = "/processed" -out_filename = "/complex1_formatted_results.csv" -outfile = homedir + basedir + outpath + out_filename -print("Output file is:", outfile) -################## end of variable assignment for input and output files - -outCols=[ - 'PredictedAffinityChange', - 'Mutationinformation', - 'Wild-type', - 'Position', - 'Mutant-type', - 'Chain', - 'LigandID', - 'Distancetoligand', - 'DUETstabilitychange' - ] - -lines = [line.rstrip('\n') for line in open(infile)] - -outputs = defaultdict(list) - -for item in lines: - col, val = item.split(':') - outputs[col].append(val) - -dfOut=pd.DataFrame(outputs) - -pd.DataFrame.to_csv(dfOut, outfile, columns=outCols) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R deleted file mode 100644 index c58dc8b..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R +++ /dev/null @@ -1,230 +0,0 @@ -getwd() -#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") -getwd() - -#======================================================= -# TASK: read formatted_results_df.csv to complete -# missing info, adding DUET categories, assigning -# meaningful colnames, etc. - -# Requirements: -# input: output of step3b, python processing, - # path: Data//input/processed/" -# output: NO output as the next scripts refers to this -# for yet more processing -#======================================================= - -# specify variables for input and output paths and filenames -homedir = "~" -basedir = "/git/Data/pyrazinamide/input" -inpath = "/processed" -in_filename = "/complex1_formatted_results.csv" -infile = paste0(homedir, basedir, inpath, in_filename) -print(paste0("Input file is:", infile)) - -#====================================================== -#TASK: To tidy the columns so you can generate figures -#======================================================= -#################### -#### read file #####: this will be the output from python script (csv file) -#################### -data = read.csv(infile - , header = T - , stringsAsFactors = FALSE) -dim(data) -str(data) - -# clear variables -rm(homedir, basedir, inpath, in_filename, infile) - -########################### -##### Data processing ##### -########################### - -# populate mutation information columns as currently it is empty -head(data$Mutationinformation) -tail(data$Mutationinformation) - -# should not be blank: create muation information -data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type) - -head(data$Mutationinformation) -tail(data$Mutationinformation) -#write.csv(data, 'test.csv') - -########################################## -# Remove duplicate SNPs as a sanity check -########################################## -# very important -table(duplicated(data$Mutationinformation)) - -# extract duplicated entries -dups = data[duplicated(data$Mutationinformation),] #0 - -# No of dups should match with the no. of TRUE in the above table -#u_dups = unique(dups$Mutationinformation) #10 -sum( table(dups$Mutationinformation) ) - -#*************************************************************** -# select non-duplicated SNPs and create a new df -df = data[!duplicated(data$Mutationinformation),] -#*************************************************************** -# sanity check -u = unique(df$Mutationinformation) -u2 = unique(data$Mutationinformation) -table(u%in%u2) - -# should all be 1 -sum(table(df$Mutationinformation) == 1) - -# sort df by Position -# MANUAL CHECKPOINT: -#foo <- df[order(df$Position),] -#df <- df[order(df$Position),] - -# clear variables -rm(u, u2, dups) - -#################### -#### give meaningful colnames to reflect units to enable correct data type -#################### - -#======= -#STEP 1 -#======== -# make a copy of the PredictedAffinityColumn and call it Lig_outcome -df$Lig_outcome = df$PredictedAffinityChange - - #make Predicted...column numeric and outcome column categorical -head(df$PredictedAffinityChange) -df$PredictedAffinityChange = gsub("log.*" - , "" - , df$PredictedAffinityChange) - -# sanity checks -head(df$PredictedAffinityChange) - -# should be numeric, check and if not make it numeric -is.numeric( df$PredictedAffinityChange ) - -# change to numeric -df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange) - -# should be TRUE -is.numeric( df$PredictedAffinityChange ) - -# change the column name to indicate units -n = which(colnames(df) == "PredictedAffinityChange"); n -colnames(df)[n] = "PredAffLog" -colnames(df)[n] - -#======== -#STEP 2 -#======== -# make Lig_outcome column categorical showing effect of mutation -head(df$Lig_outcome) -df$Lig_outcome = gsub("^.*-" - , "", - df$Lig_outcome) -# sanity checks -head(df$Lig_outcome) - -# should be factor, check and if not change it to factor -is.factor(df$Lig_outcome) - -# change to factor -df$Lig_outcome = as.factor(df$Lig_outcome) - -# should be TRUE -is.factor(df$Lig_outcome) - -#======== -#STEP 3 -#======== -# gsub -head(df$Distancetoligand) -df$Distancetoligand = gsub("Å" - , "" - , df$Distancetoligand) -# sanity checks -head(df$Distancetoligand) - -# should be numeric, check if not change it to numeric -is.numeric(df$Distancetoligand) - -# change to numeric -df$Distancetoligand = as.numeric(df$Distancetoligand) - -# should be TRUE -is.numeric(df$Distancetoligand) - -# change the column name to indicate units -n = which(colnames(df) == "Distancetoligand") -colnames(df)[n] <- "Dis_lig_Ang" -colnames(df)[n] - -#======== -#STEP 4 -#======== -#gsub -head(df$DUETstabilitychange) -df$DUETstabilitychange = gsub("Kcal/mol" - , "" - , df$DUETstabilitychange) -# sanity checks -head(df$DUETstabilitychange) - -# should be numeric, check if not change it to numeric -is.numeric(df$DUETstabilitychange) - -# change to numeric -df$DUETstabilitychange = as.numeric(df$DUETstabilitychange) - -# should be TRUE -is.numeric(df$DUETstabilitychange) - -# change the column name to indicate units -n = which(colnames(df) == "DUETstabilitychange"); n -colnames(df)[n] = "DUETStability_Kcalpermol" -colnames(df)[n] - -#======== -#STEP 5 -#======== -# create yet another extra column: classification of DUET stability only -df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0 - , "Stabilizing" - , "Destabilizing") # spelling to be consistent with mcsm - -table(df$Lig_outcome) - -table(df$DUET_outcome) - -#============================== -#FIXME -#Insert a venn diagram -#================================ - -#======== -#STEP 6 -#======== -# assign wild and mutant colnames correctly - -wt = which(colnames(df) == "Wild.type"); wt -colnames(df)[wt] <- "Wild_type" -colnames(df[wt]) - -mut = which(colnames(df) == "Mutant.type"); mut -colnames(df)[mut] <- "Mutant_type" -colnames(df[mut]) - -#======== -#STEP 7 -#======== -# create an extra column: maybe useful for some plots -df$WildPos = paste0(df$Wild_type, df$Position) - -# clear variables -rm(n, wt, mut) - -################ end of data cleaning diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R deleted file mode 100644 index eb24cab..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R +++ /dev/null @@ -1,275 +0,0 @@ -################## -# load libraries - library(compare) -################## - -getwd() - -#======================================================= -# TASK:read cleaned data and perform rescaling - # of DUET stability scores - # of Pred affinity -# compare scaling methods with plots - -# Requirements: -# input: R script, step3c_results_cleaning.R - # path: Data//input/processed/" -# output: NO output as the next scripts refers to this -# for yet more processing -# output normalised file -#======================================================= - -# specify variables for input and output paths and filenames -homedir = "~" -currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm" -in_filename = "/step3c_results_cleaning.R" -infile = paste0(homedir, currdir, in_filename) -print(paste0("Input file is:", infile)) - -# output file -basedir = "/git/Data/pyrazinamide/input" -outpath = "/processed" -out_filename = "/mcsm_complex1_normalised.csv" -outfile = paste0(homedir, basedir, outpath, out_filename) -print(paste0("Output file is:", outfile)) - -#################### -#### read file #####: this will be the output of my R script that cleans the data columns -#################### -source(infile) - -#This will outut two dataframes: -# data: unclean data: 10 cols -# df : cleaned df: 13 cols -# you can remove data if you want as you will not need it -rm(data) - -colnames(df) - -#=================== -#3a: PredAffLog -#=================== -n = which(colnames(df) == "PredAffLog"); n -group = which(colnames(df) == "Lig_outcome"); group - -#=================================================== -# order according to PredAffLog values -#=================================================== -# This is because this makes it easier to see the results of rescaling for debugging -head(df$PredAffLog) - -# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom -df = df[order(df$PredAffLog),] -head(df$PredAffLog) - -# sanity checks -head(df[,n]) # all negatives -tail(df[,n]) # all positives - -# sanity checks -mean(df[,n]) -#-0.9526746 - -tapply(df[,n], df[,group], mean) - -#=========================== -# Same as above: in 2 steps -#=========================== - -# find range of your data -my_min = min(df[,n]); my_min # -my_max = max(df[,n]); my_max # - -#=============================================== -# WITHIN GROUP rescaling 2: method "ratio" -# create column to store the rescaled values -# Rescaling separately (Less dangerous) -# =====> chosen one: preserves sign -#=============================================== -df$ratioPredAff = ifelse(df[,n] < 0 - , df[,n]/abs(my_min) - , df[,n]/my_max - )# 14 cols -# sanity checks -head(df$ratioPredAff) -tail(df$ratioPredAff) - -min(df$ratioPredAff); max(df$ratioPredAff) - -tapply(df$ratioPredAff, df$Lig_outcome, min) - -tapply(df$ratioPredAff, df$Lig_outcome, max) - -# should be the same as below -sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0) - -table(df$Lig_outcome) - -#=============================================== -# Hist and density plots to compare the rescaling -# methods: Base R -#=============================================== -# uncomment as necessary -my_title = "Ligand_stability" -# my_title = colnames(df[n]) - -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(2,2)) - -hist(df[,n] - , xlab = "" - , main = "Raw values" -) - -hist(df$ratioPredAff - , xlab = "" - , main = "ratio rescaling" -) - -# Plot density plots underneath -plot(density( df[,n] ) - , main = "Raw values" -) - -plot(density( df$ratioPredAff ) - , main = "ratio rescaling" -) - -# titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = my_title - , side = 3 - , line = 0 - , outer = TRUE) - - -#clear variables -rm(my_min, my_max, my_title, n, group) - -#=================== -# 3b: DUET stability -#=================== -dim(df) # 14 cols - -n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10 -group = which(colnames(df) == "DUET_outcome"); group #12 - -#=================================================== -# order according to DUET scores -#=================================================== -# This is because this makes it easier to see the results of rescaling for debugging -head(df$DUETStability_Kcalpermol) - -# ORDER BY DUET scores: negative values at the top and positive at the bottom -df = df[order(df$DUETStability_Kcalpermol),] - -# sanity checks -head(df[,n]) # negatives -tail(df[,n]) # positives - -# sanity checks -mean(df[,n]) - -tapply(df[,n], df[,group], mean) - -#=============================================== -# WITHIN GROUP rescaling 2: method "ratio" -# create column to store the rescaled values -# Rescaling separately (Less dangerous) -# =====> chosen one: preserves sign -#=============================================== -# find range of your data -my_min = min(df[,n]); my_min -my_max = max(df[,n]); my_max - -df$ratioDUET = ifelse(df[,n] < 0 - , df[,n]/abs(my_min) - , df[,n]/my_max - ) # 15 cols -# sanity check -head(df$ratioDUET) -tail(df$ratioDUET) - -min(df$ratioDUET); max(df$ratioDUET) - -# sanity checks -tapply(df$ratioDUET, df$DUET_outcome, min) - -tapply(df$ratioDUET, df$DUET_outcome, max) - -# should be the same as below (267 and 42) -sum(df$ratioDUET < 0); sum(df$ratioDUET > 0) - -table(df$DUET_outcome) - -#=============================================== -# Hist and density plots to compare the rescaling -# methods: Base R -#=============================================== -# uncomment as necessary -my_title = "DUET_stability" -#my_title = colnames(df[n]) - -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(2,2)) - -hist(df[,n] - , xlab = "" - , main = "Raw values" -) - -hist(df$ratioDUET - , xlab = "" - , main = "ratio rescaling" -) - -# Plot density plots underneath -plot(density( df[,n] ) - , main = "Raw values" -) - -plot(density( df$ratioDUET ) - , main = "ratio rescaling" -) - -# graph titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = my_title - , side = 3 - , line = 0 - , outer = TRUE) - -# reorder by column name -#data <- data[c("A", "B", "C")] -colnames(df) -df2 = df[c("X", "Mutationinformation", "WildPos", "Position" - , "Wild_type", "Mutant_type" - , "DUETStability_Kcalpermol", "DUET_outcome" - , "Dis_lig_Ang", "PredAffLog", "Lig_outcome" - , "ratioDUET", "ratioPredAff" - , "LigandID","Chain")] - -# sanity check -# should be True -#compare(df, df2, allowAll = T) -compare(df, df2, ignoreColOrder = T) -#TRUE -#reordered columns - -#=================== -# write output as csv file -#=================== -#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) -write.csv(df2, outfile, row.names = FALSE) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R deleted file mode 100644 index 877215a..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R +++ /dev/null @@ -1,131 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("barplot_colour_function.R") -require(data.table) -require(dplyr) - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -########################### -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -########################### - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -########################### -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -########################### -# Data for bfactor figure -# PS average -# Lig average -########################### - -head(my_df$Position) -head(my_df$ratioDUET) - -# order data frame -df = my_df[order(my_df$Position),] - -head(df$Position) -head(df$ratioDUET) - -#*********** -# PS: average by position -#*********** - -mean_DUET_by_position <- df %>% - group_by(Position) %>% - summarize(averaged.DUET = mean(ratioDUET)) - -#*********** -# Lig: average by position -#*********** -mean_Lig_by_position <- df %>% - group_by(Position) %>% - summarize(averaged.Lig = mean(ratioPredAff)) - - -#*********** -# cbind:mean_DUET_by_position and mean_Lig_by_position -#*********** - -combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) - -# sanity check -# mean_PS_Lig_Bfactor - -colnames(combined) - -colnames(combined) = c("Position" - , "average_DUETR" - , "Position2" - , "average_PredAffR") - -colnames(combined) - -identical(combined$Position, combined$Position2) - -n = which(colnames(combined) == "Position2"); n - -combined_df = combined[,-n] - -max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) - -max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) - -#============= -# output csv -#============ -outDir = "~/git/Data/pyrazinamide/input/processed/" -outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") -print(paste0("Output file with path will be:","", outFile)) - -head(combined_df$Position); tail(combined_df$Position) - -write.csv(combined_df, outFile - , row.names = F) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData deleted file mode 100644 index 9ebc62b..0000000 Binary files a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData and /dev/null differ diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R deleted file mode 100644 index 26919ce..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R +++ /dev/null @@ -1,250 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("barplot_colour_function.R") -require(cowplot) - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for OR and stability plots -# you need merged_df3_comp -# since these are matched -# to allow pairwise corr -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3_comp -#my_df = merged_df3 -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# sanity check -# Ensure correct data type in columns to plot: need to be factor -is.numeric(my_df$OR) -#[1] TRUE - -#<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -# FOR PS Plots -#<<<<<<<<<<<<<<<<<<< - -PS_df = my_df - -rm(my_df) -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1 - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -getwd() - -source("combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for OR and stability plots -# you need merged_df3_comp -# since these are matched -# to allow pairwise corr -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df2 = merged_df3_comp -#my_df2 = merged_df3 -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df2) -str(my_df2) - -# sanity check -# Ensure correct data type in columns to plot: need to be factor -is.numeric(my_df2$OR) -#[1] TRUE - -# sanity check: should be <10 -if (max(my_df2$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -#<<<<<<<<<<<<<<<< -# REASSIGNMENT -# FOR Lig Plots -#<<<<<<<<<<<<<<<< - -Lig_df = my_df2 - -rm(my_df2) - -#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1 - -############# -# Plots: Bubble plot -# x = Position, Y = stability -# size of dots = OR -# col: stability -############# - -#================= -# generate plot 1: DUET vs OR by position as geom_points -#================= - -my_ats = 20 # axis text size -my_als = 22 # axis label size - -# Spelling Correction: made redundant as already corrected at the source - -#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising' -#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising' - -table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome)) - -g = ggplot(PS_df, aes(x = factor(Position) - , y = ratioDUET)) - -p1 = g + - geom_point(aes(col = DUET_outcome - , size = OR)) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_ats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , legend.text = element_text(size = my_als) - , legend.title = element_text(size = my_als) ) + - #, legend.key.size = unit(1, "cm")) + - labs(title = "" - , x = "Position" - , y = "DUET(PS)" - , size = "Odds Ratio" - , colour = "DUET Outcome") + - guides(colour = guide_legend(override.aes = list(size=4))) - -p1 - -#================= -# generate plot 2: Lig vs OR by position as geom_points -#================= -my_ats = 20 # axis text size -my_als = 22 # axis label size - -# Spelling Correction: made redundant as already corrected at the source - -#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising' -#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising' - -table(Lig_df$Lig_outcome) - -g = ggplot(Lig_df, aes(x = factor(Position) - , y = ratioPredAff)) - -p2 = g + - geom_point(aes(col = Lig_outcome - , size = OR))+ - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_ats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , legend.text = element_text(size = my_als) - , legend.title = element_text(size = my_als) ) + - #, legend.key.size = unit(1, "cm")) + - labs(title = "" - , x = "Position" - , y = "Ligand Affinity" - , size = "Odds Ratio" - , colour = "Ligand Outcome" - ) + - guides(colour = guide_legend(override.aes = list(size=4))) - -p2 - -#====================== -#combine using cowplot -#====================== -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches -#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi -theme_set(theme_gray()) # to preserve default theme - -printFile = cowplot::plot_grid(plot_grid(p1, p2 - , ncol = 1 - , align = 'v' - , labels = c("(a)", "(b)") - , label_size = my_als+5)) -print(printFile) -dev.off() - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R deleted file mode 100644 index 30b9981..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R +++ /dev/null @@ -1,154 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for Lig plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Plot: Barplot with scores (unordered) -# corresponds to Lig_outcome -# Stacked Barplot with colours: Lig_outcome @ position coloured by -# Lig_outcome. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding Lig_outcome. -#============================ - -#=================== -# Data for plots -#=================== - -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df = my_df -#%%%%%%%%%%%%%%%%%%%%%%%% - -rm(my_df) - -# sanity checks -upos = unique(my_df$Position) - -# should be a factor -is.factor(df$Lig_outcome) -#TRUE - -table(df$Lig_outcome) - -# should be -1 and 1: may not be in this case because you have filtered the data -# FIXME: normalisation before or after filtering? -min(df$ratioPredAff) # -max(df$ratioPredAff) # - -# sanity checks -tapply(df$ratioPredAff, df$Lig_outcome, min) -tapply(df$ratioPredAff, df$Lig_outcome, max) - -#****************** -# generate plot -#****************** - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -my_title = "Ligand affinity" - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = Lig_outcome), colour = "grey") + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -# for sanity and good practice -rm(df) -#======================= end of plot -# axis colours labels -# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors -# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R deleted file mode 100644 index 169bdaf..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R +++ /dev/null @@ -1,149 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#========================== - -########################### -# Data for DUET plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$DUET_outcome) -my_df$DUET_outcome = as.factor(my_df$DUET_outcome) -is.factor(my_df$DUET_outcome) -#[1] TRUE - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Plot 2: Barplot with scores (unordered) -# corresponds to DUET_outcome -# Stacked Barplot with colours: DUET_outcome @ position coloured by -# DUET outcome. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding DUET_outcome -#============================ - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -upos = unique(df$Position) - -# should be a factor -is.factor(my_df$DUET_outcome) -#[1] TRUE - -table(my_df$DUET_outcome) - -# should be -1 and 1 -min(df$ratioDUET) -max(df$ratioDUET) - -tapply(df$ratioDUET, df$DUET_outcome, min) -tapply(df$ratioDUET, df$DUET_outcome, max) - -#****************** -# generate plot -#****************** - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -my_title = "Protein stability (DUET)" - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = DUET_outcome), colour = "grey") + - - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -# for sanity and good practice -rm(df) -#======================= end of plot -# axis colours labels -# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors -# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R deleted file mode 100644 index a5d9361..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R +++ /dev/null @@ -1,202 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") -source("../barplot_colour_function.R") - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for Lig plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$Lig_outcome) -my_df$Lig_outcome = as.factor(my_df$Ligoutcome) -is.factor(my_df$Lig_outcome) -#[1] TRUE - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Plot: Barplot with scores (unordered) -# corresponds to Lig_outcome -# Stacked Barplot with colours: Lig_outcome @ position coloured by -# stability scores. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding Lig stability value. -# Normalised values (range between -1 and 1 ) to aid visualisation -# NOTE: since barplot plots discrete values, colour = score, so number of -# colours will be equal to the no. of unique normalised scores -# rather than a continuous scale -# will require generating the colour scale separately. -#============================ - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -table(df$Lig_outcome) - -# should be -1 and 1: may not be in this case because you have filtered the data -# FIXME: normalisation before or after filtering? -min(df$ratioPredAff) # -max(df$ratioPredAff) # - -# sanity checks -# very important!!!! -tapply(df$ratioPredAff, df$Lig_outcome, min) - -tapply(df$ratioPredAff, df$Lig_outcome, max) - - -#****************** -# generate plot -#****************** - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -# My colour FUNCTION: based on group and subgroup -# in my case; -# df = df -# group = Lig_outcome -# subgroup = normalised score i.e ratioPredAff - -# Prepare data: round off ratioLig scores -# round off to 3 significant digits: -# 165 if no rounding is performed: used to generate the originalgraph -# 156 if rounded to 3 places -# FIXME: check if reducing precision creates any ML prob - -# check unique values in normalised data -u = unique(df$ratioPredAff) - -# <<<<< ------------------------------------------- -# Run this section if rounding is to be used -# specify number for rounding -n = 3 -df$ratioLigR = round(df$ratioPredAff, n) -u = unique(df$ratioLigR) # 156 -# create an extra column called group which contains the "gp name and score" -# so colours can be generated for each unique values in this column -my_grp = df$ratioLigR -df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") - -# else -# uncomment the below if rounding is not required - -#my_grp = df$ratioLig -#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") - -# <<<<< ----------------------------------------------- - -# Call the function to create the palette based on the group defined above -colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp") -my_title = "Ligand affinity" - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -# for sanity and good practice -rm(df) -#======================= end of plot -# axis colours labels -# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors -# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R deleted file mode 100644 index 8828e90..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R +++ /dev/null @@ -1,192 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") -source("../barplot_colour_function.R") - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for DUET plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$DUET_outcome) -my_df$DUET_outcome = as.factor(my_df$DUET_outcome) -is.factor(my_df$DUET_outcome) -#[1] TRUE - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Barplot with scores (unordered) -# corresponds to DUET_outcome -# Stacked Barplot with colours: DUET_outcome @ position coloured by -# stability scores. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding DUET stability value. -# Normalised values (range between -1 and 1 ) to aid visualisation -# NOTE: since barplot plots discrete values, colour = score, so number of -# colours will be equal to the no. of unique normalised scores -# rather than a continuous scale -# will require generating the colour scale separately. -#============================ - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -upos = unique(df$Position) - -# should be a factor -is.factor(my_df$DUET_outcome) -#[1] TRUE - -table(df$DUET_outcome) - -# should be -1 and 1 -min(df$ratioDUET) -max(df$ratioDUET) - -tapply(df$ratioDUET, df$DUET_outcome, min) -tapply(df$ratioDUET, df$DUET_outcome, max) - -#****************** -# generate plot -#****************** - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -# My colour FUNCTION: based on group and subgroup -# in my case; -# df = df -# group = DUET_outcome -# subgroup = normalised score i.e ratioDUET - -# Prepare data: round off ratioDUET scores -# round off to 3 significant digits: -# 323 if no rounding is performed: used to generate the original graph -# 287 if rounded to 3 places -# FIXME: check if reducing precicion creates any ML prob - -# check unique values in normalised data -u = unique(df$ratioDUET) - -# <<<<< ------------------------------------------- -# Run this section if rounding is to be used -# specify number for rounding -n = 3 -df$ratioDUETR = round(df$ratioDUET, n) -u = unique(df$ratioDUETR) -# create an extra column called group which contains the "gp name and score" -# so colours can be generated for each unique values in this column -my_grp = df$ratioDUETR -df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") - -# else -# uncomment the below if rounding is not required - -#my_grp = df$ratioDUET -#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") - -# <<<<< ----------------------------------------------- - -# Call the function to create the palette based on the group defined above -colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") -my_title = "Protein stability (DUET)" - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -# for sanity and good practice -rm(df) -#======================= end of plot -# axis colours labels -# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors -# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R deleted file mode 100644 index 432749e..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R +++ /dev/null @@ -1,296 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -############################################################ -# 1: Installing and loading required packages and functions -############################################################ - -source("../Header_TT.R") -source("../barplot_colour_function.R") - -############################################################ -# Output dir for plots -############################################################ -out_dir = "~/git/Data/pyrazinamide/output/plots" - -############################################################ -# 2: call script the prepares the data with columns containing -# colours for axis labels -############################################################ - -source("subcols_axis_LIG.R") - -# this should return -#mut_pos_cols: 52, 4 -#my_df: 169, 39 - -# clear excess variable -# "mut_pos_cols" is just for inspection in case you need to cross check -# position numbers and colours -# open file from deskptop ("sample_axis_cols") for cross checking - -table(mut_pos_cols$lab_bg) - -sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True - -table(mut_pos_cols$lab_bg2) - -sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True - -table(mut_pos_cols$lab_fg) - -sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True - -# very important!: should be the length of the unique positions -my_axis_colours = mut_pos_cols$lab_fg - -# now clear mut_pos_cols -rm(mut_pos_cols) - -########################### -# 2: Plot: Lig scores -########################### -#========================== -# Plot 2: Barplot with scores (unordered) -# corresponds to Lig_outcome -# Stacked Barplot with colours: Lig_outcome @ position coloured by -# stability scores. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding PredAff stability value. -# Normalised values (range between -1 and 1 ) to aid visualisation -# NOTE: since barplot plots discrete values, colour = score, so number of -# colours will be equal to the no. of unique normalised scores -# rather than a continuous scale -# will require generating the colour scale separately. -#============================ -# sanity checks -upos = unique(my_df$Position) - -str(my_df$Lig_outcome) - -colnames(my_df) - -#=========================== -# Data preparation for plots -#=========================== -#!!!!!!!!!!!!!!!!! -# REASSIGNMENT -df <- my_df -#!!!!!!!!!!!!!!!!! - -rm(my_df) - -# sanity checks -# should be a factor -is.factor(df$Lig_outcome); -#FALSE - -df$Lig_outcome = as.factor(df$Lig_outcome) -is.factor(df$Lig_outcome); -#TRUE - -table(df$Lig_outcome) - -# check the range -min(df$ratioPredAff) -max(df$ratioPredAff) - -# sanity checks -# very important!!!! -tapply(df$ratioPredAff, df$Lig_outcome, min) - -tapply(df$ratioPredAff, df$Lig_outcome, max) - -# My colour FUNCTION: based on group and subgroup -# in my case; -# df = df -# group = Lig_outcome -# subgroup = normalised score i.e ratioPredAff - -# Prepare data: round off ratioPredAff scores -# round off to 3 significant digits: -# 323 if no rounding is performed: used to generate the original graph -# 287 if rounded to 3 places -# FIXME: check if reducing precicion creates any ML prob - -# check unique values in normalised data -u = unique(df$ratioPredAff) - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Run this section if rounding is to be used -# specify number for rounding -n = 3 -df$ratioPredAffR = round(df$ratioPredAff, n) -u = unique(df$ratioPredAffR) - -# create an extra column called group which contains the "gp name and score" -# so colours can be generated for each unique values in this column -my_grp = df$ratioPredAffR -df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") - -# ELSE -# uncomment the below if rounding is not required - -#my_grp = df$ratioPredAff -#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -#****************** -# generate plot -#****************** - -# Call the function to create the palette based on the group defined above -colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp") -my_title = "Ligand Affinity" -library(ggplot2) - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis according to frequency -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -#======================== -# plot with axis colours -#======================== -class(df$lab_bg) -# make this a named vector - -# define cartesian coord -my_xlim = length(unique(df$Position)); my_xlim - -# axis label size -my_xals = 15 -my_yals = 15 - -# axes text size -my_xats = 15 -my_yats = 18 - -# using geom_tile -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - coord_cartesian(xlim = c(1, my_xlim) - , ylim = c(0, 6) - , clip = "off") + - - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - geom_tile(aes(,-0.8, width = 0.95, height = 0.85) - , fill = df$lab_bg) + - geom_tile(aes(,-1.2, width = 0.95, height = -0.2) - , fill = df$lab_bg2) + - - # Here it's important to specify that your axis goes from 1 to max number of levels - theme( axis.text.x = element_text(size = my_xats - , angle = 90 - , hjust = 1 - , vjust = 0.4 - , colour = my_axis_colours) - , axis.text.y = element_text(size = my_yats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xals) - , axis.title.y = element_text(size = my_yals ) - , axis.ticks.x = element_blank() - ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -#======================== -# output plot as svg/png -#======================== -class(df$lab_bg) -# make this a named vector - -# define cartesian coord -my_xlim = length(unique(df$Position)); my_xlim - -# axis label size -my_xals = 18 -my_yals = 18 - -# axes text size -my_xats = 16 #14 in PS -my_yats = 18 - -# set output dir for plots -#getwd() -#setwd("~/git/Data/pyrazinamide/output/plots") -#getwd() - -plot_name = "barplot_LIG_acoloured.svg" -my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name - -svg(my_plot_name, width = 26, height = 4) - -g = ggplot(df, aes(factor(Position, ordered = T))) - -outFile = g + - coord_cartesian(xlim = c(1, my_xlim) - , ylim = c(0, 6) - , clip = "off" - ) + - - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + -# geom_tile(aes(,-0.6, width = 0.9, height = 0.7) -# , fill = df$lab_bg) + -# geom_tile(aes(,-1, width = 0.9, height = 0.3) -# , fill = df$lab_bg2) + - geom_tile(aes(,-0.8, width = 0.95, height = 0.85) - , fill = df$lab_bg) + - geom_tile(aes(,-1.2, width = 0.95, height = -0.2) - , fill = df$lab_bg2) + - -# Here it's important to specify that your axis goes from 1 to max number of levels - theme( axis.text.x = element_text(size = my_xats - , angle = 90 - , hjust = 1 - , vjust = 0.4 - , colour = my_axis_colours) - , axis.text.y = element_text(size = my_yats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xals) - , axis.title.y = element_text(size = my_yals ) - , axis.ticks.x = element_blank() - ) + - labs(title = "" - , x = "Position" - , y = "Frequency") - - -print(outFile) -dev.off() - -# for sanity and good practice -#rm(df) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R deleted file mode 100644 index 78029be..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R +++ /dev/null @@ -1,292 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -############################################################ -# 1: Installing and loading required packages and functions -############################################################ - -source("../Header_TT.R") -source("../barplot_colour_function.R") - -############################################################ -# Output dir for plots -############################################################ -out_dir = "~/git/Data/pyrazinamide/output/plots" - -############################################################ -# 2: call script the prepares the data with columns containing -# colours for axis labels -############################################################ - -source("subcols_axis.R") - -# this should return -#mut_pos_cols: 130, 4 -#my_df: 335, 39 - -# clear excess variable -# "mut_pos_cols" is just for inspection in case you need to cross check -# position numbers and colours -# open file from deskptop ("sample_axis_cols") for cross checking - -table(mut_pos_cols$lab_bg) - -sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True - -table(mut_pos_cols$lab_bg2) - -sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True - -table(mut_pos_cols$lab_fg) - -sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True - -# very important! -my_axis_colours = mut_pos_cols$lab_fg - -# now clear mut_pos_cols -rm(mut_pos_cols) - -########################### -# 2: Plot: DUET scores -########################### -#========================== -# Plot 2: Barplot with scores (unordered) -# corresponds to DUET_outcome -# Stacked Barplot with colours: DUET_outcome @ position coloured by -# stability scores. This is a barplot where each bar corresponds -# to a SNP and is coloured by its corresponding DUET stability value. -# Normalised values (range between -1 and 1 ) to aid visualisation -# NOTE: since barplot plots discrete values, colour = score, so number of -# colours will be equal to the no. of unique normalised scores -# rather than a continuous scale -# will require generating the colour scale separately. -#============================ -# sanity checks -upos = unique(my_df$Position) - -str(my_df$DUET_outcome) - -colnames(my_df) - -#=========================== -# Data preparation for plots -#=========================== -#!!!!!!!!!!!!!!!!! -# REASSIGNMENT -df <- my_df -#!!!!!!!!!!!!!!!!! - -rm(my_df) - -# sanity checks -# should be a factor -is.factor(df$DUET_outcome) -#TRUE - -table(df$DUET_outcome) - -# should be -1 and 1 -min(df$ratioDUET) -max(df$ratioDUET) - -# sanity checks -# very important!!!! -tapply(df$ratioDUET, df$DUET_outcome, min) - -tapply(df$ratioDUET, df$DUET_outcome, max) - -# My colour FUNCTION: based on group and subgroup -# in my case; -# df = df -# group = DUET_outcome -# subgroup = normalised score i.e ratioDUET - -# Prepare data: round off ratioDUET scores -# round off to 3 significant digits: -# 323 if no rounding is performed: used to generate the original graph -# 287 if rounded to 3 places -# FIXME: check if reducing precicion creates any ML prob - -# check unique values in normalised data -u = unique(df$ratioDUET) - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Run this section if rounding is to be used -# specify number for rounding -n = 3 -df$ratioDUETR = round(df$ratioDUET, n) -u = unique(df$ratioDUETR) - -# create an extra column called group which contains the "gp name and score" -# so colours can be generated for each unique values in this column -my_grp = df$ratioDUETR -df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") - -# ELSE -# uncomment the below if rounding is not required - -#my_grp = df$ratioDUET -#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") - -# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -#****************** -# generate plot -#****************** - -# Call the function to create the palette based on the group defined above -colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") -my_title = "Protein stability (DUET)" -library(ggplot2) - -# axis label size -my_xaxls = 13 -my_yaxls = 15 - -# axes text size -my_xaxts = 15 -my_yaxts = 15 - -# no ordering of x-axis according to frequency -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - theme( axis.text.x = element_text(size = my_xaxls - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = my_yaxls - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xaxts) - , axis.title.y = element_text(size = my_yaxts ) ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -#======================== -# plot with axis colours -#======================== -class(df$lab_bg) -# make this a named vector - -# define cartesian coord -my_xlim = length(unique(df$Position)); my_xlim - -# axis label size -my_xals = 15 -my_yals = 15 - -# axes text size -my_xats = 15 -my_yats = 18 - -# using geom_tile -g = ggplot(df, aes(factor(Position, ordered = T))) -g + - coord_cartesian(xlim = c(1, my_xlim) - , ylim = c(0, 6) - , clip = "off") + - - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + - geom_tile(aes(,-0.8, width = 0.95, height = 0.85) - , fill = df$lab_bg) + - geom_tile(aes(,-1.2, width = 0.95, height = -0.2) - , fill = df$lab_bg2) + - - # Here it's important to specify that your axis goes from 1 to max number of levels - theme( axis.text.x = element_text(size = my_xats - , angle = 90 - , hjust = 1 - , vjust = 0.4 - , colour = my_axis_colours) - , axis.text.y = element_text(size = my_yats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xals) - , axis.title.y = element_text(size = my_yals ) - , axis.ticks.x = element_blank() - ) + - labs(title = my_title - , x = "Position" - , y = "Frequency") - -#======================== -# output plot as svg/png -#======================== -class(df$lab_bg) -# make this a named vector - -# define cartesian coord -my_xlim = length(unique(df$Position)); my_xlim - -# axis label size -my_xals = 18 -my_yals = 18 - -# axes text size -my_xats = 14 -my_yats = 18 - -# set output dir for plots -#getwd() -#setwd("~/git/Data/pyrazinamide/output/plots") -#getwd() - -plot_name = "barplot_PS_acoloured.svg" -my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name - -svg(my_plot_name, width = 26, height = 4) - -g = ggplot(df, aes(factor(Position, ordered = T))) - -outFile = g + - coord_cartesian(xlim = c(1, my_xlim) - , ylim = c(0, 6) - , clip = "off" - ) + - - geom_bar(aes(fill = group), colour = "grey") + - scale_fill_manual( values = colours - , guide = 'none') + -# geom_tile(aes(,-0.6, width = 0.9, height = 0.7) -# , fill = df$lab_bg) + -# geom_tile(aes(,-1, width = 0.9, height = 0.3) -# , fill = df$lab_bg2) + - geom_tile(aes(,-0.8, width = 0.95, height = 0.85) - , fill = df$lab_bg) + - geom_tile(aes(,-1.2, width = 0.95, height = -0.2) - , fill = df$lab_bg2) + - -# Here it's important to specify that your axis goes from 1 to max number of levels - theme( axis.text.x = element_text(size = my_xats - , angle = 90 - , hjust = 1 - , vjust = 0.4 - , colour = my_axis_colours) - , axis.text.y = element_text(size = my_yats - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = my_xals) - , axis.title.y = element_text(size = my_yals ) - , axis.ticks.x = element_blank() - ) + - labs(title = "" - , x = "Position" - , y = "Frequency") - - -print(outFile) -dev.off() - -# for sanity and good practice -#rm(df) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R deleted file mode 100644 index c4826d3..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R +++ /dev/null @@ -1,215 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") - -#require(data.table) -#require(dplyr) - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for Lig plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$Lig_outcome) -my_df$Lig_outcome = as.factor(my_df$lig_outcome) -is.factor(my_df$Lig_outcome) -#[1] TRUE - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Plot: Basic barplots -#=========================== - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< -rm(my_df) - -# sanity checks -str(df) - -if (identical(df$Position, df$position)){ - print("Sanity check passed: Columns 'Position' and 'position' are identical") -} else{ - print("Error!: Check column names and info contained") -} - -#**************** -# generate plot: No of stabilising and destabilsing muts -#**************** -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -svg('basic_barplots_LIG.svg') - -my_ats = 25 # axis text size -my_als = 22 # axis label size - -# uncomment as necessary for either directly outputting results or -# printing on the screen -g = ggplot(df, aes(x = Lig_outcome)) -#prinfFile = g + geom_bar( - g + geom_bar( - aes(fill = Lig_outcome) - , show.legend = TRUE -) + geom_label( - stat = "count" - , aes(label = ..count..) - , color = "black" - , show.legend = FALSE - , size = 10) + theme( - axis.text.x = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_text(size=my_als) - , axis.text.y = element_text(size = my_ats) - , legend.position = c(0.73,0.8) - , legend.text = element_text(size=my_als-2) - , legend.title = element_text(size=my_als) - , plot.title = element_blank() - ) + labs( - title = "" - , y = "Number of SNPs" - #, fill='Ligand Outcome' - ) + scale_fill_discrete(name = "Ligand Outcome" - , labels = c("Destabilising", "Stabilising")) -print(prinfFile) -dev.off() - -#**************** -# generate plot: No of positions -#**************** -#get freq count of positions so you can subset freq<1 -#require(data.table) -setDT(df)[, pos_count := .N, by = .(Position)] #169, 36 - -head(df$pos_count) -table(df$pos_count) -# this is cummulative -#1 2 3 4 5 6 -#5 24 36 56 30 18 - -# use group by on this -snpsBYpos_df <- df %>% - group_by(Position) %>% - summarize(snpsBYpos = mean(pos_count)) - -table(snpsBYpos_df$snpsBYpos) -#1 2 3 4 5 6 -#5 12 12 14 6 3 -# this is what will get plotted - -svg('position_count_LIG.svg') - -my_ats = 25 # axis text size -my_als = 22 # axis label size - -g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) -prinfFile = g + geom_bar( - #g + geom_bar( - aes (alpha = 0.5) - , show.legend = FALSE -) + - geom_label( - stat = "count", aes(label = ..count..) - , color = "black" - , size = 10 - ) + - theme( - axis.text.x = element_text( - size = my_ats - , angle = 0 - ) - , axis.text.y = element_text( - size = my_ats - , angle = 0 - , hjust = 1 - ) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , plot.title = element_blank() - ) + - labs( - x = "Number of SNPs" - , y = "Number of Sites" - ) -print(prinfFile) -dev.off() -######################################################################## -# end of Lig barplots # -######################################################################## - - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R deleted file mode 100644 index 51a2812..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R +++ /dev/null @@ -1,211 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#========================== - -########################### -# Data for DUET plots -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -# sanity check -is.factor(my_df$DUET_outcome) -my_df$DUET_outcome = as.factor(my_df$DUET_outcome) -is.factor(my_df$DUET_outcome) -#[1] TRUE - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Plot: Basic barplots -#=========================== - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -str(df) - -if (identical(df$Position, df$position)){ - print("Sanity check passed: Columns 'Position' and 'position' are identical") -} else{ - print("Error!: Check column names and info contained") - } - -#**************** -# generate plot: No of stabilising and destabilsing muts -#**************** -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -svg('basic_barplots_DUET.svg') - -my_ats = 25 # axis text size -my_als = 22 # axis label size - -theme_set(theme_grey()) - -# uncomment as necessary for either directly outputting results or -# printing on the screen -g = ggplot(df, aes(x = DUET_outcome)) -prinfFile = g + geom_bar( -#g + geom_bar( - aes(fill = DUET_outcome) - , show.legend = TRUE - ) + geom_label( - stat = "count" - , aes(label = ..count..) - , color = "black" - , show.legend = FALSE - , size = 10) + theme( - axis.text.x = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_text(size=my_als) - , axis.text.y = element_text(size = my_ats) - , legend.position = c(0.73,0.8) - , legend.text = element_text(size=my_als-2) - , legend.title = element_text(size=my_als) - , plot.title = element_blank() - ) + labs( - title = "" - , y = "Number of SNPs" - #, fill='DUET Outcome' - ) + scale_fill_discrete(name = "DUET Outcome" - , labels = c("Destabilising", "Stabilising")) - -print(prinfFile) -dev.off() - -#**************** -# generate plot: No of positions -#**************** -#get freq count of positions so you can subset freq<1 -#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36 - -setDT(df)[, pos_count := .N, by = .(Position)] #335, 36 -table(df$pos_count) -# this is cummulative -#1 2 3 4 5 6 -#34 76 63 104 40 18 - -# use group by on this -snpsBYpos_df <- df %>% - group_by(Position) %>% - summarize(snpsBYpos = mean(pos_count)) - -table(snpsBYpos_df$snpsBYpos) -#1 2 3 4 5 6 -#34 38 21 26 8 3 - -foo = select(df, Mutationinformation - , WildPos - , wild_type - , mutant_type - , mutation_info - , position - , pos_count) #335, 5 - -getwd() -write.csv(foo, "../Data/pos_count_freq.csv") - -svg('position_count_DUET.svg') -my_ats = 25 # axis text size -my_als = 22 # axis label size - -g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) -prinfFile = g + geom_bar( -#g + geom_bar( - aes (alpha = 0.5) - , show.legend = FALSE - ) + - geom_label( - stat = "count", aes(label = ..count..) - , color = "black" - , size = 10 - ) + - theme( - axis.text.x = element_text( - size = my_ats - , angle = 0 - ) - , axis.text.y = element_text( - size = my_ats - , angle = 0 - , hjust = 1 - ) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , plot.title = element_blank() - ) + - labs( - x = "Number of SNPs" - , y = "Number of Sites" - ) -print(prinfFile) -dev.off() -######################################################################## -# end of DUET barplots # -######################################################################## - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R deleted file mode 100644 index 0059bca..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R +++ /dev/null @@ -1,175 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages and functions # -######################################################################## - -source("../Header_TT.R") - -#source("barplot_colour_function.R") - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#========================== - -########################### -# Data for PS Corr plots -# you need merged_df3_comp -# since these are matched -# to allow pairwise corr -########################### - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Plot: Correlation plots -#=========================== - -#=================== -# Data for plots -#=================== - -#!!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT -df = my_df -#!!!!!!!!!!!!!!!!!!!!!!!! - -rm(my_df) - -# sanity checks -str(df) - -table(df$DUET_outcome) - -# unique positions -length(unique(df$Position)) #{RESULT: unique positions for comp data} - - -# subset data to generate pairwise correlations -corr_data = df[, c("ratioDUET" -# , "ratioPredAff" -# , "DUETStability_Kcalpermol" -# , "PredAffLog" -# , "OR" - , "logor" -# , "pvalue" - , "neglog10pvalue" - , "AF" - , "DUET_outcome" -# , "Lig_outcome" - , "pyrazinamide" - )] -dim(corr_data) -rm(df) - -# assign nice colnames (for display) -my_corr_colnames = c("DUET" -# , "Ligand Affinity" -# , "DUET_raw" -# , "Lig_raw" -# , "OR" - , "Log(Odds Ratio)" -# , "P-value" - , "-LogP" - , "Allele Frequency" - , "DUET_outcome" -# , "Lig_outcome" - , "pyrazinamide") - -# sanity check -if (length(my_corr_colnames) == length(corr_data)){ - print("Sanity check passed: corr_data and corr_names match in length") -}else{ - print("Error: length mismatch!") -} - -colnames(corr_data) -colnames(corr_data) <- my_corr_colnames -colnames(corr_data) - -############### -# PLOTS: corr -# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs -############### -#default pairs plot -start = 1 -end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column -offset = 1 - -my_corr = corr_data[start:(end-offset)] -head(my_corr) - -#my_cols = c("#f8766d", "#00bfc4") -# deep blue :#007d85 -# deep red: #ae301e - -#========== -# psych: ionformative since it draws the ellipsoid -# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html -# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs -#========== - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots" -getwd() - -svg('DUET_corr.svg', width = 15, height = 15) -printFile = pairs.panels(my_corr[1:4] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = TRUE # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))] - , pch = 21 - , jitter = T - #, alpha = .05 - #, points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 3 - , cex.axis = 2.5 - , cex.labels = 3 - , cex.cor = 1 - , smooth = F -) - -print(printFile) -dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R deleted file mode 100644 index 4e05d41..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R +++ /dev/null @@ -1,187 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") - -#source("barplot_colour_function.R") - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for Lig Corr plots -# you need merged_df3_comp -# since these are matched -# to allow pairwise corr -########################### - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df3_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Plot: Correlation plots -#=========================== - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df = my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< - -rm(my_df) - -# sanity checks -str(df) - -table(df$Lig_outcome) - -# unique positions -length(unique(df$Position)) #{RESULT: unique positions for comp data} - -# subset data to generate pairwise correlations -corr_data = df[, c(#"ratioDUET", - "ratioPredAff" -# , "DUETStability_Kcalpermol" -# , "PredAffLog" -# , "OR" - , "logor" -# , "pvalue" - , "neglog10pvalue" - , "AF" -# , "DUET_outcome" - , "Lig_outcome" - , "pyrazinamide" - )] -dim(corr_data) -rm(df) - -# assign nice colnames (for display) -my_corr_colnames = c(#"DUET", - "Ligand Affinity" -# ,"DUET_raw" -# , "Lig_raw" -# , "OR" - , "Log(Odds Ratio)" -# , "P-value" - , "-LogP" - , "Allele Frequency" -# , "DUET_outcome" - , "Lig_outcome" - , "pyrazinamide") - -# sanity check -if (length(my_corr_colnames) == length(corr_data)){ - print("Sanity check passed: corr_data and corr_names match in length") -}else{ - print("Error: length mismatch!") -} - -colnames(corr_data) -colnames(corr_data) <- my_corr_colnames -colnames(corr_data) - -############### -# PLOTS: corr -# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs -############### - -# default pairs plot -start = 1 -end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column -offset = 1 - -my_corr = corr_data[start:(end-offset)] -head(my_corr) - -#my_cols = c("#f8766d", "#00bfc4") -# deep blue :#007d85 -# deep red: #ae301e - -#========== -# psych: ionformative since it draws the ellipsoid -# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html -# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs -#========== - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots" -getwd() - -svg('Lig_corr.svg', width = 15, height = 15) -printFile = pairs.panels(my_corr[1:4] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = TRUE # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))] - , pch = 21 - , jitter = T -# , alpha = .05 -# , points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 3 - , cex.axis = 2.5 - , cex.labels = 3 - , cex.cor = 1 - , smooth = F -) -print(printFile) -dev.off() - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R deleted file mode 100644 index 1f868e4..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R +++ /dev/null @@ -1,227 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("barplot_colour_function.R") - -require(data.table) - -######################################################################## -# Read file: call script for combining df # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#========================== - -########################### -# Data for plots -# you need merged_df2, comprehensive one -# since this has one-many relationship -# i.e the same SNP can belong to multiple lineages -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -#========================== -# Plot: Lineage barplot -# x = lineage y = No. of samples -# col = Lineage -# fill = lineage -#============================ -table(my_df$lineage) - -# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV -#3 104 1293 264 1311 6 6 105 - -#=========================== -# Plot: Lineage Barplots -#=========================== - -#=================== -# Data for plots -#=================== - -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -df <- my_df -#<<<<<<<<<<<<<<<<<<<<<<<<< -rm(my_df) - -# get freq count of positions so you can subset freq<1 -#setDT(df)[, lineage_count := .N, by = .(lineage)] - -#****************** -# generate plot: barplot of mutation by lineage -#****************** -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -df_lin = subset(df, subset = lineage %in% sel_lineages ) - -#FIXME; add sanity check for numbers. -# Done this manually - -############################################################ - -######### -# Data for barplot: Lineage barplot -# to show total samples and number of unique mutations -# within each linege -########## - -# Create df with lineage inform & no. of unique mutations -# per lineage and total samples within lineage -# this is essentially barplot with two y axis - -bar = bar = as.data.frame(sel_lineages) #4, 1 -total_snps_u = NULL -total_samples = NULL - -for (i in sel_lineages){ - #print(i) - curr_total = length(unique(df$id)[df$lineage==i]) - total_samples = c(total_samples, curr_total) - print(total_samples) - - foo = df[df$lineage==i,] - print(paste0(i, "=======")) - print(length(unique(foo$Mutationinformation))) - curr_count = length(unique(foo$Mutationinformation)) - - total_snps_u = c(total_snps_u, curr_count) -} - -print(total_snps_u) -bar$num_snps_u = total_snps_u -bar$total_samples = total_samples -bar - -#***************** -# generate plot: lineage barplot with two y-axis -#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 -#***************** - -bar$num_snps_u = y1 -bar$total_samples = y2 -sel_lineages = x - -to_plot = data.frame(x = x - , y1 = y1 - , y2 = y2) -to_plot - -melted = melt(to_plot, id = "x") -melted - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -svg('lineage_basic_barplot.svg') - -my_ats = 20 # axis text size -my_als = 22 # axis label size - -g = ggplot(melted - , aes(x = x - , y = value - , fill = variable) - ) - - -printFile = g + geom_bar( - -#g + geom_bar( - stat = "identity" - , position = position_stack(reverse = TRUE) - , alpha=.75 - , colour='grey75' - ) + theme( - axis.text.x = element_text( - size = my_ats -# , angle= 30 - ) - , axis.text.y = element_text(size = my_ats - #, angle = 30 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text( - size = my_als - , colour = 'black' - ) - , axis.title.y = element_text( - size = my_als - , colour = 'black' - ) - , legend.position = "top" - , legend.text = element_text(size = my_als) - - #) + geom_text( - ) + geom_label( - aes(label = value) - , size = 5 - , hjust = 0.5 - , vjust = 0.5 - , colour = 'black' - , show.legend = FALSE - #, check_overlap = TRUE - , position = position_stack(reverse = T) - #, position = (' - - ) + labs( - title = '' - , x = '' - , y = "Number" - , fill = 'Variable' - , colour = 'black' - ) + scale_fill_manual( - values = c('grey50', 'gray75') - , name='' - , labels=c('Mutations', 'Total Samples') - ) + scale_x_discrete( - breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') - ) -print(printFile) -dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R deleted file mode 100644 index e4e6972..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R +++ /dev/null @@ -1,253 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("barplot_colour_function.R") -#require(data.table) - -######################################################################## -# Read file: call script for combining df for Lig # -######################################################################## - -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA: -# merged_df2 -# merged_df3 - -# df without NA: -# merged_df2_comp -# merged_df3_comp -#=========================== -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -########################### - -# uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -table(my_df$mutation_info) - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(my_df$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## -#========================== -# Data for plot: assign as -# necessary -#=========================== - -# uncomment as necessary -#!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT - -#================== -# data for ALL muts -#================== -plot_df = my_df -my_plot_name = 'lineage_dist_PS.svg' -#my_plot_name = 'lineage_dist_PS_comp.svg' - -#======================= -# data for dr_muts ONLY -#======================= -#plot_df = my_df_dr -#my_plot_name = 'lineage_dist_dr_PS.svg' -#my_plot_name = 'lineage_dist_dr_PS_comp.svg' -#!!!!!!!!!!!!!!!!!!!!!!! - -#========================== -# Plot: Lineage Distribution -# x = mcsm_values, y = dist -# fill = stability -#============================ - -#=================== -# Data for plots -#=================== - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -# uncomment as necessary -df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35 - -# refactor -df_lin$lineage = factor(df_lin$lineage) - -table(df_lin$lineage) #{RESULT: No of samples within lineage} -#lineage1 lineage2 lineage3 lineage4 -#78 961 195 803 - -# when merged_df2_comp is used -#lineage1 lineage2 lineage3 lineage4 -#77 955 194 770 - -length(unique(df_lin$Mutationinformation)) -#{Result: No. of unique mutations the 4 lineages contribute to} - -# sanity checks -r1 = 2:5 # when merged_df2 used: because there is missing lineages -if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) { - print ("sanity check passed: numbers match") -} else{ - print("Error!: check your numbers") -} - -#!!!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT -df <- df_lin -#!!!!!!!!!!!!!!!!!!!!!!!!! - -rm(df_lin) - -#****************** -# generate distribution plot of lineages -#****************** -# basic: could improve this! -library(plotly) -library(ggridges) - -my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - -g <- ggplot(df, aes(x = ratioPredAff)) + - geom_density(aes(fill = Lig_outcome) - , alpha = 0.5) + - facet_wrap( ~ lineage - , scales = "free" - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian(xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" -) - ggtitle("Kernel Density estimates of Ligand affinity by lineage") - -ggplotly(g) - -# 2 : ggridges (good!) - -my_ats = 15 # axis text size -my_als = 20 # axis label size - -my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -# check plot name -my_plot_name - -svg(my_plot_name) - -printFile = ggplot( df, aes(x = ratioPredAff - , y = Lig_outcome) ) + - - geom_density_ridges_gradient( aes(fill = ..x..) - , scale = 3 - , size = 0.3 ) + - facet_wrap( ~lineage - , scales = "free" -# , switch = 'x' - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" - ) + - - scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4") - , name = "Ligand Affinity" ) + - theme( axis.text.x = element_text( size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) -# , axis.text.y = element_text( size = my_ats -# , angle = 0 -# , hjust = 1 -# , vjust = 0) - , axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = 10) - , legend.title = element_text(size = my_als) -# , legend.position = c(0.3, 0.8) -# , legend.key.height = unit(1, 'mm') - ) - -print(printFile) -dev.off() -#=================================================== - -# COMPARING DISTRIBUTIONS -head(df$lineage) -df$lineage = as.character(df$lineage) - -lin1 = df[df$lineage == "lineage1",]$ratioPredAff -lin2 = df[df$lineage == "lineage2",]$ratioPredAff -lin3 = df[df$lineage == "lineage3",]$ratioPredAff -lin4 = df[df$lineage == "lineage4",]$ratioPredAff - -# ks test -ks.test(lin1,lin2) -ks.test(lin1,lin3) -ks.test(lin1,lin4) - -ks.test(lin2,lin3) -ks.test(lin2,lin4) - -ks.test(lin3,lin4) - - - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R deleted file mode 100644 index 703a206..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R +++ /dev/null @@ -1,229 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") -#source("../barplot_colour_function.R") -#require(data.table) - -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA for pyrazinamide: -# merged_df2 -# merged_df3 - -# df without NA for pyrazinamide: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -# using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available, hence use df with NA -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# Ensure correct data type in columns to plot: need to be factor -is.factor(my_df$lineage) -my_df$lineage = as.factor(my_df$lineage) -is.factor(my_df$lineage) - -table(my_df$mutation_info); str(my_df$mutation_info) - -# subset df with dr muts only -my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") -table(my_df_dr$mutation_info) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#========================== -# Run two times: -# uncomment as necessary -# 1) for all muts -# 2) for dr_muts -#=========================== - -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT - -#================ -# for ALL muts -#================ -plot_df = my_df -my_plot_name = 'lineage_dist_PS.svg' -#my_plot_name = 'lineage_dist_PS_comp.svg' - -#================ -# for dr muts ONLY -#================ -#plot_df = my_df_dr -#my_plot_name = 'lineage_dist_dr_PS.svg' -#my_plot_name = 'lineage_dist_dr_PS_comp.svg' - -#%%%%%%%%%%%%%%%%%%%%%%%% - -#========================== -# Plot: Lineage Distribution -# x = mcsm_values, y = dist -# fill = stability -#============================ - -#=================== -# Data for plots -#=================== -table(plot_df$lineage); str(plot_df$lineage) - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4") - -# uncomment as necessary -df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) - -# refactor -df_lin$lineage = factor(df_lin$lineage) - -table(df_lin$lineage) #{RESULT: No of samples within lineage} -#lineage1 lineage2 lineage3 lineage4 - -length(unique(df_lin$Mutationinformation)) -#{Result: No. of unique mutations the 4 lineages contribute to} - -# sanity checks -r1 = 2:5 # when merged_df2 used: because there is missing lineages -if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { - print ("sanity check passed: numbers match") -} else{ - print("Error!: check your numbers") -} - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df <- df_lin -#%%%%%%%%%%%%%%%%%%%%%%%%% - -rm(df_lin) - -#****************** -# generate distribution plot of lineages -#****************** -# basic: could improve this! -#library(plotly) -#library(ggridges) - -g <- ggplot(df, aes(x = ratioDUET)) + - geom_density(aes(fill = DUET_outcome) - , alpha = 0.5) + facet_wrap(~ lineage, - scales = "free") + - ggtitle("Kernel Density estimates of Protein stability by lineage") - -ggplotly(g) - -# 2 : ggridges (good!) -my_ats = 15 # axis text size -my_als = 20 # axis label size - -my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - -# set output dir for plots -getwd() -setwd("~/git/Data/pyrazinamide/output/plots") -getwd() - -# check plot name -my_plot_name - -# output svg -svg(my_plot_name) -printFile = ggplot(df, aes(x = ratioDUET - , y = DUET_outcome))+ - - #printFile=geom_density_ridges_gradient( - geom_density_ridges_gradient(aes(fill = ..x..) - , scale = 3 - , size = 0.3 ) + - facet_wrap( ~lineage - , scales = "free" -# , switch = 'x' - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" -) + - scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") - , name = "DUET" ) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) -# , axis.text.y = element_text(size = my_ats -# , angle = 0 -# , hjust = 1 -# , vjust = 0) - , axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = 10) - , legend.title = element_text(size = my_als) -# , legend.position = c(0.3, 0.8) -# , legend.key.height = unit(1, 'mm') - ) - -print(printFile) -dev.off() - -#=!=!=!=!=!=!=! -# COMMENT: Not much differences in the distributions -# when using merged_df2 or merged_df2_comp. -# Also, the lineage differences disappear when looking at all muts -# The pattern we are interested in is possibly only for dr_mutations -#=!=!=!=!=!=!=! -#=================================================== - -# COMPARING DISTRIBUTIONS: KS test -# run: "../KS_test_PS.R" - - - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R deleted file mode 100644 index f60fb0b..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R +++ /dev/null @@ -1,250 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("../Header_TT.R") - -#source("barplot_colour_function.R") - -library(ggseqlogo) - -#======= -# input -#======= -############# -# msa file: output of generate_mut_sequences.py -############# -homedir = '~' -indir = 'git/Data/pyrazinamide/output' -in_filename = "gene_msa.txt" -infile = paste0(homedir, '/', indir,'/', in_filename) -print(infile) - -#======= -# input -#======= -############# -# combined dfs -############# -source("../combining_two_df.R") - -########################### -# Data for Logo plots -# you need big df i.e -# merged_df2 -# or -# merged_df2_comp -# since these have unique SNPs -# I prefer to use the merged_df2 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df2 -#my_df = merged_df2_comp -#%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_df) -str(my_df) - -# doesn't work if you use the big df as it has duplicate snps -#rownames(my_df) = my_df$Mutationinformation - -# sanity check: should be True -table(my_df$position == my_df$Position) - -c1 = unique(my_df$Position) # 130 -nrow(my_df) # 3092 - -#FIXME -#!!! RESOLVE !!! -# get freq count of positions and add to the df -setDT(my_df)[, occurrence_sample := .N, by = .(id)] -table(my_df$occurrence_sample) - - -my_df2 = my_df %>% - select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample) - -write.csv(my_df2, "my_df2.csv") - -# extract freq_pos>1 since this will not add to much in the logo plot -# pos 5 has one mutation but coming from atleast 5 samples? -table(my_df$occurrence) -foo = my_df[my_df$occurrence ==1,] - -# uncomment as necessary -my_data_snp = my_df #3092 - -#!!! RESOLVE -# FIXME -my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019 - -u = unique(my_data_snp$Position) #96 - - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -######################################################### -# Task: To generate a logo plot or bar plot but coloured -# aa properties. -# step1: read mcsm file and OR file -# step2: plot wild type positions -# step3: plot mutants per position coloured by aa properties -# step4: make the size of the letters/bars prop to OR if you can! -######################################################### -##useful links -#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -#https://omarwagih.github.io/ggseqlogo/ -#https://kkdey.github.io/Logolas-pages/workflow.html -#A new sequence logo plot to highlight enrichment and depletion. -# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ - -##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ - -#============== -# matrix for mutant type -# frequency of mutant type by position -#============== -table(my_data_snp$Mutant_type, my_data_snp$Position) -tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) -class(tab_mt) -# unclass to convert to matrix -tab_mt = unclass(tab_mt) -tab_mt = as.matrix(tab_mt, rownames = T) - -# should be TRUE -is.matrix(tab_mt) - -rownames(tab_mt) #aa -colnames(tab_mt) #pos - -#********************** -# Plot 1: mutant logo -#********************** -my_ymax = max(my_data_snp$occurrence); my_ymax -my_ylim = c(0,my_ymax) # very important - -# axis sizes -# common: text and label -my_ats = 15 -my_als = 20 - -# individual: text and label -my_xats = 15 -my_yats = 20 -my_xals = 15 -my_yals = 20 - -# legend size: text and label -my_lts = 20 -#my_lls = 20 - -# Color scheme based on chemistry of amino acids -chemistry = data.frame( - letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'), - group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)), - col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)), - stringsAsFactors = F -) - -# uncomment as necessary -my_type = "EDLogo" -#my_type = "Logo" - -logomaker(tab_mt - , type = my_type - , return_heights = T -# , color_type = "per_row" -# , colors = chemistry$col -# , method = 'custom' -# , seq_type = 'aa' -# , col_scheme = "taylor" -# , col_scheme = "chemistry2" -) + -theme(legend.position = "bottom" - , legend.title = element_blank() - , legend.text = element_text(size = my_lts ) - , axis.text.x = element_text(size = my_ats , angle = 90) - , axis.text.y = element_text(size = my_ats , angle = 90)) - -p0 = logomaker(tab_mt - , type = my_type - , return_heights = T - , color_type = "per_row" - , colors = chemistry$col -# , seq_type = 'aa' -# , col_scheme = "taylor" -# , col_scheme = "chemistry2" -) + - #ylab('my custom height') + - theme(axis.text.x = element_blank()) + -# theme_logo()+ - # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) - scale_x_continuous(breaks = 1:ncol(tab_mt) - , labels = colnames(tab_mt))+ - scale_y_continuous( breaks = 1:my_ymax - , limits = my_ylim) - -p0 - -# further customisation -p1 = p0 + theme(legend.position = "bottom" - , legend.title = element_blank() - , legend.text = element_text(size = my_lts) - , axis.text.x = element_text(size = my_ats , angle = 90) - , axis.text.y = element_text(size = my_ats , angle = 90)) -p1 - -#======= -# input -#======= -############# -# msa file: output of generate_mut_sequences.py -############# -homedir = '~' -indir = 'git/Data/pyrazinamide/output' -in_filename = "gene_msa.txt" -infile = paste0(homedir, '/', indir,'/', in_filename) -print(infile) - -############## -# ggseqlogo: custom matrix of my data -############## -snps = read.csv(infile - , stringsAsFactors = F - , header = F) #3072, - -class(snps); str(snps) # df and chr - -# turn to a character vector -snps2 = as.character(snps[1:nrow(snps),]) - -class(snps2); str(snps2) #character, chr - -# plot -logomaker(snps2, type = my_type - , color_type = "per_row") + - theme(axis.text.x = element_blank()) + - theme_logo()+ - # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) ) - scale_x_continuous(breaks = 1:ncol(tab_mt) - , labels = colnames(tab_mt))+ - scale_y_continuous( breaks = 0:5 - , limits = my_ylim) - - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R deleted file mode 100644 index 80f1971..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R +++ /dev/null @@ -1,273 +0,0 @@ -getwd() -setwd("~/git//LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -# TASK: Multiple mutations per site -# as aa symbol coloured by aa property - -######################################################################## -# Installing and loading required packages # -######################################################################## - -#source("../Header_TT.R") - -#source("barplot_colour_function.R") - -library(ggseqlogo) - -######################################################################## -# Read file: call script for combining df for lig # -######################################################################## - -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -#merged_df2 # 3092, 35 -#merged_df2_comp #3012, 35 - -#merged_df3 #335, 35 -#merged_df3_comp #293, 35 -#========================== - -########################### -# Data for Logo plots -# you need small df i.e -# merged_df3 -# or -# merged_df3_comp? possibly -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### - -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 # to show multiple mutations per site -my_df = read.csv(file.choose()) -#%%%%%%%%%%%%%%%%%%%%%%%% - -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -colnames(my_df) -str(my_df) - -rownames(my_df) = my_df$Mutationinformation - -c1 = unique(my_df$Position) #130 -nrow(my_df) #335 - -table(my_df$occurrence) -#1 2 3 4 5 6 -#34 76 63 104 40 18 - -# get freq count of positions so you can subset freq<1 -#: already done in teh combining script -#require(data.table) -#setDT(my_df)[, occurrence := .N, by = .(Position)] #189, 36 - -table(my_df$Position); table(my_df$occurrence) - -# extract freq_pos>1 -my_data_snp = my_df[my_df$occurrence!=1,] #301, 36 -u_pos = unique(my_data_snp$Position) #96 - -# sanity check -exp_dim = nrow(my_df) - table(my_df$occurrence)[[1]]; exp_dim -if ( nrow(my_data_snp) == exp_dim ){ - print("Sanity check passed: Data filtered correctly, dim match") -} else { - print("Error: Please Debug") -} - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -######################################################### -# Task: To generate a logo plot or bar plot but coloured -# aa properties. -# step1: read data file -# step2: plot wild type positions -# step3: plot mutants per position coloured by aa properties -# step4: make the size of the letters/bars prop to OR if you can! -######################################################### -# useful links -# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -# https://omarwagih.github.io/ggseqlogo/ -# https://kkdey.github.io/Logolas-pages/workflow.html -# A new sequence logo plot to highlight enrichment and depletion. -# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ -# very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ - - -############# -# PLOTS: Bar plot with aa properties -# using gglogo -# useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -############# - -############## -# ggseqlogo: custom matrix of my data -############## - -#============== -# matrix for mutant type -# frequency of mutant type by position -#============== -table(my_data_snp$Mutant_type, my_data_snp$Position) -tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position) -class(tab_mt) - -# unclass to convert to matrix -tab_mt = unclass(tab_mt) -tab_mt = as.matrix(tab_mt, rownames = T) - -# should be TRUE -is.matrix(tab_mt) - -rownames(tab_mt) #aa -colnames(tab_mt) #pos - -#============== -# matrix for wild type -# frequency of wild type by position -#============== -# remove wt duplicates -wt = my_data_snp[, c("Position", "Wild_type")] #301, 2 -wt = wt[!duplicated(wt),]#96, 2 - -table(wt$Wild_type) # contains duplicates - -tab_wt = table(wt$Wild_type, wt$Position); tab_wt # should all be 1 - -tab_wt = unclass(tab_wt) #important -class(tab_wt); rownames(tab_wt) -#tab_wt = as.matrix(tab_wt, rownames = T) - -rownames(tab_wt) -rownames(tab_mt) - -# sanity check -if (ncol(tab_wt) == length(u_pos) ){ - print("Sanity check passed: wt data dim match") -} else { - print("Error: Please debug") -} - -#************** -# Plot 1: mutant logo -#************** -#install.packages("digest") -#library(digest) -# following example -require(ggplot2) -require(reshape2) -library(gglogo) -library(ggrepel) -library(ggseqlogo) - -# generate seq logo for mutant type -my_ymax = max(my_data_snp$occurrence); my_ymax -my_ylim = c(0, my_ymax) -#my_yrange = 1:my_ymax; my_yrange - -# axis sizes -# common: text and label -my_ats = 15 -my_als = 20 - -# individual: text and label -my_xats = 15 -my_yats = 20 -my_xals = 15 -my_yals = 20 - -# legend size: text and label -my_lts = 20 -#my_lls = 20 - -p0 = ggseqlogo(tab_mt - , method = 'custom' - , seq_type = 'aa' -# , col_scheme = "taylor" -# , col_scheme = "chemistry2" -) + -# ylab('my custom height') + - theme(axis.text.x = element_blank()) + - theme_logo()+ -# scale_x_continuous(breaks=1:51, parse (text = colnames(tab_mt)) ) - scale_x_continuous(breaks = 1:ncol(tab_mt) - , labels = colnames(tab_mt))+ - scale_y_continuous( breaks = 1:my_ymax - , limits = my_ylim) - -p0 - -# further customisation -p1 = p0 + theme(legend.position = "none" - , legend.title = element_blank() - , legend.text = element_text(size = my_lts) - , axis.text.x = element_text(size = my_xats, angle = 90) - , axis.text.y = element_text(size = my_yats, angle = 90)) -p1 - -#************** -# Plot 2: for wild_type -# with custom x axis to reflect my aa positions -#************** -# sanity check: MUST BE TRUE -# for the correctnes of the x axis -identical(colnames(tab_mt), colnames(tab_wt)) -identical(ncol(tab_mt), ncol(tab_wt)) - -p2 = ggseqlogo(tab_wt - , method = 'custom' - , seq_type = 'aa' -# , col_scheme = "taylor" -# , col_scheme = chemistry2 -) + -# ylab('my custom height') + - theme(axis.text.x = element_blank() - , axis.text.y = element_blank()) + - theme_logo() + - scale_x_continuous(breaks = 1:ncol(tab_wt) - , labels = colnames(tab_wt)) + - scale_y_continuous( breaks = 0:1 - , limits = my_ylim ) - -p2 - -# further customise - -p3 = p2 + - theme(legend.position = "bottom" - , legend.text = element_text(size = my_lts) - , axis.text.x = element_text(size = my_ats - , angle = 90) - , axis.text.y = element_blank()) - -p3 - - -# Now combine using cowplot, which ensures the plots are aligned -suppressMessages( require(cowplot) ) - -plot_grid(p1, p3, ncol = 1, align = 'v') #+ -# background_grid(minor = "xy" -# , size.minor = 1 -# , colour.minor = "grey86") - - -#colour scheme -#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R deleted file mode 100644 index 2049c3e..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R +++ /dev/null @@ -1,208 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -############################################################ -# 1: Installing and loading required packages and functions -############################################################ - -#source("../Header_TT.R") -#source("../barplot_colour_function.R") -#library(tidyverse) - -########################### -#2: Read file: normalised file, output of step 4 mcsm pipeline -########################### -#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv" -# , row.names = 1 -# , stringsAsFactors = F -# , header = T) - -# call script combining_df -source("../combining_two_df_lig.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -# from Plotting to Scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA for pyrazinamide: -#merged_df2 -#merged_df2_comp - -# df without NA for pyrazinamide: -#merged_df3 -#merged_df3_comp -#========================== -########################### -# Data to choose: -# We will be using the small dfs -# to generate the coloured axis -########################### - -# uncomment as necessary -#!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#!!!!!!!!!!!!!!!!!!!!!!! - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -str(my_df) -my_df$Position -c1 = my_df[my_df$Mutationinformation == "A134V",] - -# order my_df by Position -my_df_o = my_df[order(my_df$Position),] -head(my_df_o$Position); tail(my_df_o$Position) - -c2 = my_df_o[my_df_o$Mutationinformation == "A134V",] - -# sanity check -if (sum(table(c1 == c2)) == ncol(my_df)){ - print ("Sanity check passsd") -}else{ - print ("Error!: Please debug your code") -} - -rm(my_df, c1, c2) - -# create a new df with unique position numbers and cols -Position = unique(my_df_o$Position) -Position_cols = as.data.frame(Position) - -head(Position_cols) ; tail(Position_cols) - -# specify active site residues and bg colour -Position = c(49, 51, 57, 71 - , 8, 96, 138 - , 13, 68 - , 103, 137 - , 133, 134) #13 - -lab_bg = rep(c("purple" - , "yellow" - , "cornflowerblue" - , "blue" - , "green"), times = c(4, 3, 2, 2, 2) -) - -# second bg colour for active site residues -#lab_bg2 = rep(c("white" -# , "green" , "white", "green" -# , "white" -# , "white" -# , "white"), times = c(4 -# , 1, 1, 1 -# , 2 -# , 2 -# , 2) -#) - -#%%%%%%%%% -# revised: leave the second box coloured as the first one incase there is no second colour -#%%%%%%%%% -lab_bg2 = rep(c("purple" - , "green", "yellow", "green" - , "cornflowerblue" - , "blue" - , "green"), times = c(4 - , 1, 1, 1 - , 2 - , 2 - , 2)) - -# fg colour for labels for active site residues -lab_fg = rep(c("white" - , "black" - , "black" - , "white" - , "black"), times = c(4, 3, 2, 2, 2)) - -#%%%%%%%%% -# revised: make the purple ones black -# fg colour for labels for active site residues -#%%%%%%%%% -#lab_fg = rep(c("black" -# , "black" -# , "black" -# , "white" -# , "black"), times = c(4, 3, 2, 2, 2)) - -# combined df with active sites, bg and fg colours -aa_cols_ref = data.frame(Position - , lab_bg - , lab_bg2 - , lab_fg - , stringsAsFactors = F) #13, 4 - -str(Position_cols); class(Position_cols) -str(aa_cols_ref); class(aa_cols_ref) - -# since Position is int and numeric in the two dfs resp, -# converting numeric to int for consistency -aa_cols_ref$Position = as.integer(aa_cols_ref$Position) -class(aa_cols_ref$Position) - -#=========== -# Merge 1: merging Positions df (Position_cols) and -# active site cols (aa_cols_ref) -# linking column: "Position" -# This is so you can have colours defined for all 130 positions -#=========== -head(Position_cols$Position); head(aa_cols_ref$Position) - -mut_pos_cols = merge(Position_cols, aa_cols_ref - , by = "Position" - , all.x = TRUE) - -head(mut_pos_cols) -# replace NA's -# :column "lab_bg" with "white" -# : column "lab_fg" with "black" -mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white" -mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white" -mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black" -head(mut_pos_cols) - -#=========== -# Merge 2: Merge mut_pos_cols with mcsm df -# Now combined the 130 positions with aa colours with -# the mcsm_data -#=========== -# dfs to merge -df0 = my_df_o -df1 = mut_pos_cols - -# check the column on which merge will be performed -head(df0$Position); tail(df0$Position) -head(df1$Position); tail(df1$Position) - -# should now have 3 extra columns -my_df = merge(df0, df1 - , by = "Position" - , all.x = TRUE) - -# sanity check -my_df[my_df$Position == "49",] -my_df[my_df$Position == "13",] - -my_df$Position - -# clear variables -rm(aa_cols_ref - , df0 - , df1 - , my_df_o - , Position_cols - , lab_bg - , lab_bg2 - , lab_fg - , Position - ) - diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R deleted file mode 100644 index 37dfe32..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R +++ /dev/null @@ -1,208 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") -getwd() - -############################################################ -# 1: Installing and loading required packages and functions -############################################################ - -#source("../Header_TT.R") -#source("../barplot_colour_function.R") -#library(tidyverse) - -########################### -#2: Read file: normalised file, output of step 4 mcsm pipeline -########################### -#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv" -# , row.names = 1 -# , stringsAsFactors = F -# , header = T) - -# call script combining_df -source("../combining_two_df.R") - -#---------------------- PAY ATTENTION -# the above changes the working dir -# from Plotting to Scripts" -#---------------------- PAY ATTENTION - -#========================== -# This will return: - -# df with NA for pyrazinamide: -#merged_df2 -#merged_df2_comp - -# df without NA for pyrazinamide: -#merged_df3 -#merged_df3_comp -#========================== -########################### -# Data to choose: -# We will be using the small dfs -# to generate the coloured axis -########################### - -# uncomment as necessary -#!!!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#!!!!!!!!!!!!!!!!!!!!!!! - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -str(my_df) -my_df$Position -c1 = my_df[my_df$Mutationinformation == "L4S",] - -# order my_df by Position -my_df_o = my_df[order(my_df$Position),] -head(my_df_o$Position); tail(my_df_o$Position) - -c2 = my_df_o[my_df_o$Mutationinformation == "L4S",] - -# sanity check -if (sum(table(c1 == c2)) == ncol(my_df)){ - print ("Sanity check passsd") -}else{ - print ("Error!: Please debug your code") -} - -rm(my_df, c1, c2) - -# create a new df with unique position numbers and cols -Position = unique(my_df_o$Position) #130 -Position_cols = as.data.frame(Position) - -head(Position_cols) ; tail(Position_cols) - -# specify active site residues and bg colour -Position = c(49, 51, 57, 71 - , 8, 96, 138 - , 13, 68 - , 103, 137 - , 133, 134) #13 - -lab_bg = rep(c("purple" - , "yellow" - , "cornflowerblue" - , "blue" - , "green"), times = c(4, 3, 2, 2, 2) -) - -# second bg colour for active site residues -#lab_bg2 = rep(c("white" -# , "green" , "white", "green" -# , "white" -# , "white" -# , "white"), times = c(4 -# , 1, 1, 1 -# , 2 -# , 2 -# , 2) -#) - -#%%%%%%%%% -# revised: leave the second box coloured as the first one incase there is no second colour -#%%%%%%%%% -lab_bg2 = rep(c("purple" - , "green", "yellow", "green" - , "cornflowerblue" - , "blue" - , "green"), times = c(4 - , 1, 1, 1 - , 2 - , 2 - , 2)) - -# fg colour for labels for active site residues -lab_fg = rep(c("white" - , "black" - , "black" - , "white" - , "black"), times = c(4, 3, 2, 2, 2)) - -#%%%%%%%%% -# revised: make the purple ones black -# fg colour for labels for active site residues -#%%%%%%%%% -#lab_fg = rep(c("black" -# , "black" -# , "black" -# , "white" -# , "black"), times = c(4, 3, 2, 2, 2)) - -# combined df with active sites, bg and fg colours -aa_cols_ref = data.frame(Position - , lab_bg - , lab_bg2 - , lab_fg - , stringsAsFactors = F) #13, 4 - -str(Position_cols); class(Position_cols) -str(aa_cols_ref); class(aa_cols_ref) - -# since Position is int and numeric in the two dfs resp, -# converting numeric to int for consistency -aa_cols_ref$Position = as.integer(aa_cols_ref$Position) -class(aa_cols_ref$Position) - -#=========== -# Merge 1: merging Positions df (Position_cols) and -# active site cols (aa_cols_ref) -# linking column: "Position" -# This is so you can have colours defined for all 130 positions -#=========== -head(Position_cols$Position); head(aa_cols_ref$Position) - -mut_pos_cols = merge(Position_cols, aa_cols_ref - , by = "Position" - , all.x = TRUE) - -head(mut_pos_cols) -# replace NA's -# :column "lab_bg" with "white" -# : column "lab_fg" with "black" -mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white" -mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white" -mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black" -head(mut_pos_cols) - -#=========== -# Merge 2: Merge mut_pos_cols with mcsm df -# Now combined the 130 positions with aa colours with -# the mcsm_data -#=========== -# dfs to merge -df0 = my_df_o -df1 = mut_pos_cols - -# check the column on which merge will be performed -head(df0$Position); tail(df0$Position) -head(df1$Position); tail(df1$Position) - -# should now have 3 extra columns -my_df = merge(df0, df1 - , by = "Position" - , all.x = TRUE) - -# sanity check -my_df[my_df$Position == "49",] -my_df[my_df$Position == "13",] - -my_df$Position - -# clear variables -rm(aa_cols_ref - , df0 - , df1 - , my_df_o - , Position_cols - , lab_bg - , lab_bg2 - , lab_fg - , Position - ) - diff --git a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R deleted file mode 100644 index 41ca884..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R +++ /dev/null @@ -1,27 +0,0 @@ -######################### -#3: Read complex pdb file -########################## -source("Header_TT.R") -# This script only reads the pdb file of your complex - -# read in pdb file complex1 -inDir = "~/git/Data/pyrazinamide/input/structure/" -inFile = paste0(inDir, "complex1_no_water.pdb") -complex1 = inFile - -#inFile2 = paste0(inDir, "complex2_no_water.pdb") -#complex2 = inFile2 - -# list of 8 -my_pdb = read.pdb(complex1 - , maxlines = -1 - , multi = FALSE - , rm.insert = FALSE - , rm.alt = TRUE - , ATOM.only = FALSE - , hex = FALSE - , verbose = TRUE) - -rm(inDir, inFile, complex1) -#====== end of script - diff --git a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R deleted file mode 100644 index 658eec4..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R +++ /dev/null @@ -1,386 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") -getwd() - -######################################################################## -# Installing and loading required packages # -######################################################################## - -source("Header_TT.R") - -######################################################### -# TASK: replace B-factors in the pdb file with normalised values -# use the complex file with no water as mCSM lig was -# performed on this file. You can check it in the script: read_pdb file. -######################################################### - -########################### -# 2: Read file: average stability values -# or mcsm_normalised file, output of step 4 mcsm pipeline -########################### - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile - -my_df <- read.csv(inFile -# , row.names = 1 -# , stringsAsFactors = F - , header = T) -str(my_df) - -#========================================================= -# Processing P1: Replacing B factor with mean ratioDUET scores -#========================================================= - -######################### -# Read complex pdb file -# form the R script -########################## - -source("read_pdb.R") # list of 8 - -# extract atom list into a variable -# since in the list this corresponds to data frame, variable will be a df -d = my_pdb[[1]] - -# make a copy: required for downstream sanity checks -d2 = d - -# sanity checks: B factor -max(d$b); min(d$b) - -#******************************************* -# plot histograms for inspection -# 1: original B-factors -# 2: original DUET Scores -# 3: replaced B-factors with DUET Scores -#********************************************* -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(3,2)) -#par(mfrow = c(3,2)) - - #1: Original B-factor -hist(d$b - , xlab = "" - , main = "B-factor") - -plot(density(d$b) - , xlab = "" - , main = "B-factor") - -# 2: DUET scores -hist(my_df$average_DUETR - , xlab = "" - , main = "Norm_DUET") - -plot(density(my_df$average_DUETR) - , xlab = "" - , main = "Norm_DUET") - -# 3: After the following replacement -#******************************** - -#========= -# step 0_P1: DONT RUN once you have double checked the matched output -#========= -# sanity check: match and assign to a separate column to double check -# colnames(my_df) -# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)] - -#========= -# step 1_P1 -#========= -# Be brave and replace in place now (don't run sanity check) -# this makes all the B-factor values in the non-matched positions as NA -d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] - -#========= -# step 2_P1 -#========= -# count NA in Bfactor -b_na = sum(is.na(d$b)) ; b_na - -# count number of 0's in Bactor -sum(d$b == 0) -#table(d$b) - -# replace all NA in b factor with 0 -d$b[is.na(d$b)] = 0 - -# sanity check: should be 0 -sum(is.na(d$b)) - -# sanity check: should be True -if (sum(d$b == 0) == b_na){ - print ("Sanity check passed: NA's replaced with 0's successfully") -} else { - print("Error: NA replacement NOT successful, Debug code!") -} - -max(d$b); min(d$b) - -# sanity checks: should be True -if(max(d$b) == max(my_df$average_DUETR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -if (min(d$b) == min(my_df$average_DUETR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -#========= -# step 3_P1 -#========= -# sanity check: dim should be same before reassignment -# should be TRUE -dim(d) == dim(d2) - -#========= -# step 4_P1 -#========= -# assign it back to the pdb file -my_pdb[[1]] = d - -max(d$b); min(d$b) - -#========= -# step 5_P1 -#========= -# output dir -getwd() -outDir = "~/git/Data/pyrazinamide/input/structure/" - -outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile -write.pdb(my_pdb, outFile) - -#******************************** -# Add the 3rd histogram and density plots for comparisons -#******************************** -# Plots continued... -# 3: hist and density of replaced B-factors with DUET Scores -hist(d$b - , xlab = "" - , main = "repalced-B") - -plot(density(d$b) - , xlab = "" - , main = "replaced-B") - -# graph titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = "DUET_stability" - , side = 3 - , line = 0 - , outer = TRUE) -#******************************** - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# NOTE: This replaced B-factor distribution has the same -# x-axis as the PredAff normalised values, but the distribution -# is affected since 0 is overinflated. This is because all the positions -# where there are no SNPs have been assigned 0. -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - - - - -####################################################################### -#====================== end of section 1 ============================== -####################################################################### - - - - - -#========================================================= -# Processing P2: Replacing B values with PredAff Scores -#========================================================= -# clear workspace -rm(list = ls()) - -########################### -# 2: Read file: average stability values -# or mcsm_normalised file, output of step 4 mcsm pipeline -########################### - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile - -my_df <- read.csv(inFile -# , row.names = 1 -# , stringsAsFactors = F - , header = T) -str(my_df) -#rm(inDir, inFile) - -######################### -# 3: Read complex pdb file -# form the R script -########################## - -source("read_pdb.R") # list of 8 - -# extract atom list into a variable -# since in the list this corresponds to data frame, variable will be a df -d = my_pdb[[1]] - -# make a copy: required for downstream sanity checks -d2 = d - -# sanity checks: B factor -max(d$b); min(d$b) - -#******************************************* -# plot histograms for inspection -# 1: original B-factors -# 2: original Pred Aff Scores -# 3: replaced B-factors with PredAff Scores -#******************************************** -# Set the margin on all sides -par(oma = c(3,2,3,0) - , mar = c(1,3,5,2) - , mfrow = c(3,2)) -#par(mfrow = c(3,2)) - -# 1: Original B-factor -hist(d$b - , xlab = "" - , main = "B-factor") - -plot(density(d$b) - , xlab = "" - , main = "B-factor") - -# 2: Pred Aff scores -hist(my_df$average_PredAffR - , xlab = "" - , main = "Norm_lig_average") - -plot(density(my_df$average_PredAffR) - , xlab = "" - , main = "Norm_lig_average") - -# 3: After the following replacement -#******************************** - -#================================================= -# Processing P2: Replacing B values with ratioPredAff scores -#================================================= -# use match to perform this replacement linking with "position no" -# in the pdb file, this corresponds to column "resno" -# in my_df, this corresponds to column "Position" - -#========= -# step 0_P2: DONT RUN once you have double checked the matched output -#========= -# sanity check: match and assign to a separate column to double check -# colnames(my_df) -# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17 - -#========= -# step 1_P2: BE BRAVE and replace in place now (don't run step 0) -#========= -# this makes all the B-factor values in the non-matched positions as NA -d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] - -#========= -# step 2_P2 -#========= -# count NA in Bfactor -b_na = sum(is.na(d$b)) ; b_na - -# count number of 0's in Bactor -sum(d$b == 0) -#table(d$b) - -# replace all NA in b factor with 0 -d$b[is.na(d$b)] = 0 - -# sanity check: should be 0 -sum(is.na(d$b)) - -if (sum(d$b == 0) == b_na){ - print ("Sanity check passed: NA's replaced with 0's successfully") -} else { - print("Error: NA replacement NOT successful, Debug code!") -} - -max(d$b); min(d$b) - -# sanity checks: should be True -if (max(d$b) == max(my_df$average_PredAffR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -if (min(d$b) == min(my_df$average_PredAffR)){ - print("Sanity check passed: B-factors replaced correctly") -} else { - print ("Error: Debug code please") -} - -#========= -# step 3_P2 -#========= -# sanity check: dim should be same before reassignment -# should be TRUE -dim(d) == dim(d2) - -#========= -# step 4_P2 -#========= -# assign it back to the pdb file -my_pdb[[1]] = d - -max(d$b); min(d$b) - -#========= -# step 5_P2 -#========= - -# output dir -outDir = "~/git/Data/pyrazinamide/input/structure/" -outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile -write.pdb(my_pdb, outFile) - -#******************************** -# Add the 3rd histogram and density plots for comparisons -#******************************** -# Plots continued... -# 3: hist and density of replaced B-factors with PredAff Scores -hist(d$b - , xlab = "" - , main = "repalced-B") - -plot(density(d$b) - , xlab = "" - , main = "replaced-B") - -# graph titles -mtext(text = "Frequency" - , side = 2 - , line = 0 - , outer = TRUE) - -mtext(text = "Lig_stability" - , side = 3 - , line = 0 - , outer = TRUE) - -#******************************** - -########### -# end of output files with Bfactors -########## diff --git a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R deleted file mode 100644 index 9f30f28..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R +++ /dev/null @@ -1,257 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") -getwd() - -######################################################### -# 1: Installing and loading required packages # -######################################################### - -source("Header_TT.R") -#source("barplot_colour_function.R") - -########################################################## -# Checking: Entire data frame and for PS # -########################################################## - -########################### -#2) Read file: combined one from the script -########################### -source("combining_two_df.R") - -# df with NA: -# merged_df2 -# merged_df3: - -# df without NA: -# merged_df2_comp: -# merged_df3_comp: - -###################### -# You need to check it -# with the merged_df3 -######################## - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -#clear variables -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# should be true -identical(my_df$Position, my_df$position) - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# - - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile - -mcsm_data <- read.csv(inFile - , row.names = 1 - , stringsAsFactors = F - , header = T) -str(mcsm_data) -my_colnames = colnames(mcsm_data) - -#==================================== -# subset my_df to include only the columns in mcsm data -my_df2 = my_df[my_colnames] -#==================================== -# compare the two -head(mcsm_data$Mutationinformation) -head(mcsm_data$Position) - -head(my_df2$Mutationinformation) -head(my_df2$Position) - -# sort mcsm data by Mutationinformation -mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] -head(mcsm_data_s$Mutationinformation) -head(mcsm_data_s$Position) - -# now compare: should be True, but is false.... -# possibly due to rownames!?! -identical(mcsm_data_s, my_df2) - -# from library dplyr -setdiff(mcsm_data_s, my_df2) - -#from lib compare -compare(mcsm_data_s, my_df2) # seems rownames are the problem - -# FIXME: automate this -# write files: checked using meld and files are indeed identical -#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F) -#write.csv(my_df2, "my_df2.csv", row.names = F) - - -#====================================================== end of section 1 - - - -########################################################## -# Checking: LIG(Filtered dataframe) # -########################################################## - -# clear workspace -rm(list = ls()) - -########################### -#3) Read file: combined_lig from the script -########################### -source("combining_two_df_lig.R") - -# df with NA: -# merged_df2 : -# merged_df3: - -# df without NA: -# merged_df2_comp: -# merged_df3_comp: - -###################### -# You need to check it -# with the merged_df3 -######################## - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -#clear variables -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# should be true -identical(my_df$Position, my_df$position) - -################################# -# Read file: normalised file -# output of step 4 mcsm_pipeline -################################# - -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile - -mcsm_data <- read.csv(inFile - , row.names = 1 - , stringsAsFactors = F - , header = T) -str(mcsm_data) - -########################### -# 4a: Filter/subset data: ONLY for LIGand analysis -# Lig plots < 10Ang -# Filter the lig plots for Dis_to_lig < 10Ang -########################### -# sanity checks -upos = unique(mcsm_data$Position) - -# check range of distances -max(mcsm_data$Dis_lig_Ang) -min(mcsm_data$Dis_lig_Ang) - -# Lig filtered: subset data to have only values less than 10 Ang -mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) - -rm(mcsm_data) #to avoid confusion - -table(mcsm_data2$Dis_lig_Ang<10) -table(mcsm_data2$Dis_lig_Ang>10) - -max(mcsm_data2$Dis_lig_Ang) -min(mcsm_data2$Dis_lig_Ang) - -upos_f = unique(mcsm_data2$Position); upos_f - -# colnames of df that you will need to subset the bigger df from -my_colnames = colnames(mcsm_data2) -#==================================== -# subset bigger df i.e my_df to include only the columns in mcsm data2 -my_df2 = my_df[my_colnames] - -rm(my_df) #to avoid confusion -#==================================== -# compare the two -head(mcsm_data2$Mutationinformation) -head(mcsm_data2$Position) - -head(my_df2$Mutationinformation) -head(my_df2$Position) - -# sort mcsm data by Mutationinformation -mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] -head(mcsm_data2_s$Mutationinformation) -head(mcsm_data2_s$Position) - -# now compare: should be True, but is false.... -# possibly due to rownames!?! -identical(mcsm_data2_s, my_df2) - -# from library dplyr -setdiff(mcsm_data2_s, my_df2) - -# from library compare -compare(mcsm_data2_s, my_df2) # seems rownames are the problem - -#FIXME: automate this -# write files: checked using meld and files are indeed identical -#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F) -#write.csv(my_df2, "my_df2.csv", row.names = F) - - -########################################################## -# extract and write output file for SNP posn: all # -########################################################## - -head(merged_df3$Position) - -foo = merged_df3[order(merged_df3$Position),] -head(foo$Position) - -snp_pos_unique = unique(foo$Position); snp_pos_unique - -# sanity check: -table(snp_pos_unique == combined_df$Position) - -#===================== -# write_output files -#===================== -outDir = "~/Data/pyrazinamide/input/processed/" - - -outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1 -print(paste0("Output file name and path will be:","", outFile1)) - -write.table(snp_pos_unique - , outFile1 - , row.names = F - , col.names = F) - -############################################################## -# extract and write output file for SNP posn: complete only # -############################################################## -head(merged_df3_comp$Position) - -foo = merged_df3_comp[order(merged_df3_comp$Position),] -head(foo$Position) - -snp_pos_unique = unique(foo$Position); snp_pos_unique - -# outDir = "~/Data/pyrazinamide/input/processed/" # already set - -outFile2 = paste0(outDir, "snp_pos_unique_comp.txt") -print(paste0("Output file name and path will be:", outFile2)) - -write.table(snp_pos_unique - , outFile2 - , row.names = F - , col.names = F) -#============================== end of script - - diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py index 7b59cfa..5d3dc64 100755 --- a/meta_data_analysis/dssp_df.py +++ b/meta_data_analysis/dssp_df.py @@ -48,9 +48,10 @@ gene = 'pncA' datadir = homedir + '/' + 'git/Data' #======= -# input +# input from outdir #======= -indir = datadir + '/' + drug + '/' + 'output' +#indir = datadir + '/' + drug + '/' + 'output' +outdir = datadir + '/' + drug + '/' + 'output' #in_filename = 'pnca.dssp' in_filename = gene.lower() +'.dssp' infile = indir + '/' + in_filename diff --git a/mk_drug_dirs.sh b/mk_drug_dirs.sh index 6a6dd6d..a336ed3 100755 --- a/mk_drug_dirs.sh +++ b/mk_drug_dirs.sh @@ -4,9 +4,6 @@ ## Structure: # # $DATA_DIR/$DRUG/input -# |- original -# |- processed -# |- structure # # $DATA_DIR/$DRUG/output # |- plots @@ -15,18 +12,17 @@ DATA_DIR=~/git/Data if [[ $1 == '' ]]; then + echo "Error" echo "usage: mk-drug-dirs.sh "; exit; else DRUG=$1 - echo Creating structure for: $DRUG + echo Creating directory structure for: $DRUG if [ -d $DATA_DIR ] then echo Doing creation in $DATA_DIR - mkdir -p $DATA_DIR/$DRUG/input/original - mkdir -p $DATA_DIR/$DRUG/input/processed - mkdir -p $DATA_DIR/$DRUG/input/structure + mkdir -p $DATA_DIR/$DRUG/input mkdir -p $DATA_DIR/$DRUG/output/plots mkdir -p $DATA_DIR/$DRUG/output/structure diff --git a/meta_data_analysis/data_extraction.py b/scripts/data_extraction.py similarity index 57% rename from meta_data_analysis/data_extraction.py rename to scripts/data_extraction.py index 70f3008..451d6cf 100755 --- a/meta_data_analysis/data_extraction.py +++ b/scripts/data_extraction.py @@ -11,63 +11,77 @@ Created on Tue Aug 6 12:56:03 2019 # FIXME: import dirs.py to get the basic dir paths available #======================================================================= -# TASK: extract ALL pncA_p. mutations from GWAS data +# TASK: extract ALL matched mutations from GWAS data # Input data file has the following format: each row = unique sample id -# id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide... -# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,pncA_p.POS; pncA_c.POS... -# where multiple mutations and multiple mutation types are separated by ';'. We are interested in the -# protein coding region i.e mutation with the 'p.' format. - -# the script splits the mutations on the ';' and extracts protein coding muts only +# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col... +# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_matchPOS; pncA_c.POS... +# where multiple mutations and multiple mutation types are separated by ';'. +# We are interested in the protein coding region i.e mutation with the_'p.' format. +# This script splits the mutations on the ';' and extracts protein coding muts only # where each row is a separate mutation # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique -# output files: -# 0) pnca_common_ids.csv -# 1) pnca_ambiguous_muts.csv -# 2) pnca_mcsm_snps.csv -# 3) pnca_metadata.csv -# 4) pnca_all_muts_msa.csv -# 5) pnca_mutational_positons.csv +# output files: all lower case +# 0) _common_ids.csv +# 1) _ambiguous_muts.csv +# 2) _mcsm_snps.csv +# 3) _metadata.csv +# 4) _all_muts_msa.csv +# 5) _mutational_positons.csv #======================================================================= #%% load libraries import os, sys import pandas as pd #import numpy as np - -#from pandas.api.types import is_string_dtype -#from pandas.api.types import is_numeric_dtype - -#%% specify homedir as python doesn't recognise tilde +import argparse +#======================================================================= +#%% homdir and curr dir and local imports homedir = os.path.expanduser('~') - # set working dir os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # import aa dict -from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! +from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' +#drug = 'pyrazinamide' +#gene = 'pncA' +drug = args.drug +gene = args.gene gene_match = gene + '_p.' +# building cols to extract +dr_muts_col = 'dr_mutations_' + drug +other_muts_col = 'other_mutations_' + drug +print('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n===============================================================') +#======================================================================= +#%% input and output dirs and files #======= # data dir #======= -#indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= -#indir = 'git/Data/pyrazinamide/input/original' in_filename = 'original_tanushree_data_v2.csv' infile = datadir + '/' + in_filename print('Input filename: ', in_filename - , '\nInput path: ', indir + , '\nInput path: ', datadir , '\n============================================================') #======= @@ -88,15 +102,15 @@ master_data = pd.read_csv(infile, sep = ',') # column names #list(master_data.columns) -# extract elevant columns to extract from meta data related to the pyrazinamide +# extract elevant columns to extract from meta data related to the drug meta_data = master_data[['id' ,'country' ,'lineage' ,'sublineage' ,'drtype' - , 'pyrazinamide' - , 'dr_mutations_pyrazinamide' - , 'other_mutations_pyrazinamide' + , drug + , dr_muts_col + , other_muts_col ]] del(master_data) @@ -115,13 +129,13 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum() meta_data.head() # equivalent of table in R -# pyrazinamide counts -meta_data.pyrazinamide.value_counts() -print('RESULT: Sus and Res samples:\n', meta_data.pyrazinamide.value_counts() +# drug counts +meta_data[drug].value_counts() +print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts() , '\n===========================================================') # clear variables -del(indir, in_filename,infile) +del(in_filename,infile) #del(outdir) #%% # !!!IMPORTANT!!! sanity check: @@ -129,18 +143,18 @@ del(indir, in_filename,infile) # can use it to check if your data extraction process for dr_muts # and other_muts has worked correctly AND also to check the dim of the # final formatted data. -# This will have: unique COMBINATION of sample id and pncA_p.mutations +# This will have: unique COMBINATION of sample id and mutations #======== -# First: counting pncA_p. mutations in dr_mutations_pyrazinamide column +# First: counting mutations in dr_muts_col column #======== -print('Now counting WT & pncA_p. muts within the column: dr_mutations_pyrazinamide') +print('Now counting WT &', gene_match, 'muts within the column:', dr_muts_col) # drop na and extract a clean df -clean_df = meta_data.dropna(subset=['dr_mutations_pyrazinamide']) +clean_df = meta_data.dropna(subset=[dr_muts_col]) # sanity check: count na -na_count = meta_data['dr_mutations_pyrazinamide'].isna().sum() +na_count = meta_data[dr_muts_col].isna().sum() if len(clean_df) == (total_samples - na_count): print('PASS: clean_df extracted: length is', len(clean_df) @@ -150,7 +164,7 @@ else: print('FAIL: dropping NA failed' , '\n==========================================================') -dr_pnca_count = 0 +dr_gene_count = 0 wt = 0 id_dr = [] id2_dr = [] @@ -158,45 +172,44 @@ id2_dr = [] for i, id in enumerate(clean_df.id): # print (i, id) # id_dr.append(id) -# count_pnca_dr = clean_df.dr_mutations_pyrazinamide.iloc[i].count('pncA_p.') #works 2201 - count_pnca_dr = clean_df.dr_mutations_pyrazinamide.iloc[i].count(gene_match) #works 2201 - if count_pnca_dr > 0: + count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) + if count_gene_dr > 0: id_dr.append(id) - if count_pnca_dr > 1: + if count_gene_dr > 1: id2_dr.append(id) -# print(id, count_pnca_dr) - dr_pnca_count = dr_pnca_count + count_pnca_dr - count_wt = clean_df.dr_mutations_pyrazinamide.iloc[i].count('WT') +# print(id, count_gene_dr) + dr_gene_count = dr_gene_count + count_gene_dr + count_wt = clean_df[dr_muts_col].iloc[i].count('WT') wt = wt + count_wt print('RESULTS:') -print('Total WT in dr_mutations_pyrazinamide:', wt) -print('Total matches of', gene_match, 'in dr_mutations_pyrazinamide:', dr_pnca_count) -print('Total samples with > 1', gene_match, 'muts in dr_mutations_pyrazinamide:', len(id2_dr) ) +print('Total WT in dr_muts_col:', wt) +print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count) +print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) ) print('=================================================================') -del(i, id, wt, id2_dr, clean_df, na_count, count_pnca_dr, count_wt) +del(i, id, wt, id2_dr, clean_df, na_count, count_gene_dr, count_wt) #======== -# Second: counting pncA_p. mutations in dr_mutations_pyrazinamide column +# Second: counting mutations in dr_muts_col column #======== -print('Now counting WT & pncA_p. muts within the column: other_mutations_pyrazinamide') +print('Now counting WT &', gene_match, 'muts within the column:', other_muts_col) # drop na and extract a clean df -clean_df = meta_data.dropna(subset=['other_mutations_pyrazinamide']) +clean_df = meta_data.dropna(subset=[other_muts_col]) # sanity check: count na -na_count = meta_data['other_mutations_pyrazinamide'].isna().sum() +na_count = meta_data[other_muts_col].isna().sum() if len(clean_df) == (total_samples - na_count): print('PASS: clean_df extracted: length is', len(clean_df) - , '\nNo.of NA s=', na_count, '/', total_samples + , '\nNo.of NAs =', na_count, '/', total_samples , '\n=========================================================') else: print('FAIL: dropping NA failed' , '\n=========================================================') -other_pnca_count = 0 +other_gene_count = 0 wt_other = 0 id_other = [] id2_other = [] @@ -204,63 +217,63 @@ id2_other = [] for i, id in enumerate(clean_df.id): # print (i, id) # id_other.append(id) -# count_pnca_other = clean_df.other_mutations_pyrazinamide.iloc[i].count('pncA_p.') - count_pnca_other = clean_df.other_mutations_pyrazinamide.iloc[i].count(gene_match) - if count_pnca_other > 0: +# count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match') + count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) + if count_gene_other > 0: id_other.append(id) - if count_pnca_other > 1: + if count_gene_other > 1: id2_other.append(id) -# print(id, count_pnca_other) - other_pnca_count = other_pnca_count + count_pnca_other - count_wt = clean_df.other_mutations_pyrazinamide.iloc[i].count('WT') +# print(id, count_gene_other) + other_gene_count = other_gene_count + count_gene_other + count_wt = clean_df[other_muts_col].iloc[i].count('WT') wt_other = wt_other + count_wt print('RESULTS:') -print('Total WT in other_mutations_pyrazinamide:', wt_other) -print('Total matches of', gene_match, 'in other_mutations_pyrazinamide:', other_pnca_count) -print('Total samples with > 1', gene_match, 'muts in other_mutations_pyrazinamide:', len(id2_other) ) +print('Total WT in other_muts_col:', wt_other) +print('Total matches of', gene_match, 'in', other_muts_col, ':', other_gene_count) +print('Total samples with > 1', gene_match, 'muts in other_muts_col:', len(id2_other) ) print('=================================================================') -print('Predicting total no. of rows in your curated df:', dr_pnca_count + other_pnca_count ) -expected_rows = dr_pnca_count + other_pnca_count +print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count + , '\n===================================================================') +expected_rows = dr_gene_count + other_gene_count -del(i, id, wt_other, clean_df, na_count, id2_other, count_pnca_other, count_wt) +del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt) #%% ############ # extracting dr and other muts separately along with the common cols ############# -print('=================================================================') -print('Extracting dr_muts in a dr_mutations_pyrazinamide with other meta_data') +print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data') print('gene to extract:', gene_match ) #=============== -# dr mutations: extract pncA_p. entries with meta data and ONLY dr_muts col +# dr mutations: extract gene_match entries with meta data and ONLY dr_muts col #=============== -# FIXME: replace pyrazinamide with variable containing the drug name +# FIXME: replace drug with variable containing the drug name # !!! important !!! meta_data_dr = meta_data[['id' ,'country' ,'lineage' ,'sublineage' ,'drtype' - , 'pyrazinamide' - , 'dr_mutations_pyrazinamide' + , drug + , dr_muts_col ]] print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) print('actual dim:', meta_data_dr.shape , '\n===============================================================') # Extract within this the gene of interest using string match -#meta_pnca_dr = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] -meta_pnca_dr = meta_data_dr.loc[meta_data_dr.dr_mutations_pyrazinamide.str.contains(gene_match, na = False)] +#meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] +meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] -dr_id = meta_pnca_dr['id'].unique() +dr_id = meta_gene_dr['id'].unique() print('RESULT: No. of samples with dr muts in pncA:', len(dr_id)) print('checking RESULT:', '\nexpected len =', len(id_dr), - '\nactual len =', len(meta_pnca_dr) ) + '\nactual len =', len(meta_gene_dr) ) -if len(id_dr) == len(meta_pnca_dr): +if len(id_dr) == len(meta_gene_dr): print('PASS: lengths match' , '\n===============================================================') else: @@ -270,18 +283,18 @@ else: dr_id = pd.Series(dr_id) #================= -# other mutations: extract pncA_p. entries +# other mutations: extract gene_match entries #================== -print('Extracting dr_muts in a other_mutations_pyrazinamide with other meta_data') -# FIXME: replace pyrazinamide with variable containing the drug name +print('Extracting dr_muts from:', other_muts_col,'with other meta_data') +# FIXME: replace drug with variable containing the drug name # !!! important !!! meta_data_other = meta_data[['id' ,'country' ,'lineage' ,'sublineage' ,'drtype' - , 'pyrazinamide' - , 'other_mutations_pyrazinamide' + , drug + , other_muts_col ]] print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) @@ -289,15 +302,15 @@ print('actual dim:', meta_data_other.shape , '\n===============================================================') # Extract within this the gene of interest using string match -meta_pnca_other = meta_data_other.loc[meta_data_other.other_mutations_pyrazinamide.str.contains(gene_match, na = False)] +meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] -other_id = meta_pnca_other['id'].unique() +other_id = meta_gene_other['id'].unique() print('RESULT: No. of samples with other muts:', len(other_id)) print('checking RESULT:', '\nexpected len =', len(id_other), - '\nactual len =', len(meta_pnca_other) ) + '\nactual len =', len(meta_gene_other) ) -if len(id_other) == len(meta_pnca_other): +if len(id_other) == len(meta_gene_other): print('PASS: lengths match' , '\n==============================================================') else: @@ -308,7 +321,7 @@ other_id = pd.Series(other_id) #%% Find common IDs print('Now extracting common_ids...') common_mut_ids = dr_id.isin(other_id).sum() -print('RESULT: No. of common Ids:', common_mut_ids) +print('RESULT: No. of common ids:', common_mut_ids) # sanity checks # check if True: should be since these are common @@ -327,9 +340,9 @@ common_ids2.columns = ['index', 'id2'] # should be True print(common_ids['id'].equals(common_ids2['id2'])) -# good sanity check: use it later to check pnca_sample_counts -expected_pnca_samples = ( len(meta_pnca_dr) + len(meta_pnca_other) - common_mut_ids ) -print('expected no. of pnca samples:', expected_pnca_samples) +# good sanity check: use it later to check gene_sample_counts +expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids ) +print('expected no. of gene samples:', expected_gene_samples) print('=================================================================') #%% write file #print(outdir) @@ -348,47 +361,47 @@ del(out_filename0) # clear variables del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) +#%% Now extract 'all' pncA mutations: i.e 'gene_match*' +print('extracting from string match:', gene_match, 'mutations from cols:\n' + , dr_muts_col, 'and', other_muts_col, 'using string match:' + , '\n===================================================================') +#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match) | meta_data[other_muts_col].str.contains(gene_match) ] +meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ] -#%% Now extract 'all' pncA mutations: i.e 'pncA_p.*' -print('extracting all pncA mutations from dr_ and other_ cols using string match:', gene_match - , '\n===============================================================') -#meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match) ] -meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match, na = False) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match, na = False) ] +extracted_gene_samples = meta_gene_all['id'].nunique() +print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples + , '\n===================================================================') -extracted_pnca_samples = meta_pnca_all['id'].nunique() -print('RESULT: actual no. of pnca samples extracted:', extracted_pnca_samples) -print('======================================================================') - -# sanity check: length of pnca samples +# sanity check: length of gene samples print('Performing sanity check:') -if extracted_pnca_samples == expected_pnca_samples: - print('No. of pnca samples:', len(meta_pnca_all) - , '\nPASS: expected & actual no. of pnca samples match' +if extracted_gene_samples == expected_gene_samples: + print('No. of gene samples:', len(meta_gene_all) + , '\nPASS: expected & actual no. of gene samples match' , '\n=========================================================') else: print('FAIL: Debug please!' , '\n===============================================================') -# count NA in pyrazinamide column -pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() -print('No. of pnca samples without pza testing i.e NA in pza column:', pnca_na) +# count NA in drug column +gene_na = meta_gene_all[drug].isna().sum() +print('No. of gene samples without pza testing i.e NA in pza column:', gene_na) # use it later to check number of complete samples from LF data -comp_pnca_samples = len(meta_pnca_all) - pnca_na -print('comp pnca samples tested for pza:', comp_pnca_samples) +comp_gene_samples = len(meta_gene_all) - gene_na +print('comp gene samples tested for pza:', comp_gene_samples) print('=================================================================') # Comment: This is still dirty data since these -# are samples that have pncA_p. muts, but can have others as well +# are samples that have gene_match muts, but can have others as well # since the format for mutations is mut1; mut2, etc. -print('This is still dirty data: samples have pncA_p. muts, but may have others as well' +print('This is still dirty data: samples have ', gene_match, 'muts but may have others as well' , '\nsince the format for mutations is mut1; mut2, etc.' , '\n=============================================================') #%% tidy_split():Function to split mutations on specified delimiter: ';' #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas +print('Performing tidy_split(): to separate the mutations into indivdual rows') -print('Performing tidy_spllit(): to separate the mutations into indivdual rows') # define the split function def tidy_split(df, column, sep='|', keep=False): ''' @@ -428,38 +441,38 @@ def tidy_split(df, column, sep='|', keep=False): #%% end of tidy_split() #========= -# DF1: dr_mutations_pyrazinamide +# DF1: dr_muts_col #========= ######## -# tidy_split(): on 'dr_mutations_pyrazinamide' column and remove leading white spaces +# tidy_split(): on dr_muts_col column and remove leading white spaces ######## -col_to_split1 = 'dr_mutations_pyrazinamide' -print ('Firstly, applying tidy split on dr df:', meta_pnca_dr.shape - , '\ncolumn name:', col_to_split1 +col_to_split1 = dr_muts_col +print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape + , '\ncolumn name to apply tidy_split():', col_to_split1 , '\n============================================================') # apply tidy_split() -dr_WF0 = tidy_split(meta_pnca_dr, col_to_split1, sep = ';') +dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') # remove leading white space else these are counted as distinct mutations as well -dr_WF0['dr_mutations_pyrazinamide'] = dr_WF0['dr_mutations_pyrazinamide'].str.lstrip() +dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip() -# extract only the samples/rows with pncA_p. -dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)] +# extract only the samples/rows with gene_match +dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] print('lengths after tidy split and extracting', gene_match, 'muts:' - , '\nold length:' , len(meta_pnca_dr) + , '\nold length:' , len(meta_gene_dr) , '\nlen after split:', len(dr_WF0) - , '\ndr_pnca_WF0 length:', len(dr_pnca_WF0) - , '\nexpected len:', dr_pnca_count) + , '\ndr_gene_WF0 length:', len(dr_gene_WF0) + , '\nexpected len:', dr_gene_count) -if len(dr_pnca_WF0) == dr_pnca_count: - print('PASS: length of dr_pnca_WF0 match with expected length' +if len(dr_gene_WF0) == dr_gene_count: + print('PASS: length of dr_gene_WF0 match with expected length' , '\n===============================================================') else: print('FAIL: lengths mismatch' , '\n===============================================================') # count the freq of 'dr_muts' samples -dr_muts_df = dr_pnca_WF0 [['id', 'dr_mutations_pyrazinamide']] +dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] print('dim of dr_muts_df:', dr_muts_df.shape) # add freq column @@ -468,13 +481,13 @@ dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') print('revised dim of dr_muts_df:', dr_muts_df.shape) c1 = dr_muts_df.dr_sample_freq.value_counts() -print('counting no. of sample frequency:\n', c1) -print('=================================================================') +print('counting no. of sample frequency:\n', c1 + , '\n===================================================================') -# sanity check: length of pnca samples -if len(dr_pnca_WF0) == c1.sum(): +# sanity check: length of gene samples +if len(dr_gene_WF0) == c1.sum(): print('PASS: WF data has expected length' - , '\nlength of dr_pnca WFO:', c1.sum() + , '\nlength of dr_gene WFO:', c1.sum() , '\n===============================================================') else: print('FAIL: Debug please!' @@ -483,7 +496,7 @@ else: #!!! Important !!! # Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df -dr_df = dr_pnca_WF0.assign(mutation_info = 'dr_mutations_pyrazinamide') +dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) print('dim of dr_df:', dr_df.shape , '\n==============================================================' , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' @@ -493,35 +506,35 @@ print('dim of dr_df:', dr_df.shape # DF2: other_mutations_pyrazinamdie #========= ######## -# tidy_split(): on 'other_mutations_pyrazinamide' column and remove leading white spaces +# tidy_split(): on other_muts_col column and remove leading white spaces ######## -col_to_split2 = 'other_mutations_pyrazinamide' -print ('applying second tidy split separately on df:', meta_pnca_other.shape - , '\ncolumn name:', col_to_split2 +col_to_split2 = other_muts_col +print ('applying second tidy split() separately on other muts df', meta_gene_other.shape + , '\ncolumn name to apply tidy_split():', col_to_split2 , '\n============================================================') # apply tidy_split() -other_WF1 = tidy_split(meta_pnca_other, col_to_split2, sep = ';') +other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') # remove the leading white spaces in the column -other_WF1['other_mutations_pyrazinamide'] = other_WF1['other_mutations_pyrazinamide'].str.strip() +other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() -# extract only the samples/rows with pncA_p. -other_pnca_WF1 = other_WF1.loc[other_WF1.other_mutations_pyrazinamide.str.contains(gene_match)] +# extract only the samples/rows with gene_match +other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] print('lengths after tidy split and extracting', gene_match, 'muts:', - '\nold length:' , len(meta_pnca_other), + '\nold length:' , len(meta_gene_other), '\nlen after split:', len(other_WF1), - '\nother_pnca_WF1 length:', len(other_pnca_WF1), - '\nexpected len:', other_pnca_count) + '\nother_gene_WF1 length:', len(other_gene_WF1), + '\nexpected len:', other_gene_count) -if len(other_pnca_WF1) == other_pnca_count: - print('PASS: length of dr_pnca_WF0 match with expected length +if len(other_gene_WF1) == other_gene_count: + print('PASS: length of dr_gene_WF0 match with expected length' , '\n===============================================================') else: - print('FAIL: lengths mismatch + print('FAIL: lengths mismatch' , '\n===============================================================') # count the freq of 'other muts' samples -other_muts_df = other_pnca_WF1 [['id', 'other_mutations_pyrazinamide']] +other_muts_df = other_gene_WF1 [['id', other_muts_col]] print('dim of other_muts_df:', other_muts_df.shape) # add freq column @@ -531,10 +544,10 @@ print('revised dim of other_muts_df:', other_muts_df.shape) c2 = other_muts_df.other_sample_freq.value_counts() print('counting no. of sample frequency:\n', c2) print('=================================================================') -# sanity check: length of pnca samples -if len(other_pnca_WF1) == c2.sum(): +# sanity check: length of gene samples +if len(other_gene_WF1) == c2.sum(): print('PASS: WF data has expected length' - , '\nlength of other_pnca WFO:', c2.sum() + , '\nlength of other_gene WFO:', c2.sum() , '\n===============================================================') else: print('FAIL: Debug please!' @@ -543,7 +556,7 @@ else: #!!! Important !!! # Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df -other_df = other_pnca_WF1.assign(mutation_info = 'other_mutations_pyrazinamide') +other_df = other_gene_WF1.assign(mutation_info = other_muts_col) print('dim of other_df:', other_df.shape , '\n===============================================================' , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' @@ -555,17 +568,19 @@ print('dim of other_df:', other_df.shape #!!! important !!! # change column names to allow concat: # dr_muts.. & other_muts : 'mutation' -print('Now concatenating the two dfs by row') +print('Now concatenating the two dfs by row' + , '\nfirst assigning a common colname: "mutation" to the col containing muts' + , '\nthis is done for both dfs' + , '\n===================================================================') dr_df.columns -dr_df.rename(columns = {'dr_mutations_pyrazinamide': 'mutation'}, inplace = True) +dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True) dr_df.columns other_df.columns -other_df.rename(columns = {'other_mutations_pyrazinamide': 'mutation'}, inplace = True) +other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True) other_df.columns -print('=================================================================') print('Now appending the two dfs:' , '\ndr_df dim:', dr_df.shape , '\nother_df dim:', other_df.shape @@ -582,18 +597,18 @@ else: print('FAIL: Debug please!') # concatenate (axis = 0): Rbind -pnca_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) +gene_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) # checking colnames and length after concat print('checking colnames AFTER concatenating the two dfs...') -if (set(dr_df.columns) == set(pnca_LF0.columns)): +if (set(dr_df.columns) == set(gene_LF0.columns)): print('PASS: column names match') else: print('FAIL: Debug please!') print('checking length AFTER concatenating the two dfs...') -if len(pnca_LF0) == len(dr_df) + len(other_df): +if len(gene_LF0) == len(dr_df) + len(other_df): print('PASS:length of df after concat match' , '\n===============================================================') else: @@ -603,61 +618,59 @@ else: ########### # This is hopefully clean data, but just double check # Filter LF data so that you only have -# mutations corresponding to pncA_p.* (string match pattern) +# mutations corresponding to gene_match* (string match pattern) # this will be your list you run OR calcs ########### -print('length of pnca_LF0:', len(pnca_LF0), +print('length of gene_LF0:', len(gene_LF0), '\nThis should be what you need. But just double check and extract', gene_match, - '\nfrom LF0 (concatenated data)') + '\nfrom LF0 (concatenated data) using string match:', gene_match) -print('using string match:', gene_match) +print('Double checking and creating df: gene_LF1') +gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)] -print('Double checking and creating df: pnca_LF1') -pnca_LF1 = pnca_LF0[pnca_LF0['mutation'].str.contains(gene_match)] - -if len(pnca_LF0) == len(pnca_LF1): - print('PASS: length of pnca_LF0 and pnca_LF1 match', +if len(gene_LF0) == len(gene_LF1): + print('PASS: length of gene_LF0 and gene_LF1 match', '\nconfirming extraction and concatenating worked correctly') else: print('FAIL: BUT NOT FATAL!' - , '\npnca_LF0 and pnca_LF1 lengths differ' + , '\ngene_LF0 and gene_LF1 lengths differ' , '\nsuggesting error in extraction process' - , ' use pnca_LF1 for downstreama analysis' + , ' use gene_LF1 for downstreama analysis' , '\n=========================================================') print('length of dfs pre and post processing...' - , '\npnca WF data (unique samples in each row):', extracted_pnca_samples - , '\npnca LF data (unique mutation in each row):', len(pnca_LF1) + , '\ngene WF data (unique samples in each row):', extracted_gene_samples + , '\ngene LF data (unique mutation in each row):', len(gene_LF1) , '\n=============================================================') -#%% -# final sanity check +#%% sanity check for extraction print('Verifying whether extraction process worked correctly...') -if len(pnca_LF1) == expected_rows: +if len(gene_LF1) == expected_rows: print('PASS: extraction process performed correctly' , '\nexpected length:', expected_rows - , '\ngot:', len(pnca_LF1) - , '\nRESULT: Total no. of mutant sequences for logo plot:', len(pnca_LF1) + , '\ngot:', len(gene_LF1) + , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1) , '\n=========================================================') else: print('FAIL: extraction process has bugs' , '\nexpected length:', expected_rows - , '\ngot:', len(pnca_LF1) + , '\ngot:', len(gene_LF1) , ', \Debug please' , '\n=========================================================') #%% -print('Perfmorning some more sanity checks...') +print('Performing some more sanity checks...') + # From LF1: # no. of unique muts -distinct_muts = pnca_LF1.mutation.value_counts() -print('Distinct mutations:', len(distinct_muts)) +distinct_muts = gene_LF1.mutation.value_counts() +print('Distinct genomic mutations:', len(distinct_muts)) # no. of samples contributing the unique muta -pnca_LF1.id.nunique() -print('No.of samples contributing to distinct muts:', pnca_LF1.id.nunique() ) +gene_LF1.id.nunique() +print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique() ) # no. of dr and other muts -mut_grouped = pnca_LF1.groupby('mutation_info').mutation.nunique() -print('No.of unique dr and other muts:', pnca_LF1.groupby('mutation_info').mutation.nunique() ) +mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique() +print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique() ) # sanity check if len(distinct_muts) == mut_grouped.sum() : @@ -670,7 +683,7 @@ else: , '\nmuts should be distinct within dr* and other* type' , '\ninspecting ...' , '\n=========================================================') - muts_split = list(pnca_LF1.groupby('mutation_info')) + muts_split = list(gene_LF1.groupby('mutation_info')) dr_muts = muts_split[0][1].mutation other_muts = muts_split[1][1].mutation # print('splitting muts by mut_info:', muts_split) @@ -679,7 +692,7 @@ else: #%% # !!! IMPORTANT !!!! # sanity check: There should not be any common muts -# i.e the same mutation cannot be classed as a 'drug' AND 'others' +# i.e the same mutation cannot be classed as a drug AND 'others' if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' , '\n===============================================================') @@ -695,8 +708,8 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum() , '\nThese are:\n', dr_muts[dr_muts.isin(other_muts)] , '\n=========================================================' - , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum(), - , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)], + , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum() + , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)] , '\n=========================================================') else: print('Error: ambiguous muts present, but extraction failed. Debug!' @@ -706,22 +719,22 @@ print('Counting no. of ambiguous muts...') if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() - print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)), - 'list of ambiguous mutations (see below):', *common_muts, sep = '\n' - , '\n=========================================================') + print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) + , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') + print('\n===========================================================') else: print('Error: ambiguous muts detected, but extraction failed. Debug!' - , '\nNo. of ambiguous muts in dr:', len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) - , '\nNo. of ambiguous muts in other:', len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in dr:' + , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in other:' + , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) , '\n=========================================================') #%% clear variables -del(id_dr, id_other, meta_data, meta_pnca_dr, meta_pnca_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_pnca_count, pnca_LF0, pnca_na) +del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na) -del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_muts_df, dr_pnca_WF0, dr_pnca_count, expected_pnca_samples, other_pnca_WF1) +del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1) -#%% end of data extraction and some files writing. Below are some more files writing. - #%%: write file: ambiguous muts # uncomment as necessary #print(outdir) @@ -734,8 +747,8 @@ print('Writing file: ambiguous muts', '\nFilename:', out_filename1, '\nPath:', outdir) -#common_muts = ['pncA_p.Val180Phe','pncA_p.Gln10Pro'] # test -inspect = pnca_LF1[pnca_LF1['mutation'].isin(common_muts)] +#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test +inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] inspect.to_csv(outfile1) print('Finished writing:', out_filename1 @@ -746,22 +759,33 @@ print('Finished writing:', out_filename1 , '\n=============================================================') del(out_filename1) - -#%% read aa dict and pull relevant info -print('Reading aa dict and fetching1 letter aa code' +#%% end of data extraction and some files writing. Below are some more files writing. +#============================================================================= +#%% Formatting df: read aa dict and pull relevant info +print('Now some more formatting:' + , '\nread aa dict and pull relevant info' + , '\nformat mutations:' + , '\nsplit mutation into mCSM style muts: ' , '\nFormatting mutation in mCSM style format: {WT}{MUT}' - , '\nAdding aa properties' - , '\n============================================================') - + , '\nassign aa properties: adding 2 cols at a time for each prop' + , '\n===================================================================') + +# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut +# in each lookup cycle +ncol_mutf_add = 3 # mut split into 3 cols +ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping + #=========== # Split 'mutation' column into three: wild_type, position and # mutant_type separately. Then map three letter code to one using -# reference_dict. -# First: Import reference dict -# Second: convert to mutation to lowercase for compatibility with dict +# reference_dict imported at the beginning. +# After importing, convert to mutation to lowercase for compatibility with dict #=========== -pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower() +gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower() +gene_regex = gene_match.lower()+'(\w{3})' +print('gene regex being used:', gene_regex) +mylen0 = len(gene_LF1.columns) #======= # Iterate through the dict, create a lookup dict i.e # lookup_dict = {three_letter_code: one_letter_code}. @@ -770,17 +794,47 @@ pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower() # The three letter code is extracted using a string match match from the dataframe and then converted # to 'pandas series'since map only works in pandas series #======= +print('Adding', ncol_mutf_add, 'more cols:\n') +# initialise a sub dict that is lookup dict for three letter code to 1-letter code +# adding three more cols lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['one_letter_code'] - wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - pnca_LF1['wild_type'] = wt.map(lookup_dict) - mut = pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() - pnca_LF1['mutant_type'] = mut.map(lookup_dict) +# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + gene_LF1['wild_type'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + gene_LF1['mutant_type'] = mut.map(lookup_dict) # extract position info from mutation column separetly using string match -pnca_LF1['position'] = pnca_LF1['mutation'].str.extract(r'(\d+)') +gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') + +mylen1 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen1 == mylen0 + ncol_mutf_add: + print('PASS: successfully added', ncol_mutf_add, 'cols' + , '\nold length:', mylen0 + , '\nnew len:', mylen1) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen0 + , '\nnew len:', mylen1) # clear variables del(k, v, wt, mut, lookup_dict) @@ -790,18 +844,45 @@ del(k, v, wt, mut, lookup_dict) # lookup_dict = {three_letter_code: aa_prop_water} # Do this for both wild_type and mutant as above. #========= -# initialise a sub dict that is lookup dict for three letter code to aa prop -lookup_dict = dict() +print('Adding', ncol_aa_add, 'more cols:\n') +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_prop_water'] #print(lookup_dict) - wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - pnca_LF1['wt_prop_water'] = wt.map(lookup_dict) - mut = pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() - pnca_LF1['mut_prop_water'] = mut.map(lookup_dict) - -# added two more cols +# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + gene_LF1['wt_prop_water'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + gene_LF1['mut_prop_water'] = mut.map(lookup_dict) + +mylen2 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen2 == mylen1 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) # clear variables del(k, v, wt, mut, lookup_dict) @@ -811,19 +892,92 @@ del(k, v, wt, mut, lookup_dict) # lookup_dict = {three_letter_code: aa_prop_polarity} # Do this for both wild_type and mutant as above. #========= -# initialise a sub dict that is lookup dict for three letter code to aa prop -lookup_dict = dict() +print('Adding', ncol_aa_add, 'more cols:\n') +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_prop_polarity'] #print(lookup_dict) - wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict) - mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() - pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict) - -# added two more cols +# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict) +mylen3 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen3 == mylen2 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) + +# clear variables +del(k, v, wt, mut, lookup_dict) + +#======== +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_calcprop} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_calcprop'] + #print(lookup_dict) +# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + gene_LF1['wt_calcprop'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + gene_LF1['mut_calcprop'] = mut.map(lookup_dict) + +mylen4 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen4 == mylen3 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) + # clear variables del(k, v, wt, mut, lookup_dict) @@ -833,56 +987,62 @@ del(k, v, wt, mut, lookup_dict) # Do this for both wild_type and mutant as above. # caution: taylor mapping will create a list within a column #========= +#print('Adding', ncol_aa_add, 'more cols:\n') #lookup_dict = dict() - #for k, v in my_aa_dict.items(): # lookup_dict[k] = v['aa_taylor'] -# #print(lookup_dict) -# wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on -# pnca_LF1['wt_taylor'] = wt.map(lookup_dict) -# mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() -# pnca_LF1['mut_taylor'] = mut.map(lookup_dict) + #print(lookup_dict) +# wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() +# gene_LF1['wt_taylor'] = wt.map(lookup_dict) +# mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() +# gene_LF1['mut_taylor'] = mut.map(lookup_dict) -# added two more cols +#mylen5 = len(gene_LF1.columns) + +# sanity checks +#print('checking if 3-letter wt&mut residue extraction worked correctly') +#if wt.isna().sum() & mut.isna().sum() == 0: +# print('PASS: 3-letter wt&mut residue extraction worked correctly:' +# , '\nNo NAs detected:' +# , '\nwild-type\n', wt +# , '\nmutant-type\n', mut +# , '\ndim of df:', gene_LF1.shape) +#else: +# print('FAIL: 3-letter wt&mut residue extraction failed' +# , '\nNo NAs detected:' +# , '\nwild-type\n', wt +# , '\nmutant-type\n', mut +# , '\ndim of df:', gene_LF1.shape) + +#if mylen5 == mylen4 + ncol_aa_add: +# print('PASS: successfully added', ncol_aa_add, 'cols' +# , '\nold length:', mylen4 +# , '\nnew len:', mylen5) +#else: +# print('FAIL: failed to add cols:' +# , '\nold length:', mylen4 +# , '\nnew len:', mylen5) # clear variables #del(k, v, wt, mut, lookup_dict) -#======== -# iterate through the dict, create a lookup dict that i.e -# lookup_dict = {three_letter_code: aa_calcprop} -# Do this for both wild_type and mutant as above. -#========= -lookup_dict = dict() - -for k, v in my_aa_dict.items(): - lookup_dict[k] = v['aa_calcprop'] - #print(lookup_dict) - wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on - pnca_LF1['wt_calcprop'] = wt.map(lookup_dict) - mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() - pnca_LF1['mut_calcprop'] = mut.map(lookup_dict) - -# added two more cols -# clear variables -del(k, v, wt, mut, lookup_dict) - ######## # combine the wild_type+poistion+mutant_type columns to generate # Mutationinformation (matches mCSM output field) # Remember to use .map(str) for int col types to allow string concatenation ######### -pnca_LF1['Mutationinformation'] = pnca_LF1['wild_type'] + pnca_LF1.position.map(str) + pnca_LF1['mutant_type'] +gene_LF1['Mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] print('Created column: Mutationinformation' - , '\n===============================================================') + , '\n=====================================================================' + , gene_LF1.Mutationinformation.head(10)) #%% Write file: mCSM muts -snps_only = pd.DataFrame(pnca_LF1['Mutationinformation'].unique()) +snps_only = pd.DataFrame(gene_LF1['Mutationinformation'].unique()) snps_only.head() # assign column name snps_only.columns = ['Mutationinformation'] # count how many positions this corresponds to -pos_only = pd.DataFrame(pnca_LF1['position'].unique()) +pos_only = pd.DataFrame(gene_LF1['position'].unique()) print('Checking NA in snps...')# should be 0 if snps_only.Mutationinformation.isna().sum() == 0: @@ -912,7 +1072,7 @@ print('Finished writing:', out_filename2 , '\n=============================================================') del(out_filename2) -#%% Write file: pnca_metadata (i.e pnca_LF1) +#%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids out_filename3 = gene.lower() + '_' + 'metadata.csv' outfile3 = outdir + '/' + out_filename3 @@ -921,15 +1081,15 @@ print('Writing file: LF formatted data' , '\nPath:', outdir , '\n============================================================') -pnca_LF1.to_csv(outfile3, header = True, index = False) +gene_LF1.to_csv(outfile3, header = True, index = False) print('Finished writing:', out_filename3 - , '\nNo. of rows:', len(pnca_LF1) - , '\nNo. of cols:', len(pnca_LF1.columns) + , '\nNo. of rows:', len(gene_LF1) + , '\nNo. of cols:', len(gene_LF1.columns) , '\n=============================================================') del(out_filename3) #%% write file: mCSM style but with repitions for MSA and logo plots -all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation']) +all_muts_msa = pd.DataFrame(gene_LF1['Mutationinformation']) all_muts_msa.head() # assign column name all_muts_msa.columns = ['Mutationinformation'] @@ -978,7 +1138,7 @@ del(out_filename4) #%% write file for mutational positions # count how many positions this corresponds to -pos_only = pd.DataFrame(pnca_LF1['position'].unique()) +pos_only = pd.DataFrame(gene_LF1['position'].unique()) # assign column name pos_only.columns = ['position'] # make sure dtype of column position is int or numeric and not string diff --git a/meta_data_analysis/reference_dict.py b/scripts/reference_dict.py similarity index 95% rename from meta_data_analysis/reference_dict.py rename to scripts/reference_dict.py index 0461523..8087009 100644 --- a/meta_data_analysis/reference_dict.py +++ b/scripts/reference_dict.py @@ -23,34 +23,31 @@ homedir = os.path.expanduser('~') # set working dir #os.getcwd() -#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +#os.chdir(homedir + '/git/LSHTM_analysis/scripts') #os.getcwd() #======================================================================= #%% variable assignment: input and output -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' +#drug = 'pyrazinamide' +#gene = 'pncA' +#gene_match = gene + '_p.' #========== # data dir #========== -#indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= -indir = datadir + '/' + drug + 'input' in_filename = 'aa_codes.csv' -infile = indir + '/' + in_filename +infile = datadir + '/' + in_filename print('Input filename:', in_filename - , '\nInput path:', indir + , '\nInput path:', datadir , '\n============================================================') #======= # output: No output #======= - #outdir = datadir + '/' + drug + '/' + 'output' #out_filename = '' #outfile = outdir + '/' + out_filename @@ -76,6 +73,7 @@ my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 # using 'index' creates a dict of dicts # using 'records' creates a list of dicts my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys +print('Printing my_aa_dict:', my_aa_dict.keys()) #================================================ # dict of aa with their corresponding properties