From 50ade050c22acd24601c09e4d97c0af6fb9ebadb Mon Sep 17 00:00:00 2001 From: tgttunstall <54280754+tgttunstall@users.noreply.github.com> Date: Tue, 14 Jan 2020 11:22:41 +0000 Subject: [PATCH 1/7] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4e7f8f2..0001735 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ mCSM Analysis ============= -This repo does mCSM analysis using Python, bash and R. +This repo does mCSM analysis using bash, python and R. -Requires an additional 'Data' directory. Batteries not included. +Requires an additional 'Data' directory. Batteries not included:-) ## Assumptions From ec37e3c1f6fcddcf692caf9c36ea0653a3baf4cf Mon Sep 17 00:00:00 2001 From: tgttunstall <54280754+tgttunstall@users.noreply.github.com> Date: Tue, 14 Jan 2020 11:29:13 +0000 Subject: [PATCH 2/7] Update README.md Updated README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0001735..84f8e1c 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ subdirs within this repo *.py mcsm\_analysis/ - / (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon") + / scripts/ *.R *.py From 4f06e42ee42c199cbdb475b5df7e474e993e8bae Mon Sep 17 00:00:00 2001 From: Tanu Date: Wed, 22 Jan 2020 10:12:09 +0000 Subject: [PATCH 3/7] graphs for PS lineage dist for all and dr muts --- mcsm_analysis/pyrazinamide/scripts/.Rhistory | 509 +----------------- .../pyrazinamide/scripts/Header_TT.R | 31 +- .../pyrazinamide/scripts/combining_two_df.R | 4 +- .../scripts/plotting/lineage_dist_PS.R | 116 ++-- 4 files changed, 93 insertions(+), 567 deletions(-) diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory index 8055405..0112d24 100644 --- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory +++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory @@ -1,512 +1,7 @@ -########################### -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#%%%%%%%%%%%%%%%%%%%%%%%% -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) -# quick checks -colnames(my_df) -str(my_df) -########################### -# Data for bfactor figure -# PS average -# Lig average -########################### -head(my_df$Position) -head(my_df$ratioDUET) -# order data frame -df = my_df[order(my_df$Position),] -head(df$Position) -head(df$ratioDUET) -#*********** -# PS: average by position -#*********** -mean_DUET_by_position <- df %>% -group_by(Position) %>% -summarize(averaged.DUET = mean(ratioDUET)) -#*********** -# Lig: average by position -#*********** -mean_Lig_by_position <- df %>% -group_by(Position) %>% -summarize(averaged.Lig = mean(ratioPredAff)) -#*********** -# cbind:mean_DUET_by_position and mean_Lig_by_position -#*********** -combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) -# sanity check -# mean_PS_Lig_Bfactor -colnames(combined) -colnames(combined) = c("Position" -, "average_DUETR" -, "Position2" -, "average_PredAffR") -colnames(combined) -identical(combined$Position, combined$Position2) -n = which(colnames(combined) == "Position2"); n -combined_df = combined[,-n] -max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) -max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) -#============= -# output csv -#============ -outDir = "~/Data/pyrazinamide/input/processed/" -outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") -print(paste0("Output file with path will be:","", outFile)) -head(combined_df$Position); tail(combined_df$Position) -write.csv(combined_df, outFile -, row.names = F) getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() -######################################################################## -# Installing and loading required packages # -######################################################################## source("../Header_TT.R") -#source("barplot_colour_function.R") -require(data.table) -require(dplyr) -######################################################################## -# Read file: call script for combining df for PS # -######################################################################## source("../combining_two_df.R") -########################### -# This will return: -# df with NA: -# merged_df2 -# merged_df3 -# df without NA: -# merged_df2_comp -# merged_df3_comp -########################### -#---------------------- PAY ATTENTION -# the above changes the working dir -#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts" -#---------------------- PAY ATTENTION -########################### -# you need merged_df3 -# or -# merged_df3_comp -# since these have unique SNPs -# I prefer to use the merged_df3 -# because using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available -########################### -# uncomment as necessary -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#my_df = merged_df3_comp -#%%%%%%%%%%%%%%%%%%%%%%%% -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) -# quick checks -colnames(my_df) -str(my_df) -########################### -# Data for bfactor figure -# PS average -# Lig average -########################### -head(my_df$Position) -head(my_df$ratioDUET) -# order data frame -df = my_df[order(my_df$Position),] -head(df$Position) -head(df$ratioDUET) -#*********** -# PS: average by position -#*********** -mean_DUET_by_position <- df %>% -group_by(Position) %>% -summarize(averaged.DUET = mean(ratioDUET)) -#*********** -# Lig: average by position -#*********** -mean_Lig_by_position <- df %>% -group_by(Position) %>% -summarize(averaged.Lig = mean(ratioPredAff)) -#*********** -# cbind:mean_DUET_by_position and mean_Lig_by_position -#*********** -combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position )) -# sanity check -# mean_PS_Lig_Bfactor -colnames(combined) -colnames(combined) = c("Position" -, "average_DUETR" -, "Position2" -, "average_PredAffR") -colnames(combined) -identical(combined$Position, combined$Position2) -n = which(colnames(combined) == "Position2"); n -combined_df = combined[,-n] -max(combined_df$average_DUETR) ; min(combined_df$average_DUETR) -max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR) -#============= -# output csv -#============ -outDir = "~/git/Data/pyrazinamide/input/processed/" -outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv") -print(paste0("Output file with path will be:","", outFile)) -head(combined_df$Position); tail(combined_df$Position) -write.csv(combined_df, outFile -, row.names = F) -# read in pdb file complex1 -inDir = "~/git/Data/pyrazinamide/input/structure" -inFile = paste0(inDir, "complex1_no_water.pdb") -# read in pdb file complex1 -inDir = "~/git/Data/pyrazinamide/input/structure/" -inFile = paste0(inDir, "complex1_no_water.pdb") -complex1 = inFile -my_pdb = read.pdb(complex1 -, maxlines = -1 -, multi = FALSE -, rm.insert = FALSE -, rm.alt = TRUE -, ATOM.only = FALSE -, hex = FALSE -, verbose = TRUE) -######################### -#3: Read complex pdb file -########################## -source("Header_TT.R") -# list of 8 -my_pdb = read.pdb(complex1 -, maxlines = -1 -, multi = FALSE -, rm.insert = FALSE -, rm.alt = TRUE -, ATOM.only = FALSE -, hex = FALSE -, verbose = TRUE) -rm(inDir, inFile) -#====== end of script -inDir = "~/git/Data/pyrazinamide/input/structure/" -inFile = paste0(inDir, "complex1_no_water.pdb") -complex1 = inFile -complex1 = inFile -my_pdb = read.pdb(complex1 -, maxlines = -1 -, multi = FALSE -, rm.insert = FALSE -, rm.alt = TRUE -, ATOM.only = FALSE -, hex = FALSE -, verbose = TRUE) -inFile -inDir = "~/git/Data/pyrazinamide/input/structure/" -inFile = paste0(inDir, "complex1_no_water.pdb") -complex1 = inFile -#inFile2 = paste0(inDir, "complex2_no_water.pdb") -#complex2 = inFile2 -# list of 8 -my_pdb = read.pdb(complex1 -, maxlines = -1 -, multi = FALSE -, rm.insert = FALSE -, rm.alt = TRUE -, ATOM.only = FALSE -, hex = FALSE -, verbose = TRUE) -rm(inDir, inFile, complex1) -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") -getwd() -source("Header_TT.R") -getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") -getwd() -######################################################################## -# Installing and loading required packages # -######################################################################## -source("Header_TT.R") -######################################################### -# TASK: replace B-factors in the pdb file with normalised values -# use the complex file with no water as mCSM lig was -# performed on this file. You can check it in the script: read_pdb file. -######################################################### -########################### -# 2: Read file: average stability values -# or mcsm_normalised file, output of step 4 mcsm pipeline -########################### -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile -my_df <- read.csv(inFile -# , row.names = 1 -# , stringsAsFactors = F -, header = T) -str(my_df) -source("read_pdb.R") # list of 8 -# extract atom list into a variable -# since in the list this corresponds to data frame, variable will be a df -d = my_pdb[[1]] -# make a copy: required for downstream sanity checks -d2 = d -# sanity checks: B factor -max(d$b); min(d$b) -par(oma = c(3,2,3,0) -, mar = c(1,3,5,2) -, mfrow = c(3,2)) -#par(mfrow = c(3,2)) -#1: Original B-factor -hist(d$b -, xlab = "" -, main = "B-factor") -plot(density(d$b) -, xlab = "" -, main = "B-factor") -# 2: DUET scores -hist(my_df$average_DUETR -, xlab = "" -, main = "Norm_DUET") -plot(density(my_df$average_DUETR) -, xlab = "" -, main = "Norm_DUET") -# Set the margin on all sides -par(oma = c(3,2,3,0) -, mar = c(1,3,5,2) -, mfrow = c(3,2)) -#par(mfrow = c(3,2)) -#1: Original B-factor -hist(d$b -, xlab = "" -, main = "B-factor") -plot(density(d$b) -, xlab = "" -, main = "B-factor") -# 2: DUET scores -hist(my_df$average_DUETR -, xlab = "" -, main = "Norm_DUET") -plot(density(my_df$average_DUETR) -, xlab = "" -, main = "Norm_DUET") -#========= -# step 1_P1 -#========= -# Be brave and replace in place now (don't run sanity check) -# this makes all the B-factor values in the non-matched positions as NA -d$b = my_df$average_DUETR[match(d$resno, my_df$Position)] -#========= -# step 2_P1 -#========= -# count NA in Bfactor -b_na = sum(is.na(d$b)) ; b_na -# count number of 0's in Bactor -sum(d$b == 0) -# replace all NA in b factor with 0 -d$b[is.na(d$b)] = 0 -# sanity check: should be 0 -sum(is.na(d$b)) -# sanity check: should be True -if (sum(d$b == 0) == b_na){ -print ("Sanity check passed: NA's replaced with 0's successfully") -} else { -print("Error: NA replacement NOT successful, Debug code!") -} -max(d$b); min(d$b) -# sanity checks: should be True -if(max(d$b) == max(my_df$average_DUETR)){ -print("Sanity check passed: B-factors replaced correctly") -} else { -print ("Error: Debug code please") -} -if (min(d$b) == min(my_df$average_DUETR)){ -print("Sanity check passed: B-factors replaced correctly") -} else { -print ("Error: Debug code please") -} -#========= -# step 3_P1 -#========= -# sanity check: dim should be same before reassignment -# should be TRUE -dim(d) == dim(d2) -#========= -# step 4_P1 -#========= -# assign it back to the pdb file -my_pdb[[1]] = d -max(d$b); min(d$b) -#========= -# step 5_P1 -#========= -# output dir -getwd() -outDir = "~/git/Data/pyrazinamide/output/" -getwd() -outFile = paste0(outDir, "complex1_BwithNormDUET.pdb") -outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile -outDir = "~/git/Data/pyrazinamide/input/structure" -outDir = "~/git/Data/pyrazinamide/input/structure/" -outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile -write.pdb(my_pdb, outFile) -hist(d$b -, xlab = "" -, main = "repalced-B") -plot(density(d$b) -, xlab = "" -, main = "replaced-B") -# graph titles -mtext(text = "Frequency" -, side = 2 -, line = 0 -, outer = TRUE) -mtext(text = "DUET_stability" -, side = 3 -, line = 0 -, outer = TRUE) -#========================================================= -# Processing P2: Replacing B values with PredAff Scores -#========================================================= -# clear workspace -rm(list = ls()) -#========================================================= -# Processing P2: Replacing B values with PredAff Scores -#========================================================= -# clear workspace -rm(list = ls()) -########################### -# 2: Read file: average stability values -# or mcsm_normalised file, output of step 4 mcsm pipeline -########################### -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile -my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv" -# , row.names = 1 -# , stringsAsFactors = F -, header = T) -str(my_df) -#========================================================= -# Processing P2: Replacing B factor with mean ratioLig scores -#========================================================= -######################### -# 3: Read complex pdb file -# form the R script -########################## -source("read_pdb.R") # list of 8 -# extract atom list into a vari -inDir = "~/git/Data/pyrazinamide/input/processed/" -inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile -my_df <- read.csv(inFile -# , row.names = 1 -# , stringsAsFactors = F -, header = T) -str(my_df) -# extract atom list into a variable -# since in the list this corresponds to data frame, variable will be a df -d = my_pdb[[1]] -# make a copy: required for downstream sanity checks -d2 = d -# sanity checks: B factor -max(d$b); min(d$b) -par(oma = c(3,2,3,0) -, mar = c(1,3,5,2) -, mfrow = c(3,2)) -#par(mfrow = c(3,2)) -# 1: Original B-factor -hist(d$b -, xlab = "" -, main = "B-factor") -plot(density(d$b) -, xlab = "" -, main = "B-factor") -# 2: Pred Aff scores -hist(my_df$average_PredAffR -, xlab = "" -, main = "Norm_lig_average") -plot(density(my_df$average_PredAffR) -, xlab = "" -, main = "Norm_lig_average") -# 3: After the following replacement -#******************************** -par(oma = c(3,2,3,0) -, mar = c(1,3,5,2) -, mfrow = c(3,2)) -#par(mfrow = c(3,2)) -# 1: Original B-factor -hist(d$b -, xlab = "" -, main = "B-factor") -plot(density(d$b) -, xlab = "" -, main = "B-factor") -# 2: Pred Aff scores -hist(my_df$average_PredAffR -, xlab = "" -, main = "Norm_lig_average") -plot(density(my_df$average_PredAffR) -, xlab = "" -, main = "Norm_lig_average") -# 3: After the following replacement -#******************************** -#========= -# step 1_P2: BE BRAVE and replace in place now (don't run step 0) -#========= -# this makes all the B-factor values in the non-matched positions as NA -d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)] -#========= -# step 2_P2 -#========= -# count NA in Bfactor -b_na = sum(is.na(d$b)) ; b_na -# count number of 0's in Bactor -sum(d$b == 0) -# replace all NA in b factor with 0 -d$b[is.na(d$b)] = 0 -# sanity check: should be 0 -sum(is.na(d$b)) -if (sum(d$b == 0) == b_na){ -print ("Sanity check passed: NA's replaced with 0's successfully") -} else { -print("Error: NA replacement NOT successful, Debug code!") -} -max(d$b); min(d$b) -# sanity checks: should be True -if (max(d$b) == max(my_df$average_PredAffR)){ -print("Sanity check passed: B-factors replaced correctly") -} else { -print ("Error: Debug code please") -} -if (min(d$b) == min(my_df$average_PredAffR)){ -print("Sanity check passed: B-factors replaced correctly") -} else { -print ("Error: Debug code please") -} -#========= -# step 3_P2 -#========= -# sanity check: dim should be same before reassignment -# should be TRUE -dim(d) == dim(d2) -#========= -# step 4_P2 -#========= -# assign it back to the pdb file -my_pdb[[1]] = d -max(d$b); min(d$b) -#========= -# step 5_P2 -#========= -write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb") -# output dir -getwd() -# output dir -outDir = "~/git/Data/pyrazinamide/input/structure/" -outFile = paste0(outDir, "complex1_BwithNormLIG.pdb") -outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile -write.pdb(my_pdb, outFile) +source("../combining_two_df.R") +source("../combining_two_df.R") diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R index 2069fa7..9eae42a 100644 --- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R +++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R @@ -1,25 +1,31 @@ ######################################################### ### A) Installing and loading required packages ######################################################### +#lib_loc = "/usr/local/lib/R/site-library") #if (!require("gplots")) { # install.packages("gplots", dependencies = TRUE) # library(gplots) #} -if (!require("tidyverse")) { - install.packages("tidyverse", dependencies = TRUE) - library(tidyverse) -} +#if (!require("tidyverse")) { +# install.packages("tidyverse", dependencies = TRUE) +# library(tidyverse) +#} if (!require("ggplot2")) { install.packages("ggplot2", dependencies = TRUE) library(ggplot2) } +if (!require("plotly")) { + install.packages("plotly", dependencies = TRUE) + library(plotly) +} + if (!require("cowplot")) { install.packages("copwplot", dependencies = TRUE) - library(ggplot2) + library(cowplot) } if (!require("ggcorrplot")) { @@ -43,37 +49,33 @@ if (!require ("GOplot")) { } if(!require("VennDiagram")) { - install.packages("VennDiagram", dependencies = T) library(VennDiagram) } if(!require("scales")) { - install.packages("scales", dependencies = T) library(scales) } if(!require("plotrix")) { - install.packages("plotrix", dependencies = T) library(plotrix) } if(!require("stats")) { - install.packages("stats", dependencies = T) library(stats) } if(!require("stats4")) { - install.packages("stats4", dependencies = T) library(stats4) } if(!require("data.table")) { - library(stats4) +install.packages("data.table") + library(data.table) } if (!require("PerformanceAnalytics")){ @@ -98,18 +100,17 @@ if (!require ("psych")){ if (!require ("dplyr")){ install.packages("dplyr") - library(psych) + library(dplyr) } - if (!require ("compare")){ install.packages("compare") - library(psych) + library(compare) } if (!require ("arsenal")){ install.packages("arsenal") - library(psych) + library(arsenal) } diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R index 57e476a..d0e13cd 100644 --- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R +++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R @@ -11,7 +11,7 @@ getwd() # Installing and loading required packages # ######################################################################## -source("Header_TT.R") +#source("Header_TT.R") #require(data.table) #require(arsenal) #require(compare) @@ -286,7 +286,7 @@ outDir = "~/git/Data/pyrazinamide/output/" getwd() outFile1 = paste0(outDir, "merged_df3.csv"); outFile1 -write.csv(merged_df3, outFile1) +#write.csv(merged_df3, outFile1) #outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2 #write.csv(merged_df3_comp, outFile2) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R index cc7e15a..4c54681 100644 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R @@ -1,5 +1,5 @@ getwd() -setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() ######################################################################## @@ -24,11 +24,11 @@ source("../combining_two_df.R") #========================== # This will return: -# df with NA: +# df with NA for pyrazinamide: # merged_df2 # merged_df3 -# df without NA: +# df without NA for pyrazinamide: # merged_df2_comp # merged_df3_comp #=========================== @@ -38,14 +38,17 @@ source("../combining_two_df.R") # you need merged_df2 or merged_df2_comp # since this is one-many relationship # i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA ########################### # uncomment as necessary -#<<<<<<<<<<<<<<<<<<<<<<<<< +#!!!!!!!!!!!!!!!!!!!!!!! # REASSIGNMENT my_df = merged_df2 #my_df = merged_df2_comp -#<<<<<<<<<<<<<<<<<<<<<<<<< +#!!!!!!!!!!!!!!!!!!!!!!! # delete variables not required rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) @@ -59,12 +62,39 @@ is.factor(my_df$lineage) my_df$lineage = as.factor(my_df$lineage) is.factor(my_df$lineage) -table(my_df$mutation_info) +table(my_df$mutation_info); str(my_df$mutation_info) + +# subset df with dr muts only +my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## +#========================== +# Data for plot: assign as +# necessary +#=========================== + +# uncomment as necessary +#!!!!!!!!!!!!!!!!!!!!!!! +# REASSIGNMENT + +#================== +# data for ALL muts +#================== +plot_df = my_df +my_plot_name = 'lineage_dist_PS.svg' +#my_plot_name = 'lineage_dist_PS_comp.svg' + +#======================= +# data for dr_muts ONLY +#======================= +#plot_df = my_df_dr +#my_plot_name = 'lineage_dist_dr_PS.svg' +#my_plot_name = 'lineage_dist_dr_PS_comp.svg' +#!!!!!!!!!!!!!!!!!!!!!!! + #========================== # Plot: Lineage Distribution # x = mcsm_values, y = dist @@ -74,6 +104,7 @@ table(my_df$mutation_info) #=================== # Data for plots #=================== +table(plot_df$lineage); str(plot_df$lineage) # subset only lineages1-4 sel_lineages = c("lineage1" @@ -82,34 +113,29 @@ sel_lineages = c("lineage1" , "lineage4") # uncomment as necessary -df_lin = subset(my_df, subset = lineage %in% sel_lineages ) +df_lin = subset(plot_df, subset = lineage %in% sel_lineages ) # refactor df_lin$lineage = factor(df_lin$lineage) table(df_lin$lineage) #{RESULT: No of samples within lineage} #lineage1 lineage2 lineage3 lineage4 -#104 1293 264 1311 - -# when merged_df2_comp is used -#lineage1 lineage2 lineage3 lineage4 -#99 1275 263 1255 length(unique(df_lin$Mutationinformation)) #{Result: No. of unique mutations the 4 lineages contribute to} # sanity checks r1 = 2:5 # when merged_df2 used: because there is missing lineages -if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) { +if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) { print ("sanity check passed: numbers match") } else{ print("Error!: check your numbers") } -#<<<<<<<<<<<<<<<<<<<<<<<<< +#!!!!!!!!!!!!!!!!!!!!!!!!! # REASSIGNMENT df <- df_lin -#<<<<<<<<<<<<<<<<<<<<<<<<< +#!!!!!!!!!!!!!!!!!!!!!!!!! rm(df_lin) @@ -117,8 +143,8 @@ rm(df_lin) # generate distribution plot of lineages #****************** # basic: could improve this! -library(plotly) -library(ggridges) +#library(plotly) +#library(ggridges) g <- ggplot(df, aes(x = ratioDUET)) + geom_density(aes(fill = DUET_outcome) @@ -129,64 +155,68 @@ g <- ggplot(df, aes(x = ratioDUET)) + ggplotly(g) # 2 : ggridges (good!) - my_ats = 15 # axis text size my_als = 20 # axis label size -fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4') +my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') # set output dir for plots getwd() setwd("~/git/Data/pyrazinamide/output/plots") getwd() -svg('lineage_dist_PS.svg') +# check plot name +my_plot_name -printFile = ggplot( df, aes(x = ratioDUET - , y = DUET_outcome) )+ +# output svg +svg(my_plot_name) +printFile = ggplot(df, aes(x = ratioDUET + , y = DUET_outcome))+ #printFile=geom_density_ridges_gradient( - geom_density_ridges_gradient( aes(fill = ..x..) + geom_density_ridges_gradient(aes(fill = ..x..) , scale = 3 , size = 0.3 ) + facet_wrap( ~lineage , scales = "free" # , switch = 'x' - , labeller = labeller(lineage = fooNames) ) + + , labeller = labeller(lineage = my_labels) ) + coord_cartesian( xlim = c(-1, 1) -# , ylim = c(0, 6) -# , clip = "off" - ) + - scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4") +# , ylim = c(0, 6) +# , clip = "off" +) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") , name = "DUET" ) + - theme( axis.text.x = element_text( size = my_ats + theme(axis.text.x = element_text(size = my_ats , angle = 90 , hjust = 1 , vjust = 0.4) -# , axis.text.y = element_text( size = my_ats -# , angle = 0 -# , hjust = 1 -# , vjust = 0) +# , axis.text.y = element_text(size = my_ats +# , angle = 0 +# , hjust = 1 +# , vjust = 0) , axis.text.y = element_blank() , axis.title.x = element_blank() , axis.title.y = element_blank() , axis.ticks.y = element_blank() , plot.title = element_blank() - , strip.text = element_text(size=my_als) - , legend.text = element_text(size=10) - , legend.title = element_text(size=my_als) -# , legend.position = c(0.3, 0.8) -# , legend.key.height = unit(1, 'mm') - ) + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = 10) + , legend.title = element_text(size = my_als) +# , legend.position = c(0.3, 0.8) +# , legend.key.height = unit(1, 'mm') + ) print(printFile) dev.off() -#=!=!=!=!=!=! -# COMMENT: When you look at all mutations, the lineage differences disappear... +#=!=!=!=!=!=!=! +# COMMENT: Not much differences in the distributions +# when using merged_df2 or merged_df2_comp. +# Also, the lineage differences disappear when looking at all muts # The pattern we are interested in is possibly only for dr_mutations -#=!=!=!=!=!=! +#=!=!=!=!=!=!=! #=================================================== # COMPARING DISTRIBUTIONS From bcf822d6e47b480ca7a3692ef826a99cdbc8f252 Mon Sep 17 00:00:00 2001 From: Tanu Date: Wed, 22 Jan 2020 11:34:59 +0000 Subject: [PATCH 4/7] updated lineage dist for LIG for consistency --- .../scripts/plotting/lineage_dist_LIG.R | 50 +++++++++++++------ 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R index ebbec76..e4e6972 100644 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R @@ -75,6 +75,29 @@ if (max(my_df$Dis_lig_Ang) < 10){ ######################################################################## # end of data extraction and cleaning for plots # ######################################################################## +#========================== +# Data for plot: assign as +# necessary +#=========================== + +# uncomment as necessary +#!!!!!!!!!!!!!!!!!!!!!!! +# REASSIGNMENT + +#================== +# data for ALL muts +#================== +plot_df = my_df +my_plot_name = 'lineage_dist_PS.svg' +#my_plot_name = 'lineage_dist_PS_comp.svg' + +#======================= +# data for dr_muts ONLY +#======================= +#plot_df = my_df_dr +#my_plot_name = 'lineage_dist_dr_PS.svg' +#my_plot_name = 'lineage_dist_dr_PS_comp.svg' +#!!!!!!!!!!!!!!!!!!!!!!! #========================== # Plot: Lineage Distribution @@ -117,10 +140,10 @@ if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) { print("Error!: check your numbers") } -#<<<<<<<<<<<<<<<<<<<<<<<<< +#!!!!!!!!!!!!!!!!!!!!!!!!! # REASSIGNMENT df <- df_lin -#<<<<<<<<<<<<<<<<<<<<<<<<< +#!!!!!!!!!!!!!!!!!!!!!!!!! rm(df_lin) @@ -131,15 +154,15 @@ rm(df_lin) library(plotly) library(ggridges) -fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') +my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') g <- ggplot(df, aes(x = ratioPredAff)) + geom_density(aes(fill = Lig_outcome) , alpha = 0.5) + facet_wrap( ~ lineage , scales = "free" - , labeller = labeller(lineage = fooNames) ) + + , labeller = labeller(lineage = my_labels) ) + coord_cartesian(xlim = c(-1, 1) # , ylim = c(0, 6) # , clip = "off" @@ -153,15 +176,18 @@ ggplotly(g) my_ats = 15 # axis text size my_als = 20 # axis label size -fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') -names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') +my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4') +names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4') # set output dir for plots getwd() setwd("~/git/Data/pyrazinamide/output/plots") getwd() -svg('lineage_dist_LIG.svg') +# check plot name +my_plot_name + +svg(my_plot_name) printFile = ggplot( df, aes(x = ratioPredAff , y = Lig_outcome) ) + @@ -172,7 +198,7 @@ printFile = ggplot( df, aes(x = ratioPredAff facet_wrap( ~lineage , scales = "free" # , switch = 'x' - , labeller = labeller(lineage = fooNames) ) + + , labeller = labeller(lineage = my_labels) ) + coord_cartesian( xlim = c(-1, 1) # , ylim = c(0, 6) # , clip = "off" @@ -202,12 +228,6 @@ printFile = ggplot( df, aes(x = ratioPredAff print(printFile) dev.off() - -#=!=!=!=!=!=! -# COMMENT: When you look at all mutations, the lineage differences disappear... -# The pattern we are interested in is possibly only for dr_mutations -#=!=!=!=!=!=! - #=================================================== # COMPARING DISTRIBUTIONS From 4d2d03f634a461ed1da08b653bf58f1a3c99fa21 Mon Sep 17 00:00:00 2001 From: Tanu Date: Wed, 22 Jan 2020 15:09:21 +0000 Subject: [PATCH 5/7] added coloured axis barplots --- mcsm_analysis/pyrazinamide/scripts/.Rhistory | 202 +++++++++++- .../plotting/barplots_subcolours_aa_PS.R | 292 ++++++++++++++++++ .../scripts/plotting/lineage_dist_PS.R | 2 +- .../scripts/plotting/subcols_axis.R | 205 ++++++++++++ 4 files changed, 697 insertions(+), 4 deletions(-) create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory index 0112d24..715e7a4 100644 --- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory +++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory @@ -2,6 +2,202 @@ getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() source("../Header_TT.R") -source("../combining_two_df.R") -source("../combining_two_df.R") -source("../combining_two_df.R") +source("../barplot_colour_function.R") +############################################################ +# Output dir for plots +############################################################ +out_dir = "~/git/Data/pyrazinamide/output/plots" +source("subcols_axis.R") +table(mut_pos_cols$lab_bg) +#blue cornflowerblue green purple white yellow +#2 2 2 4 117 3 +sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True +table(mut_pos_cols$lab_bg2) +#green white +#2 128 +sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True +table(mut_pos_cols$lab_fg) +#black white +#124 6 +sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True +# very important! +my_axis_colours = mut_pos_cols$lab_fg +# now clear mut_pos_cols +rm(mut_pos_cols) +########################### +# 2: Plot: DUET scores +########################### +#========================== +# Plot 2: Barplot with scores (unordered) +# corresponds to DUET_outcome +# Stacked Barplot with colours: DUET_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ +# sanity checks +upos = unique(my_df$Position) +str(my_df$DUET_outcome) +colnames(my_df) +#=========================== +# Data preparation for plots +#=========================== +#!!!!!!!!!!!!!!!!! +# REASSIGNMENT +df <- my_df +#!!!!!!!!!!!!!!!!! +rm(my_df) +# sanity checks +# should be a factor +is.factor(df$DUET_outcome) +#TRUE +table(df$DUET_outcome) +#Destabilizing Stabilizing +#288 47 +# should be -1 and 1 +min(df$ratioDUET) +max(df$ratioDUET) +# sanity checks +# very important!!!! +tapply(df$ratioDUET, df$DUET_outcome, min) +#Destabilizing Stabilizing +#-1.0000000 0.01065719 +tapply(df$ratioDUET, df$DUET_outcome, max) +#Destabilizing Stabilizing +#-0.003875969 1.0000000 +# check unique values in normalised data +u = unique(df$ratioDUET) # 323 +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$ratioDUETR = round(df$ratioDUET, n) # 335, 40 +u = unique(df$ratioDUETR) # 287 +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$ratioDUETR +df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") # 335,41 +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") +my_title = "Protein stability (DUET)" +library(ggplot2) +# axis label size +my_xaxls = 13 +my_yaxls = 15 +# axes text size +my_xaxts = 15 +my_yaxts = 15 +# no ordering of x-axis according to frequency +g = ggplot(df, aes(factor(Position, ordered = T))) +g + +geom_bar(aes(fill = group), colour = "grey") + +scale_fill_manual( values = colours +, guide = 'none') + +theme( axis.text.x = element_text(size = my_xaxls +, angle = 90 +, hjust = 1 +, vjust = 0.4) +, axis.text.y = element_text(size = my_yaxls +, angle = 0 +, hjust = 1 +, vjust = 0) +, axis.title.x = element_text(size = my_xaxts) +, axis.title.y = element_text(size = my_yaxts ) ) + +labs(title = my_title +, x = "Position" +, y = "Frequency") +class(df$lab_bg) +# make this a named vector +# define cartesian coord +my_xlim = length(unique(df$Position)); my_xlim +# axis label size +my_xals = 15 +my_yals = 15 +# axes text size +my_xats = 15 +my_yats = 18 +# using geom_tile +g = ggplot(df, aes(factor(Position, ordered = T))) +g + +coord_cartesian(xlim = c(1, my_xlim) +, ylim = c(0, 6) +, clip = "off") + +geom_bar(aes(fill = group), colour = "grey") + +scale_fill_manual( values = colours +, guide = 'none') + +geom_tile(aes(,-0.8, width = 0.9, height = 0.85) +, fill = df$lab_bg) + +geom_tile(aes(,-1.2, width = 0.9, height = -0.2) +, fill = df$lab_bg2) + +# Here it's important to specify that your axis goes from 1 to max number of levels +theme( axis.text.x = element_text(size = my_xats +, angle = 90 +, hjust = 1 +, vjust = 0.4 +, colour = my_axis_colours) +, axis.text.y = element_text(size = my_yats +, angle = 0 +, hjust = 1 +, vjust = 0) +, axis.title.x = element_text(size = my_xals) +, axis.title.y = element_text(size = my_yals ) +, axis.ticks.x = element_blank() +) + +labs(title = my_title +, x = "Position" +, y = "Frequency") +class(df$lab_bg) +# make this a named vector +# define cartesian coord +my_xlim = length(unique(df$Position)); my_xlim +# axis label size +my_xals = 18 +my_yals = 18 +# axes text size +my_xats = 14 +my_yats = 18 +my_plot_name = "barplot_PS_acoloured.svg" +out_file = paste0(out_dir, "/", my_plot_name); outfile +svg(outfile, width = 26, height = 4) +svg(out_file, width = 26, height = 4) +# using geom_tile +g = ggplot(df, aes(factor(Position, ordered = T))) +outFile = g + +coord_cartesian(xlim = c(1, my_xlim) +, ylim = c(0, 6) +, clip = "off" +) + +geom_bar(aes(fill = group), colour = "grey") + +scale_fill_manual( values = colours +, guide = 'none') + +# geom_tile(aes(,-0.6, width = 0.9, height = 0.7) +# , fill = df$lab_bg) + +# geom_tile(aes(,-1, width = 0.9, height = 0.3) +# , fill = df$lab_bg2) + +geom_tile(aes(,-0.8, width = 0.9, height = 0.85) +, fill = df$lab_bg) + +geom_tile(aes(,-1.2, width = 0.9, height = -0.2) +, fill = df$lab_bg2) + +# Here it's important to specify that your axis goes from 1 to max number of levels +theme( axis.text.x = element_text(size = my_xats +, angle = 90 +, hjust = 1 +, vjust = 0.4 +, colour = my_axis_colours) +, axis.text.y = element_text(size = my_yats +, angle = 0 +, hjust = 1 +, vjust = 0) +, axis.title.x = element_text(size = my_xals) +, axis.title.y = element_text(size = my_yals ) +, axis.ticks.x = element_blank() +) + +labs(title = "" +, x = "Position" +, y = "Frequency") +print(outFile) +dev.off() diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R new file mode 100644 index 0000000..63be70a --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R @@ -0,0 +1,292 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +############################################################ +# 1: Installing and loading required packages and functions +############################################################ + +source("../Header_TT.R") +source("../barplot_colour_function.R") + +############################################################ +# Output dir for plots +############################################################ +out_dir = "~/git/Data/pyrazinamide/output/plots" + +############################################################ +# 2: call script the prepares the data with columns containing +# colours for axis labels +############################################################ + +source("subcols_axis.R") + +# this should return +#mut_pos_cols: 130, 4 +#my_df: 335, 39 + +# clear excess variable +# "mut_pos_cols" is just for inspection in case you need to cross check +# position numbers and colours +# open file from deskptop ("sample_axis_cols") for cross checking + +table(mut_pos_cols$lab_bg) + +sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True + +table(mut_pos_cols$lab_bg2) + +sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True + +table(mut_pos_cols$lab_fg) + +sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True + +# very important! +my_axis_colours = mut_pos_cols$lab_fg + +# now clear mut_pos_cols +rm(mut_pos_cols) + +########################### +# 2: Plot: DUET scores +########################### +#========================== +# Plot 2: Barplot with scores (unordered) +# corresponds to DUET_outcome +# Stacked Barplot with colours: DUET_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding DUET stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ +# sanity checks +upos = unique(my_df$Position) + +str(my_df$DUET_outcome) + +colnames(my_df) + +#=========================== +# Data preparation for plots +#=========================== +#!!!!!!!!!!!!!!!!! +# REASSIGNMENT +df <- my_df +#!!!!!!!!!!!!!!!!! + +rm(my_df) + +# sanity checks +# should be a factor +is.factor(df$DUET_outcome) +#TRUE + +table(df$DUET_outcome) + +# should be -1 and 1 +min(df$ratioDUET) +max(df$ratioDUET) + +# sanity checks +# very important!!!! +tapply(df$ratioDUET, df$DUET_outcome, min) + +tapply(df$ratioDUET, df$DUET_outcome, max) + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = DUET_outcome +# subgroup = normalised score i.e ratioDUET + +# Prepare data: round off ratioDUET scores +# round off to 3 significant digits: +# 323 if no rounding is performed: used to generate the original graph +# 287 if rounded to 3 places +# FIXME: check if reducing precicion creates any ML prob + +# check unique values in normalised data +u = unique(df$ratioDUET) + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Run this section if rounding is to be used +# specify number for rounding +n = 3 +df$ratioDUETR = round(df$ratioDUET, n) +u = unique(df$ratioDUETR) + +# create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$ratioDUETR +df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") + +# ELSE +# uncomment the below if rounding is not required + +#my_grp = df$ratioDUET +#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") + +# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#****************** +# generate plot +#****************** + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp") +my_title = "Protein stability (DUET)" +library(ggplot2) + +# axis label size +my_xaxls = 13 +my_yaxls = 15 + +# axes text size +my_xaxts = 15 +my_yaxts = 15 + +# no ordering of x-axis according to frequency +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + theme( axis.text.x = element_text(size = my_xaxls + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_yaxls + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xaxts) + , axis.title.y = element_text(size = my_yaxts ) ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +#======================== +# plot with axis colours +#======================== +class(df$lab_bg) +# make this a named vector + +# define cartesian coord +my_xlim = length(unique(df$Position)); my_xlim + +# axis label size +my_xals = 15 +my_yals = 15 + +# axes text size +my_xats = 15 +my_yats = 18 + +# using geom_tile +g = ggplot(df, aes(factor(Position, ordered = T))) +g + + coord_cartesian(xlim = c(1, my_xlim) + , ylim = c(0, 6) + , clip = "off") + + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + + geom_tile(aes(,-0.8, width = 0.9, height = 0.85) + , fill = df$lab_bg) + + geom_tile(aes(,-1.2, width = 0.9, height = -0.2) + , fill = df$lab_bg2) + + + # Here it's important to specify that your axis goes from 1 to max number of levels + theme( axis.text.x = element_text(size = my_xats + , angle = 90 + , hjust = 1 + , vjust = 0.4 + , colour = my_axis_colours) + , axis.text.y = element_text(size = my_yats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals) + , axis.title.y = element_text(size = my_yals ) + , axis.ticks.x = element_blank() + ) + + labs(title = my_title + , x = "Position" + , y = "Frequency") + +#======================== +# output plot as svg/png +#======================== +class(df$lab_bg) +# make this a named vector + +# define cartesian coord +my_xlim = length(unique(df$Position)); my_xlim + +# axis label size +my_xals = 18 +my_yals = 18 + +# axes text size +my_xats = 14 +my_yats = 18 + +# set output dir for plots +#getwd() +#setwd("~/git/Data/pyrazinamide/output/plots") +#getwd() + +plot_name = "barplot_PS_acoloured.svg" +my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name + +svg(my_plot_name, width = 26, height = 4) + +g = ggplot(df, aes(factor(Position, ordered = T))) + +outFile = g + + coord_cartesian(xlim = c(1, my_xlim) + , ylim = c(0, 6) + , clip = "off" + ) + + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual( values = colours + , guide = 'none') + +# geom_tile(aes(,-0.6, width = 0.9, height = 0.7) +# , fill = df$lab_bg) + +# geom_tile(aes(,-1, width = 0.9, height = 0.3) +# , fill = df$lab_bg2) + + geom_tile(aes(,-0.8, width = 0.9, height = 0.85) + , fill = df$lab_bg) + + geom_tile(aes(,-1.2, width = 0.9, height = -0.2) + , fill = df$lab_bg2) + + +# Here it's important to specify that your axis goes from 1 to max number of levels + theme( axis.text.x = element_text(size = my_xats + , angle = 90 + , hjust = 1 + , vjust = 0.4 + , colour = my_axis_colours) + , axis.text.y = element_text(size = my_yats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals) + , axis.title.y = element_text(size = my_yals ) + , axis.ticks.x = element_blank() + ) + + labs(title = "" + , x = "Position" + , y = "Frequency") + + +print(outFile) +dev.off() + +# for sanity and good practice +#rm(df) diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R index 4c54681..453f0d3 100644 --- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R @@ -7,7 +7,7 @@ getwd() ######################################################################## source("../Header_TT.R") -#source("barplot_colour_function.R") +#source("../barplot_colour_function.R") #require(data.table) ######################################################################## diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R new file mode 100644 index 0000000..c7fc436 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R @@ -0,0 +1,205 @@ +getwd() +setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") +getwd() + +############################################################ +# 1: Installing and loading required packages and functions +############################################################ + +#source("../Header_TT.R") +#source("../barplot_colour_function.R") +#library(tidyverse) + +########################### +#2: Read file: normalised file, output of step 4 mcsm pipeline +########################### +#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv" +# , row.names = 1 +# , stringsAsFactors = F +# , header = T) + +# call script combining_df +source("../combining_two_df.R") + +#---------------------- PAY ATTENTION +# the above changes the working dir +# from Plotting to Scripts" +#---------------------- PAY ATTENTION + +#========================== +# This will return: + +# df with NA for pyrazinamide: +#merged_df2 +#merged_df2_comp + +# df without NA for pyrazinamide: +#merged_df3 +#merged_df3_comp +#========================== +########################### +# Data to choose: +# We will be using the small dfs +# to generate the coloured axis +########################### + +# uncomment as necessary +#!!!!!!!!!!!!!!!!!!!!!!! +# REASSIGNMENT +my_df = merged_df3 +#my_df = merged_df3_comp +#!!!!!!!!!!!!!!!!!!!!!!! + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +str(my_df) +my_df$Position +c1 = my_df[my_df$Mutationinformation == "L4S",] + +# order my_df by Position +my_df_o = my_df[order(my_df$Position),] +head(my_df_o$Position); tail(my_df_o$Position) + +c2 = my_df_o[my_df_o$Mutationinformation == "L4S",] + +# sanity check +if (sum(table(c1 == c2)) == ncol(my_df)){ + print ("Sanity check passsd") +}else{ + print ("Error!: Please debug your code") +} + +rm(my_df, c1, c2) + +# create a new df with unique position numbers and cols +Position = unique(my_df_o$Position) #130 +Position_cols = as.data.frame(Position) + +head(Position_cols) ; tail(Position_cols) + +# specify active site residues and bg colour +Position = c(49, 51, 57, 71 + , 8, 96, 138 + , 13, 68 + , 103, 137 + , 133, 134) #13 + +lab_bg = rep(c("purple" + , "yellow" + , "cornflowerblue" + , "blue" + , "green"), times = c(4, 3, 2, 2, 2) +) + +# second bg colour for active site residues +#lab_bg2 = rep(c("white" +# , "green" , "white", "green" +# , "white" +# , "white" +# , "white"), times = c(4 +# , 1, 1, 1 +# , 2 +# , 2 +# , 2) +#) + +# revised: leave the second box coloured as the first one incase there is no second colour +lab_bg2 = rep(c("purple" + , "green" , "yellow", "green" + , "cornflowerblue" + , "blue" + , "green"), times = c(4 + , 1, 1, 1 + , 2 + , 2 + , 2) +) + +# fg colour for labels for active site residues +#lab_fg = rep(c("white" +# , "black" +# , "black" +# , "white" +# , "black"), times = c(4, 3, 2, 2, 2)) + +# revised: make the purple ones black +# fg colour for labels for active site residues +lab_fg = rep(c("black" + , "black" + , "black" + , "white" + , "black"), times = c(4, 3, 2, 2, 2)) + +# combined df with active sites, bg and fg colours +aa_cols_ref = data.frame(Position + , lab_bg + , lab_bg2 + , lab_fg + , stringsAsFactors = F) #13, 4 + +str(Position_cols); class(Position_cols) +str(aa_cols_ref); class(aa_cols_ref) + +# since Position is int and numeric in the two dfs resp, +# converting numeric to int for consistency +aa_cols_ref$Position = as.integer(aa_cols_ref$Position) +class(aa_cols_ref$Position) + +#=========== +# Merge 1: merging Positions df (Position_cols) and +# active site cols (aa_cols_ref) +# linking column: "Position" +# This is so you can have colours defined for all 130 positions +#=========== +head(Position_cols$Position); head(aa_cols_ref$Position) + +mut_pos_cols = merge(Position_cols, aa_cols_ref + , by = "Position" + , all.x = TRUE) + +head(mut_pos_cols) +# replace NA's +# :column "lab_bg" with "white" +# : column "lab_fg" with "black" +mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white" +mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white" +mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black" +head(mut_pos_cols) + +#=========== +# Merge 2: Merge mut_pos_cols with mcsm df +# Now combined the 130 positions with aa colours with +# the mcsm_data +#=========== +# dfs to merge +df0 = my_df_o +df1 = mut_pos_cols + +# check the column on which merge will be performed +head(df0$Position); tail(df0$Position) +head(df1$Position); tail(df1$Position) + +# should now have 3 extra columns +my_df = merge(df0, df1 + , by = "Position" + , all.x = TRUE) + +# sanity check +my_df[my_df$Position == "49",] +my_df[my_df$Position == "13",] + +my_df$Position + +# clear variables +rm(aa_cols_ref + , df0 + , df1 + , my_df_o + , Position_cols + , lab_bg + , lab_bg2 + , lab_fg + , Position + ) + From c3c50f65f227aa58c1b1dabceb1c69b3e535f0da Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 23 Jan 2020 09:29:20 +0000 Subject: [PATCH 6/7] saving previous work from home pc --- .../__pycache__/reference_dict.cpython-37.pyc | Bin 2203 -> 2216 bytes meta_data_analysis/pnca_data_extraction.py | 227 ++++++++++-------- meta_data_analysis/reference_dict.py | 25 +- 3 files changed, 146 insertions(+), 106 deletions(-) diff --git a/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc b/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc index 1db3c8321b73129609403f35aa5ecc9a7fcd9421..3c936f99b195bc3763249c3d77aedeb5bb30dba3 100644 GIT binary patch delta 113 zcmbO&xI&QEiIeIiqIeS%;|q%N3*yTYOHzxX1Yq2P{G7z1%#zBe$$Qz-fQnVv=d&3BO)oN< Ie3JbV00TEBpa1{> delta 100 zcmZ1>I9rg{iIlF(1jEYWvKEJ@tl!&J@87&V!LZGqA)-o(WC vf};F_`0~V()S_DgFm6G9PGV7JN#(7{r`XbfiVfN4vl#;oEHa*ajr|e;wYDH+ diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py index 0eb5aab..e05c39d 100755 --- a/meta_data_analysis/pnca_data_extraction.py +++ b/meta_data_analysis/pnca_data_extraction.py @@ -23,11 +23,7 @@ import pandas as pd #my_dir = os.path.expanduser('~/some_dir') #make sure mcsm_analysis/ exists #or specify the output directory - -#%% -#%% -#%% -#======================================================== +#==================================================== # TASK: extract ALL pncA mutations from GWAS data #======================================================== #%% @@ -40,83 +36,45 @@ os.getcwd() #%% from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! #%% -#NOTE: Out_dir MUST exis -# User defined dir strpyrazinamide -drug = 'pyrazinamide' -gene = 'pnca' -out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/' -# = out_dir + drug -data_dir = homedir + '/git/Data' +############# specify variables for input and output paths and filenames +drug = "pyrazinamide" +gene = "pnca" -if not os.path.exists(data_dir): - print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data') - os.makedirs(data_dir) +datadir = homedir + "/git/Data" +basedir = datadir + "/" + drug + "/input" + +# input +inpath = "/original" +in_filename = "/original_tanushree_data_v2.csv" +infile = basedir + inpath + in_filename +#print(infile) + +# output: several output files +# output filenames in respective sections at the time of outputting files +outpath = "/processed" +outdir = basedir + outpath +#print(outdir) + +if not os.path.exists(datadir): + print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md') + os.makedirs(datadir) die() -if not os.path.exists(out_dir): - print('Error!', out_dir, 'does not exist. Please create it') +if not os.path.exists(outdir): + print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md') exit() -#if not os.path.exists(work_dir): -# print('creating dir that does not exist', 'dir_name:', work_dir) -# os.makedirs(work_dir) else: print('Dir exists: Carrying on') -# now create sub dir structure within work_dir -# pyrazinamide/mcsm_analysis - -# we need three dir -# Data -# Scripts - # Plotting -# Results - # Plots - -# create a list of dir names -#dir_names = ['Data', 'Scripts', 'Results'] - - -#for i in dir_names: -# this_dir = (work_dir + '/' + i) -# if not os.path.exists(this_dir): -# print('creating dir that does not exist:', this_dir) -# os.makedirs(this_dir) -#else: -# print('Dir exists: Carrying on') - -# Create sub dirs -# 1) -# Scripts - # Plotting -#subdir_plotting = work_dir + '/Scripts/Plotting' -#if not os.path.exists(subdir_plotting): -# print('creating dir that does not exist:', subdir_plotting) -# os.makedirs(subdir_plotting) -#else: -# print('Dir exists: Carrying on') - -# 2) -# Results - # Plots -#subdir_plots = work_dir + '/Results/Plots' -#if not os.path.exists(subdir_plots): -# print('creating dir that does not exist:', subdir_plots) -# os.makedirs(subdir_plots) -#else: -# print('Dir exists: Carrying on') - -# clear varaibles -#del(dir_names, drug, i, subdir_plots, subdir_plotting) - -#exit() +################## end of variable assignment for input and output files #%% #============================================================================== ############ # STEP 1: Read file original_tanushree_data_v2.csv ############ -data_file = data_dir + '/input/original/original_tanushree_data_v2.csv' -meta_data = pd.read_csv(data_file, sep = ',') +#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv' +meta_data = pd.read_csv(infile, sep = ',') # column names list(meta_data.columns) @@ -130,7 +88,7 @@ meta_data = meta_data[['id' , 'pyrazinamide' , 'dr_mutations_pyrazinamide' , 'other_mutations_pyrazinamide' - ]] + ]] #19265, 67 # checks total_samples = meta_data['id'].nunique() # 19265 @@ -143,7 +101,12 @@ meta_data.head() # equivalent of table in R # pyrazinamide counts -meta_data.pyrazinamide.value_counts() +meta_data.pyrazinamide.value_counts() #12511 +#0.0 10565 +#1.0 1946 {RESULT: No. of Resistant and Susceptible samples} + +clear variables +#del(basedir, datadir, inpath, infile) #%% ############ @@ -154,8 +117,17 @@ meta_data.pyrazinamide.value_counts() # (corresponding to a structure) # and drop the entries with NA ############# -meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] -meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +#2163 {RESULT: samples with dr_muts} +dr_id = meta_pza['id'].unique() + +meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +#526 (RESULT: samples with other_muts) +other_id = meta_pza['id'].unique() + +# FIXME: See if the sample ids are unique in each +# find any common IDs +dr_id.isin(other_id[1,1]) del(meta_pza) @@ -188,8 +160,9 @@ del(meta_pnca_other) # Now extract "all" mutations meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ] +#2665, 8 -meta_pnca_all['id'].nunique() +meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples} pnca_samples = len(meta_pnca_all) pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() comp_pnca_samples = pnca_samples - pnca_na @@ -468,8 +441,11 @@ meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF #========= # Step 12a: all SNPs to run mCSM #========= -snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) -pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) +snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336 +pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131 + +# since 186 is not part of struc: it is one less +#FIXME! # assign meaningful colnames #snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True) @@ -480,24 +456,27 @@ snps_only.isna().sum() # should be 0 # specify variable name for output file gene = 'pnca' #drug = 'pyrazinamide' -my_fname1 = '_snps_' +my_fname1 = '_snps' nrows = len(snps_only) #output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' #output_file_path = work_dir + '/Data/' -output_file_path = data_dir + '/input/processed/' + drug + '/' +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) -if not os.path.exists(output_file_path): - print( output_file_path, 'does not exist. Creating') - os.makedirs(output_file_path) +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) exit() -output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' -print(output_filename) #<<<- check +#out_filename = gene + my_fname1 + str(nrows) + '.csv' +out_filename = '/' + gene + my_fname1 + '.csv' +outfile = outdir + out_filename +print(outfile) #<<<- check # write to csv: without column or row names # Bad practice: numbers at the start of a filename -snps_only.to_csv(output_filename, header = False, index = False) +snps_only.to_csv(out_filename, header = False, index = False) #========= # Step 12b: all snps with annotation @@ -519,12 +498,23 @@ snps_only.to_csv(output_filename, header = False, index = False) #my_fname2 = '_snps_with_metadata_' #nrows = len(pnca_snps_ALL) +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' #output_file_path = work_dir + '/Data/' -#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv' -#print(output_filename) #<<<- check +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) + +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) + exit() + +#out_filename = gene + my_fname2 + str(nrows) + '.csv' +out_filename = '/' + gene + my_fname1 + '.csv' +outfile = outdir + out_filename +print(outfile) #<<<- check # write out file -#pnca_snps_ALL.to_csv(output_filename, header = True, index = False) +#pnca_snps_ALL.to_csv(outfile, header = True, index = False) #========= # Step 12c: comp snps for OR calcs with annotation @@ -547,7 +537,7 @@ meta_pnca_LF2['mutation'].nunique() meta_pnca_LF2.groupby('mutation_info').nunique() # sanity check -meta_pnca_LF2['id'].nunique() +meta_pnca_LF2['id'].nunique() #1908 # should be True if meta_pnca_LF2['id'].nunique() == comp_pnca_samples: @@ -569,15 +559,26 @@ len(pnca_snps_COMP) gene = 'pnca' #drug = 'pyrazinamide' -my_fname3 = '_comp_snps_with_metadata_' +my_fname3 = '_comp_snps_with_metadata' nrows = len(pnca_snps_COMP) -#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv' -#print(output_filename) #<<<-check +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' +#output_file_path = work_dir + '/Data/' +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) + +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) + exit() + +#out_filename = gene + my_fname3 + str(nrows) + '.csv' +#out_filename = '/' + gene + my_fname3 + '.csv' +#outfile = outdir + out_filename +#print(outfile) #<<<- check # write out file -#pnca_snps_COMP.to_csv(output_filename, header = True, index = False) - +#pnca_snps_COMP.to_csv(outfile, header = True, index = False) #========= # Step 12d: comp snps only @@ -589,15 +590,26 @@ snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique()) gene = 'pnca' #drug = 'pyrazinamide' -my_fname1 = '_comp_snps_' +my_fname1 = '_comp_snps' nrows = len(snps_only) -output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' -print(output_filename) #<<<- check +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' +#output_file_path = work_dir + '/Data/' +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) + +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) + exit() + +#out_filename = gene + my_fname1 + str(nrows) + '.csv' +out_filename = '/' + gene + my_fname1 + '.csv' +outfile = outdir + out_filename +print(outfile) #<<<- check # write to csv: without column or row names -snps_only.to_csv(output_filename, header = False, index = False) - +snps_only.to_csv(outfile, header = False, index = False) #=#=#=#=#=#=#=# # COMMENT: LF1 is the file to extract all unique snps for mcsm @@ -619,8 +631,21 @@ gene = 'pnca' #drug = 'pyrazinamide' my_fname4 = '_metadata' #nrows = len(meta_pnca_LF1) -output_filename = output_file_path + gene + my_fname4 + '.csv' -print(output_filename) #<<<-check + +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' +#output_file_path = work_dir + '/Data/' +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) + +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) + exit() + +#out_filename = gene + my_fname1 + str(nrows) + '.csv' +out_filename = '/' + gene + my_fname4 + '.csv' +outfile = outdir + out_filename +print(outfile) #<<<- check # write out file -meta_pnca_LF1.to_csv(output_filename) +meta_pnca_LF1.to_csv(outfile) diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py index bf6561b..e83ce2c 100755 --- a/meta_data_analysis/reference_dict.py +++ b/meta_data_analysis/reference_dict.py @@ -6,7 +6,7 @@ Created on Tue Jun 18 11:32:28 2019 @author: tanushree """ ############################################ -#load libraries +# load libraries import pandas as pd import os ############################################# @@ -17,14 +17,29 @@ import os # containing GWAS data #!#########################! -print(os.getcwd()) -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -os.chdir(homedir + '/git/Data/input/original') +#print(os.getcwd()) +#homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde +#os.chdir(homedir + '/git/Data/pyrazinamide/input/original') print(os.getcwd()) + +#%% +############# specify variables for input and output paths and filenames +drug = 'pyrazinamide' +#gene = 'pnca' + +datadir = homedir + '/git/Data' +basedir = datadir + '/' + drug + '/input' + +# input +inpath = "/original" +in_filename = "/aa_codes.csv" +infile = basedir + inpath + in_filename +print(infile) + #========== #read file #========== -my_aa = pd.read_csv('aa_codes.csv') #20, 6 +my_aa = pd.read_csv(infile) #20, 6 #assign the one_letter code as the row names so that it is easier to create a dict of dicts using index #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 From 15391a57001cc66e34be42dce7e62d9b3a9b3459 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 28 Jan 2020 10:10:16 +0000 Subject: [PATCH 7/7] saving data_extraction from home --- meta_data_analysis/pnca_data_extraction.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py index e05c39d..2a030f9 100755 --- a/meta_data_analysis/pnca_data_extraction.py +++ b/meta_data_analysis/pnca_data_extraction.py @@ -88,10 +88,10 @@ meta_data = meta_data[['id' , 'pyrazinamide' , 'dr_mutations_pyrazinamide' , 'other_mutations_pyrazinamide' - ]] #19265, 67 + ]] # checks -total_samples = meta_data['id'].nunique() # 19265 +total_samples = meta_data['id'].nunique() # counts NA per column meta_data.isna().sum() @@ -101,9 +101,7 @@ meta_data.head() # equivalent of table in R # pyrazinamide counts -meta_data.pyrazinamide.value_counts() #12511 -#0.0 10565 -#1.0 1946 {RESULT: No. of Resistant and Susceptible samples} +meta_data.pyrazinamide.value_counts() clear variables #del(basedir, datadir, inpath, infile) @@ -160,7 +158,7 @@ del(meta_pnca_other) # Now extract "all" mutations meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ] -#2665, 8 + meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples} pnca_samples = len(meta_pnca_all)