From 50ade050c22acd24601c09e4d97c0af6fb9ebadb Mon Sep 17 00:00:00 2001
From: tgttunstall <54280754+tgttunstall@users.noreply.github.com>
Date: Tue, 14 Jan 2020 11:22:41 +0000
Subject: [PATCH 1/7] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 4e7f8f2..0001735 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
 mCSM Analysis
 =============
 
-This repo does mCSM analysis using Python, bash and R.
+This repo does mCSM analysis using bash, python and R.
 
-Requires an additional 'Data' directory. Batteries not included.
+Requires an additional 'Data' directory. Batteries not included:-)
 
 ## Assumptions
 

From ec37e3c1f6fcddcf692caf9c36ea0653a3baf4cf Mon Sep 17 00:00:00 2001
From: tgttunstall <54280754+tgttunstall@users.noreply.github.com>
Date: Tue, 14 Jan 2020 11:29:13 +0000
Subject: [PATCH 2/7] Update README.md

Updated README.md
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0001735..84f8e1c 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ subdirs within this repo
  	*.py
 	
  mcsm\_analysis/
-	<drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
+	<drug>/
 		scripts/
 		*.R
 		*.py

From 4f06e42ee42c199cbdb475b5df7e474e993e8bae Mon Sep 17 00:00:00 2001
From: Tanu <tanu@tunstall.in>
Date: Wed, 22 Jan 2020 10:12:09 +0000
Subject: [PATCH 3/7] graphs for PS lineage dist for all and dr muts

---
 mcsm_analysis/pyrazinamide/scripts/.Rhistory  | 509 +-----------------
 .../pyrazinamide/scripts/Header_TT.R          |  31 +-
 .../pyrazinamide/scripts/combining_two_df.R   |   4 +-
 .../scripts/plotting/lineage_dist_PS.R        | 116 ++--
 4 files changed, 93 insertions(+), 567 deletions(-)

diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
index 8055405..0112d24 100644
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@@ -1,512 +1,7 @@
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
 source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
 source("../combining_two_df.R")
-###########################
-# This will return:
-# df with NA:
-# merged_df2
-# merged_df3
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile)
-#====== end of script
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-inFile
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile, complex1)
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-source("Header_TT.R")
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-source("Header_TT.R")
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-source("read_pdb.R") # list of 8
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/output/"
-getwd()
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-outDir = "~/git/Data/pyrazinamide/input/structure"
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-hist(d$b
-, xlab = ""
-, main = "repalced-B")
-plot(density(d$b)
-, xlab = ""
-, main = "replaced-B")
-# graph titles
-mtext(text = "Frequency"
-, side = 2
-, line = 0
-, outer = TRUE)
-mtext(text = "DUET_stability"
-, side = 3
-, line = 0
-, outer = TRUE)
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-#=========================================================
-# Processing P2: Replacing B factor with mean ratioLig scores
-#=========================================================
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-source("read_pdb.R") # list of 8
-# extract atom list into a vari
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P2
-#=========
-write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
-# output dir
-getwd()
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
+source("../combining_two_df.R")
+source("../combining_two_df.R")
diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
index 2069fa7..9eae42a 100644
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@@ -1,25 +1,31 @@
 #########################################################
 ### A) Installing and loading required packages
 #########################################################
+#lib_loc = "/usr/local/lib/R/site-library")
 
 #if (!require("gplots")) {
 #  install.packages("gplots", dependencies = TRUE)
 #  library(gplots)
 #}
 
-if (!require("tidyverse")) {
-  install.packages("tidyverse", dependencies = TRUE)
-  library(tidyverse)
-}
+#if (!require("tidyverse")) {
+#  install.packages("tidyverse", dependencies = TRUE)
+#  library(tidyverse)
+#}
 
 if (!require("ggplot2")) {
   install.packages("ggplot2", dependencies = TRUE)
   library(ggplot2)
 }
 
+if (!require("plotly")) {
+  install.packages("plotly", dependencies = TRUE)
+  library(plotly)
+}
+
 if (!require("cowplot")) {
   install.packages("copwplot", dependencies = TRUE)
-  library(ggplot2)
+  library(cowplot)
 }
 
 if (!require("ggcorrplot")) {
@@ -43,37 +49,33 @@ if (!require ("GOplot")) {
 }
 
 if(!require("VennDiagram")) {
-  
   install.packages("VennDiagram", dependencies = T)
   library(VennDiagram)
 }
 
 if(!require("scales")) {
-  
   install.packages("scales", dependencies = T)
   library(scales)
 }
 
 if(!require("plotrix")) {
-  
   install.packages("plotrix", dependencies = T)
   library(plotrix)
 }
 
 if(!require("stats")) {
-  
   install.packages("stats", dependencies = T)
   library(stats)
 }
 
 if(!require("stats4")) {
-  
   install.packages("stats4", dependencies = T)
   library(stats4)
 }
 
 if(!require("data.table")) {
-  library(stats4)
+install.packages("data.table")
+  library(data.table)
 }
 
 if (!require("PerformanceAnalytics")){
@@ -98,18 +100,17 @@ if (!require ("psych")){
 
 if (!require ("dplyr")){
   install.packages("dplyr")
-  library(psych)
+  library(dplyr)
 }
 
-
 if (!require ("compare")){
   install.packages("compare")
-  library(psych)
+  library(compare)
 }
 
 if (!require ("arsenal")){
   install.packages("arsenal")
-  library(psych)
+  library(arsenal)
 }
 
 
diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
index 57e476a..d0e13cd 100644
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@@ -11,7 +11,7 @@ getwd()
 # 				Installing and loading required packages 			   #
 ########################################################################
 
-source("Header_TT.R")
+#source("Header_TT.R")
 #require(data.table)
 #require(arsenal)
 #require(compare)
@@ -286,7 +286,7 @@ outDir = "~/git/Data/pyrazinamide/output/"
 getwd()
 
 outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
-write.csv(merged_df3, outFile1)
+#write.csv(merged_df3, outFile1)
 
 #outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
 #write.csv(merged_df3_comp, outFile2)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
index cc7e15a..4c54681 100644
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@@ -1,5 +1,5 @@
 getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 
 ########################################################################
@@ -24,11 +24,11 @@ source("../combining_two_df.R")
 #==========================
 # This will return:
 
-# df with NA:
+# df with NA for pyrazinamide:
 # merged_df2
 # merged_df3
 
-# df without NA:
+# df without NA for pyrazinamide:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
@@ -38,14 +38,17 @@ source("../combining_two_df.R")
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
 ###########################
 
 # uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!
 
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
@@ -59,12 +62,39 @@ is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 
-table(my_df$mutation_info)
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
 
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 
+#==========================
+# Data for plot: assign as 
+# necessary
+#===========================
+
+# uncomment as necessary
+#!!!!!!!!!!!!!!!!!!!!!!!
+# REASSIGNMENT
+
+#==================
+# data for ALL muts
+#==================
+plot_df = my_df  
+my_plot_name = 'lineage_dist_PS.svg'
+#my_plot_name = 'lineage_dist_PS_comp.svg'
+
+#=======================
+# data for dr_muts ONLY
+#=======================
+#plot_df = my_df_dr 
+#my_plot_name = 'lineage_dist_dr_PS.svg'
+#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
+#!!!!!!!!!!!!!!!!!!!!!!!
+
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
@@ -74,6 +104,7 @@ table(my_df$mutation_info)
 #===================
 # Data for plots
 #===================
+table(plot_df$lineage); str(plot_df$lineage)
 
 # subset only lineages1-4
 sel_lineages = c("lineage1"
@@ -82,34 +113,29 @@ sel_lineages = c("lineage1"
                  , "lineage4")
 
 # uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
 
 # refactor
 df_lin$lineage = factor(df_lin$lineage)
 
 table(df_lin$lineage) #{RESULT: No of samples within lineage}
 #lineage1 lineage2 lineage3 lineage4 
-#104     1293      264     1311 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#99     1275      263     1255
 
 length(unique(df_lin$Mutationinformation))
 #{Result: No. of unique mutations the 4 lineages contribute to}
 
 # sanity checks
 r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
   print ("sanity check passed: numbers match")
 } else{
   print("Error!: check your numbers")
 } 
 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!! 
 # REASSIGNMENT
 df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!!
 
 rm(df_lin)
 
@@ -117,8 +143,8 @@ rm(df_lin)
 # generate distribution plot of lineages
 #******************
 # basic: could improve this!
-library(plotly)
-library(ggridges)
+#library(plotly)
+#library(ggridges)
 
 g <- ggplot(df, aes(x = ratioDUET)) + 
   geom_density(aes(fill = DUET_outcome)
@@ -129,64 +155,68 @@ g <- ggplot(df, aes(x = ratioDUET)) +
 ggplotly(g)
 
 # 2 : ggridges (good!)
-
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 
-fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
 
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 
-svg('lineage_dist_PS.svg')
+# check plot name
+my_plot_name
 
-printFile = ggplot( df, aes(x = ratioDUET
-                            , y = DUET_outcome) )+
+# output svg
+svg(my_plot_name)
+printFile = ggplot(df, aes(x = ratioDUET
+                            , y = DUET_outcome))+
   
   #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient( aes(fill = ..x..)
+  geom_density_ridges_gradient(aes(fill = ..x..)
                                 , scale = 3
                                 , size = 0.3 ) +
   facet_wrap( ~lineage
               , scales = "free"
 #             , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
+              , labeller = labeller(lineage = my_labels) ) +
   coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off" 
-                ) +
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+#                   , ylim = c(0, 6)
+#                   , clip = "off" 
+) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                         , name = "DUET" ) + 
-  theme( axis.text.x = element_text( size = my_ats
+  theme(axis.text.x = element_text(size = my_ats
                                      , angle = 90
                                      , hjust = 1
                                      , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
+#                  , axis.text.y = element_text(size = my_ats
+#                                                , angle = 0
+#                                                , hjust = 1
+#                                                , vjust = 0)
          , axis.text.y = element_blank()
          , axis.title.x = element_blank()
          , axis.title.y = element_blank()
          , axis.ticks.y = element_blank()
          , plot.title = element_blank()
-         , strip.text = element_text(size=my_als)
-         , legend.text = element_text(size=10)
-         , legend.title = element_text(size=my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-        ) 
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#                  , legend.position = c(0.3, 0.8)
+#                  , legend.key.height = unit(1, 'mm')
+  ) 
 
 print(printFile)
 dev.off()
 
-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
+#=!=!=!=!=!=!=!
+# COMMENT: Not much differences in the distributions
+# when using merged_df2 or merged_df2_comp.
+# Also, the lineage differences disappear when looking at all muts
 # The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
+#=!=!=!=!=!=!=!
 #===================================================
 
 # COMPARING DISTRIBUTIONS

From bcf822d6e47b480ca7a3692ef826a99cdbc8f252 Mon Sep 17 00:00:00 2001
From: Tanu <tanu@tunstall.in>
Date: Wed, 22 Jan 2020 11:34:59 +0000
Subject: [PATCH 4/7] updated lineage dist for LIG for consistency

---
 .../scripts/plotting/lineage_dist_LIG.R       | 50 +++++++++++++------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
index ebbec76..e4e6972 100644
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
@@ -75,6 +75,29 @@ if (max(my_df$Dis_lig_Ang) < 10){
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
+#==========================
+# Data for plot: assign as 
+# necessary
+#===========================
+
+# uncomment as necessary
+#!!!!!!!!!!!!!!!!!!!!!!!
+# REASSIGNMENT
+
+#==================
+# data for ALL muts
+#==================
+plot_df = my_df  
+my_plot_name = 'lineage_dist_PS.svg'
+#my_plot_name = 'lineage_dist_PS_comp.svg'
+
+#=======================
+# data for dr_muts ONLY
+#=======================
+#plot_df = my_df_dr 
+#my_plot_name = 'lineage_dist_dr_PS.svg'
+#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
+#!!!!!!!!!!!!!!!!!!!!!!!
 
 #==========================
 # Plot: Lineage Distribution
@@ -117,10 +140,10 @@ if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
   print("Error!: check your numbers")
 } 
 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!! 
 # REASSIGNMENT
 df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!! 
 
 rm(df_lin)
 
@@ -131,15 +154,15 @@ rm(df_lin)
 library(plotly)
 library(ggridges)
 
-fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
 
 g <- ggplot(df, aes(x = ratioPredAff)) + 
   geom_density(aes(fill = Lig_outcome)
                , alpha = 0.5) + 
   facet_wrap( ~ lineage
              , scales = "free"
-             , labeller = labeller(lineage = fooNames) ) +
+             , labeller = labeller(lineage = my_labels) ) +
   coord_cartesian(xlim = c(-1, 1)
 #                  , ylim = c(0, 6)
 #                  , clip = "off"
@@ -153,15 +176,18 @@ ggplotly(g)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 
-fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
 
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 
-svg('lineage_dist_LIG.svg')
+# check plot name
+my_plot_name
+
+svg(my_plot_name)
 
 printFile = ggplot( df, aes(x = ratioPredAff
                           , y = Lig_outcome) ) +
@@ -172,7 +198,7 @@ printFile = ggplot( df, aes(x = ratioPredAff
   facet_wrap( ~lineage
               , scales = "free"
 #              , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
+              , labeller = labeller(lineage = my_labels) ) +
   coord_cartesian( xlim = c(-1, 1)
 #                  , ylim = c(0, 6)
 #                  , clip = "off"
@@ -202,12 +228,6 @@ printFile = ggplot( df, aes(x = ratioPredAff
 
 print(printFile)
 dev.off()
-
-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
-
 #===================================================
 
 # COMPARING DISTRIBUTIONS

From 4d2d03f634a461ed1da08b653bf58f1a3c99fa21 Mon Sep 17 00:00:00 2001
From: Tanu <tanu@tunstall.in>
Date: Wed, 22 Jan 2020 15:09:21 +0000
Subject: [PATCH 5/7] added coloured axis barplots

---
 mcsm_analysis/pyrazinamide/scripts/.Rhistory  | 202 +++++++++++-
 .../plotting/barplots_subcolours_aa_PS.R      | 292 ++++++++++++++++++
 .../scripts/plotting/lineage_dist_PS.R        |   2 +-
 .../scripts/plotting/subcols_axis.R           | 205 ++++++++++++
 4 files changed, 697 insertions(+), 4 deletions(-)
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R

diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
index 0112d24..715e7a4 100644
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@@ -2,6 +2,202 @@ getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 source("../Header_TT.R")
-source("../combining_two_df.R")
-source("../combining_two_df.R")
-source("../combining_two_df.R")
+source("../barplot_colour_function.R")
+############################################################
+# Output dir for plots
+############################################################
+out_dir = "~/git/Data/pyrazinamide/output/plots"
+source("subcols_axis.R")
+table(mut_pos_cols$lab_bg)
+#blue   cornflowerblue    green       purple    white  yellow
+#2        2                2            4       117     3
+sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should  be True
+table(mut_pos_cols$lab_bg2)
+#green white
+#2   128
+sum( table(mut_pos_cols$lab_bg2) ) ==  nrow(mut_pos_cols) # should  be True
+table(mut_pos_cols$lab_fg)
+#black white
+#124     6
+sum( table(mut_pos_cols$lab_fg) ) ==  nrow(mut_pos_cols) # should  be True
+# very important!
+my_axis_colours = mut_pos_cols$lab_fg
+# now clear mut_pos_cols
+rm(mut_pos_cols)
+###########################
+# 2: Plot: DUET scores
+###########################
+#==========================
+# Plot 2: Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by
+# stability scores. This is a barplot where each bar corresponds
+# to a SNP and is coloured by its corresponding DUET stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+# sanity checks
+upos = unique(my_df$Position)
+str(my_df$DUET_outcome)
+colnames(my_df)
+#===========================
+# Data preparation for plots
+#===========================
+#!!!!!!!!!!!!!!!!!
+# REASSIGNMENT
+df <- my_df
+#!!!!!!!!!!!!!!!!!
+rm(my_df)
+# sanity checks
+# should be a factor
+is.factor(df$DUET_outcome)
+#TRUE
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing
+#288              47
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+# sanity checks
+# very important!!!!
+tapply(df$ratioDUET, df$DUET_outcome, min)
+#Destabilizing   Stabilizing
+#-1.0000000     0.01065719
+tapply(df$ratioDUET, df$DUET_outcome, max)
+#Destabilizing   Stabilizing
+#-0.003875969     1.0000000
+# check unique values in normalised data
+u = unique(df$ratioDUET) # 323
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3
+df$ratioDUETR = round(df$ratioDUET, n)  # 335, 40
+u = unique(df$ratioDUETR) # 287
+# create an extra column called group which contains the "gp name and score"
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioDUETR
+df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")  # 335,41
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
+my_title = "Protein stability (DUET)"
+library(ggplot2)
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+# no ordering of x-axis according to frequency
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g +
+geom_bar(aes(fill = group), colour = "grey") +
+scale_fill_manual( values = colours
+, guide = 'none') +
+theme( axis.text.x = element_text(size = my_xaxls
+, angle = 90
+, hjust = 1
+, vjust = 0.4)
+, axis.text.y = element_text(size = my_yaxls
+, angle = 0
+, hjust = 1
+, vjust = 0)
+, axis.title.x = element_text(size = my_xaxts)
+, axis.title.y = element_text(size = my_yaxts ) ) +
+labs(title = my_title
+, x = "Position"
+, y = "Frequency")
+class(df$lab_bg)
+# make this a named vector
+# define cartesian coord
+my_xlim = length(unique(df$Position)); my_xlim
+# axis label size
+my_xals = 15
+my_yals = 15
+# axes text size
+my_xats = 15
+my_yats = 18
+# using geom_tile
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g +
+coord_cartesian(xlim = c(1, my_xlim)
+, ylim = c(0, 6)
+, clip = "off") +
+geom_bar(aes(fill = group), colour = "grey") +
+scale_fill_manual( values = colours
+, guide = 'none') +
+geom_tile(aes(,-0.8, width = 0.9, height = 0.85)
+, fill = df$lab_bg) +
+geom_tile(aes(,-1.2, width = 0.9, height = -0.2)
+, fill = df$lab_bg2) +
+# Here it's important to specify that your axis goes from 1 to max number of levels
+theme( axis.text.x = element_text(size = my_xats
+, angle = 90
+, hjust = 1
+, vjust = 0.4
+, colour = my_axis_colours)
+, axis.text.y = element_text(size = my_yats
+, angle = 0
+, hjust = 1
+, vjust = 0)
+, axis.title.x = element_text(size = my_xals)
+, axis.title.y = element_text(size = my_yals )
+, axis.ticks.x = element_blank()
+) +
+labs(title = my_title
+, x = "Position"
+, y = "Frequency")
+class(df$lab_bg)
+# make this a named vector
+# define cartesian coord
+my_xlim = length(unique(df$Position)); my_xlim
+# axis label size
+my_xals = 18
+my_yals = 18
+# axes text size
+my_xats = 14
+my_yats = 18
+my_plot_name = "barplot_PS_acoloured.svg"
+out_file = paste0(out_dir, "/", my_plot_name); outfile
+svg(outfile, width = 26, height = 4)
+svg(out_file, width = 26, height = 4)
+# using geom_tile
+g = ggplot(df, aes(factor(Position, ordered = T)))
+outFile = g +
+coord_cartesian(xlim = c(1, my_xlim)
+, ylim = c(0, 6)
+, clip = "off"
+) +
+geom_bar(aes(fill = group), colour = "grey") +
+scale_fill_manual( values = colours
+, guide = 'none') +
+#  geom_tile(aes(,-0.6, width = 0.9, height = 0.7)
+#            , fill = df$lab_bg) +
+#  geom_tile(aes(,-1, width = 0.9, height = 0.3)
+#            , fill = df$lab_bg2) +
+geom_tile(aes(,-0.8, width = 0.9, height = 0.85)
+, fill = df$lab_bg) +
+geom_tile(aes(,-1.2, width = 0.9, height = -0.2)
+, fill = df$lab_bg2) +
+# Here it's important to specify that your axis goes from 1 to max number of levels
+theme( axis.text.x = element_text(size = my_xats
+, angle = 90
+, hjust = 1
+, vjust = 0.4
+, colour = my_axis_colours)
+, axis.text.y = element_text(size = my_yats
+, angle = 0
+, hjust = 1
+, vjust = 0)
+, axis.title.x = element_text(size = my_xals)
+, axis.title.y = element_text(size = my_yals )
+, axis.ticks.x = element_blank()
+) +
+labs(title = ""
+, x = "Position"
+, y = "Frequency")
+print(outFile)
+dev.off()
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R
new file mode 100644
index 0000000..63be70a
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R
@@ -0,0 +1,292 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+############################################################
+# 1: Installing and loading required packages and functions
+############################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+############################################################
+# Output dir for plots
+############################################################
+out_dir = "~/git/Data/pyrazinamide/output/plots"
+
+############################################################
+# 2: call script the prepares the data with columns containing
+# colours for axis labels
+############################################################
+
+source("subcols_axis.R")
+
+# this should return
+#mut_pos_cols: 130, 4
+#my_df: 335, 39
+
+# clear excess variable
+# "mut_pos_cols" is just for inspection in case you need to cross check
+# position numbers and colours
+# open file from deskptop ("sample_axis_cols") for cross checking
+
+table(mut_pos_cols$lab_bg)
+
+sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should  be True
+     
+table(mut_pos_cols$lab_bg2)
+
+sum( table(mut_pos_cols$lab_bg2) ) ==  nrow(mut_pos_cols) # should  be True
+
+table(mut_pos_cols$lab_fg)
+
+sum( table(mut_pos_cols$lab_fg) ) ==  nrow(mut_pos_cols) # should  be True
+
+# very important!
+my_axis_colours = mut_pos_cols$lab_fg
+
+# now clear mut_pos_cols
+rm(mut_pos_cols) 
+
+###########################
+# 2: Plot: DUET scores
+###########################
+#==========================
+# Plot 2: Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+# sanity checks
+upos = unique(my_df$Position)
+
+str(my_df$DUET_outcome)
+
+colnames(my_df)
+
+#=========================== 
+# Data preparation for plots
+#===========================
+#!!!!!!!!!!!!!!!!!
+# REASSIGNMENT
+df <- my_df
+#!!!!!!!!!!!!!!!!!
+
+rm(my_df)
+
+# sanity checks
+# should be a factor
+is.factor(df$DUET_outcome)
+#TRUE
+
+table(df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+# sanity checks
+# very important!!!!
+tapply(df$ratioDUET, df$DUET_outcome, min)
+
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = DUET_outcome
+# subgroup = normalised score i.e ratioDUET
+
+# Prepare data: round off ratioDUET scores
+# round off to 3 significant digits:
+# 323 if no rounding is performed: used to generate the original graph
+# 287 if rounded to 3 places
+# FIXME: check if reducing precicion creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioDUET) 
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioDUETR = round(df$ratioDUET, n)  
+u = unique(df$ratioDUETR)
+
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioDUETR
+df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") 
+
+# ELSE
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioDUET
+#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")  
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#******************
+# generate plot
+#******************
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
+my_title = "Protein stability (DUET)"
+library(ggplot2)
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis according to frequency
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+#========================
+# plot with axis colours
+#========================
+class(df$lab_bg)
+# make this a named vector
+
+# define cartesian coord
+my_xlim = length(unique(df$Position)); my_xlim
+
+# axis label size
+my_xals = 15
+my_yals = 15
+
+# axes text size
+my_xats = 15
+my_yats = 18
+
+# using geom_tile
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  coord_cartesian(xlim = c(1, my_xlim)
+                  , ylim = c(0, 6)
+                  , clip = "off") +
+  
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  geom_tile(aes(,-0.8, width = 0.9, height = 0.85)
+            , fill = df$lab_bg) +
+  geom_tile(aes(,-1.2, width = 0.9, height = -0.2)
+            , fill = df$lab_bg2) +
+  
+  # Here it's important to specify that your axis goes from 1 to max number of levels
+  theme( axis.text.x = element_text(size = my_xats
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4
+                                    , colour = my_axis_colours)
+         , axis.text.y = element_text(size = my_yats 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xals)
+         , axis.title.y = element_text(size = my_yals ) 
+         , axis.ticks.x = element_blank()
+  ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+#========================
+# output plot as svg/png
+#========================
+class(df$lab_bg)
+# make this a named vector
+
+# define cartesian coord
+my_xlim = length(unique(df$Position)); my_xlim
+
+# axis label size
+my_xals = 18
+my_yals = 18
+
+# axes text size
+my_xats = 14
+my_yats = 18
+
+# set output dir for plots
+#getwd()
+#setwd("~/git/Data/pyrazinamide/output/plots") 
+#getwd()
+
+plot_name = "barplot_PS_acoloured.svg"
+my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name
+
+svg(my_plot_name, width = 26, height = 4)
+
+g = ggplot(df, aes(factor(Position, ordered = T)))
+
+outFile = g + 
+  coord_cartesian(xlim = c(1, my_xlim)
+                  , ylim = c(0, 6)
+                  , clip = "off"
+  ) +
+  
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+#  geom_tile(aes(,-0.6, width = 0.9, height = 0.7)
+#            , fill = df$lab_bg) +
+#  geom_tile(aes(,-1, width = 0.9, height = 0.3)
+#            , fill = df$lab_bg2) +
+  geom_tile(aes(,-0.8, width = 0.9, height = 0.85)
+            , fill = df$lab_bg) +
+  geom_tile(aes(,-1.2, width = 0.9, height = -0.2)
+            , fill = df$lab_bg2) +
+  
+# Here it's important to specify that your axis goes from 1 to max number of levels
+  theme( axis.text.x = element_text(size = my_xats
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4
+                                    , colour = my_axis_colours)
+         , axis.text.y = element_text(size = my_yats 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xals)
+         , axis.title.y = element_text(size = my_yals ) 
+         , axis.ticks.x = element_blank()
+  ) +
+  labs(title = ""
+       , x = "Position"
+       , y = "Frequency")
+
+
+print(outFile)
+dev.off()
+
+# for sanity and good practice
+#rm(df)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
index 4c54681..453f0d3 100644
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@@ -7,7 +7,7 @@ getwd()
 ########################################################################
 
 source("../Header_TT.R")
-#source("barplot_colour_function.R")
+#source("../barplot_colour_function.R")
 #require(data.table)
 
 ########################################################################
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R
new file mode 100644
index 0000000..c7fc436
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis.R
@@ -0,0 +1,205 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+############################################################
+# 1: Installing and loading required packages and functions
+############################################################
+
+#source("../Header_TT.R")
+#source("../barplot_colour_function.R")
+#library(tidyverse)
+
+###########################
+#2: Read file: normalised file, output of step 4 mcsm pipeline
+###########################
+#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv"
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+#                  , header = T)
+
+# call script combining_df
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#  from Plotting to Scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA for pyrazinamide:
+#merged_df2 
+#merged_df2_comp 
+
+# df without NA for pyrazinamide:
+#merged_df3 
+#merged_df3_comp 
+#==========================
+###########################
+# Data to choose: 
+# We will be using the small dfs
+# to generate the coloured axis
+###########################
+
+# uncomment as necessary
+#!!!!!!!!!!!!!!!!!!!!!!!
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df  = merged_df3_comp
+#!!!!!!!!!!!!!!!!!!!!!!!
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+str(my_df)
+my_df$Position
+c1 = my_df[my_df$Mutationinformation == "L4S",]
+
+# order my_df by Position
+my_df_o = my_df[order(my_df$Position),]
+head(my_df_o$Position); tail(my_df_o$Position)
+
+c2 = my_df_o[my_df_o$Mutationinformation == "L4S",]
+
+# sanity check
+if (sum(table(c1 == c2)) == ncol(my_df)){
+  print ("Sanity check passsd")
+}else{
+  print ("Error!: Please debug your code")
+}
+
+rm(my_df, c1, c2)
+
+# create a new df with unique position numbers and cols
+Position = unique(my_df_o$Position) #130
+Position_cols = as.data.frame(Position)
+
+head(Position_cols) ; tail(Position_cols)
+
+# specify active site residues and bg colour
+Position = c(49, 51, 57, 71
+             , 8, 96, 138
+             , 13, 68
+             , 103, 137
+             , 133, 134) #13
+
+lab_bg = rep(c("purple" 
+               , "yellow"
+               , "cornflowerblue"
+               , "blue"
+               , "green"), times = c(4, 3, 2, 2, 2)
+)
+
+# second bg colour for active site residues
+#lab_bg2 = rep(c("white" 
+#                , "green" , "white", "green"
+#                , "white"
+#                , "white"
+#                , "white"), times = c(4
+#                                      , 1, 1, 1
+#                                      , 2
+#                                      , 2
+#                                      , 2)
+#)
+
+# revised: leave the second box coloured as the first one incase there is no second colour
+lab_bg2 = rep(c("purple" 
+                , "green" , "yellow", "green"
+                , "cornflowerblue"
+                , "blue"
+                , "green"), times = c(4
+                                      , 1, 1, 1
+                                      , 2
+                                      , 2
+                                      , 2)
+)
+
+# fg colour for labels for active site residues
+#lab_fg = rep(c("white" 
+#               , "black"
+#               , "black"
+#               , "white"
+#               , "black"), times = c(4, 3, 2, 2, 2))
+
+# revised: make the purple ones black
+# fg colour for labels for active site residues
+lab_fg = rep(c("black" 
+               , "black"
+               , "black"
+               , "white"
+               , "black"), times = c(4, 3, 2, 2, 2))               
+               
+# combined df with active sites, bg and fg colours
+aa_cols_ref = data.frame(Position
+                         , lab_bg
+                         , lab_bg2
+                         , lab_fg
+                         , stringsAsFactors = F) #13, 4
+
+str(Position_cols); class(Position_cols)
+str(aa_cols_ref); class(aa_cols_ref)
+
+# since Position is int and numeric in the two dfs resp, 
+# converting numeric to int for consistency
+aa_cols_ref$Position = as.integer(aa_cols_ref$Position)
+class(aa_cols_ref$Position)
+
+#===========
+# Merge 1: merging Positions df (Position_cols) and
+# active site cols (aa_cols_ref)
+# linking column: "Position"
+# This is so you can have colours defined for all 130 positions
+#===========
+head(Position_cols$Position); head(aa_cols_ref$Position)
+
+mut_pos_cols = merge(Position_cols, aa_cols_ref
+            , by = "Position"
+            , all.x = TRUE) 
+
+head(mut_pos_cols)
+# replace NA's 
+# :column "lab_bg" with "white"
+# : column "lab_fg" with "black"
+mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white"
+mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white"
+mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black"
+head(mut_pos_cols)
+
+#===========
+# Merge 2: Merge mut_pos_cols with mcsm df
+# Now combined the 130 positions with aa colours with 
+# the mcsm_data
+#===========
+# dfs to merge
+df0 = my_df_o
+df1 = mut_pos_cols
+
+# check the column on which merge will be performed
+head(df0$Position); tail(df0$Position)
+head(df1$Position); tail(df1$Position)
+
+# should now have 3 extra columns
+my_df = merge(df0, df1
+              , by = "Position"
+              , all.x = TRUE)
+
+# sanity check
+my_df[my_df$Position == "49",]
+my_df[my_df$Position == "13",]
+
+my_df$Position
+
+# clear variables
+rm(aa_cols_ref
+   , df0
+   , df1
+   , my_df_o
+   , Position_cols
+   , lab_bg
+   , lab_bg2
+   , lab_fg
+   , Position
+   )
+

From c3c50f65f227aa58c1b1dabceb1c69b3e535f0da Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 23 Jan 2020 09:29:20 +0000
Subject: [PATCH 6/7] saving previous work from home pc

---
 .../__pycache__/reference_dict.cpython-37.pyc | Bin 2203 -> 2216 bytes
 meta_data_analysis/pnca_data_extraction.py    | 227 ++++++++++--------
 meta_data_analysis/reference_dict.py          |  25 +-
 3 files changed, 146 insertions(+), 106 deletions(-)

diff --git a/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc b/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc
index 1db3c8321b73129609403f35aa5ecc9a7fcd9421..3c936f99b195bc3763249c3d77aedeb5bb30dba3 100644
GIT binary patch
delta 113
zcmbO&xI&QEiI<m)0SIO<myJ{2$eY0=uBxA&S)%WfSdyq;P+62%m6?~Ao0*cjxt^(-
zneo=-->eIiqIeS%;|q%N3*yTYOHzxX1Yq2P{G7z1%#zBe$$Qz-fQnVv=d&3BO)oN<
Ie3JbV00TEBpa1{>

delta 100
zcmZ1>I9rg{iI<m)0SK7Zio}U-<jr8>lF(1jEYWvKEJ@tl!&J@87&V!LZGqA)-o(WC
vf};F_`0~V()S_DgFm6G9PGV7JN#(7{r`XbfiVfN4vl#;oEHa*ajr|e;wYDH+

diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py
index 0eb5aab..e05c39d 100755
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@@ -23,11 +23,7 @@ import pandas as pd
 #my_dir = os.path.expanduser('~/some_dir')
 #make sure mcsm_analysis/ exists
 #or specify the output directory
-
-#%%
-#%%
-#%%
-#========================================================
+#====================================================
 # TASK: extract ALL pncA mutations from GWAS data
 #========================================================
 #%%
@@ -40,83 +36,45 @@ os.getcwd()
 #%%
 from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
 #%%
-#NOTE: Out_dir MUST exis
-# User defined dir strpyrazinamide
-drug = 'pyrazinamide'
-gene = 'pnca'
-out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
-# = out_dir + drug
-data_dir = homedir + '/git/Data'
+############# specify variables for input and output paths and filenames
+drug = "pyrazinamide"
+gene = "pnca"
 
-if not os.path.exists(data_dir):
-    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
-    os.makedirs(data_dir)
+datadir = homedir + "/git/Data"
+basedir =  datadir + "/" + drug + "/input"
+
+# input
+inpath = "/original"
+in_filename  = "/original_tanushree_data_v2.csv"
+infile = basedir + inpath + in_filename
+#print(infile)
+
+# output: several output files
+# output filenames in respective sections at the time of outputting files
+outpath = "/processed"
+outdir = basedir + outpath
+#print(outdir)
+
+if not os.path.exists(datadir):
+    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
+    os.makedirs(datadir)
     die()
 
-if not os.path.exists(out_dir):
-    print('Error!', out_dir, 'does not exist. Please create it')
+if not os.path.exists(outdir):
+    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
     exit()
     
-#if not os.path.exists(work_dir):
-#    print('creating dir that does not exist', 'dir_name:', work_dir)
-#    os.makedirs(work_dir)
 else:
     print('Dir exists: Carrying on')
 
-# now create sub dir structure within work_dir
-# pyrazinamide/mcsm_analysis
-
-# we need three dir
-# Data
-# Scripts
-    # Plotting
-# Results
-    # Plots
-    
-# create a list of dir names
-#dir_names = ['Data', 'Scripts', 'Results']
-
-
-#for i in dir_names:
-#    this_dir = (work_dir + '/' + i)
-#    if not os.path.exists(this_dir):
-#        print('creating dir that does not exist:', this_dir)
-#        os.makedirs(this_dir)
-#else:
-#    print('Dir exists: Carrying on')
-      
-# Create sub dirs
-# 1)        
-# Scripts
-    # Plotting
-#subdir_plotting = work_dir + '/Scripts/Plotting'
-#if not os.path.exists(subdir_plotting):
-#      print('creating dir that does not exist:', subdir_plotting)
-#      os.makedirs(subdir_plotting)
-#else:
-#    print('Dir exists: Carrying on')
- 
-# 2)    
-# Results
-    # Plots
-#subdir_plots = work_dir + '/Results/Plots'        
-#if not os.path.exists(subdir_plots):
-#        print('creating dir that does not exist:', subdir_plots)
-#        os.makedirs(subdir_plots)    
-#else:
-#    print('Dir exists: Carrying on')
-         
-# clear varaibles
-#del(dir_names, drug, i, subdir_plots, subdir_plotting)
-
-#exit()
+################## end of variable assignment for input and output files
 #%%
 #==============================================================================
 ############
 # STEP 1: Read file original_tanushree_data_v2.csv
 ############
-data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
-meta_data = pd.read_csv(data_file, sep = ',') 
+#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
+meta_data = pd.read_csv(infile, sep = ',')  
 
 # column names
 list(meta_data.columns)
@@ -130,7 +88,7 @@ meta_data = meta_data[['id'
        , 'pyrazinamide'
        , 'dr_mutations_pyrazinamide'
        , 'other_mutations_pyrazinamide'
-        ]] 
+        ]] #19265, 67
 
 # checks
 total_samples = meta_data['id'].nunique() # 19265
@@ -143,7 +101,12 @@ meta_data.head()
 
 # equivalent of table in R
 # pyrazinamide counts
-meta_data.pyrazinamide.value_counts() 
+meta_data.pyrazinamide.value_counts() #12511
+#0.0    10565
+#1.0     1946 {RESULT: No. of Resistant and Susceptible samples}
+
+clear variables
+#del(basedir, datadir, inpath, infile)
 
 #%%
 ############
@@ -154,8 +117,17 @@ meta_data.pyrazinamide.value_counts()
 # (corresponding to a structure)
 # and drop the entries with NA
 #############
-meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
-meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] 
+#2163 {RESULT: samples with dr_muts}
+dr_id = meta_pza['id'].unique()
+
+meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] 
+#526 (RESULT: samples with other_muts)
+other_id = meta_pza['id'].unique()
+
+# FIXME: See if the sample ids are unique in each
+# find any common IDs
+dr_id.isin(other_id[1,1])
 
 del(meta_pza)
 
@@ -188,8 +160,9 @@ del(meta_pnca_other)
 
 # Now extract "all" mutations
 meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
+#2665, 8
 
-meta_pnca_all['id'].nunique() 
+meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
 pnca_samples = len(meta_pnca_all)
 pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
 comp_pnca_samples = pnca_samples - pnca_na 
@@ -468,8 +441,11 @@ meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF
 #=========
 # Step 12a: all SNPs to run mCSM
 #=========
-snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
-pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
+snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336
+pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131
+
+# since 186 is not part of struc: it is one less
+#FIXME!
 
 # assign meaningful colnames 
 #snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
@@ -480,24 +456,27 @@ snps_only.isna().sum() # should be 0
 # specify variable name for output file
 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname1 = '_snps_'
+my_fname1 = '_snps'
 nrows = len(snps_only) 
 
 #output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
 #output_file_path = work_dir + '/Data/'
-output_file_path = data_dir + '/input/processed/' + drug + '/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
 
-if not os.path.exists(output_file_path):
-    print( output_file_path, 'does not exist. Creating')
-    os.makedirs(output_file_path)
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
     exit()
 
-output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
-print(output_filename) #<<<- check
+#out_filename = gene + my_fname1 + str(nrows) + '.csv'
+out_filename = '/' + gene + my_fname1 + '.csv'
+outfile = outdir + out_filename
+print(outfile) #<<<- check
 
 # write to csv: without column or row names
 # Bad practice: numbers at the start of a filename
-snps_only.to_csv(output_filename, header = False, index = False)
+snps_only.to_csv(out_filename, header = False, index = False)
 
 #=========
 # Step 12b: all snps with annotation
@@ -519,12 +498,23 @@ snps_only.to_csv(output_filename, header = False, index = False)
 #my_fname2 = '_snps_with_metadata_'
 #nrows = len(pnca_snps_ALL) 
 
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
 #output_file_path = work_dir + '/Data/'
-#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
-#print(output_filename)  #<<<- check
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
+
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
+    exit()
+
+#out_filename = gene + my_fname2 + str(nrows) + '.csv'
+out_filename = '/' + gene + my_fname1 + '.csv'
+outfile = outdir + out_filename
+print(outfile) #<<<- check
 
 # write out file
-#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
+#pnca_snps_ALL.to_csv(outfile, header = True, index = False)
 
 #=========
 # Step 12c: comp snps for OR calcs with annotation
@@ -547,7 +537,7 @@ meta_pnca_LF2['mutation'].nunique()
 meta_pnca_LF2.groupby('mutation_info').nunique() 
 
 # sanity check
-meta_pnca_LF2['id'].nunique() 
+meta_pnca_LF2['id'].nunique()  #1908
 
 # should be True
 if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
@@ -569,15 +559,26 @@ len(pnca_snps_COMP)
 
 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname3 = '_comp_snps_with_metadata_'
+my_fname3 = '_comp_snps_with_metadata'
 nrows = len(pnca_snps_COMP) 
 
-#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
-#print(output_filename) #<<<-check
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
+
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
+    exit()
+
+#out_filename = gene + my_fname3 + str(nrows) + '.csv'
+#out_filename = '/' + gene + my_fname3 + '.csv'
+#outfile = outdir + out_filename
+#print(outfile) #<<<- check
 
 # write out file
-#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
-
+#pnca_snps_COMP.to_csv(outfile, header = True, index = False)
 
 #=========
 # Step 12d: comp snps only
@@ -589,15 +590,26 @@ snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
 
 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname1 = '_comp_snps_'
+my_fname1 = '_comp_snps'
 nrows = len(snps_only) 
 
-output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
-print(output_filename) #<<<- check
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
+
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
+    exit()
+
+#out_filename = gene + my_fname1 + str(nrows) + '.csv'
+out_filename = '/' + gene + my_fname1 + '.csv'
+outfile = outdir + out_filename
+print(outfile) #<<<- check
 
 # write to csv: without column or row names
-snps_only.to_csv(output_filename, header = False, index = False)
-
+snps_only.to_csv(outfile, header = False, index = False)
 
 #=#=#=#=#=#=#=#
 # COMMENT: LF1 is the file to extract all unique snps for mcsm 
@@ -619,8 +631,21 @@ gene = 'pnca'
 #drug = 'pyrazinamide'
 my_fname4 = '_metadata'
 #nrows = len(meta_pnca_LF1)
-output_filename = output_file_path + gene + my_fname4  + '.csv'
-print(output_filename) #<<<-check
+
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
+
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
+    exit()
+
+#out_filename = gene + my_fname1 + str(nrows) + '.csv'
+out_filename = '/' + gene + my_fname4 + '.csv'
+outfile = outdir + out_filename
+print(outfile) #<<<- check
 
 # write out file
-meta_pnca_LF1.to_csv(output_filename) 
+meta_pnca_LF1.to_csv(outfile) 
diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py
index bf6561b..e83ce2c 100755
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@@ -6,7 +6,7 @@ Created on Tue Jun 18 11:32:28 2019
 @author: tanushree
 """
 ############################################
-#load libraries
+# load libraries
 import pandas as pd
 import os
 #############################################
@@ -17,14 +17,29 @@ import os
 # containing GWAS data
 #!#########################!
 
-print(os.getcwd()) 
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-os.chdir(homedir + '/git/Data/input/original') 
+#print(os.getcwd()) 
+#homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+#os.chdir(homedir + '/git/Data/pyrazinamide/input/original') 
 print(os.getcwd())
+
+#%%
+############# specify variables for input and output paths and filenames
+drug = 'pyrazinamide'
+#gene = 'pnca'
+
+datadir = homedir + '/git/Data'
+basedir =  datadir + '/' + drug + '/input'
+
+# input
+inpath = "/original"
+in_filename  = "/aa_codes.csv"
+infile = basedir + inpath + in_filename
+print(infile)
+
 #==========
 #read file
 #==========
-my_aa = pd.read_csv('aa_codes.csv') #20, 6
+my_aa = pd.read_csv(infile) #20, 6
 #assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
 #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
 my_aa = my_aa.set_index('three_letter_code_lower') #20, 5

From 15391a57001cc66e34be42dce7e62d9b3a9b3459 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 28 Jan 2020 10:10:16 +0000
Subject: [PATCH 7/7] saving data_extraction from home

---
 meta_data_analysis/pnca_data_extraction.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py
index e05c39d..2a030f9 100755
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@@ -88,10 +88,10 @@ meta_data = meta_data[['id'
        , 'pyrazinamide'
        , 'dr_mutations_pyrazinamide'
        , 'other_mutations_pyrazinamide'
-        ]] #19265, 67
+        ]] 
 
 # checks
-total_samples = meta_data['id'].nunique() # 19265
+total_samples = meta_data['id'].nunique() 
 
 # counts NA per column
 meta_data.isna().sum()
@@ -101,9 +101,7 @@ meta_data.head()
 
 # equivalent of table in R
 # pyrazinamide counts
-meta_data.pyrazinamide.value_counts() #12511
-#0.0    10565
-#1.0     1946 {RESULT: No. of Resistant and Susceptible samples}
+meta_data.pyrazinamide.value_counts() 
 
 clear variables
 #del(basedir, datadir, inpath, infile)
@@ -160,7 +158,7 @@ del(meta_pnca_other)
 
 # Now extract "all" mutations
 meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
-#2665, 8
+
 
 meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
 pnca_samples = len(meta_pnca_all)