Merge branch 'master' of github.com:tgttunstall/LSHTM_analysis

2020-09-10 16:14:46 +01:00 · 2020-09-10 16:14:46 +01:00 · c9040cad21
commit c9040cad21
parent 2540424308 a04c3d6b0d
21 changed files with 2132 additions and 243 deletions
--- a/scripts/plotting/autoviz.py
+++ b/scripts/plotting/autoviz.py
@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-#=======================================================================
-#%% useful links
-#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
-#https://pypi.org/project/autoviz/
-#=======================================================================
-import os, sys
-import pandas as pd
-import numpy as np
-import re
-import argparse
-from autoviz.AutoViz_Class import AutoViz_Class
-
-homedir = os.path.expanduser('~')
-os.chdir(homedir + '/git/LSHTM_analysis/scripts')
-#%%============================================================================
-# variables
-gene = 'pncA'
-drug = 'pyrazinamide'
-
-#%%============================================================================
-#==============
-# directories
-#==============
-datadir = homedir + '/' + 'git/Data'
-    
-indir = datadir + '/' + drug + '/input'
-    
-outdir = datadir + '/' + drug + '/output'
-
-#=======
-# input
-#=======
-in_filename_plotting = 'car_design.csv'
-in_filename_plotting = gene.lower() + '_all_params.csv'
-infile_plotting = outdir + '/' + in_filename_plotting
-print('plotting file: ', infile_plotting
-      , '\n============================================================')
-#=======================================================================
-plotting_df = pd.read_csv(infile_plotting, sep = ',')
-#Instantiate the AutoViz class
-AV = AutoViz_Class()
-df = AV.AutoViz(infile_plotting)
-#df2 = AV.AutoViz(plotting_df)
-plotting_df.columns[~plotting_df.columns.isin(df.columns)]
--- a/scripts/plotting/barplots_subcolours_aa_PS.R
+++ b/scripts/plotting/barplots_subcolours_aa_PS.R
@ -1,5 +1,4 @@
-#!/usr/bin/env Rscript   
-getwd()
+#!/usr/bin/env Rscript   getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()

@ -19,8 +18,7 @@ getwd()
 library(ggplot2)
 library(data.table)
 source("barplot_colour_function.R")
-#source("subcols_axis.R")
-source("subcols_axis_PS.R")
+source("subcols_axis.R")

 # should return the following dfs, directories and variables
 # mut_pos_cols
@ -161,9 +159,7 @@ min(df$duet_scaled)
 max(df$duet_scaled)

 # sanity checks
-# very important!!!!
 tapply(df$duet_scaled, df$duet_outcome, min)
-
 tapply(df$duet_scaled, df$duet_outcome, max)

 # My colour FUNCTION: based on group and subgroup
@ -241,7 +237,7 @@ outPlot = g +
         , axis.ticks.x = element_blank()) +
  labs(title = ""
       #title = my_title
-       , x = "position"
+       , x = "Position"
       , y = "Frequency")

 print(outPlot)
--- a/scripts/plotting/columns_all_params.csv
+++ b/scripts/plotting/columns_all_params.csv
@ -1,87 +0,0 @@
-,x,,changes,
-1,mutationinformation,,Mutationinformation,
-2,wild_type,,,consider...wild_aa
-3,position,,Position,
-4,mutant_type,,,consider...mutant_aa
-5,chain,,,
-6,ligand_id,,,
-7,ligand_distance,,,
-8,duet_stability_change,,,
-9,duet_outcome,,DUET_outcome,
-10,ligand_affinity_change,,,
-11,ligand_outcome,,Lig_outcome,
-12,duet_scaled,,ratioDUET,
-13,affinity_scaled,,ratioPredAff,
-14,wild_pos,,WildPos,
-15,wild_chain_pos,,,
-16,ddg,,,
-17,contacts,,,
-18,electro_rr,,,
-19,electro_mm,,,
-20,electro_sm,,,
-21,electro_ss,,,
-22,disulfide_rr,,,
-23,disulfide_mm,,,
-24,disulfide_sm,,,
-25,disulfide_ss,,,
-26,hbonds_rr,,,
-27,hbonds_mm,,,
-28,hbonds_sm,,,
-29,hbonds_ss,,,
-30,partcov_rr,,,
-31,partcov_mm,,,
-32,partcov_sm,,,
-33,partcov_ss,,,
-34,vdwclashes_rr,,,
-35,vdwclashes_mm,,,
-36,vdwclashes_sm,,,
-37,vdwclashes_ss,,,
-38,volumetric_rr,,,
-39,volumetric_mm,,,
-40,volumetric_sm,,,
-41,volumetric_ss,,,
-42,wild_type_dssp,,,
-43,asa,,,
-44,rsa,,,
-45,ss,,,
-46,ss_class,,,
-47,chain_id,,,
-48,wild_type_kd,,,
-49,kd_values,,,
-50,rd_values,,,
-51,wt_3letter_caps,,,
-52,mutation,,,
-53,af,,,
-54,beta_logistic,,,
-55,or_logistic,,,
-56,pval_logistic,,,
-57,se_logistic,,,
-58,zval_logistic,,,
-59,ci_low_logistic,,,
-60,ci_hi_logistic,,,
-61,or_mychisq,,,
-62,or_fisher,,,
-63,pval_fisher,,,
-64,ci_low_fisher,,,
-65,ci_hi_fisher,,,
-66,est_chisq,,,
-67,pval_chisq,,,
-68,chromosome_number,,,
-69,ref_allele,,,
-70,alt_allele,,,
-71,mut_type,,,
-72,gene_id,,,
-73,gene_number,,,
-74,mut_region,,,
-75,mut_info,,,
-76,chr_num_allele,,,
-77,wt_3let,,,
-78,mt_3let,,,
-79,af_kin,,,
-80,or_kin,,,
-81,pwald_kin,,,
-82,beta_kin,,,
-83,se_kin,,,
-84,logl_h1_kin,,,
-85,l_remle_kin,,,
-86,n_miss,,,
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -6,6 +6,7 @@
 # 2) <gene>_meta_data.csv

 # Output: 
+# 1) muts with opposite effects on stability
 # 2) large combined df including NAs for AF, OR,etc
 # 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
 # 3) small combined df including NAs for AF, OR, etc.
@ -36,17 +37,23 @@ source("plotting_data.R")
 # dup_muts

 cat("Directories imported:"
+    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)

 cat("Variables imported:"
+    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
-    , "\nLength of upos:", length(upos)
-    , "\nAngstrom symbol:", angstroms_symbol)     
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+

 # clear excess variable
 rm(my_df, upos, dup_muts)
@ -57,7 +64,6 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u

-
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@ -94,6 +100,8 @@ gene_metadata <- read.csv(infile_gene_metadata
                      , header = T)
 cat("Dim:", dim(gene_metadata))

+table(gene_metadata$mutation_info)
+

 # counting NAs in AF, OR cols
 # or_mychisq
@ -124,6 +132,7 @@ if (identical(sum(is.na(my_df_u$or_kin))
      , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
 }

+str(gene_metadata)

 ###################################################################
 #                           combining: PS
@ -145,7 +154,7 @@ merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))

 cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
           , "\nNo. of merging cols:", length(merging_cols)
-           , "\nMerging columns identified:\n"))
+           , "\nMerging columns identified:"))
 print(merging_cols)

 # important checks!
@ -160,7 +169,7 @@ merged_df2 = merge(x = gene_metadata
                  , by = merging_cols
                  , all.y = T)

-cat("Dim of merged_df2: ", dim(merged_df2), "\n")
+cat("Dim of merged_df2: ", dim(merged_df2))
 head(merged_df2$position)

 # sanity check 
@ -170,10 +179,10 @@ if(nrow(gene_metadata) == nrow(merged_df2)){
      ,"\nExpected no. of rows: ",nrow(gene_metadata) 
      ,"\nGot no. of rows: ", nrow(merged_df2))
 } else{
-  cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+  cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
      , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
      , "\nGot no. of rows: ", nrow(merged_df2)
-      , "\nFinding discrepancy\n")
+      , "\nFinding discrepancy")
  merged_muts_u = unique(merged_df2$mutationinformation)
  meta_muts_u = unique(gene_metadata$mutationinformation)
    # find the index where it differs
@ -227,6 +236,7 @@ if (identical(sum(is.na(merged_df3$or_kin))
      , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
 }

+
 #=========================
 # Merge3: merged_df2_comp
 # same as merge 1 but excluding NAs from ORs, etc.
@ -237,6 +247,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"

 na_count_df2 = sum(is.na(merged_df2$af))
 merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
+
 # sanity check: no +-1 gymnastics
 cat("Checking nrows in merged_df2_comp")
 if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
@ -246,20 +257,22 @@ if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
      , "\nNo. of rows: ", nrow(merged_df2_comp)
      , "\nNo. of cols: ", ncol(merged_df2_comp))
 }else{
-    cat("FAIL: No. of rows mismatch"
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
+  cat("FAIL: No. of rows mismatch"
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      ,"\nGot no. of rows: ", nrow(merged_df2_comp))
 }
+
 #=========================
 # Merge4: merged_df3_comp
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================
-
 na_count_df3 = sum(is.na(merged_df3$af))
 #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
+
 merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
-cat("\nChecking nrows in merged_df3_comp")
+cat("Checking nrows in merged_df3_comp")
+
 if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
  cat("\nPASS: No. of rows match"
      ,"\nDim of merged_df3_comp: "
@ -267,12 +280,11 @@ if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
      , "\nNo. of rows: ", nrow(merged_df3_comp)
      , "\nNo. of cols: ", ncol(merged_df3_comp))
 }else{
-  cat("\nFAIL: No. of rows mismatch"
+  cat("FAIL: No. of rows mismatch"
      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-      , "\nGot no. of rows: ", nrow(merged_df3_comp))
+      ,"\nGot no. of rows: ", nrow(merged_df3_comp))
 }

-  
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$af),]
 bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
@ -307,7 +319,8 @@ all.equal(foo, bar)
 # clear variables
 rm(foo, bar, gene_metadata
   , in_filename_params, infile_params, merging_cols
-   , in_filename_gene_metadata, infile_gene_metadata)
+   , in_filename_gene_metadata, infile_gene_metadata
+   , merged_df2v2, merged_df2v3)
 #*************************
 #####################################################################
 #                       Combining: LIG
@ -334,4 +347,4 @@ if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){

 #==========================================================================
 # end of script
-##=========================================================================
+##==========================================================================
--- a/scripts/plotting/combining_two_df_FIXME.R
+++ b/scripts/plotting/combining_two_df_FIXME.R
--- a/scripts/plotting/corr_plots.R
+++ b/scripts/plotting/corr_plots.R
@ -0,0 +1,263 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Corr plots for PS and Lig 
+
+# Output: 1 svg
+
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+
+source("Header_TT.R")
+require(cowplot)
+source("combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))     
+
+#=======
+# output
+#=======
+# can't combine by cowplot because not ggplots
+#corr_plot_combined = "corr_combined.svg"
+#plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
+
+# PS
+corr_ps = "corr_PS.svg"
+plot_corr_ps =  paste0(plotdir,"/", corr_ps)
+
+# LIG
+corr_lig = "corr_LIG.svg"
+plot_corr_lig =  paste0(plotdir,"/", corr_lig)
+
+####################################################################
+#               end of loading libraries and functions                 #
+########################################################################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+df_ps = merged_df3_comp 
+df_lig = merged_df3_comp_lig
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Data for Correlation plots:PS
+#===========================
+table(df_ps$duet_outcome)
+
+# adding log cols 
+df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
+df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
+
+df_ps$log10_or_kin = log10(df_ps$or_kin)
+df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
+
+# subset data to generate pairwise correlations
+cols_to_select =  c("duet_scaled"
+                    
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    
+                    , "log10_or_kin"
+                    , "neglog_pwald_kin"
+                    
+                    , "af"
+                    
+                    , "duet_outcome"
+                    , drug)
+
+corr_data_ps = df_ps[, cols_to_select]
+
+dim(corr_data_ps)
+
+#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
+#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
+
+# assign nice colnames (for display)
+my_corr_colnames = c("DUET"
+                     
+                     , "Log(OR)"
+                     , "-Log(P)"
+                     
+                     , "Log(OR adjusted)"
+                     , "-Log(P wald)"
+                     
+                     , "AF"
+                     
+                     , "duet_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_ps)
+colnames(corr_data_ps) <- my_corr_colnames
+colnames(corr_data_ps)
+
+#-----------------
+# generate corr PS plot
+#-----------------
+start = 1
+end = which(colnames(corr_data_ps) == drug); end # should be the last column
+offset = 1
+
+my_corr_ps = corr_data_ps[start:(end-offset)]
+head(my_corr_ps)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+cat("Corr plot PS:", plot_corr_ps)
+svg(plot_corr_ps, width = 15, height = 15)
+
+OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))]
+             , pch = 21
+             , jitter = T
+             #, alpha = .05
+             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 2.1
+             , cex.cor = 1
+             , smooth = F
+)
+
+print(OutPlot1)
+dev.off()
+
+#===========================
+# Data for Correlation plots: LIG
+#===========================
+table(df_lig$ligand_outcome)
+
+df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
+df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
+
+
+df_lig$log10_or_kin = log10(df_lig$or_kin)
+df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
+
+
+# subset data to generate pairwise correlations
+cols_to_select =  c("affinity_scaled"
+                    
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    
+                    , "log10_or_kin"
+                    , "neglog_pwald_kin"
+                    
+                    , "af"
+                    
+                    , "ligand_outcome"
+                    , drug)
+
+corr_data_lig = df_lig[, cols_to_select]
+
+
+dim(corr_data_lig)
+
+# assign nice colnames (for display)
+my_corr_colnames = c("Ligand Affinity"
+                     
+                     , "Log(OR)"
+                     , "-Log(P)"
+                     
+                     , "Log(OR adjusted)"
+                     , "-Log(P wald)"
+                     
+                     , "AF"
+                     
+                     , "ligand_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_lig)
+colnames(corr_data_lig) <- my_corr_colnames
+colnames(corr_data_lig)
+
+#-----------------
+# generate corr LIG plot
+#-----------------
+
+start = 1
+end = which(colnames(corr_data_lig) == drug); end # should be the last column
+offset = 1
+
+my_corr_lig = corr_data_lig[start:(end-offset)]
+head(my_corr_lig)
+
+cat("Corr LIG plot:", plot_corr_lig)
+svg(plot_corr_lig, width = 15, height = 15)
+
+OutPlot2  = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
+                  , method = "spearman" # correlation method
+                  , hist.col = "grey" ##00AFBB
+                  , density = TRUE  # show density plots
+                  , ellipses = F # show correlation ellipses
+                  , stars = T
+                  , rug = F
+                  , breaks = "Sturges"
+                  , show.points = T
+                  , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))]
+                  , pch = 21
+                  , jitter = T
+                  #, alpha = .05
+                  #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+                  , cex = 3
+                  , cex.axis = 2.5
+                  , cex.labels = 2.1
+                  , cex.cor = 1
+                  , smooth = F
+)
+
+print(OutPlot2)
+dev.off()
+#######################################################
--- a/scripts/plotting/dirs.R
+++ b/scripts/plotting/dirs.R
@ -0,0 +1,56 @@
+#!/usr/bin/env Rscript             
+#########################################################
+# TASK: formatting data that will be used for various plots
+
+# useful links
+#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
+#########################################################
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+#source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+library(dplyr)
+
+require("getopt", quietly = TRUE) #cmd parse arguments
+#========================================================
+# command line args
+#spec = matrix(c(
+#  "drug"   , "d", 1, "character",
+#  "gene"   , "g", 1, "character"
+#), byrow = TRUE, ncol = 4)
+
+#opt = getopt(spec)
+
+#drug = opt$druggene = opt$gene
+
+#if(is.null(drug)|is.null(gene)) {
+#  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+#}
+#========================================================
+# FIXME: change to cmd
+#%% variable assignment: input and output paths & filenames
+drug = "pyrazinamide"
+gene = "pncA"
+gene_match = paste0(gene,"_p.")
+cat(gene_match)
+
+#=============
+# directories and variables
+#=============
+datadir = paste0("~/git/Data")
+indir = paste0(datadir, "/", drug, "/input")
+outdir = paste0("~/git/Data", "/", drug, "/output")
+plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
+
+dr_muts_col = paste0('dr_mutations_', drug)
+other_muts_col  = paste0('other_mutations_', drug)
+resistance_col = "drtype"
+
+#%%===============================================================
+
+
+
--- a/scripts/plotting/lineage_basic_barplot.R
+++ b/scripts/plotting/lineage_basic_barplot.R
@ -0,0 +1,214 @@
+#!/usr/bin/env Rscript       
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+#########################################################
+# TASK: Basic lineage barplot showing numbers
+
+# Output: Basic barplot with lineage samples and mut count
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+source("Header_TT.R")
+require(data.table)
+source("combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\nNA count for ORs:", na_count
+    , "\nNA count in df2:", na_count_df2
+    , "\nNA count in df3:", na_count_df3
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+
+
+#===========
+# input
+#===========
+# output of combining_dfs_plotting.R
+
+#=======
+# output
+#=======
+# plot 1
+basic_bp_lineage = "basic_lineage_barplot.svg"
+plot_basic_bp_lineage  =  paste0(plotdir,"/", basic_bp_lineage)
+
+#=======================================================================
+#================
+# Data for plots:
+# you need merged_df2, comprehensive one
+# since this has one-many relationship
+# i.e the same SNP can belong to multiple lineages
+#================
+# REASSIGNMENT as necessary
+my_df  = merged_df2
+
+# clear excess variable
+rm(merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+#==========================
+# Plot: Lineage barplot
+# x = lineage y = No. of samples
+# col = Lineage
+# fill = lineage
+#============================
+table(my_df$lineage)
+as.data.frame(table(my_df$lineage))
+
+#=============
+# Data for plots
+#=============
+# REASSIGNMENT
+df <- my_df
+
+rm(my_df)
+
+# get freq count of positions so you can subset freq<1
+#setDT(df)[, lineage_count := .N, by = .(lineage)]
+
+#******************
+# generate plot: barplot of mutation by lineage
+#******************
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4"
+                 #, "lineage5"
+                 #, "lineage6"
+                 #, "lineage7"
+                 )
+
+df_lin = subset(df, subset = lineage %in% sel_lineages)
+
+# Create df with lineage inform & no. of unique mutations
+# per lineage and total samples within lineage
+# this is essentially barplot with two y axis
+
+bar = bar = as.data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(df$id)[df$lineage==i])
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+  
+  foo = df[df$lineage==i,]
+  print(paste0(i, "======="))
+  print(length(unique(foo$mutationinformation)))
+  curr_count = length(unique(foo$mutationinformation))
+
+  total_snps_u = c(total_snps_u, curr_count)
+}
+
+print(total_snps_u)
+bar$num_snps_u = total_snps_u
+bar$total_samples = total_samples
+bar
+
+#*****************
+# generate plot: lineage barplot with two y-axis
+#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
+#*****************
+
+y1 = bar$num_snps_u
+y2 = bar$total_samples
+x = sel_lineages
+
+to_plot = data.frame(x = x
+                      , y1 = y1
+                      , y2 = y2)
+to_plot
+
+# FIXME later: will be depricated!
+melted = melt(to_plot, id = "x")
+melted
+
+
+svg(plot_basic_bp_lineage)
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(melted, aes(x = x
+                 , y = value
+                 , fill = variable))
+
+printFile = g + geom_bar(stat = "identity"
+                         , position = position_stack(reverse = TRUE)
+                         , alpha=.75
+                         , colour='grey75') + 
+  theme(axis.text.x = element_text(size = my_ats)
+        , axis.text.y = element_text(size = my_ats
+                                     #, angle = 30
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als
+                                      , colour = 'black')
+        , axis.title.y = element_text(size = my_als
+                                      , colour = 'black')
+        , legend.position = "top"
+        , legend.text = element_text(size = my_als)) + 
+          #geom_text() +
+          geom_label(aes(label = value)
+                     , size = 5
+                     , hjust = 0.5
+                     , vjust = 0.5
+                     , colour = 'black'
+                     , show.legend = FALSE
+                     #, check_overlap = TRUE
+                     , position = position_stack(reverse = T)) + 
+          labs(title = ''
+               , x = ''
+               , y = "Number"
+               , fill = 'Variable'
+               , colour = 'black') + 
+          scale_fill_manual(values = c('grey50', 'gray75')
+                            , name=''
+                            , labels=c('Mutations', 'Total Samples')) +
+          scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+                           , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))
+
+print(printFile)
+dev.off()
--- a/scripts/plotting/lineage_dist_combined_PS.R
+++ b/scripts/plotting/lineage_dist_combined_PS.R
@ -0,0 +1,301 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Lineage dist plots: ggridges
+
+# Output: 2 SVGs for PS stability
+
+# 1) all muts
+# 2) dr_muts
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R")
+library(ggridges)
+source("combining_dfs_plotting.R")
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\nNA count for ORs:", na_count
+    , "\nNA count in df2:", na_count_df2
+    , "\nNA count in df3:", na_count_df3
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+
+#=======
+# output
+#=======
+lineage_dist_combined = "lineage_dist_combined_PS.svg"
+plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
+#========================================================================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+# REASSIGNMENT
+my_df  = merged_df2
+
+# delete variables not required
+rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
+table(my_df_dr$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot 1: ALL Muts
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+my_plot_name = 'lineage_dist_PS.svg'
+
+plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
+
+#===================
+# Data for plots
+#===================
+table(my_df$lineage); str(my_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4"
+                 #, "lineage5"
+                 #, "lineage6"
+                 #, "lineage7"
+                 )
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+table(df_lin$lineage)
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
+
+table(df_lin$lineage)#{RESULT: No of samples within lineage}
+
+length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
+
+length(df_lin$mutationinformation)
+
+u2 = unique(my_df$mutationinformation)
+u = unique(df_lin$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df <- df_lin
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
+              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
+              )
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
+                    # , 'lineage5', 'lineage6', 'lineage7'
+                     )
+# check plot name
+plot_lineage_duet
+
+# output svg
+#svg(plot_lineage_duet)
+p1 = ggplot(df, aes(x = duet_scaled
+                            , y = duet_outcome))+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+             #, switch = 'x'
+              , labeller = labeller(lineage = my_labels) ) +
+  coord_cartesian( xlim = c(-1, 1)) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = my_als-5)
+         , legend.title = element_text(size = my_als)
+) 
+
+print(p1)
+#dev.off()
+
+#######################################################################
+# lineage distribution plot for dr_muts
+#######################################################################
+
+#==========================
+# Plot 2: dr muts ONLY
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
+
+plot_lineage_dr_duet  =  paste0(plotdir,"/", my_plot_name_dr)
+
+#===================
+# Data for plots
+#===================
+table(my_df_dr$lineage); str(my_df_dr$lineage)
+
+# uncomment as necessary
+df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
+table(df_lin_dr$lineage)
+
+# refactor
+df_lin_dr$lineage = factor(df_lin_dr$lineage)
+
+sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
+
+table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
+
+length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
+
+length(df_lin_dr$mutationinformation)
+
+u2 = unique(my_df_dr$mutationinformation)
+u = unique(df_lin_dr$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df_dr <- df_lin_dr
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin_dr)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+
+# check plot name
+plot_lineage_dr_duet
+
+# output svg
+#svg(plot_lineage_dr_duet)
+p2 = ggplot(df_dr, aes(x = duet_scaled
+                           , y = duet_outcome))+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                               , scale = 3
+                               , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+              #, switch = 'x'
+              , labeller = labeller(lineage = my_labels) ) +
+  coord_cartesian( xlim = c(-1, 1)
+                   #, ylim = c(0, 6)
+                   #, clip = "off" 
+  ) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
+                       , name = "DUET" ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_blank()
+        , axis.title.x = element_blank()
+        , axis.title.y = element_blank()
+        , axis.ticks.y = element_blank()
+        , plot.title = element_blank()
+        , strip.text = element_text(size = my_als)
+        , legend.text = element_text(size = 10)
+        , legend.title = element_text(size = my_als)
+        #, legend.position = "none"
+  ) 
+
+print(p2)
+#dev.off()
+########################################################################
+#==============
+# combine plot
+#===============
+
+svg(plot_lineage_dist_combined, width = 12, height = 6)
+
+printFile = cowplot::plot_grid(p1, p2
+                               , label_size = my_als+10)
+
+print(printFile)
+dev.off()
--- a/scripts/plotting/opp_mcsm_muts.R
+++ b/scripts/plotting/opp_mcsm_muts.R
@ -0,0 +1,96 @@
+#!/usr/bin/env Rscript      
+#########################################################
+# TASK: To write muts with opposite effects on
+# protomer and ligand stability
+#########################################################
+# working dir and loading libraries
+
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("plotting_data.R")
+
+# should return the following dfs, directories and variables
+# my_df
+# my_df_u
+# my_df_u_lig
+# dup_muts
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nLength of upos:", length(upos)
+           , "\nAngstrom symbol:", angstroms_symbol))       
+
+# clear excess variable
+rm(my_df, upos, dup_muts)
+#========================================================
+#===========
+# input
+#===========
+#in_file1: output of plotting_data.R
+# my_df_u
+
+#===========
+# output
+#===========
+# mutations with opposite effects
+out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv")
+outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts)
+
+#%%===============================================================
+
+# spelling Correction 1: DUET incase American spelling needed!
+table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) )
+#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing"
+#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing"
+
+
+# spelling Correction 2: Ligand incase American spelling needed!
+table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) )
+#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing"
+#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing"
+
+
+# muts with opposing effects on protomer and ligand stability
+table(my_df_u$duet_outcome != my_df_u$ligand_outcome)
+changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),]
+
+# sanity check: redundant, but uber cautious!
+dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome)
+ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome)
+
+cat("Identifying muts with opposite stability effects")
+if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) {
+  cat("PASS: muts with opposite effects on stability and affinity correctly identified"
+        , "\nNo. of such muts: ", nrow(changes))
+}else {
+  cat("FAIL: unsuccessful in extracting muts with changed stability effects")
+}
+
+#==========================
+# write file: changed muts
+#==========================
+write.csv(changes, outfile_opp_muts)
+
+cat("Finished writing file for muts with opp effects:"
+    , "\nFilename: ", outfile_opp_muts
+    , "\nDim:", dim(changes))
+
+# clear variables
+rm(out_filename_opp_muts, outfile_opp_muts)
+rm(changes, dl_i, ld_i)
+
+# count na in each column
+na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count
+df_ncols = ncol(my_df_u)
+
+#===================================== end of script
--- a/scripts/plotting/or_plots_combined.R
+++ b/scripts/plotting/or_plots_combined.R
@ -0,0 +1,192 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Bubble plot of OR for PS and Lig 
+
+# Output: 1 svg
+
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+
+source("Header_TT.R")
+require(cowplot)
+source("combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+#=========================
+#=======
+# output
+#=======
+or_combined = "or_combined_PS_LIG.svg"
+plot_or_combined  =  paste0(plotdir,"/", or_combined)
+
+#or_kin_combined = "or_kin_combined_PS_LIG.svg"
+#plot_or_kin_combined  =  paste0(plotdir,"/", or_kin_combined)
+#=======================================================================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+ps_df  = merged_df3_comp
+lig_df = merged_df3_comp_lig
+
+# Ensure correct data type in columns to plot: should be TRUE
+is.numeric(ps_df$or_mychisq)
+is.numeric(lig_df$or_mychisq)
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
+#%% end of section 1
+
+# sanity check: should be <10
+if (max(lig_df$ligand_distance) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+#############
+# Plots: Bubble plot
+# x = Position, Y = stability
+# size of dots = OR
+# col: stability
+#############
+
+#-----------------
+# Plot 1: DUET vs OR by position as geom_points
+#-------------------
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+#ps_df$duet_outcome[ps_df$duet_outcome=='Stabilizing'] <- 'Stabilising'
+#ps_df$duet_outcome[ps_df$duet_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(ps_df$duet_outcome) ; sum(table(ps_df$duet_outcome))
+
+g1 = ggplot(ps_df, aes(x = factor(position)
+                   , y = duet_scaled))
+
+p1 = g1 + 
+  geom_point(aes(col = duet_outcome
+                 , size = or_mychisq))+
+                 #, size = or_kin)) + # not good, almost like log(or)
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "DUET(PS)"
+       , size = "Odds Ratio"
+       , colour = "DUET Outcome") +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p1 
+
+#-------------------
+# generate plot 2: Lig vs OR by position as geom_points
+#-------------------
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#lig_df$ligand_outcome[lig_df$ligand_outcome=='Stabilizing'] <- 'Stabilising'
+#lig_df$ligand_outcome[lig_df$ligand_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(lig_df$ligand_outcome)
+
+g2 = ggplot(lig_df, aes(x = factor(position)
+                   , y = affinity_scaled))
+
+p2 = g2 + 
+  geom_point(aes(col = ligand_outcome
+                 , size = or_mychisq))+
+                 #, size = or_kin)) + # not good! almost like log(or)
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "Ligand Affinity"
+       , size = "Odds Ratio"
+       , colour = "Ligand Outcome"
+       ) +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p2
+
+#======================
+# combine using cowplot
+#======================
+
+svg(plot_or_combined, width = 32, height = 12)
+#svg(plot_or_kin_combined, width = 32, height = 12)
+
+theme_set(theme_gray()) # to preserve default theme
+
+printFile = cowplot::plot_grid(plot_grid(p1, p2
+                             , ncol = 1
+                             , align = 'v'
+                             , labels = c("", "")
+                             , label_size = my_als+5))
+print(printFile)
+dev.off()
+
--- a/scripts/plotting/plotting_data.R
+++ b/scripts/plotting/plotting_data.R
@ -15,7 +15,10 @@ library(ggplot2)
 library(data.table)
 library(dplyr)

+source("dirs.R")
+
 require("getopt", quietly = TRUE) #cmd parse arguments
+
 #========================================================
 # command line args
 #spec = matrix(c(
@ -31,19 +34,6 @@ require("getopt", quietly = TRUE) #cmd parse arguments
 #  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 #}
 #========================================================
-#%% variable assignment: input and output paths & filenames
-drug = "pyrazinamide"
-gene = "pncA"
-gene_match = paste0(gene,"_p.")
-cat(gene_match)
-
-#=============
-# directories
-#=============
-datadir = paste0("~/git/Data")
-indir = paste0(datadir, "/", drug, "/input")
-outdir = paste0("~/git/Data", "/", drug, "/output")
-plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
 #======
 # input
 #======
@ -52,6 +42,17 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv")
 infile_params = paste0(outdir, "/", in_filename_params)
 cat(paste0("Input file 1:", infile_params) )

+
+cat('columns based on variables:\n'
+      , drug
+      , '\n'
+      , dr_muts_col
+      , '\n'
+      , other_muts_col
+      , "\n"
+      , resistance_col
+      , '\n===============================================================')
+
 #%%===============================================================
 ###########################
 # Read file: struct params
@ -62,9 +63,6 @@ my_df = read.csv(infile_params, header = T)

 cat("\nInput dimensions:", dim(my_df)) 

-# quick checks
-#colnames(my_df)
-#str(my_df)

 ###########################
 # extract unique mutation entries
--- a/scripts/plotting/ps_plots_combined.R
+++ b/scripts/plotting/ps_plots_combined.R
@ -0,0 +1,251 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: AF, OR and stability plots: PS
+# Output: 1 SVG
+
+#########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+#source("combining_dfs_plotting.R")
+source("barplot_colour_function.R")
+source("subcols_axis.R")
+
+###########################
+# Data for PS plots
+# you need merged_df3_comp/merged_df_comp
+# since these have unique SNPs
+###########################
+
+no_or_index = which(is.na(my_df_u_cols$or_mychisq))
+
+my_df_u_cols_clean = my_df_u_cols[-no_or_index,]
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df = my_df_u_cols_clean
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+cols_to_select = colnames(mut_pos_cols)
+
+mut_pos_cols_clean = df[colnames(df)%in%cols_to_select]
+mut_pos_cols_clean = unique(mut_pos_cols_clean[, 1:length(mut_pos_cols_clean)])
+  
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Barplot with axis colours:
+#===========================
+#================
+# Inspecting mut_pos_cols
+# position numbers and colours and assigning axis colours based on lab_fg
+# of the correct df
+# open file from desktop ("sample_axis_cols") for cross checking
+#================
+# very important!
+#my_axis_colours = mut_pos_cols$lab_fg
+
+if (nrow(mut_pos_cols_clean) == length(unique(df$position)) ){
+  print("PASS: lengths checked, assigning axis colours")
+  my_axis_colours = mut_pos_cols_clean$lab_fg
+  cat("length of axis colours:", length(my_axis_colours)
+      , "\nwhich corresponds to the number of positions on the x-axis of the plot")
+}else{
+  print("FAIL:lengths mismatch, could not assign axis colours")
+  quit()
+}
+
+# unique positions
+upos = unique(df$position); length(upos)
+
+table(df$duet_outcome)
+
+# add frequency of positions (from lib data.table)
+setDT(df)[, pos_count := .N, by = .(position)]
+
+# this is cummulative
+table(df$pos_count)
+
+# use group by on this
+library(dplyr)
+snpsBYpos_df <- df %>%
+  group_by(position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+
+snp_count = sort(unique(snpsBYpos_df$snpsBYpos))
+
+# sanity checks
+# should be a factor
+if (is.factor(df$duet_outcome)){
+  print("duet_outcome is factor")
+}else{
+  print("converting duet_outcome to factor")
+  df$duet_outcome = as.factor(df$duet_outcome)
+}
+
+is.factor(df$duet_outcome)
+
+# should be -1 and 1
+min(df$duet_scaled)
+max(df$duet_scaled)
+
+# sanity checks
+tapply(df$duet_scaled, df$duet_outcome, min)
+tapply(df$duet_scaled, df$duet_outcome, max)
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = duet_outcome
+# subgroup = normalised score i.e duet_scaled
+
+# check unique values in normalised data
+u = unique(df$duet_scaled) 
+cat("No. of unique values in normalised data:", length(u))
+
+# Define group
+# Create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$duet_scaled # no rounding
+df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "") 
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp")
+print(paste0("Colour palette generated for: ", length(colours), " colours"))
+my_title = "Protein stability (DUET)"
+cat("No. of axis colours: ", length(my_axis_colours))
+
+#******************
+# generate plot: barplot unordered by frequency with axis colours
+#******************
+class(df$lab_bg)
+
+# define cartesian coord
+my_xlim = length(unique(df$position)); my_xlim
+
+# axis label size
+my_xals = 18
+my_yals = 18
+
+# axes text size
+my_xats = 14
+my_yats = 18
+
+g3 = ggplot(df, aes(factor(position, ordered = T)))
+
+p3 = g3 + 
+  coord_cartesian(xlim = c(1, my_xlim)
+                  #, ylim = c(0, 6)
+                  , ylim = c(0, max(snp_count))
+                  , clip = "off") +
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual(values = colours
+                    , guide = "none") +
+  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
+            , fill = df$lab_bg) +
+  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
+            , fill = df$lab_bg2) +
+  
+  # Here it"s important to specify that your axis goes from 1 to max number of levels
+  theme(axis.text.x = element_text(size = my_xats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4
+                                   , colour = my_axis_colours)
+        , axis.text.y = element_text(size = my_yats 
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_xals)
+        , axis.title.y = element_text(size = my_yals+2 ) 
+        , axis.ticks.x = element_blank()) +
+  labs(title = ""
+       #title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+p3
+#=================
+# generate plot: AF by position unordered, not coloured
+#=================  
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g1 = ggplot(df)
+p1 = g1 + 
+  geom_bar(aes(x = factor(position, ordered = T)
+               , y = af*100
+               #, fill = duet_outcome
+               )
+           , color = "black"
+           , fill = "lightgrey"
+           , stat = "identity") +
+  theme(axis.text.x = element_blank()
+         , axis.text.y = element_text(size = my_ats
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_blank()
+         , axis.title.y = element_text(size = my_als) ) +
+  labs(title = ""
+       #, size = 100
+       #, x = "Wild-type position"
+       , y = "AF(%)")
+       
+p1      
+#=================
+# generate plot: OR by position unordered
+#=================  
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+
+g2 = ggplot(df)
+
+p2 = g2 + 
+  geom_bar(aes(x = factor(position, ordered = T)
+               , y = or_mychisq
+               #, fill = duet_outcome
+               ) 
+           , colour = "black"
+           , fill = "lightgray"
+           , stat = "identity") +
+  theme(axis.text.x = element_blank()
+         , axis.text.y = element_text(size = my_ats
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_blank()
+         , axis.title.y = element_text(size = my_als) ) +
+  labs(#title = "OR by position"
+       #, x = "Wild-type position"
+       y = "OR") 
+
+p2
+
+########################################################################
+#               			end of DUET barplots         			               #
+########################################################################
+#=======
+# output
+#=======
+ps_plots_combined = "af_or_combined_PS.svg"
+plot_ps_plots_combined  =  paste0(plotdir,"/", ps_plots_combined )
+
+svg(plot_ps_plots_combined , width = 26, height = 12)
+
+#theme_set(theme_gray()) # to preserve default theme
+printFile = cowplot::plot_grid(p1, p2, p3
+                            , ncol = 1
+                            , align = 'v')
+printFile
+dev.off()
--- a/scripts/plotting/resolving_ambiguous_muts.R
+++ b/scripts/plotting/resolving_ambiguous_muts.R
@ -0,0 +1,136 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: To resolve ambiguous muts present in <gene>_metadata.csv
+#(which is one of the outputs from python script)
+
+# Input csv file: <gene>_metadata.csv
+
+# Output csv file: <gene>_metadata_clean.csv
+
+#########################################################
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("dirs.R")
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+
+#========================================================
+#===========
+# input
+#===========
+# infile1: gene associated meta data
+gene_metadata_raw = paste0(tolower(gene),  "_metadata_raw.csv")
+infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw)
+cat("Input infile 1:", infile_gene_metadata_raw)
+
+# infile 2: ambiguous muts
+ambiguous_muts = paste0(tolower(gene),  "_ambiguous_mut_names.csv")
+infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts)
+cat("Input infile 2:", infile_ambiguous_muts)
+
+#===========
+# output
+#===========
+# clean gene_metadat with ambiguous muts resolved
+gene_metadata_clean = paste0(tolower(gene), "_metadata.csv")
+outfile_gene_metadata_clean  = paste0(outdir, "/", gene_metadata_clean)
+cat("Output file:", outfile_gene_metadata_clean)
+
+#%%===============================================================
+
+###########################
+# Read file: <gene>_meta data raw.csv
+###########################
+cat("Reading meta data file:", infile_gene_metadata_raw)
+
+gene_metadata_raw <- read.csv(infile_gene_metadata_raw
+                      , stringsAsFactors = F
+                      , header = T)
+
+cat("Dim:", dim(gene_metadata_raw))
+
+###########################
+# Read file: ambiguous muts.csv
+##########################
+ambiguous_muts = read.csv(infile_ambiguous_muts)
+
+ambiguous_muts_names = ambiguous_muts$mutation
+
+common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,]
+
+#==============
+# resolving ambiguous muts
+#===============
+table(gene_metadata_raw$mutation_info)
+count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info)))
+count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0
+                            , "ambi", "pass")
+table(count_check$checks)
+
+cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these")
+
+# remove all ambiguous muts from df
+ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids)
+gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,]
+
+# sanity checks: should be true
+table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous)
+nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw)
+
+# check before resolving ambiguous muts
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
+
+# resolving ambiguous muts: change other_muts to dr_muts
+common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
+
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info =  as.character(common_muts_all$mutation_info)
+
+# combining to a clean df
+gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all)
+all(dim(gene_metadata) == dim(gene_metadata))
+table(gene_metadata$mutation_info)
+
+count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation
+                                         , gene_metadata$mutation_info)))
+
+count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0
+                            , "ambi", "pass")
+
+if (table(count_check2$checks) == nrow(count_check2)){
+  cat("PASS: ambiguous muts successfully resolved."
+      , "\nwriting file")
+}else{
+  print("FAIL: ambiguous muts could not be resolved!")
+  quit()
+}
+#============
+# writing file
+#=============
+write.csv(gene_metadata, outfile_gene_metadata_clean
+          , row.names = FALSE)
+
+#=====================================================================
+# End of script
+#======================================================================
--- a/scripts/plotting/running_plot_scripts
+++ b/scripts/plotting/running_plot_scripts
@ -1,12 +1,26 @@
 #========
 # plotting
 #========
+FIXME:
+
+#=======================
+dirs.R
+#=======================
+#dirs.R: imports dir structure
+change this to cmd so you can pass in drug and gene name

 #=======================
 plotting_data.R
 #=======================
 ??? update how to run

+
+#=======================
+combining_dfs_plotting.R
+#=======================
+??? update how to run
+
+
 #=======================
 mcsm_mean_stability.R
 #=======================
@ -58,6 +72,13 @@ barplots_subcolours_PS.R
 # output plots: 1 svg
    1) barplot_coloured_PS.svg

+#=======================
+lineage_dist_PS.R
+# lineage distribution for all muts and dr muts in one figure
+#=======================
+# input: calls the "combining_dfs_plotting.R"
+# output plots: 1 svg
+    1) lineage_dist_combined_PS.svg

    
 ########################################################################
--- a/scripts/plotting/test.R
+++ b/scripts/plotting/test.R
@ -0,0 +1,96 @@
+setwd("/home/tanu/git/LSHTM_analysis/scripts/plotting")
+
+source("combining_dfs_plotting.R")
+
+table(merged_df3$mutation_info)
+
+# assign foldx
+#ddg<0 = "Stabilising" (-ve)
+table(merged_df3$ddg < 0)
+merged_df3$foldx_outcome = ifelse(merged_df3$ddg < 0, "Stabilising", "Destabilising")
+
+#===========
+# PS data
+#===========
+dr_muts = merged_df3[merged_df3$mutation_info == "dr_mutations_pyrazinamide",]
+other_muts = merged_df3[merged_df3$mutation_info == "other_mutations_pyrazinamide",]
+
+par(mfrow = c(1,1))
+par(mfrow = c(2,6))
+
+# mcsm duet
+boxplot(dr_muts$duet_scaled, other_muts$duet_scaled, main = "DUET"
+        #, col = factor(merged_df3$duet_outcome)
+        )
+wilcox.test(dr_muts$duet_scaled, other_muts$duet_scaled, paired = F)
+
+# foldx ddg
+boxplot(dr_muts$ddg, other_muts$ddg, main = "Foldx")
+wilcox.test(dr_muts$ddg, other_muts$ddg, paired = F)
+
+# rd
+boxplot(dr_muts$rd_values, other_muts$rd_values, main = "RD")
+wilcox.test(dr_muts$rd_values, other_muts$rd_values)
+
+# kd
+boxplot(dr_muts$kd_values, other_muts$kd_values, main = "KD")
+wilcox.test(dr_muts$kd_values, other_muts$kd_values)
+
+# asa
+boxplot(dr_muts$asa, other_muts$asa, main = "ASA")
+wilcox.test(dr_muts$asa, other_muts$asa)
+
+# rsa
+boxplot(dr_muts$rsa, other_muts$rsa, main = "RSA")
+wilcox.test(dr_muts$rsa, other_muts$rsa)
+
+#===================================================================
+#==========
+# LIG data
+#==========
+dr_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "dr_mutations_pyrazinamide",]
+other_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "other_mutations_pyrazinamide",]
+
+# mcsm ligand affinity
+boxplot(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, main = "Ligand affinity")
+wilcox.test(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, paired = F)
+
+# rd
+boxplot(dr_muts_lig$rd_values, other_muts_lig$rd_values, main = "RD")
+wilcox.test(dr_muts_lig$rd_values, other_muts_lig$rd_values)
+
+# kd
+boxplot(dr_muts_lig$kd_values, other_muts_lig$kd_values, main = "KD")
+wilcox.test(dr_muts_lig$kd_values, other_muts_lig$kd_values)
+
+# asa
+boxplot(dr_muts_lig$asa, other_muts_lig$asa, main = "ASA")
+wilcox.test(dr_muts_lig$asa, other_muts_lig$asa)
+
+# rsa
+boxplot(dr_muts_lig$rsa, other_muts_lig$rsa, main = "RSA")
+wilcox.test(dr_muts_lig$rsa, other_muts_lig$rsa)
+
+# checking agreement b/w mcsm and foldx
+cols_to_select = c("mutationinformation"
+                   , "mutation_info"
+                   , "duet_scaled"
+                   , "ddg"
+                   , "duet_outcome"
+                   , "foldx_outcome")
+
+merged_df3_short = select(merged_df3, cols_to_select)
+
+mcsm_foldx = merged_df3_short[which(merged_df3_short$duet_outcome != merged_df3_short$foldx_outcome),]
+
+mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL")
+table(mcsm_foldx$sign_comp)
+
+# another way of checking
+merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL")
+table(merged_df3$sign_comp)
+
+disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100
+agreement = 100 - disagreement
+
+cat("There is", agreement, "% between mcsm and foldx predictions")