From 3f8d6695a45c4aed8922552e16238f1cb43d60c1 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 26 Aug 2020 17:18:45 +0100
Subject: [PATCH] all barplots generated for ps and lig

---
 scripts/plotting/autoviz.py                   |  45 +++
 scripts/plotting/barplots_subcolours_aa_LIG.R | 265 ++++++++++++++++++
 scripts/plotting/basic_barplots_LIG.R         | 193 +++++++++++++
 scripts/plotting/columns_all_params.csv       |  87 ++++++
 scripts/plotting/notes                        |  66 +++++
 5 files changed, 656 insertions(+)
 create mode 100644 scripts/plotting/autoviz.py
 create mode 100755 scripts/plotting/barplots_subcolours_aa_LIG.R
 create mode 100755 scripts/plotting/basic_barplots_LIG.R
 create mode 100644 scripts/plotting/columns_all_params.csv
 create mode 100644 scripts/plotting/notes

diff --git a/scripts/plotting/autoviz.py b/scripts/plotting/autoviz.py
new file mode 100644
index 0000000..8dc6405
--- /dev/null
+++ b/scripts/plotting/autoviz.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+#=======================================================================
+#%% useful links
+#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
+#https://pypi.org/project/autoviz/
+#=======================================================================
+import os, sys
+import pandas as pd
+import numpy as np
+import re
+import argparse
+from autoviz.AutoViz_Class import AutoViz_Class
+
+homedir = os.path.expanduser('~')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+#%%============================================================================
+# variables
+gene = 'pncA'
+drug = 'pyrazinamide'
+
+#%%============================================================================
+#==============
+# directories
+#==============
+datadir = homedir + '/' + 'git/Data'
+    
+indir = datadir + '/' + drug + '/input'
+    
+outdir = datadir + '/' + drug + '/output'
+
+#=======
+# input
+#=======
+in_filename_plotting = 'car_design.csv'
+in_filename_plotting = gene.lower() + '_all_params.csv'
+infile_plotting = outdir + '/' + in_filename_plotting
+print('plotting file: ', infile_plotting
+      , '\n============================================================')
+#=======================================================================
+plotting_df = pd.read_csv(infile_plotting, sep = ',')
+#Instantiate the AutoViz class
+AV = AutoViz_Class()
+df = AV.AutoViz(infile_plotting)
+#df2 = AV.AutoViz(plotting_df)
+plotting_df.columns[~plotting_df.columns.isin(df.columns)]
diff --git a/scripts/plotting/barplots_subcolours_aa_LIG.R b/scripts/plotting/barplots_subcolours_aa_LIG.R
new file mode 100755
index 0000000..f00466b
--- /dev/null
+++ b/scripts/plotting/barplots_subcolours_aa_LIG.R
@@ -0,0 +1,265 @@
+#!/usr/bin/env Rscript   
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+#########################################################
+# TASK: output barplot by position with each bar coloured by 
+# its stability value and active site positions indicated 
+# according to colour specified in "subcols_axis_PS.R"
+#########################################################
+
+#=======================================================================
+
+############################################################
+# 1: Installing and loading required packages and functions
+############################################################
+
+#source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+source("barplot_colour_function.R")
+#source("subcols_axis.R")
+source("subcols_axis_PS.R")
+
+# should return the following dfs, directories and variables
+# mut_pos_cols
+# mut_pos_cols_lig
+# my_df_cols
+# my_df_u_cols
+# my_df_u_lig_cols
+# dup_muts_cols
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nLength of upos:", length(upos)
+           , "\nAngstrom symbol:", angstroms_symbol))
+
+# clear excess variable
+rm(dup_muts_cols, mut_pos_cols, my_df_cols, my_df_u_cols, upos)
+
+#=======================================================================
+# !!! very important!!!!
+#================
+# Inspecting mut_pos_cols
+# position numbers and colours and assigning axis colours based on lab_fg
+# of the correct df
+# open file from desktop ("sample_axis_cols") for cross checking
+#================
+if ( nrow(mut_pos_cols_lig) == length(unique(my_df_u_cols_lig$position)) ){
+  print("PASS: lengths checked, assigning axis colours")
+  my_axis_colours = mut_pos_cols_lig$lab_fg
+  cat("length of axis colours:", length(my_axis_colours)
+      , "\nwhich corresponds to the number of positions on the x-axis of the plot")
+}else{
+  print("FAIL:lengths mismatch, could not assign axis colours")
+  quit()
+}
+
+# further sanity checks
+table(mut_pos_cols_lig$lab_bg)
+check_lab_bg = sum( table(mut_pos_cols_lig$lab_bg) ) == nrow(mut_pos_cols_lig) # should  be True
+check_lab_bg    
+
+table(mut_pos_cols_lig$lab_bg2)
+check_lab_bg2 = sum( table(mut_pos_cols_lig$lab_bg2) ) ==  nrow(mut_pos_cols_lig) # should  be True
+check_lab_bg2
+
+table(mut_pos_cols_lig$lab_fg)
+check_lab_fg = sum( table(mut_pos_cols_lig$lab_fg) ) ==  nrow(mut_pos_cols_lig) # should  be True
+check_lab_fg
+
+# sanity checks:
+if (check_lab_bg && check_lab_bg2 && check_lab_fg) {
+  print("PASS: No. of assigned colours match length")
+}else{
+  print("FAIL: length of assigned colours mismatch")
+  quit()
+}
+#=======================================================================
+#=======
+# output
+#=======
+# plot name and location
+print(paste0("plot will be in:", plotdir))
+bp_aa_subcols_ligand = "barplot_acoloured_LIG.svg"
+plot_bp_aa_subcols_ligand = paste0(plotdir, "/", bp_aa_subcols_ligand)
+
+#=======================================================================
+#================
+# Data for plots
+#================
+# REASSIGNMENT as necessary
+df  = my_df_u_cols_lig
+
+cat("ligand df dim:", dim(df))
+
+# sanity checks
+str(df)
+
+# sanity check
+df[df$position == "49",]
+df[df$position == "13",]
+df[df$position == "103",]
+
+###########################
+# Plot: Ligand affinity
+###########################
+
+#==========================
+# Barplot with scores (unordered)
+# corresponds to ligand_outcome
+# Stacked Barplot with colours: ligand_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding ligand stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+# sanity checks
+upos = unique(df$position)
+
+table(df$ligand_outcome)
+table(df$ligand_outcome)
+
+# add frequency of positions (from lib data.table)
+setDT(df)[, pos_count := .N, by = .(position)]
+
+# this is cummulative
+table(df$pos_count)
+
+# use group by on this
+library(dplyr)
+snpsBYpos_df <- df %>%
+  group_by(position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+
+snp_count = sort(unique(snpsBYpos_df$snpsBYpos))
+
+# sanity checks
+# should be a factor
+if (is.factor(df$ligand_outcome)){
+  print("ligand_outcome is factor")
+}else{
+  print("converting ligand_outcome to factor")
+  df$ligand_outcome = as.factor(df$ligand_outcome)
+}
+
+is.factor(df$ligand_outcome)
+
+table(df$ligand_outcome)
+
+# may not be -1 and 1 since these are filtered within 10A
+min(df$affinity_scaled)
+max(df$affinity_scaled)
+
+# sanity checks
+# very important!!!!
+tapply(df$affinity_scaled, df$ligand_outcome, min)
+
+tapply(df$affinity_scaled, df$ligand_outcome, max)
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = ligand_outcome
+# subgroup = normalised score i.e affinity_scaled
+
+# check unique values in normalised data
+u = unique(df$affinity_scaled) 
+cat("No. of unique values in normalised data:", length(u))
+
+# Define group
+# Create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$affinity_scaled 
+df$group <- paste0(df$ligand_outcome, "_", my_grp, sep = "") 
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "ligand_outcome", "my_grp")
+print(paste0("Colour palette generated for: ", length(colours), " colours"))
+my_title = "Ligand affinity"
+cat("No. of axis colours: ", length(my_axis_colours))
+
+#========================
+# plot with axis colours
+#========================
+class(df$lab_bg)
+
+# define cartesian coord
+my_xlim = length(unique(df$position)); my_xlim
+
+# axis label size
+my_xals = 18
+my_yals = 18
+
+# axes text size
+my_xats = 14
+my_yats = 18
+
+#******************
+# generate plot: with axis colours
+#******************
+print(paste0("plot name:", plot_bp_aa_subcols_ligand))
+
+svg(plot_bp_aa_subcols_ligand, width = 26, height = 4)
+
+g = ggplot(df, aes(factor(position, ordered = T)))
+
+outPlot = g + 
+  coord_cartesian(xlim = c(1, my_xlim)
+                  #, ylim = c(0, 6)
+                  , ylim = c(0, max(snp_count))
+                  , clip = "off") +
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual(values = colours
+                     , guide = "none") +
+  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
+            , fill = df$lab_bg) +
+  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
+            , fill = df$lab_bg2) +
+  
+# Here it"s important to specify that your axis goes from 1 to max number of levels
+  theme(axis.text.x = element_text(size = my_xats
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4
+                                    , colour = my_axis_colours)
+        
+         , axis.text.y = element_text(size = my_yats 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xals)
+                                       #, hjust = 1
+                                       #, vjust = 0.4)
+         , axis.title.y = element_text(size = my_yals ) 
+         , axis.ticks.x = element_blank()) +
+  labs(title = ""
+       #title = my_title
+       , x = "position"
+       , y = "Frequency")
+
+print(outPlot)
+dev.off()
+
+#!!!!!!!!!!!!!!!!
+#Warning message:
+#  Vectorized input to `element_text()` is not officially supported.
+#Results may be unexpected or may change in future versions of ggplot2. 
+#!!!!!!!!!!!!!!!!!
+
+# for sanity and good practice
+#rm(df)
diff --git a/scripts/plotting/basic_barplots_LIG.R b/scripts/plotting/basic_barplots_LIG.R
new file mode 100755
index 0000000..ac1123c
--- /dev/null
+++ b/scripts/plotting/basic_barplots_LIG.R
@@ -0,0 +1,193 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: producing barplots
+# basic barplots with count of mutations
+# basic barplots with frequency of count of mutations
+#########################################################
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+#source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+library(dplyr)
+source("plotting_data.R")
+
+# should return the following dfs and directories
+# my_df
+# my_df_u
+# my_df_u_lig
+ # dup_muts
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+ cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nLength of upos:", length(upos)
+           , "\nAngstrom symbol:", angstroms_symbol))       
+           
+# clear excess variable
+rm(my_df, upos, dup_muts, my_df_u)
+
+#=======================================================================
+#=======
+# output
+#=======
+# plot 1
+basic_bp_ligand = "basic_barplot_LIG.svg"
+plot_basic_bp_ligand  =  paste0(plotdir,"/", basic_bp_ligand)
+
+# plot 2
+pos_count_ligand = "position_count_LIG.svg"
+plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand)
+
+#=======================================================================
+#================
+# Data for plots
+#================
+# REASSIGNMENT as necessary
+df  = my_df_u_lig
+rm(my_df_u, my_df, upos, dup_muts)
+
+# sanity checks
+str(df)
+#=====================================================================
+#****************
+# Plot 1:Count of stabilising and destabilsing muts
+#****************
+
+#svg("basic_barplots_LIG.svg")
+svg(plot_basic_bp_ligand)
+print(paste0("plot1 filename:", basic_bp_ligand))
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+theme_set(theme_grey())
+
+#--------------
+# start plot 1
+#--------------
+g = ggplot(df, aes(x = ligand_outcome))
+outPlot = g + geom_bar(aes(fill = ligand_outcome)
+                         , show.legend = TRUE) + 
+  geom_label(stat = "count"
+             , aes(label = ..count..)
+             , color = "black"
+             , show.legend = FALSE
+             , size = 10) + 
+  theme(axis.text.x = element_blank()
+        , axis.title.x = element_blank()
+        , axis.title.y = element_text(size=my_als)
+        , axis.text.y = element_text(size = my_ats)
+        , legend.position = c(0.73,0.8)
+        , legend.text = element_text(size=my_als-2)
+        , legend.title = element_text(size=my_als)
+        , plot.title = element_blank()) + 
+  labs(title = ""
+       , y = "Number of SNPs"
+      #, fill="ligand_outcome"
+      ) + 
+  scale_fill_discrete(name = "Ligand Outcome"
+                      , labels = c("Destabilising", "Stabilising"))
+
+print(outPlot)
+dev.off()
+
+table(df$ligand_outcome)
+#=======================================================================
+#****************
+# Plot 2: frequency of positions
+#****************
+df_ncols = ncol(df)
+df_nrows = nrow(df)
+
+cat(paste0("original df dimensions:"
+    , "\nNo. of rows:", df_nrows
+    , "\nNo. of cols:", df_ncols
+    , "\nNow adding column: frequency of mutational positions"))
+
+#setDT(df)[, .(pos_count := .N), by = .(position)] 
+setDT(df)[, pos_count := .N, by = .(position)]
+
+rm(df_nrows, df_ncols) 
+
+df_nrows = nrow(df)
+df_ncols = ncol(df)
+
+cat(paste0("revised df dimensions:"
+           , "\nNo. of rows:", df_nrows
+           , "\nNo. of cols:", df_ncols))
+
+# this is cummulative
+table(df$pos_count)
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# FIXME, get this mutation_info, perhaLIG useful!
+foo = select(df, mutationinformation
+             , wild_pos
+             , wild_type
+             , mutant_type
+             #, mutation_info # comes from meta data, notused yet
+             , position
+             , pos_count)
+
+#write.csv(foo, "/pos_count_freq.csv")
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+#--------------
+# start plot 2
+#--------------
+#svg("position_count_LIG.svg")
+svg(plot_pos_count_ligand)
+print(paste0("plot filename:", plot_pos_count_ligand))
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+# to make x axis display all positions
+# not sure if to use with sort or directly
+my_x = sort(unique(snpsBYpos_df$snpsBYpos)) 
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+outPlot_pos = g + geom_bar(aes (alpha = 0.5)
+                         , show.legend = FALSE) +
+  scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
+  #scale_x_continuous(breaks = my_x) +
+  geom_label(stat = "count", aes(label = ..count..)
+             , color = "black"
+             , size = 10) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 0)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als)
+        , plot.title = element_blank()) +
+  
+  labs(x = "Number of SNPs"
+       , y = "Number of Sites")
+
+print(outPlot_pos)
+dev.off()
+########################################################################
+#               			end of lig barplots         			   
+########################################################################
+
diff --git a/scripts/plotting/columns_all_params.csv b/scripts/plotting/columns_all_params.csv
new file mode 100644
index 0000000..fb8adc3
--- /dev/null
+++ b/scripts/plotting/columns_all_params.csv
@@ -0,0 +1,87 @@
+,x,,changes,
+1,mutationinformation,,Mutationinformation,
+2,wild_type,,,consider...wild_aa
+3,position,,Position,
+4,mutant_type,,,consider...mutant_aa
+5,chain,,,
+6,ligand_id,,,
+7,ligand_distance,,,
+8,duet_stability_change,,,
+9,duet_outcome,,DUET_outcome,
+10,ligand_affinity_change,,,
+11,ligand_outcome,,Lig_outcome,
+12,duet_scaled,,ratioDUET,
+13,affinity_scaled,,ratioPredAff,
+14,wild_pos,,WildPos,
+15,wild_chain_pos,,,
+16,ddg,,,
+17,contacts,,,
+18,electro_rr,,,
+19,electro_mm,,,
+20,electro_sm,,,
+21,electro_ss,,,
+22,disulfide_rr,,,
+23,disulfide_mm,,,
+24,disulfide_sm,,,
+25,disulfide_ss,,,
+26,hbonds_rr,,,
+27,hbonds_mm,,,
+28,hbonds_sm,,,
+29,hbonds_ss,,,
+30,partcov_rr,,,
+31,partcov_mm,,,
+32,partcov_sm,,,
+33,partcov_ss,,,
+34,vdwclashes_rr,,,
+35,vdwclashes_mm,,,
+36,vdwclashes_sm,,,
+37,vdwclashes_ss,,,
+38,volumetric_rr,,,
+39,volumetric_mm,,,
+40,volumetric_sm,,,
+41,volumetric_ss,,,
+42,wild_type_dssp,,,
+43,asa,,,
+44,rsa,,,
+45,ss,,,
+46,ss_class,,,
+47,chain_id,,,
+48,wild_type_kd,,,
+49,kd_values,,,
+50,rd_values,,,
+51,wt_3letter_caps,,,
+52,mutation,,,
+53,af,,,
+54,beta_logistic,,,
+55,or_logistic,,,
+56,pval_logistic,,,
+57,se_logistic,,,
+58,zval_logistic,,,
+59,ci_low_logistic,,,
+60,ci_hi_logistic,,,
+61,or_mychisq,,,
+62,or_fisher,,,
+63,pval_fisher,,,
+64,ci_low_fisher,,,
+65,ci_hi_fisher,,,
+66,est_chisq,,,
+67,pval_chisq,,,
+68,chromosome_number,,,
+69,ref_allele,,,
+70,alt_allele,,,
+71,mut_type,,,
+72,gene_id,,,
+73,gene_number,,,
+74,mut_region,,,
+75,mut_info,,,
+76,chr_num_allele,,,
+77,wt_3let,,,
+78,mt_3let,,,
+79,af_kin,,,
+80,or_kin,,,
+81,pwald_kin,,,
+82,beta_kin,,,
+83,se_kin,,,
+84,logl_h1_kin,,,
+85,l_remle_kin,,,
+86,n_miss,,,
diff --git a/scripts/plotting/notes b/scripts/plotting/notes
new file mode 100644
index 0000000..7cc9e0e
--- /dev/null
+++ b/scripts/plotting/notes
@@ -0,0 +1,66 @@
+#####################
+# combining_two_df.R
+#####################
+orig_col ==> df_ncols
+Mutationinformation ==> mutationinformation
+Position ==> position
+DUET_outcome ==> duet_outcome
+Lig_outcome ==> ligand_outcome
+
+infile ==> infile_params
+in_filename_comb ==> in_filename_metadata
+meta_with_afor ==> gene_metadata
+
+#!!!!!!!!!
+# FIXME: plotting_data.R
+#!!!!!!!!!
+
+# This script will be called by various plotting scripts.
+# Ensure you can call this using command line args which are currently commented out
+
+#####################
+# basic_barplots_PS.R
+#####################
+dim(my_df)
+416, 86
+
+# unique mutations
+dim(my_df_u)
+403, 86
+
+# all dups identified are destabilising
+dups_df = 13 rows
+11 unique muts
+
+upos = unique(my_df_u$position)
+145
+
+df = my_df_u
+
+df$duet_outcome
+
+Destabilising   Stabilising 
+          346           57 
+                       
+                       
+                        # with dups
+                        Destabilising   Stabilising 
+                                  359            57 
+
+table(df$pos_count)
+# this is cummulative
+#1   2   3   4   5   6 
+#39  62  81 100  85  36 
+
+                        # with dups
+                          1   2   3   4   5   6   7 
+                         39  60  75  92 100  36  14 
+
+# use group by
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#39 31 27 25 17  6 
+
+                         # with dups
+                          1  2  3  4  5  6  7 
+                          39 30 25 23 20  6  2