From 3f8d6695a45c4aed8922552e16238f1cb43d60c1 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 26 Aug 2020 17:18:45 +0100 Subject: [PATCH] all barplots generated for ps and lig --- scripts/plotting/autoviz.py | 45 +++ scripts/plotting/barplots_subcolours_aa_LIG.R | 265 ++++++++++++++++++ scripts/plotting/basic_barplots_LIG.R | 193 +++++++++++++ scripts/plotting/columns_all_params.csv | 87 ++++++ scripts/plotting/notes | 66 +++++ 5 files changed, 656 insertions(+) create mode 100644 scripts/plotting/autoviz.py create mode 100755 scripts/plotting/barplots_subcolours_aa_LIG.R create mode 100755 scripts/plotting/basic_barplots_LIG.R create mode 100644 scripts/plotting/columns_all_params.csv create mode 100644 scripts/plotting/notes diff --git a/scripts/plotting/autoviz.py b/scripts/plotting/autoviz.py new file mode 100644 index 0000000..8dc6405 --- /dev/null +++ b/scripts/plotting/autoviz.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +#======================================================================= +#%% useful links +#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a +#https://pypi.org/project/autoviz/ +#======================================================================= +import os, sys +import pandas as pd +import numpy as np +import re +import argparse +from autoviz.AutoViz_Class import AutoViz_Class + +homedir = os.path.expanduser('~') +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +#%%============================================================================ +# variables +gene = 'pncA' +drug = 'pyrazinamide' + +#%%============================================================================ +#============== +# directories +#============== +datadir = homedir + '/' + 'git/Data' + +indir = datadir + '/' + drug + '/input' + +outdir = datadir + '/' + drug + '/output' + +#======= +# input +#======= +in_filename_plotting = 'car_design.csv' +in_filename_plotting = gene.lower() + '_all_params.csv' +infile_plotting = outdir + '/' + in_filename_plotting +print('plotting file: ', infile_plotting + , '\n============================================================') +#======================================================================= +plotting_df = pd.read_csv(infile_plotting, sep = ',') +#Instantiate the AutoViz class +AV = AutoViz_Class() +df = AV.AutoViz(infile_plotting) +#df2 = AV.AutoViz(plotting_df) +plotting_df.columns[~plotting_df.columns.isin(df.columns)] diff --git a/scripts/plotting/barplots_subcolours_aa_LIG.R b/scripts/plotting/barplots_subcolours_aa_LIG.R new file mode 100755 index 0000000..f00466b --- /dev/null +++ b/scripts/plotting/barplots_subcolours_aa_LIG.R @@ -0,0 +1,265 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +######################################################### +# TASK: output barplot by position with each bar coloured by +# its stability value and active site positions indicated +# according to colour specified in "subcols_axis_PS.R" +######################################################### + +#======================================================================= + +############################################################ +# 1: Installing and loading required packages and functions +############################################################ + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +source("barplot_colour_function.R") +#source("subcols_axis.R") +source("subcols_axis_PS.R") + +# should return the following dfs, directories and variables +# mut_pos_cols +# mut_pos_cols_lig +# my_df_cols +# my_df_u_cols +# my_df_u_lig_cols +# dup_muts_cols + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nLength of upos:", length(upos) + , "\nAngstrom symbol:", angstroms_symbol)) + +# clear excess variable +rm(dup_muts_cols, mut_pos_cols, my_df_cols, my_df_u_cols, upos) + +#======================================================================= +# !!! very important!!!! +#================ +# Inspecting mut_pos_cols +# position numbers and colours and assigning axis colours based on lab_fg +# of the correct df +# open file from desktop ("sample_axis_cols") for cross checking +#================ +if ( nrow(mut_pos_cols_lig) == length(unique(my_df_u_cols_lig$position)) ){ + print("PASS: lengths checked, assigning axis colours") + my_axis_colours = mut_pos_cols_lig$lab_fg + cat("length of axis colours:", length(my_axis_colours) + , "\nwhich corresponds to the number of positions on the x-axis of the plot") +}else{ + print("FAIL:lengths mismatch, could not assign axis colours") + quit() +} + +# further sanity checks +table(mut_pos_cols_lig$lab_bg) +check_lab_bg = sum( table(mut_pos_cols_lig$lab_bg) ) == nrow(mut_pos_cols_lig) # should be True +check_lab_bg + +table(mut_pos_cols_lig$lab_bg2) +check_lab_bg2 = sum( table(mut_pos_cols_lig$lab_bg2) ) == nrow(mut_pos_cols_lig) # should be True +check_lab_bg2 + +table(mut_pos_cols_lig$lab_fg) +check_lab_fg = sum( table(mut_pos_cols_lig$lab_fg) ) == nrow(mut_pos_cols_lig) # should be True +check_lab_fg + +# sanity checks: +if (check_lab_bg && check_lab_bg2 && check_lab_fg) { + print("PASS: No. of assigned colours match length") +}else{ + print("FAIL: length of assigned colours mismatch") + quit() +} +#======================================================================= +#======= +# output +#======= +# plot name and location +print(paste0("plot will be in:", plotdir)) +bp_aa_subcols_ligand = "barplot_acoloured_LIG.svg" +plot_bp_aa_subcols_ligand = paste0(plotdir, "/", bp_aa_subcols_ligand) + +#======================================================================= +#================ +# Data for plots +#================ +# REASSIGNMENT as necessary +df = my_df_u_cols_lig + +cat("ligand df dim:", dim(df)) + +# sanity checks +str(df) + +# sanity check +df[df$position == "49",] +df[df$position == "13",] +df[df$position == "103",] + +########################### +# Plot: Ligand affinity +########################### + +#========================== +# Barplot with scores (unordered) +# corresponds to ligand_outcome +# Stacked Barplot with colours: ligand_outcome @ position coloured by +# stability scores. This is a barplot where each bar corresponds +# to a SNP and is coloured by its corresponding ligand stability value. +# Normalised values (range between -1 and 1 ) to aid visualisation +# NOTE: since barplot plots discrete values, colour = score, so number of +# colours will be equal to the no. of unique normalised scores +# rather than a continuous scale +# will require generating the colour scale separately. +#============================ +# sanity checks +upos = unique(df$position) + +table(df$ligand_outcome) +table(df$ligand_outcome) + +# add frequency of positions (from lib data.table) +setDT(df)[, pos_count := .N, by = .(position)] + +# this is cummulative +table(df$pos_count) + +# use group by on this +library(dplyr) +snpsBYpos_df <- df %>% + group_by(position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) + +snp_count = sort(unique(snpsBYpos_df$snpsBYpos)) + +# sanity checks +# should be a factor +if (is.factor(df$ligand_outcome)){ + print("ligand_outcome is factor") +}else{ + print("converting ligand_outcome to factor") + df$ligand_outcome = as.factor(df$ligand_outcome) +} + +is.factor(df$ligand_outcome) + +table(df$ligand_outcome) + +# may not be -1 and 1 since these are filtered within 10A +min(df$affinity_scaled) +max(df$affinity_scaled) + +# sanity checks +# very important!!!! +tapply(df$affinity_scaled, df$ligand_outcome, min) + +tapply(df$affinity_scaled, df$ligand_outcome, max) + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = ligand_outcome +# subgroup = normalised score i.e affinity_scaled + +# check unique values in normalised data +u = unique(df$affinity_scaled) +cat("No. of unique values in normalised data:", length(u)) + +# Define group +# Create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$affinity_scaled +df$group <- paste0(df$ligand_outcome, "_", my_grp, sep = "") + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "ligand_outcome", "my_grp") +print(paste0("Colour palette generated for: ", length(colours), " colours")) +my_title = "Ligand affinity" +cat("No. of axis colours: ", length(my_axis_colours)) + +#======================== +# plot with axis colours +#======================== +class(df$lab_bg) + +# define cartesian coord +my_xlim = length(unique(df$position)); my_xlim + +# axis label size +my_xals = 18 +my_yals = 18 + +# axes text size +my_xats = 14 +my_yats = 18 + +#****************** +# generate plot: with axis colours +#****************** +print(paste0("plot name:", plot_bp_aa_subcols_ligand)) + +svg(plot_bp_aa_subcols_ligand, width = 26, height = 4) + +g = ggplot(df, aes(factor(position, ordered = T))) + +outPlot = g + + coord_cartesian(xlim = c(1, my_xlim) + #, ylim = c(0, 6) + , ylim = c(0, max(snp_count)) + , clip = "off") + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual(values = colours + , guide = "none") + + geom_tile(aes(,-0.8, width = 0.95, height = 0.85) + , fill = df$lab_bg) + + geom_tile(aes(,-1.2, width = 0.95, height = -0.2) + , fill = df$lab_bg2) + + +# Here it"s important to specify that your axis goes from 1 to max number of levels + theme(axis.text.x = element_text(size = my_xats + , angle = 90 + , hjust = 1 + , vjust = 0.4 + , colour = my_axis_colours) + + , axis.text.y = element_text(size = my_yats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals) + #, hjust = 1 + #, vjust = 0.4) + , axis.title.y = element_text(size = my_yals ) + , axis.ticks.x = element_blank()) + + labs(title = "" + #title = my_title + , x = "position" + , y = "Frequency") + +print(outPlot) +dev.off() + +#!!!!!!!!!!!!!!!! +#Warning message: +# Vectorized input to `element_text()` is not officially supported. +#Results may be unexpected or may change in future versions of ggplot2. +#!!!!!!!!!!!!!!!!! + +# for sanity and good practice +#rm(df) diff --git a/scripts/plotting/basic_barplots_LIG.R b/scripts/plotting/basic_barplots_LIG.R new file mode 100755 index 0000000..ac1123c --- /dev/null +++ b/scripts/plotting/basic_barplots_LIG.R @@ -0,0 +1,193 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: producing barplots +# basic barplots with count of mutations +# basic barplots with frequency of count of mutations +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) +source("plotting_data.R") + +# should return the following dfs and directories +# my_df +# my_df_u +# my_df_u_lig + # dup_muts + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + + cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nLength of upos:", length(upos) + , "\nAngstrom symbol:", angstroms_symbol)) + +# clear excess variable +rm(my_df, upos, dup_muts, my_df_u) + +#======================================================================= +#======= +# output +#======= +# plot 1 +basic_bp_ligand = "basic_barplot_LIG.svg" +plot_basic_bp_ligand = paste0(plotdir,"/", basic_bp_ligand) + +# plot 2 +pos_count_ligand = "position_count_LIG.svg" +plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand) + +#======================================================================= +#================ +# Data for plots +#================ +# REASSIGNMENT as necessary +df = my_df_u_lig +rm(my_df_u, my_df, upos, dup_muts) + +# sanity checks +str(df) +#===================================================================== +#**************** +# Plot 1:Count of stabilising and destabilsing muts +#**************** + +#svg("basic_barplots_LIG.svg") +svg(plot_basic_bp_ligand) +print(paste0("plot1 filename:", basic_bp_ligand)) + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +theme_set(theme_grey()) + +#-------------- +# start plot 1 +#-------------- +g = ggplot(df, aes(x = ligand_outcome)) +outPlot = g + geom_bar(aes(fill = ligand_outcome) + , show.legend = TRUE) + + geom_label(stat = "count" + , aes(label = ..count..) + , color = "black" + , show.legend = FALSE + , size = 10) + + theme(axis.text.x = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_text(size=my_als) + , axis.text.y = element_text(size = my_ats) + , legend.position = c(0.73,0.8) + , legend.text = element_text(size=my_als-2) + , legend.title = element_text(size=my_als) + , plot.title = element_blank()) + + labs(title = "" + , y = "Number of SNPs" + #, fill="ligand_outcome" + ) + + scale_fill_discrete(name = "Ligand Outcome" + , labels = c("Destabilising", "Stabilising")) + +print(outPlot) +dev.off() + +table(df$ligand_outcome) +#======================================================================= +#**************** +# Plot 2: frequency of positions +#**************** +df_ncols = ncol(df) +df_nrows = nrow(df) + +cat(paste0("original df dimensions:" + , "\nNo. of rows:", df_nrows + , "\nNo. of cols:", df_ncols + , "\nNow adding column: frequency of mutational positions")) + +#setDT(df)[, .(pos_count := .N), by = .(position)] +setDT(df)[, pos_count := .N, by = .(position)] + +rm(df_nrows, df_ncols) + +df_nrows = nrow(df) +df_ncols = ncol(df) + +cat(paste0("revised df dimensions:" + , "\nNo. of rows:", df_nrows + , "\nNo. of cols:", df_ncols)) + +# this is cummulative +table(df$pos_count) + +# use group by on this +snpsBYpos_df <- df %>% + group_by(position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# FIXME, get this mutation_info, perhaLIG useful! +foo = select(df, mutationinformation + , wild_pos + , wild_type + , mutant_type + #, mutation_info # comes from meta data, notused yet + , position + , pos_count) + +#write.csv(foo, "/pos_count_freq.csv") +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +#-------------- +# start plot 2 +#-------------- +#svg("position_count_LIG.svg") +svg(plot_pos_count_ligand) +print(paste0("plot filename:", plot_pos_count_ligand)) + +my_ats = 25 # axis text size +my_als = 22 # axis label size + +# to make x axis display all positions +# not sure if to use with sort or directly +my_x = sort(unique(snpsBYpos_df$snpsBYpos)) + +g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) +outPlot_pos = g + geom_bar(aes (alpha = 0.5) + , show.legend = FALSE) + + scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + + #scale_x_continuous(breaks = my_x) + + geom_label(stat = "count", aes(label = ..count..) + , color = "black" + , size = 10) + + theme(axis.text.x = element_text(size = my_ats + , angle = 0) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , plot.title = element_blank()) + + + labs(x = "Number of SNPs" + , y = "Number of Sites") + +print(outPlot_pos) +dev.off() +######################################################################## +# end of lig barplots +######################################################################## + diff --git a/scripts/plotting/columns_all_params.csv b/scripts/plotting/columns_all_params.csv new file mode 100644 index 0000000..fb8adc3 --- /dev/null +++ b/scripts/plotting/columns_all_params.csv @@ -0,0 +1,87 @@ +,x,,changes, +1,mutationinformation,,Mutationinformation, +2,wild_type,,,consider...wild_aa +3,position,,Position, +4,mutant_type,,,consider...mutant_aa +5,chain,,, +6,ligand_id,,, +7,ligand_distance,,, +8,duet_stability_change,,, +9,duet_outcome,,DUET_outcome, +10,ligand_affinity_change,,, +11,ligand_outcome,,Lig_outcome, +12,duet_scaled,,ratioDUET, +13,affinity_scaled,,ratioPredAff, +14,wild_pos,,WildPos, +15,wild_chain_pos,,, +16,ddg,,, +17,contacts,,, +18,electro_rr,,, +19,electro_mm,,, +20,electro_sm,,, +21,electro_ss,,, +22,disulfide_rr,,, +23,disulfide_mm,,, +24,disulfide_sm,,, +25,disulfide_ss,,, +26,hbonds_rr,,, +27,hbonds_mm,,, +28,hbonds_sm,,, +29,hbonds_ss,,, +30,partcov_rr,,, +31,partcov_mm,,, +32,partcov_sm,,, +33,partcov_ss,,, +34,vdwclashes_rr,,, +35,vdwclashes_mm,,, +36,vdwclashes_sm,,, +37,vdwclashes_ss,,, +38,volumetric_rr,,, +39,volumetric_mm,,, +40,volumetric_sm,,, +41,volumetric_ss,,, +42,wild_type_dssp,,, +43,asa,,, +44,rsa,,, +45,ss,,, +46,ss_class,,, +47,chain_id,,, +48,wild_type_kd,,, +49,kd_values,,, +50,rd_values,,, +51,wt_3letter_caps,,, +52,mutation,,, +53,af,,, +54,beta_logistic,,, +55,or_logistic,,, +56,pval_logistic,,, +57,se_logistic,,, +58,zval_logistic,,, +59,ci_low_logistic,,, +60,ci_hi_logistic,,, +61,or_mychisq,,, +62,or_fisher,,, +63,pval_fisher,,, +64,ci_low_fisher,,, +65,ci_hi_fisher,,, +66,est_chisq,,, +67,pval_chisq,,, +68,chromosome_number,,, +69,ref_allele,,, +70,alt_allele,,, +71,mut_type,,, +72,gene_id,,, +73,gene_number,,, +74,mut_region,,, +75,mut_info,,, +76,chr_num_allele,,, +77,wt_3let,,, +78,mt_3let,,, +79,af_kin,,, +80,or_kin,,, +81,pwald_kin,,, +82,beta_kin,,, +83,se_kin,,, +84,logl_h1_kin,,, +85,l_remle_kin,,, +86,n_miss,,, diff --git a/scripts/plotting/notes b/scripts/plotting/notes new file mode 100644 index 0000000..7cc9e0e --- /dev/null +++ b/scripts/plotting/notes @@ -0,0 +1,66 @@ +##################### +# combining_two_df.R +##################### +orig_col ==> df_ncols +Mutationinformation ==> mutationinformation +Position ==> position +DUET_outcome ==> duet_outcome +Lig_outcome ==> ligand_outcome + +infile ==> infile_params +in_filename_comb ==> in_filename_metadata +meta_with_afor ==> gene_metadata + +#!!!!!!!!! +# FIXME: plotting_data.R +#!!!!!!!!! + +# This script will be called by various plotting scripts. +# Ensure you can call this using command line args which are currently commented out + +##################### +# basic_barplots_PS.R +##################### +dim(my_df) +416, 86 + +# unique mutations +dim(my_df_u) +403, 86 + +# all dups identified are destabilising +dups_df = 13 rows +11 unique muts + +upos = unique(my_df_u$position) +145 + +df = my_df_u + +df$duet_outcome + +Destabilising Stabilising + 346 57 + + + # with dups + Destabilising Stabilising + 359 57 + +table(df$pos_count) +# this is cummulative +#1 2 3 4 5 6 +#39 62 81 100 85 36 + + # with dups + 1 2 3 4 5 6 7 + 39 60 75 92 100 36 14 + +# use group by +table(snpsBYpos_df$snpsBYpos) +#1 2 3 4 5 6 +#39 31 27 25 17 6 + + # with dups + 1 2 3 4 5 6 7 + 39 30 25 23 20 6 2