all barplots generated for ps and lig

This commit is contained in:
Tanushree Tunstall 2020-08-26 17:18:45 +01:00
parent 8ee7d4234e
commit de14752a0c
5 changed files with 656 additions and 0 deletions

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python3
#=======================================================================
#%% useful links
#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
#https://pypi.org/project/autoviz/
#=======================================================================
import os, sys
import pandas as pd
import numpy as np
import re
import argparse
from autoviz.AutoViz_Class import AutoViz_Class
homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#%%============================================================================
# variables
gene = 'pncA'
drug = 'pyrazinamide'
#%%============================================================================
#==============
# directories
#==============
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'
#=======
# input
#=======
in_filename_plotting = 'car_design.csv'
in_filename_plotting = gene.lower() + '_all_params.csv'
infile_plotting = outdir + '/' + in_filename_plotting
print('plotting file: ', infile_plotting
, '\n============================================================')
#=======================================================================
plotting_df = pd.read_csv(infile_plotting, sep = ',')
#Instantiate the AutoViz class
AV = AutoViz_Class()
df = AV.AutoViz(infile_plotting)
#df2 = AV.AutoViz(plotting_df)
plotting_df.columns[~plotting_df.columns.isin(df.columns)]

View file

@ -0,0 +1,265 @@
#!/usr/bin/env Rscript
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
#########################################################
# TASK: output barplot by position with each bar coloured by
# its stability value and active site positions indicated
# according to colour specified in "subcols_axis_PS.R"
#########################################################
#=======================================================================
############################################################
# 1: Installing and loading required packages and functions
############################################################
#source("Header_TT.R")
library(ggplot2)
library(data.table)
source("barplot_colour_function.R")
#source("subcols_axis.R")
source("subcols_axis_PS.R")
# should return the following dfs, directories and variables
# mut_pos_cols
# mut_pos_cols_lig
# my_df_cols
# my_df_u_cols
# my_df_u_lig_cols
# dup_muts_cols
cat(paste0("Directories imported:"
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir))
cat(paste0("Variables imported:"
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nLength of upos:", length(upos)
, "\nAngstrom symbol:", angstroms_symbol))
# clear excess variable
rm(dup_muts_cols, mut_pos_cols, my_df_cols, my_df_u_cols, upos)
#=======================================================================
# !!! very important!!!!
#================
# Inspecting mut_pos_cols
# position numbers and colours and assigning axis colours based on lab_fg
# of the correct df
# open file from desktop ("sample_axis_cols") for cross checking
#================
if ( nrow(mut_pos_cols_lig) == length(unique(my_df_u_cols_lig$position)) ){
print("PASS: lengths checked, assigning axis colours")
my_axis_colours = mut_pos_cols_lig$lab_fg
cat("length of axis colours:", length(my_axis_colours)
, "\nwhich corresponds to the number of positions on the x-axis of the plot")
}else{
print("FAIL:lengths mismatch, could not assign axis colours")
quit()
}
# further sanity checks
table(mut_pos_cols_lig$lab_bg)
check_lab_bg = sum( table(mut_pos_cols_lig$lab_bg) ) == nrow(mut_pos_cols_lig) # should be True
check_lab_bg
table(mut_pos_cols_lig$lab_bg2)
check_lab_bg2 = sum( table(mut_pos_cols_lig$lab_bg2) ) == nrow(mut_pos_cols_lig) # should be True
check_lab_bg2
table(mut_pos_cols_lig$lab_fg)
check_lab_fg = sum( table(mut_pos_cols_lig$lab_fg) ) == nrow(mut_pos_cols_lig) # should be True
check_lab_fg
# sanity checks:
if (check_lab_bg && check_lab_bg2 && check_lab_fg) {
print("PASS: No. of assigned colours match length")
}else{
print("FAIL: length of assigned colours mismatch")
quit()
}
#=======================================================================
#=======
# output
#=======
# plot name and location
print(paste0("plot will be in:", plotdir))
bp_aa_subcols_ligand = "barplot_acoloured_LIG.svg"
plot_bp_aa_subcols_ligand = paste0(plotdir, "/", bp_aa_subcols_ligand)
#=======================================================================
#================
# Data for plots
#================
# REASSIGNMENT as necessary
df = my_df_u_cols_lig
cat("ligand df dim:", dim(df))
# sanity checks
str(df)
# sanity check
df[df$position == "49",]
df[df$position == "13",]
df[df$position == "103",]
###########################
# Plot: Ligand affinity
###########################
#==========================
# Barplot with scores (unordered)
# corresponds to ligand_outcome
# Stacked Barplot with colours: ligand_outcome @ position coloured by
# stability scores. This is a barplot where each bar corresponds
# to a SNP and is coloured by its corresponding ligand stability value.
# Normalised values (range between -1 and 1 ) to aid visualisation
# NOTE: since barplot plots discrete values, colour = score, so number of
# colours will be equal to the no. of unique normalised scores
# rather than a continuous scale
# will require generating the colour scale separately.
#============================
# sanity checks
upos = unique(df$position)
table(df$ligand_outcome)
table(df$ligand_outcome)
# add frequency of positions (from lib data.table)
setDT(df)[, pos_count := .N, by = .(position)]
# this is cummulative
table(df$pos_count)
# use group by on this
library(dplyr)
snpsBYpos_df <- df %>%
group_by(position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
snp_count = sort(unique(snpsBYpos_df$snpsBYpos))
# sanity checks
# should be a factor
if (is.factor(df$ligand_outcome)){
print("ligand_outcome is factor")
}else{
print("converting ligand_outcome to factor")
df$ligand_outcome = as.factor(df$ligand_outcome)
}
is.factor(df$ligand_outcome)
table(df$ligand_outcome)
# may not be -1 and 1 since these are filtered within 10A
min(df$affinity_scaled)
max(df$affinity_scaled)
# sanity checks
# very important!!!!
tapply(df$affinity_scaled, df$ligand_outcome, min)
tapply(df$affinity_scaled, df$ligand_outcome, max)
# My colour FUNCTION: based on group and subgroup
# in my case;
# df = df
# group = ligand_outcome
# subgroup = normalised score i.e affinity_scaled
# check unique values in normalised data
u = unique(df$affinity_scaled)
cat("No. of unique values in normalised data:", length(u))
# Define group
# Create an extra column called group which contains the "gp name and score"
# so colours can be generated for each unique values in this column
my_grp = df$affinity_scaled
df$group <- paste0(df$ligand_outcome, "_", my_grp, sep = "")
# Call the function to create the palette based on the group defined above
colours <- ColourPalleteMulti(df, "ligand_outcome", "my_grp")
print(paste0("Colour palette generated for: ", length(colours), " colours"))
my_title = "Ligand affinity"
cat("No. of axis colours: ", length(my_axis_colours))
#========================
# plot with axis colours
#========================
class(df$lab_bg)
# define cartesian coord
my_xlim = length(unique(df$position)); my_xlim
# axis label size
my_xals = 18
my_yals = 18
# axes text size
my_xats = 14
my_yats = 18
#******************
# generate plot: with axis colours
#******************
print(paste0("plot name:", plot_bp_aa_subcols_ligand))
svg(plot_bp_aa_subcols_ligand, width = 26, height = 4)
g = ggplot(df, aes(factor(position, ordered = T)))
outPlot = g +
coord_cartesian(xlim = c(1, my_xlim)
#, ylim = c(0, 6)
, ylim = c(0, max(snp_count))
, clip = "off") +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual(values = colours
, guide = "none") +
geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
, fill = df$lab_bg) +
geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
, fill = df$lab_bg2) +
# Here it"s important to specify that your axis goes from 1 to max number of levels
theme(axis.text.x = element_text(size = my_xats
, angle = 90
, hjust = 1
, vjust = 0.4
, colour = my_axis_colours)
, axis.text.y = element_text(size = my_yats
, angle = 0
, hjust = 1
, vjust = 0)
, axis.title.x = element_text(size = my_xals)
#, hjust = 1
#, vjust = 0.4)
, axis.title.y = element_text(size = my_yals )
, axis.ticks.x = element_blank()) +
labs(title = ""
#title = my_title
, x = "position"
, y = "Frequency")
print(outPlot)
dev.off()
#!!!!!!!!!!!!!!!!
#Warning message:
# Vectorized input to `element_text()` is not officially supported.
#Results may be unexpected or may change in future versions of ggplot2.
#!!!!!!!!!!!!!!!!!
# for sanity and good practice
#rm(df)

View file

@ -0,0 +1,193 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: producing barplots
# basic barplots with count of mutations
# basic barplots with frequency of count of mutations
#########################################################
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()
#source("Header_TT.R")
library(ggplot2)
library(data.table)
library(dplyr)
source("plotting_data.R")
# should return the following dfs and directories
# my_df
# my_df_u
# my_df_u_lig
# dup_muts
cat(paste0("Directories imported:"
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir))
cat(paste0("Variables imported:"
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nLength of upos:", length(upos)
, "\nAngstrom symbol:", angstroms_symbol))
# clear excess variable
rm(my_df, upos, dup_muts, my_df_u)
#=======================================================================
#=======
# output
#=======
# plot 1
basic_bp_ligand = "basic_barplot_LIG.svg"
plot_basic_bp_ligand = paste0(plotdir,"/", basic_bp_ligand)
# plot 2
pos_count_ligand = "position_count_LIG.svg"
plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand)
#=======================================================================
#================
# Data for plots
#================
# REASSIGNMENT as necessary
df = my_df_u_lig
rm(my_df_u, my_df, upos, dup_muts)
# sanity checks
str(df)
#=====================================================================
#****************
# Plot 1:Count of stabilising and destabilsing muts
#****************
#svg("basic_barplots_LIG.svg")
svg(plot_basic_bp_ligand)
print(paste0("plot1 filename:", basic_bp_ligand))
my_ats = 25 # axis text size
my_als = 22 # axis label size
theme_set(theme_grey())
#--------------
# start plot 1
#--------------
g = ggplot(df, aes(x = ligand_outcome))
outPlot = g + geom_bar(aes(fill = ligand_outcome)
, show.legend = TRUE) +
geom_label(stat = "count"
, aes(label = ..count..)
, color = "black"
, show.legend = FALSE
, size = 10) +
theme(axis.text.x = element_blank()
, axis.title.x = element_blank()
, axis.title.y = element_text(size=my_als)
, axis.text.y = element_text(size = my_ats)
, legend.position = c(0.73,0.8)
, legend.text = element_text(size=my_als-2)
, legend.title = element_text(size=my_als)
, plot.title = element_blank()) +
labs(title = ""
, y = "Number of SNPs"
#, fill="ligand_outcome"
) +
scale_fill_discrete(name = "Ligand Outcome"
, labels = c("Destabilising", "Stabilising"))
print(outPlot)
dev.off()
table(df$ligand_outcome)
#=======================================================================
#****************
# Plot 2: frequency of positions
#****************
df_ncols = ncol(df)
df_nrows = nrow(df)
cat(paste0("original df dimensions:"
, "\nNo. of rows:", df_nrows
, "\nNo. of cols:", df_ncols
, "\nNow adding column: frequency of mutational positions"))
#setDT(df)[, .(pos_count := .N), by = .(position)]
setDT(df)[, pos_count := .N, by = .(position)]
rm(df_nrows, df_ncols)
df_nrows = nrow(df)
df_ncols = ncol(df)
cat(paste0("revised df dimensions:"
, "\nNo. of rows:", df_nrows
, "\nNo. of cols:", df_ncols))
# this is cummulative
table(df$pos_count)
# use group by on this
snpsBYpos_df <- df %>%
group_by(position) %>%
summarize(snpsBYpos = mean(pos_count))
table(snpsBYpos_df$snpsBYpos)
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# FIXME, get this mutation_info, perhaLIG useful!
foo = select(df, mutationinformation
, wild_pos
, wild_type
, mutant_type
#, mutation_info # comes from meta data, notused yet
, position
, pos_count)
#write.csv(foo, "/pos_count_freq.csv")
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#--------------
# start plot 2
#--------------
#svg("position_count_LIG.svg")
svg(plot_pos_count_ligand)
print(paste0("plot filename:", plot_pos_count_ligand))
my_ats = 25 # axis text size
my_als = 22 # axis label size
# to make x axis display all positions
# not sure if to use with sort or directly
my_x = sort(unique(snpsBYpos_df$snpsBYpos))
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
outPlot_pos = g + geom_bar(aes (alpha = 0.5)
, show.legend = FALSE) +
scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
#scale_x_continuous(breaks = my_x) +
geom_label(stat = "count", aes(label = ..count..)
, color = "black"
, size = 10) +
theme(axis.text.x = element_text(size = my_ats
, angle = 0)
, axis.text.y = element_text(size = my_ats
, angle = 0
, hjust = 1)
, axis.title.x = element_text(size = my_als)
, axis.title.y = element_text(size = my_als)
, plot.title = element_blank()) +
labs(x = "Number of SNPs"
, y = "Number of Sites")
print(outPlot_pos)
dev.off()
########################################################################
# end of lig barplots
########################################################################

View file

@ -0,0 +1,87 @@
,x,,changes,
1,mutationinformation,,Mutationinformation,
2,wild_type,,,consider...wild_aa
3,position,,Position,
4,mutant_type,,,consider...mutant_aa
5,chain,,,
6,ligand_id,,,
7,ligand_distance,,,
8,duet_stability_change,,,
9,duet_outcome,,DUET_outcome,
10,ligand_affinity_change,,,
11,ligand_outcome,,Lig_outcome,
12,duet_scaled,,ratioDUET,
13,affinity_scaled,,ratioPredAff,
14,wild_pos,,WildPos,
15,wild_chain_pos,,,
16,ddg,,,
17,contacts,,,
18,electro_rr,,,
19,electro_mm,,,
20,electro_sm,,,
21,electro_ss,,,
22,disulfide_rr,,,
23,disulfide_mm,,,
24,disulfide_sm,,,
25,disulfide_ss,,,
26,hbonds_rr,,,
27,hbonds_mm,,,
28,hbonds_sm,,,
29,hbonds_ss,,,
30,partcov_rr,,,
31,partcov_mm,,,
32,partcov_sm,,,
33,partcov_ss,,,
34,vdwclashes_rr,,,
35,vdwclashes_mm,,,
36,vdwclashes_sm,,,
37,vdwclashes_ss,,,
38,volumetric_rr,,,
39,volumetric_mm,,,
40,volumetric_sm,,,
41,volumetric_ss,,,
42,wild_type_dssp,,,
43,asa,,,
44,rsa,,,
45,ss,,,
46,ss_class,,,
47,chain_id,,,
48,wild_type_kd,,,
49,kd_values,,,
50,rd_values,,,
51,wt_3letter_caps,,,
52,mutation,,,
53,af,,,
54,beta_logistic,,,
55,or_logistic,,,
56,pval_logistic,,,
57,se_logistic,,,
58,zval_logistic,,,
59,ci_low_logistic,,,
60,ci_hi_logistic,,,
61,or_mychisq,,,
62,or_fisher,,,
63,pval_fisher,,,
64,ci_low_fisher,,,
65,ci_hi_fisher,,,
66,est_chisq,,,
67,pval_chisq,,,
68,chromosome_number,,,
69,ref_allele,,,
70,alt_allele,,,
71,mut_type,,,
72,gene_id,,,
73,gene_number,,,
74,mut_region,,,
75,mut_info,,,
76,chr_num_allele,,,
77,wt_3let,,,
78,mt_3let,,,
79,af_kin,,,
80,or_kin,,,
81,pwald_kin,,,
82,beta_kin,,,
83,se_kin,,,
84,logl_h1_kin,,,
85,l_remle_kin,,,
86,n_miss,,,
1 x changes
2 1 mutationinformation Mutationinformation
3 2 wild_type consider...wild_aa
4 3 position Position
5 4 mutant_type consider...mutant_aa
6 5 chain
7 6 ligand_id
8 7 ligand_distance
9 8 duet_stability_change
10 9 duet_outcome DUET_outcome
11 10 ligand_affinity_change
12 11 ligand_outcome Lig_outcome
13 12 duet_scaled ratioDUET
14 13 affinity_scaled ratioPredAff
15 14 wild_pos WildPos
16 15 wild_chain_pos
17 16 ddg
18 17 contacts
19 18 electro_rr
20 19 electro_mm
21 20 electro_sm
22 21 electro_ss
23 22 disulfide_rr
24 23 disulfide_mm
25 24 disulfide_sm
26 25 disulfide_ss
27 26 hbonds_rr
28 27 hbonds_mm
29 28 hbonds_sm
30 29 hbonds_ss
31 30 partcov_rr
32 31 partcov_mm
33 32 partcov_sm
34 33 partcov_ss
35 34 vdwclashes_rr
36 35 vdwclashes_mm
37 36 vdwclashes_sm
38 37 vdwclashes_ss
39 38 volumetric_rr
40 39 volumetric_mm
41 40 volumetric_sm
42 41 volumetric_ss
43 42 wild_type_dssp
44 43 asa
45 44 rsa
46 45 ss
47 46 ss_class
48 47 chain_id
49 48 wild_type_kd
50 49 kd_values
51 50 rd_values
52 51 wt_3letter_caps
53 52 mutation
54 53 af
55 54 beta_logistic
56 55 or_logistic
57 56 pval_logistic
58 57 se_logistic
59 58 zval_logistic
60 59 ci_low_logistic
61 60 ci_hi_logistic
62 61 or_mychisq
63 62 or_fisher
64 63 pval_fisher
65 64 ci_low_fisher
66 65 ci_hi_fisher
67 66 est_chisq
68 67 pval_chisq
69 68 chromosome_number
70 69 ref_allele
71 70 alt_allele
72 71 mut_type
73 72 gene_id
74 73 gene_number
75 74 mut_region
76 75 mut_info
77 76 chr_num_allele
78 77 wt_3let
79 78 mt_3let
80 79 af_kin
81 80 or_kin
82 81 pwald_kin
83 82 beta_kin
84 83 se_kin
85 84 logl_h1_kin
86 85 l_remle_kin
87 86 n_miss

66
scripts/plotting/notes Normal file
View file

@ -0,0 +1,66 @@
#####################
# combining_two_df.R
#####################
orig_col ==> df_ncols
Mutationinformation ==> mutationinformation
Position ==> position
DUET_outcome ==> duet_outcome
Lig_outcome ==> ligand_outcome
infile ==> infile_params
in_filename_comb ==> in_filename_metadata
meta_with_afor ==> gene_metadata
#!!!!!!!!!
# FIXME: plotting_data.R
#!!!!!!!!!
# This script will be called by various plotting scripts.
# Ensure you can call this using command line args which are currently commented out
#####################
# basic_barplots_PS.R
#####################
dim(my_df)
416, 86
# unique mutations
dim(my_df_u)
403, 86
# all dups identified are destabilising
dups_df = 13 rows
11 unique muts
upos = unique(my_df_u$position)
145
df = my_df_u
df$duet_outcome
Destabilising Stabilising
346 57
# with dups
Destabilising Stabilising
359 57
table(df$pos_count)
# this is cummulative
#1 2 3 4 5 6
#39 62 81 100 85 36
# with dups
1 2 3 4 5 6 7
39 60 75 92 100 36 14
# use group by
table(snpsBYpos_df$snpsBYpos)
#1 2 3 4 5 6
#39 31 27 25 17 6
# with dups
1 2 3 4 5 6 7
39 30 25 23 20 6 2