all barplots generated for ps and lig

2020-08-26 17:18:45 +01:00 · 2020-08-26 17:18:45 +01:00 · 3f8d6695a4
commit 3f8d6695a4
parent 0220960975
5 changed files with 656 additions and 0 deletions
--- a/scripts/plotting/autoviz.py
+++ b/scripts/plotting/autoviz.py
@ -0,0 +1,45 @@
 #!/usr/bin/env python3
 #=======================================================================
 #%% useful links
 #https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
 #https://pypi.org/project/autoviz/
 #=======================================================================
 import os, sys
 import pandas as pd
 import numpy as np
 import re
 import argparse
 from autoviz.AutoViz_Class import AutoViz_Class
 homedir = os.path.expanduser('~')
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 #%%============================================================================
 # variables
 gene = 'pncA'
 drug = 'pyrazinamide'
 #%%============================================================================
 #==============
 # directories
 #==============
 datadir = homedir + '/' + 'git/Data'
 indir = datadir + '/' + drug + '/input'
 outdir = datadir + '/' + drug + '/output'
 #=======
 # input
 #=======
 in_filename_plotting = 'car_design.csv'
 in_filename_plotting = gene.lower() + '_all_params.csv'
 infile_plotting = outdir + '/' + in_filename_plotting
 print('plotting file: ', infile_plotting
      , '\n============================================================')
 #=======================================================================
 plotting_df = pd.read_csv(infile_plotting, sep = ',')
 #Instantiate the AutoViz class
 AV = AutoViz_Class()
 df = AV.AutoViz(infile_plotting)
 #df2 = AV.AutoViz(plotting_df)
 plotting_df.columns[~plotting_df.columns.isin(df.columns)]
--- a/scripts/plotting/barplots_subcolours_aa_LIG.R
+++ b/scripts/plotting/barplots_subcolours_aa_LIG.R
@ -0,0 +1,265 @@
 #!/usr/bin/env Rscript   
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 #########################################################
 # TASK: output barplot by position with each bar coloured by 
 # its stability value and active site positions indicated 
 # according to colour specified in "subcols_axis_PS.R"
 #########################################################
 #=======================================================================
 ############################################################
 # 1: Installing and loading required packages and functions
 ############################################################
 #source("Header_TT.R")
 library(ggplot2)
 library(data.table)
 source("barplot_colour_function.R")
 #source("subcols_axis.R")
 source("subcols_axis_PS.R")
 # should return the following dfs, directories and variables
 # mut_pos_cols
 # mut_pos_cols_lig
 # my_df_cols
 # my_df_u_cols
 # my_df_u_lig_cols
 # dup_muts_cols
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nLength of upos:", length(upos)
           , "\nAngstrom symbol:", angstroms_symbol))
 # clear excess variable
 rm(dup_muts_cols, mut_pos_cols, my_df_cols, my_df_u_cols, upos)
 #=======================================================================
 # !!! very important!!!!
 #================
 # Inspecting mut_pos_cols
 # position numbers and colours and assigning axis colours based on lab_fg
 # of the correct df
 # open file from desktop ("sample_axis_cols") for cross checking
 #================
 if ( nrow(mut_pos_cols_lig) == length(unique(my_df_u_cols_lig$position)) ){
  print("PASS: lengths checked, assigning axis colours")
  my_axis_colours = mut_pos_cols_lig$lab_fg
  cat("length of axis colours:", length(my_axis_colours)
      , "\nwhich corresponds to the number of positions on the x-axis of the plot")
 }else{
  print("FAIL:lengths mismatch, could not assign axis colours")
  quit()
 }
 # further sanity checks
 table(mut_pos_cols_lig$lab_bg)
 check_lab_bg = sum( table(mut_pos_cols_lig$lab_bg) ) == nrow(mut_pos_cols_lig) # should  be True
 check_lab_bg    
 table(mut_pos_cols_lig$lab_bg2)
 check_lab_bg2 = sum( table(mut_pos_cols_lig$lab_bg2) ) ==  nrow(mut_pos_cols_lig) # should  be True
 check_lab_bg2
 table(mut_pos_cols_lig$lab_fg)
 check_lab_fg = sum( table(mut_pos_cols_lig$lab_fg) ) ==  nrow(mut_pos_cols_lig) # should  be True
 check_lab_fg
 # sanity checks:
 if (check_lab_bg && check_lab_bg2 && check_lab_fg) {
  print("PASS: No. of assigned colours match length")
 }else{
  print("FAIL: length of assigned colours mismatch")
  quit()
 }
 #=======================================================================
 #=======
 # output
 #=======
 # plot name and location
 print(paste0("plot will be in:", plotdir))
 bp_aa_subcols_ligand = "barplot_acoloured_LIG.svg"
 plot_bp_aa_subcols_ligand = paste0(plotdir, "/", bp_aa_subcols_ligand)
 #=======================================================================
 #================
 # Data for plots
 #================
 # REASSIGNMENT as necessary
 df  = my_df_u_cols_lig
 cat("ligand df dim:", dim(df))
 # sanity checks
 str(df)
 # sanity check
 df[df$position == "49",]
 df[df$position == "13",]
 df[df$position == "103",]
 ###########################
 # Plot: Ligand affinity
 ###########################
 #==========================
 # Barplot with scores (unordered)
 # corresponds to ligand_outcome
 # Stacked Barplot with colours: ligand_outcome @ position coloured by 
 # stability scores. This is a barplot where each bar corresponds 
 # to a SNP and is coloured by its corresponding ligand stability value.
 # Normalised values (range between -1 and 1 ) to aid visualisation
 # NOTE: since barplot plots discrete values, colour = score, so number of
 # colours will be equal to the no. of unique normalised scores 
 # rather than a continuous scale
 # will require generating the colour scale separately.
 #============================
 # sanity checks
 upos = unique(df$position)
 table(df$ligand_outcome)
 table(df$ligand_outcome)
 # add frequency of positions (from lib data.table)
 setDT(df)[, pos_count := .N, by = .(position)]
 # this is cummulative
 table(df$pos_count)
 # use group by on this
 library(dplyr)
 snpsBYpos_df <- df %>%
  group_by(position) %>%
  summarize(snpsBYpos = mean(pos_count))
 table(snpsBYpos_df$snpsBYpos)
 snp_count = sort(unique(snpsBYpos_df$snpsBYpos))
 # sanity checks
 # should be a factor
 if (is.factor(df$ligand_outcome)){
  print("ligand_outcome is factor")
 }else{
  print("converting ligand_outcome to factor")
  df$ligand_outcome = as.factor(df$ligand_outcome)
 }
 is.factor(df$ligand_outcome)
 table(df$ligand_outcome)
 # may not be -1 and 1 since these are filtered within 10A
 min(df$affinity_scaled)
 max(df$affinity_scaled)
 # sanity checks
 # very important!!!!
 tapply(df$affinity_scaled, df$ligand_outcome, min)
 tapply(df$affinity_scaled, df$ligand_outcome, max)
 # My colour FUNCTION: based on group and subgroup
 # in my case;
 # df = df
 # group = ligand_outcome
 # subgroup = normalised score i.e affinity_scaled
 # check unique values in normalised data
 u = unique(df$affinity_scaled) 
 cat("No. of unique values in normalised data:", length(u))
 # Define group
 # Create an extra column called group which contains the "gp name and score" 
 # so colours can be generated for each unique values in this column
 my_grp = df$affinity_scaled 
 df$group <- paste0(df$ligand_outcome, "_", my_grp, sep = "") 
 # Call the function to create the palette based on the group defined above
 colours <- ColourPalleteMulti(df, "ligand_outcome", "my_grp")
 print(paste0("Colour palette generated for: ", length(colours), " colours"))
 my_title = "Ligand affinity"
 cat("No. of axis colours: ", length(my_axis_colours))
 #========================
 # plot with axis colours
 #========================
 class(df$lab_bg)
 # define cartesian coord
 my_xlim = length(unique(df$position)); my_xlim
 # axis label size
 my_xals = 18
 my_yals = 18
 # axes text size
 my_xats = 14
 my_yats = 18
 #******************
 # generate plot: with axis colours
 #******************
 print(paste0("plot name:", plot_bp_aa_subcols_ligand))
 svg(plot_bp_aa_subcols_ligand, width = 26, height = 4)
 g = ggplot(df, aes(factor(position, ordered = T)))
 outPlot = g + 
  coord_cartesian(xlim = c(1, my_xlim)
                  #, ylim = c(0, 6)
                  , ylim = c(0, max(snp_count))
                  , clip = "off") +
  geom_bar(aes(fill = group), colour = "grey") +
  scale_fill_manual(values = colours
                     , guide = "none") +
  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
            , fill = df$lab_bg) +
  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
            , fill = df$lab_bg2) +
 # Here it"s important to specify that your axis goes from 1 to max number of levels
  theme(axis.text.x = element_text(size = my_xats
                                    , angle = 90
                                    , hjust = 1
                                    , vjust = 0.4
                                    , colour = my_axis_colours)
         , axis.text.y = element_text(size = my_yats 
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_text(size = my_xals)
                                       #, hjust = 1
                                       #, vjust = 0.4)
         , axis.title.y = element_text(size = my_yals ) 
         , axis.ticks.x = element_blank()) +
  labs(title = ""
       #title = my_title
       , x = "position"
       , y = "Frequency")
 print(outPlot)
 dev.off()
 #!!!!!!!!!!!!!!!!
 #Warning message:
 #  Vectorized input to `element_text()` is not officially supported.
 #Results may be unexpected or may change in future versions of ggplot2. 
 #!!!!!!!!!!!!!!!!!
 # for sanity and good practice
 #rm(df)
--- a/scripts/plotting/basic_barplots_LIG.R
+++ b/scripts/plotting/basic_barplots_LIG.R
@ -0,0 +1,193 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: producing barplots
 # basic barplots with count of mutations
 # basic barplots with frequency of count of mutations
 #########################################################
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 #source("Header_TT.R")
 library(ggplot2)
 library(data.table)
 library(dplyr)
 source("plotting_data.R")
 # should return the following dfs and directories
 # my_df
 # my_df_u
 # my_df_u_lig
 # dup_muts
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nLength of upos:", length(upos)
           , "\nAngstrom symbol:", angstroms_symbol))       
 # clear excess variable
 rm(my_df, upos, dup_muts, my_df_u)
 #=======================================================================
 #=======
 # output
 #=======
 # plot 1
 basic_bp_ligand = "basic_barplot_LIG.svg"
 plot_basic_bp_ligand  =  paste0(plotdir,"/", basic_bp_ligand)
 # plot 2
 pos_count_ligand = "position_count_LIG.svg"
 plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand)
 #=======================================================================
 #================
 # Data for plots
 #================
 # REASSIGNMENT as necessary
 df  = my_df_u_lig
 rm(my_df_u, my_df, upos, dup_muts)
 # sanity checks
 str(df)
 #=====================================================================
 #****************
 # Plot 1:Count of stabilising and destabilsing muts
 #****************
 #svg("basic_barplots_LIG.svg")
 svg(plot_basic_bp_ligand)
 print(paste0("plot1 filename:", basic_bp_ligand))
 my_ats = 25 # axis text size
 my_als = 22 # axis label size
 theme_set(theme_grey())
 #--------------
 # start plot 1
 #--------------
 g = ggplot(df, aes(x = ligand_outcome))
 outPlot = g + geom_bar(aes(fill = ligand_outcome)
                         , show.legend = TRUE) + 
  geom_label(stat = "count"
             , aes(label = ..count..)
             , color = "black"
             , show.legend = FALSE
             , size = 10) + 
  theme(axis.text.x = element_blank()
        , axis.title.x = element_blank()
        , axis.title.y = element_text(size=my_als)
        , axis.text.y = element_text(size = my_ats)
        , legend.position = c(0.73,0.8)
        , legend.text = element_text(size=my_als-2)
        , legend.title = element_text(size=my_als)
        , plot.title = element_blank()) + 
  labs(title = ""
       , y = "Number of SNPs"
      #, fill="ligand_outcome"
      ) + 
  scale_fill_discrete(name = "Ligand Outcome"
                      , labels = c("Destabilising", "Stabilising"))
 print(outPlot)
 dev.off()
 table(df$ligand_outcome)
 #=======================================================================
 #****************
 # Plot 2: frequency of positions
 #****************
 df_ncols = ncol(df)
 df_nrows = nrow(df)
 cat(paste0("original df dimensions:"
    , "\nNo. of rows:", df_nrows
    , "\nNo. of cols:", df_ncols
    , "\nNow adding column: frequency of mutational positions"))
 #setDT(df)[, .(pos_count := .N), by = .(position)] 
 setDT(df)[, pos_count := .N, by = .(position)]
 rm(df_nrows, df_ncols) 
 df_nrows = nrow(df)
 df_ncols = ncol(df)
 cat(paste0("revised df dimensions:"
           , "\nNo. of rows:", df_nrows
           , "\nNo. of cols:", df_ncols))
 # this is cummulative
 table(df$pos_count)
 # use group by on this
 snpsBYpos_df <- df %>%
  group_by(position) %>%
  summarize(snpsBYpos = mean(pos_count))
 table(snpsBYpos_df$snpsBYpos)
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # FIXME, get this mutation_info, perhaLIG useful!
 foo = select(df, mutationinformation
             , wild_pos
             , wild_type
             , mutant_type
             #, mutation_info # comes from meta data, notused yet
             , position
             , pos_count)
 #write.csv(foo, "/pos_count_freq.csv")
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 #--------------
 # start plot 2
 #--------------
 #svg("position_count_LIG.svg")
 svg(plot_pos_count_ligand)
 print(paste0("plot filename:", plot_pos_count_ligand))
 my_ats = 25 # axis text size
 my_als = 22 # axis label size
 # to make x axis display all positions
 # not sure if to use with sort or directly
 my_x = sort(unique(snpsBYpos_df$snpsBYpos)) 
 g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
 outPlot_pos = g + geom_bar(aes (alpha = 0.5)
                         , show.legend = FALSE) +
  scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) +
  #scale_x_continuous(breaks = my_x) +
  geom_label(stat = "count", aes(label = ..count..)
             , color = "black"
             , size = 10) +
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 0)
        , axis.text.y = element_text(size = my_ats
                                     , angle = 0
                                     , hjust = 1)
        , axis.title.x = element_text(size = my_als)
        , axis.title.y = element_text(size = my_als)
        , plot.title = element_blank()) +
  labs(x = "Number of SNPs"
       , y = "Number of Sites")
 print(outPlot_pos)
 dev.off()
 ########################################################################
 #               			end of lig barplots         			   
 ########################################################################
--- a/scripts/plotting/columns_all_params.csv
+++ b/scripts/plotting/columns_all_params.csv
@ -0,0 +1,87 @@
 ,x,,changes,
 1,mutationinformation,,Mutationinformation,
 2,wild_type,,,consider...wild_aa
 3,position,,Position,
 4,mutant_type,,,consider...mutant_aa
 5,chain,,,
 6,ligand_id,,,
 7,ligand_distance,,,
 8,duet_stability_change,,,
 9,duet_outcome,,DUET_outcome,
 10,ligand_affinity_change,,,
 11,ligand_outcome,,Lig_outcome,
 12,duet_scaled,,ratioDUET,
 13,affinity_scaled,,ratioPredAff,
 14,wild_pos,,WildPos,
 15,wild_chain_pos,,,
 16,ddg,,,
 17,contacts,,,
 18,electro_rr,,,
 19,electro_mm,,,
 20,electro_sm,,,
 21,electro_ss,,,
 22,disulfide_rr,,,
 23,disulfide_mm,,,
 24,disulfide_sm,,,
 25,disulfide_ss,,,
 26,hbonds_rr,,,
 27,hbonds_mm,,,
 28,hbonds_sm,,,
 29,hbonds_ss,,,
 30,partcov_rr,,,
 31,partcov_mm,,,
 32,partcov_sm,,,
 33,partcov_ss,,,
 34,vdwclashes_rr,,,
 35,vdwclashes_mm,,,
 36,vdwclashes_sm,,,
 37,vdwclashes_ss,,,
 38,volumetric_rr,,,
 39,volumetric_mm,,,
 40,volumetric_sm,,,
 41,volumetric_ss,,,
 42,wild_type_dssp,,,
 43,asa,,,
 44,rsa,,,
 45,ss,,,
 46,ss_class,,,
 47,chain_id,,,
 48,wild_type_kd,,,
 49,kd_values,,,
 50,rd_values,,,
 51,wt_3letter_caps,,,
 52,mutation,,,
 53,af,,,
 54,beta_logistic,,,
 55,or_logistic,,,
 56,pval_logistic,,,
 57,se_logistic,,,
 58,zval_logistic,,,
 59,ci_low_logistic,,,
 60,ci_hi_logistic,,,
 61,or_mychisq,,,
 62,or_fisher,,,
 63,pval_fisher,,,
 64,ci_low_fisher,,,
 65,ci_hi_fisher,,,
 66,est_chisq,,,
 67,pval_chisq,,,
 68,chromosome_number,,,
 69,ref_allele,,,
 70,alt_allele,,,
 71,mut_type,,,
 72,gene_id,,,
 73,gene_number,,,
 74,mut_region,,,
 75,mut_info,,,
 76,chr_num_allele,,,
 77,wt_3let,,,
 78,mt_3let,,,
 79,af_kin,,,
 80,or_kin,,,
 81,pwald_kin,,,
 82,beta_kin,,,
 83,se_kin,,,
 84,logl_h1_kin,,,
 85,l_remle_kin,,,
 86,n_miss,,,
--- a/scripts/plotting/notes
+++ b/scripts/plotting/notes
@ -0,0 +1,66 @@
 #####################
 # combining_two_df.R
 #####################
 orig_col ==> df_ncols
 Mutationinformation ==> mutationinformation
 Position ==> position
 DUET_outcome ==> duet_outcome
 Lig_outcome ==> ligand_outcome
 infile ==> infile_params
 in_filename_comb ==> in_filename_metadata
 meta_with_afor ==> gene_metadata
 #!!!!!!!!!
 # FIXME: plotting_data.R
 #!!!!!!!!!
 # This script will be called by various plotting scripts.
 # Ensure you can call this using command line args which are currently commented out
 #####################
 # basic_barplots_PS.R
 #####################
 dim(my_df)
 416, 86
 # unique mutations
 dim(my_df_u)
 403, 86
 # all dups identified are destabilising
 dups_df = 13 rows
 11 unique muts
 upos = unique(my_df_u$position)
 145
 df = my_df_u
 df$duet_outcome
 Destabilising   Stabilising 
          346           57 
                        # with dups
                        Destabilising   Stabilising 
                                  359            57 
 table(df$pos_count)
 # this is cummulative
 #1   2   3   4   5   6 
 #39  62  81 100  85  36 
                        # with dups
                          1   2   3   4   5   6   7 
                         39  60  75  92 100  36  14 
 # use group by
 table(snpsBYpos_df$snpsBYpos)
 #1  2  3  4  5  6 
 #39 31 27 25 17  6 
                         # with dups
                          1  2  3  4  5  6  7 
                          39 30 25 23 20  6  2