Merge branch 'gidb_dev'

2021-06-14 13:27:00 +01:00 · 2021-06-14 13:27:00 +01:00 · 86ed1805fc
commit 86ed1805fc
parent 687adf0ec7 ddb1a7a7aa
7 changed files with 505 additions and 7 deletions
--- a/scripts/aa_prop.py
+++ b/scripts/aa_prop.py
@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Mon June 14 2021
+
+@author: tanu
+'''
+# FIXME: import dirs.py to get the basic dir paths available
+#=======================================================================
+# TASK
+
+# Input:
+
+# Output: 
+#=======================================================================
+#%% load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+#from varname import nameof
+import argparse
+
+DEBUG = False
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+
+from reference_dict import oneletter_aa_dict 
+#=======================================================================
+#%%
+def get_aa_prop(df, col1 = 'aap1', col2 = 'aap2', col3 = 'aap_taylor', col4 = 'aap_kd', col5 = 'aap_polarity', col6 = 'aap_calcprop'):
+
+    """Add amino acid properties for wt and mutant residues
+    
+    @df: df containing one letter aa code for wt and mutant respectively
+    @type: pandas df
+    
+    @col1: column adding 7 aa categories (no overlap; acidic, basic, amidic, hydrophobic, hydroxylic, aromatic, sulphur)
+    @type: str
+    
+    @col2: column adding 9 aa categories (overlap; acidic, basic, polar, hydrophobic, hydrophilic, small, aromatic, aliphatic, special)
+    @type: str
+    
+    @col3: column adding 8 aa categories (overlap; acidic, basic, polar, hydrophobic, small, aromatic, aliphatic, special)
+    @type: str
+
+    @col4: column adding 3 aa categories (no overlap, hydrophobic, neutral and hydrophilic according to KD scale)
+    @type: str
+
+    @col5: column adding 4 aa categories (no overlap, acidic, basic, neutral, non-polar)
+    @type: str
+
+    @col6: column adding 4 aa categories (neg, pos, polar, non-polar)
+    @type: str
+    
+    returns df: with 6 added columns. If column names clash, the function column 
+                name will override original column
+    @rtype: pandas df
+    """
+
+    lookup_dict_p1 = dict()
+    lookup_dict_p2 = dict()
+    
+    lookup_dict_taylor = dict()
+    lookup_dict_kd = dict()
+    
+    lookup_dict_polarity = dict()
+    lookup_dict_calcprop = dict()
+    
+    for k, v in oneletter_aa_dict.items():
+      
+        lookup_dict_p1[k] = v['aa_prop1']
+        
+        lookup_dict_p2[k] = v['aa_prop2']
+              
+        lookup_dict_taylor[k] = v['aa_taylor']
+        
+        lookup_dict_kd[k] = v['aa_prop_water']
+                
+        lookup_dict_polarity[k] = v['aa_prop_polarity']
+        
+        lookup_dict_calcprop[k] = v['aa_calcprop']      
+        
+        #if DEBUG:
+        #    print('Key:', k, 'value:', v
+        #          , '\n============================================================'
+        #          , '\nlook up dict:\n')
+            
+        df['wt_aap1'] = df['wild_type'].map(lookup_dict_p1)   
+        df['mut_aap1'] = df['mutant_type'].map(lookup_dict_p1)    
+            
+        df['wt_aap2'] = df['wild_type'].map(lookup_dict_p2)   
+        df['mut_aap2'] = df['mutant_type'].map(lookup_dict_p2)   
+
+        df['wt_aap_taylor'] = df['wild_type'].map(lookup_dict_taylor)   
+        df['mut_aap_taylor'] = df['mutant_type'].map(lookup_dict_taylor)   
+
+        df['wt_aap_kd'] = df['wild_type'].map(lookup_dict_kd)   
+        df['mut_aap_kd'] = df['mutant_type'].map(lookup_dict_kd)   
+
+        df['wt_aap_polarity'] = df['wild_type'].map(lookup_dict_polarity)   
+        df['mut_aap_polarity'] = df['mutant_type'].map(lookup_dict_polarity)           
+                    
+        df['wt_aa_calcprop'] = df['wild_type'].map(lookup_dict_calcprop)   
+        df['mut_aa_calcprop'] = df['mutant_type'].map(lookup_dict_calcprop)
+
+    return df
+#========================================
--- a/scripts/add_aa_prop.py
+++ b/scripts/add_aa_prop.py
@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+# FIXME: import dirs.py to get the basic dir paths available
+#=======================================================================
+# TASK: 
+# 
+
+# Input: 
+
+# Output: 
+#=======================================================================
+#%% load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+#from varname import nameof
+import argparse
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+
+from aa_prop import get_aa_prop 
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
+arg_parser.add_argument('-g', '--gene', help='gene name', default = None)
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+
+arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') # not used atm
+
+args = arg_parser.parse_args()
+#%% variable assignment: input and output 
+drug    = args.drug
+gene    = args.gene
+datadir = args.datadir
+indir   = args.input_dir
+outdir  = args.output_dir
+
+#%%=======================================================================
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#=======
+# input
+#=======
+#in_filename  = 'merged_df3.csv'
+#in_filename  = gene.lower() + '_complex_mcsm_norm.csv'
+in_filename_mcsm  = gene.lower() + '_complex_mcsm_norm_SRY.csv' # gid
+infile_mcsm = outdir + '/' + in_filename_mcsm
+print('Input file: ', infile_mcsm
+      , '\n============================================================')
+
+#=======
+# output 
+#=======
+out_filename_aaps = gene.lower() + '_mcsm_aaps.txt'
+outfile_aaps =  outdir + '/' + out_filename_aaps
+print('Output file: ', outfile_aaps
+      , '\n============================================================')
+
+#%% end of variable assignment for input and output files
+#=======================================================================
+#%% Read input files
+print('Reading input file (merged file):', infile_mcsm)
+
+comb_df = pd.read_csv(infile_mcsm, sep = ',')  
+
+print('Input filename: ', in_filename_mcsm
+      , '\nPath :', outdir
+      , '\nNo. of rows: ', len(comb_df)
+      , '\nNo. of cols: ', len(comb_df.columns)
+      , '\n============================================================')
+
+# column names
+list(comb_df.columns)
+
+#%% sanity check
+nrows_df = len(comb_df)
+ncols_add =  12
+expected_cols = len(comb_df.columns) + ncols_add
+print('\nAdding aa properties for wt and mutant: ', ncols_add, ' columns in total'
+	, '\n===============================================================')
+
+#%% call get_aa_prop():
+get_aa_prop(df = comb_df)
+
+#%% check dim of df
+if len(comb_df) == nrows_df and len(comb_df.columns) == expected_cols:
+    print('Checking dim of df: '
+          , '\nPASS: df dim match'
+          , '\nno.of rows: ', len(comb_df)
+          , '\nno. of cols: ', len(comb_df.columns))
+else:
+    print('\FAIL: dim mismatch'
+             , 'Expected rows: ', nrows_df
+             , '\Got: ', len(comb_df)
+             , '\nExpected cols: ', expected_cols
+             , '\nGot: ', len(comb_df.columns))
+    sys.exit('Aborting')
+#%% summary output
+sys.stdout = open(outfile_aaps, 'w')
+
+print('################################################'
+      , '\n aa properties: ', gene 
+      , '\n################################################')
+
+print('\n-------------------'
+      , '\nWild-type: aap 1'
+      , '\n-------------------\n'
+      , comb_df['wt_aap1'].value_counts()
+      , '\n-------------------'
+      , '\nmutant-type: aap 1'
+      , '\n-------------------\n'
+      , comb_df['mut_aap1'].value_counts()
+      , '\n===================================================')
+
+print('\n-------------------'
+      , '\nWild-type: aap 2'
+      , '\n-------------------\n'
+      , comb_df['wt_aap2'].value_counts()
+      , '\n-------------------'
+      , '\nmutant-type: aap 2'
+      , '\n-------------------\n'
+      , comb_df['mut_aap2'].value_counts()
+      , '\n===================================================')
+
+print('\n-------------------'
+      , '\nWild-type: aap taylor'
+      , '\n-------------------\n'
+      , comb_df['wt_aap_taylor'].value_counts()
+      , '\n-------------------'
+      , '\nmutant-type: taylor'
+      , '\n-------------------\n'
+      , comb_df['mut_aap_taylor'].value_counts()
+      , '\n===================================================')
+
+print('\n-------------------'
+      , '\nWild-type: aap water/kd'
+      , '\n-------------------\n'
+      , comb_df['wt_aap_kd'].value_counts()
+      , '\n-------------------'
+      , '\nmutant-type: water/kd'
+      , '\n-------------------\n'
+      , comb_df['mut_aap_kd'].value_counts()
+      , '\n===================================================')
+
+print('\n-------------------'
+      , '\nWild-type: aap polarity'
+      , '\n-------------------\n'
+      , comb_df['wt_aap_polarity'].value_counts()
+      , '\n-------------------'
+      , '\nmutant-type: polarity'
+      , '\n-------------------\n'
+      , comb_df['mut_aap_polarity'].value_counts()
+      , '\n===================================================')
+
+print('\n-------------------'
+      , '\nWild-type: aa calcprop'
+      , '\n-------------------\n'
+      , comb_df['wt_aa_calcprop'].value_counts()
+      , '\n-------------------'
+      , '\nmutant-type: aa calcprop'
+      , '\n-------------------\n'
+      , comb_df['mut_aa_calcprop'].value_counts()
+      , '\n===================================================')
+
+#%% end of script
+#=======================================================================
--- a/scripts/functions/aa_prop_bp.R
+++ b/scripts/functions/aa_prop_bp.R
@ -0,0 +1,22 @@
+library(ggplot2)
+library(tidyverse)
+library(data.table)
+########################################################################
+# TASK: barplots to colour nssnps by chosen aa_property
+########################################################################
+aa_prop_bp <- function(plotdf
+                       , position_colname
+                       , fill_colname
+                       , fill_colours
+                       , leg_name = "LEGEND"){
+  mp = ggplot(df, aes(x = as.factor(eval(parse(text = pos_colname)))
+                      #, y = ..count..
+                      , fill = eval(parse(text = fill_colname))
+  )) +
+    geom_bar(position = 'identity') +
+    scale_fill_manual(values = fill_colours
+                      , name = leg_name)
+  return(mp)
+}
+
+
--- a/scripts/functions/test_aa_prop_bp.R
+++ b/scripts/functions/test_aa_prop_bp.R
@ -0,0 +1,62 @@
+library(ggplot2)
+library(tidyverse)
+library(data.table)
+
+setwd("~/git/LSHTM_analysis/scripts/plotting/functions")
+getwd()
+#############################################################
+#===========================================
+# load functions, data, dirs, hardocded vars
+# that will be used in testing the functions
+#===========================================
+source("../plotting_data.R")
+infile = "/home/tanu/git/Data/streptomycin/output/"
+pd_df = plotting_data(infile)
+my_df       = pd_df[[1]]
+my_df_u     = pd_df[[2]]
+my_df_u_lig = pd_df[[3]]
+dup_muts    = pd_df[[4]]
+
+source("../plotting_globals.R")
+drug = "streptomycin"
+gene = "gid"
+
+import_dirs(drug, gene)
+
+#=====================
+# functions to test 
+#=====================
+source("stability_count_bp.R")
+source("position_count_bp.R")
+#################################################################
+##############################################
+# read a sample file containing muts and prop 
+###############################################
+df<- read.csv(file.choose())
+
+setDT(df)[, pos_count := .N, by = .(position)]
+foo = data.frame(df$position, df$pos_count)
+
+#snpsBYpos_df <- df %>%
+#  group_by(position) %>%
+#  summarize(snpsBYpos = mean(pos_count))
+
+# subset df without duplicates for position
+df2 = df[!duplicated(df$position)]
+##################################################################
+# ---------------------------------------
+# barplot for nssnps, coloured by aa prop
+# ---------------------------------------
+pos_colname = "position"
+aa_prop_colname = "mut_prop_water"
+aa_prop_colours = c("black", "blue")
+my_legname = "aa_prop: water"
+
+# call function
+aa_prop_bp(plotdf = df
+           , position_colname = pos_colname
+           , fill_colname = aa_prop_colname
+           , fill_colours = aa_prop_cols
+           , leg_name = my_legname)
+           
+#===============================================================
--- a/scripts/mut_electrostatic_changes.py
+++ b/scripts/mut_electrostatic_changes.py
@ -5,13 +5,15 @@ Created on Tue Aug  6 12:56:03 2019

@author: tanu
 '''
-
 # FIXME: import dirs.py to get the basic dir paths available
 #=======================================================================
 # TASK: calculate how many mutations result in 
 # electrostatic changes wrt wt

-# Input: merged_df3
+# Input: mCSM-normalised file or any file containing one-letter aa code 
+# of wt and mut
+# Note: this can be easily modified into 3-letter code
+# TODO: turn to a function

 # Output: mut_elec_changes_results.txt 
 #=======================================================================
--- a/scripts/plotting/myshiny/app_layout_eg.R
+++ b/scripts/plotting/myshiny/app_layout_eg.R
@ -0,0 +1,106 @@
+# 
+# 
+# Home
+# Public
+# 
+# Questions
+# Tags
+# Users
+# Find a Job
+# Jobs
+# Companies
+# 
+# Teams
+# Stack Overflow for Teams – Collaborate and share knowledge with a private group. 
+# 
+# How can put multiple plots side-by-side in shiny r?
+#     Asked 5 years, 5 months ago
+# Active 2 years, 1 month ago
+# Viewed 59k times
+# 28
+# 11
+# 
+# In mainpanel, I try to handle this problem via fluidrow. However, one of my plot is optional to be displayed or not by users. When user clicks the button, the second plot appears below the first plot.
+# 
+# fluidRow(
+#     column(2, align="right",
+#            plotOutput(outputId = "plotgraph1", width  = "500px",height = "400px"),  
+#            plotOutput(outputId = "plotgraph2", width  = "500px",height = "400px")
+#     ))
+# 
+# I played with "align" and "widths", but nothing changed.
+# r
+# plot
+# ggplot2
+# shiny
+# Share
+# Improve this question
+# Follow
+# edited Jan 21 '17 at 17:51
+# Mike Wise
+# 18.8k66 gold badges7171 silver badges9595 bronze badges
+# asked Dec 20 '15 at 19:25
+# can.u
+# 42511 gold badge44 silver badges1111 bronze badges
+# Add a comment
+# 3 Answers
+# 30
+# 
+# So it is a couple years later, and while the others answers - including mine - are still valid, it is not how I would recommend approaching it today. Today I would lay it out using the grid.arrange from the gridExtra package.
+# 
+# It allows any number of plots, and can lay them out in a grid checkerboard-like. (I was erroneously under the impression splitLayout only worked with two).
+# It has more customization possibilities (you can specify rows, columns, headers, footer, padding, etc.)
+# It is ultimately easier to use, even for two plots, since laying out in the UI is finicky - it can be difficult to predict what Bootstrap will do with your elements when the screen size changes.
+# Since this question gets a lot of traffic, I kind of think more alternative should be here.
+# 
+# The cowplot package is also worth looking into, it offers similar functionality, but I am not so familiar with it.
+# 
+# Here is a small shiny program demonstrating that:
+#     
+library(shiny)
+library(ggplot2)
+library(gridExtra)
+
+u <- shinyUI(fluidPage(
+    titlePanel("title panel"),
+    sidebarLayout(position = "left",
+                  sidebarPanel("sidebar panel",
+                               checkboxInput("donum1", "Make #1 plot", value = T),
+                               checkboxInput("donum2", "Make #2 plot", value = F),
+                               checkboxInput("donum3", "Make #3 plot", value = F),
+                               sliderInput("wt1","Weight 1",min=1,max=10,value=1),
+                               sliderInput("wt2","Weight 2",min=1,max=10,value=1),
+                               sliderInput("wt3","Weight 3",min=1,max=10,value=1)
+                  ),
+                  mainPanel("main panel",
+                            column(6,plotOutput(outputId="plotgraph", width="500px",height="400px"))
+                  ))))
+
+s <- shinyServer(function(input, output) 
+{
+    set.seed(123)
+    pt1 <- reactive({
+        if (!input$donum1) return(NULL)
+        qplot(rnorm(500),fill=I("red"),binwidth=0.2,main="plotgraph1")
+    })
+    pt2 <- reactive({
+        if (!input$donum2) return(NULL)
+        qplot(rnorm(500),fill=I("blue"),binwidth=0.2,main="plotgraph2")
+    })
+    pt3 <- reactive({
+        if (!input$donum3) return(NULL)
+        qplot(rnorm(500),fill=I("green"),binwidth=0.2,main="plotgraph3")
+    })
+    output$plotgraph = renderPlot({
+        ptlist <- list(pt1(),pt2(),pt3())
+        wtlist <- c(input$wt1,input$wt2,input$wt3)
+        # remove the null plots from ptlist and wtlist
+        to_delete <- !sapply(ptlist,is.null)
+        ptlist <- ptlist[to_delete] 
+        wtlist <- wtlist[to_delete]
+        if (length(ptlist)==0) return(NULL)
+        
+        grid.arrange(grobs=ptlist,widths=wtlist,ncol=length(ptlist))
+    })
+})
+shinyApp(u,s)
--- a/scripts/reference_dict.py
+++ b/scripts/reference_dict.py
@ -132,9 +132,11 @@ qualities_taylor = { ('R', 'H', 'K'): 'Basic'
             , ('C', 'G', 'P'): 'Special'
 }

-# binary classification: hydrophilic or hydrophobic
-qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
-                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
+# ternary classification: hydrophobic --> neutral --> hydrophilic (KD scale)
+#http://www.imgt.org/IMGTeducation/Aide-memoire/_UK/aminoacids/IMGTclasses.html
+qualities_water = { ('I','V','L','F','C','M','A','W'): 'hydrophobic'
+                   , ('G','T','S','Y','P','H'): 'neutral'
+                   , ('N','D','Q','E','K','R'): 'hydrophilic'
 }

 # polarity: no overlap