renamed aa_index folder to aa_index_scripts

2022-05-30 02:24:54 +01:00 · 2022-05-30 02:24:54 +01:00 · 0c316e4a41
commit 0c316e4a41
parent 650d357afc
9 changed files with 4452 additions and 0 deletions
--- a/scripts/aa_index_scripts/aa_index.R
+++ b/scripts/aa_index_scripts/aa_index.R
@ -0,0 +1,85 @@
+library(bio3d)
+library(seqinr)
+library(bios2mds)
+library(protr)
+#############################################################
+#%% TASK
+# use this to return df for AA index and mutation properties
+
+source()
+
+##############################################################
+my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta"
+my_mcsmf_snps = "~/git/Data/streptomycin/output/gid_mcsm_formatted_snps.csv"
+###############################################################
+#%% fasta as vector
+gid_aa_seq_v= read.fasta(my_fasta_file
+                    , seqtype = "AA"
+                    , as.string = F)
+
+gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
+
+#%% fasta as string
+gid_aa_seq_s = read.fasta(my_fasta_file
+                         , seqtype = "AA"
+                         , as.string = T)
+
+gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
+###############################################################
+#===================
+# AA indices
+# https://www.genome.jp/aaindex/AAindex/list_of_indices
+#===================
+data(aa.index)
+
+# default
+aai_kd  = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
+
+aai_rv  = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
+aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
+aai_b   = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
+
+par(mfrow = c(1,1))
+barplot(aai_kd)
+barplot(aai_rv)
+barplot(aai_rv2)
+#barplot(aai_b, col = c("black", "yellow"))
+
+##########################################################
+#===================
+# mutation matrices
+#===================
+data(sub.mat)
+snps = read.csv(my_mcsmf_snps
+                , header = 0)
+snps
+colnames(snps) <- "mutationinformation"
+
+# run using all matrices
+sub_mat_names = as.character(unlist(attributes(sub.mat)))
+#sub_mat_names = "BLOSUM80"
+
+for (j in sub_mat_names){
+  print(j)
+  snps[[j]] <- NA
+for (i in 1:nrow(snps)) {
+    curr_snp = snps$mutationinformation[i]
+    m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
+    aa1 = m1[,2]
+    aa2 = m1[,3]
+    #snps$blosum_80[i]
+    snps[[j]][i] = sub.mat[[j]][aa1,aa2]
+  }
+
+}
+snps
+##########################################################
+gid_aac = extractAAC(gid_aa_s)
+gid_dc = extractDC(gid_aa_s)
+gid_tc = extractTC(gid_aa_s)
+
+par(mfrow = c(1, 3))
+barplot(gid_aac)
+barplot(gid_dc)
+barplot(gid_tc)
+###########################################################
--- a/scripts/aa_index_scripts/aaindex.zip
+++ b/scripts/aa_index_scripts/aaindex.zip
--- a/scripts/aa_index_scripts/aaindex/data/aaindex2
+++ b/scripts/aa_index_scripts/aaindex/data/aaindex2
--- a/scripts/aa_index_scripts/aaindex/data/aaindex2.p
+++ b/scripts/aa_index_scripts/aaindex/data/aaindex2.p
--- a/scripts/aa_index_scripts/aaindex/data/aaindex3
+++ b/scripts/aa_index_scripts/aaindex/data/aaindex3
--- a/scripts/aa_index_scripts/aaindex/data/aaindex3.p
+++ b/scripts/aa_index_scripts/aaindex/data/aaindex3.p
--- a/scripts/aa_index_scripts/aaindex/data/parse_aaindex.py
+++ b/scripts/aa_index_scripts/aaindex/data/parse_aaindex.py
@ -0,0 +1,90 @@
+from collections import defaultdict
+
+import os
+import pickle
+
+DATA_FOLDER = "/home/chmrodrigues/Documents/ppi2/reverse_mutations/data/aaindex"
+
+def main():
+
+    aaindex2_file = os.path.join(DATA_FOLDER,"aaindex2")
+    aaindex3_file = os.path.join(DATA_FOLDER,"aaindex3")
+
+    lines_index2 = ' '.join([item for item in open(aaindex2_file,'r').readlines()])
+    lines_index3 = ' '.join([item for item in open(aaindex3_file,'r').readlines()])
+
+    attrs_index2 = [item for item in lines_index2.split('//\n') if len(item) != 0]
+    attrs_index3 = [item for item in lines_index3.split('//\n') if len(item) != 0]
+    
+    attr_name = str()
+    all_matrices = dict()
+    for line in attrs_index2:
+        attr_elements = line.split('\n')
+
+        attr_name = [item for item in attr_elements if item.strip().startswith("H ")][0].split()[-1]
+        rows_columns_index = [attr_elements.index(item) for item in attr_elements if item.startswith(" M rows =")][0]
+
+        rows = attr_elements[rows_columns_index].split()[3].replace(",","")
+        columns = attr_elements[rows_columns_index].split()[-1]
+
+        attr_dict = dict()
+        for row in rows:
+            attr_dict[row] = dict()
+            for col in columns:
+                attr_dict[row][col] = None
+
+        for i in range(rows_columns_index+1,len(attr_elements)):
+            values = attr_elements[i].split()
+            try:
+                row = rows[i-(rows_columns_index+1)]
+                for idx,value in enumerate(values):
+                    col = columns[idx]
+                    try:
+                        attr_dict[row][col] = float(value)
+                    except ValueError:
+                        attr_dict[row][col] = value
+            except IndexError:
+                pass
+		
+        all_matrices[attr_name] = attr_dict
+    print(len(all_matrices))
+    pickle.dump(all_matrices, open('index2.p','wb'),protocol=2)
+
+    attr_name = str()
+    all_matrices = dict()
+    for line in attrs_index3:
+        attr_elements = line.split('\n')
+
+        attr_name = [item for item in attr_elements if item.strip().startswith("H ")][0].split()[-1]
+        rows_columns_index = [attr_elements.index(item) for item in attr_elements if item.startswith(" M rows =")][0]
+
+        rows = attr_elements[rows_columns_index].split()[3].replace(",","")
+        columns = attr_elements[rows_columns_index].split()[-1]
+
+        attr_dict = dict()
+        for row in rows:
+            attr_dict[row] = dict()
+            for col in columns:
+                attr_dict[row][col] = None
+
+        for i in range(rows_columns_index+1,len(attr_elements)):
+            values = attr_elements[i].split()
+            try:
+                row = rows[i-(rows_columns_index+1)]
+                for idx,value in enumerate(values):
+                    col = columns[idx]
+                    try:
+                        attr_dict[row][col] = float(value)
+                    except ValueError:
+                        attr_dict[row][col] = value
+            except IndexError:
+                pass
+		
+        all_matrices[attr_name] = attr_dict
+    pickle.dump(all_matrices, open('index3.p','wb'),protocol=2)
+    print(len(all_matrices))
+
+    return True
+
+if __name__ == "__main__":
+    main()
--- a/scripts/aa_index_scripts/aaindex/get_scores.py
+++ b/scripts/aa_index_scripts/aaindex/get_scores.py
@ -0,0 +1,161 @@
+"""
+    RSA <= 0.2 Buried (Inaccessible)
+    RSA > 0.2 Exposed (Accessible)
+
+    SST = [H,I,G] - Helix
+    SST = [B,E] - Beta
+    SST = [T] - Turn
+    SST = [S,-] - Coil
+"""
+from Bio.PDB import PDBParser, DSSP
+import pickle
+import os
+import sys
+import warnings
+
+warnings.filterwarnings("ignore")
+
+CURRENT_FOLDER = '/home/local/BHRI/sportelli/Desktop/Important_Code/structural/aaindex'
+DATA_FOLDER = os.path.join(CURRENT_FOLDER,'data')
+
+RSA_SST_DEPENDENT = {
+    'exposed_helix' : 'KOSJ950101',
+    'exposed_beta' : 'KOSJ950102',
+    'exposed_turn' : 'KOSJ950103',
+    'exposed_coil' : 'KOSJ950104',
+    'buried_helix' : 'KOSJ950105',
+    'buried_beta' : 'KOSJ950106',
+    'buried_turn' : 'KOSJ950107',
+    'buried_coil' : 'KOSJ950108',
+}
+
+SST_DEPENDENT = {
+    'helix' : 'KOSJ950109',
+    'beta' : 'KOSJ950110',
+    'turn' : 'KOSJ950111',
+    'coil' : 'KOSJ950112',
+}
+
+RSA_DEPENDENT1 = {
+    'exposed' : 'KOSJ950113',
+    'buried' : 'KOSJ950114',
+}
+
+RSA_DEPENDENT2 = {
+    'exposed' : 'OVEJ920104',
+    'buried' : 'OVEJ920105',
+}
+
+def get_environment(pdb_file, chain, position, insertion_code=' '):
+    parser = PDBParser()
+    structure = parser.get_structure(pdb_file, pdb_file)
+    model = structure[0]
+
+    dssp = DSSP(model, pdb_file)
+    dssp_key = [item for item in dssp.keys() if item[0] == chain and item[1][1] == int(position) and item[1][2] == insertion_code]
+
+    dssp_key = dssp_key[0]
+    sst = dssp[dssp_key][2]
+    rsa = float(dssp[dssp_key][3])
+
+    return{'sst':sst, 'rsa':rsa}
+
+def main():
+    """
+        READ IMPUT
+    """
+    pdb_file = sys.argv[1]
+    chain_id = sys.argv[2]
+    mutation_code = sys.argv[3]
+
+    aa_from = mutation_code[0]
+    aa_to = mutation_code[-1]
+    position = mutation_code[1:-1]
+    insertion_code = ' '
+    if not position[-1].isdigit():
+        insertion_code = position[-1]
+        position = position[:-1]
+
+    """
+        READ DATABASES
+        index2 - Amino acid substitution indexes
+        index3 - Statistical protein contact potentials
+    """
+    index2 = pickle.load(open('{}/aaindex2.p'.format(DATA_FOLDER),'rb'))
+    index3 = pickle.load(open('{}/aaindex3.p'.format(DATA_FOLDER),'rb'))
+
+    """
+        LOOP THROUGH TABLES AND EXTRACT VALUES
+    """
+    results_index2 = dict()
+    results_index3 = dict()
+    for key in index2.keys():
+        if index2[key][aa_from][aa_to] != None:
+            results_index2[key] = index2[key][aa_from][aa_to]
+        else:
+            results_index2[key] = index2[key][aa_to][aa_from]
+
+    for key in index3.keys():
+        if index3[key][aa_from][aa_to] != None:
+            results_index3[key] = index3[key][aa_from][aa_to]
+        else:
+            results_index3[key] = index3[key][aa_to][aa_from]
+
+    """
+        GET ENVIRONMENT CHARACTERISTICS
+    """
+    environment = get_environment(pdb_file, chain_id, position, insertion_code)
+
+    buried = 'buried'
+    sst = str()
+    if environment['rsa'] <= 0.2:
+        buried = 'exposed'
+
+    if environment['sst'] in ['H','I','G']:
+        sst = 'helix'
+    elif environment['sst'] in ['B','E']:
+        sst = 'beta'
+    elif environment['sst'] in ['T']:
+        sst = 'turn'
+    else:
+        sst = 'coil'
+
+    results_index2['KOSJ950100_RSA_SST'] = results_index2[RSA_SST_DEPENDENT['{}_{}'.format(buried,sst)]]
+    results_index2['KOSJ950100_SST'] = results_index2[SST_DEPENDENT[sst]]
+    results_index2['KOSJ950110_RSA'] = results_index2[RSA_DEPENDENT1[buried]]
+    results_index2['OVEJ920100_RSA'] = results_index2[RSA_DEPENDENT2[buried]]
+
+    for value in RSA_SST_DEPENDENT.values():
+        results_index2.pop(value)
+    for value in SST_DEPENDENT.values():
+        results_index2.pop(value)
+    for value in RSA_DEPENDENT1.values():
+        results_index2.pop(value)
+    for value in RSA_DEPENDENT2.values():
+        results_index2.pop(value)
+
+    """
+        PRINT RESULTS
+    """
+    output_dict = dict()
+    output_dict.update(results_index2)
+    output_dict.update(results_index3)
+
+    keys = list(output_dict.keys())
+    keys.sort()
+    values = [str(output_dict[item]) for item in keys]
+
+    # print(",".join(keys))
+    print(",".join(values))
+
+    return True
+
+
+if __name__ == "__main__":
+
+    if len(sys.argv) != 4:
+        print("Error on parsing argument list")
+        print("Please provide a one letter code for wild-type and mutant residues")
+        print("Eg.: python get_scores.py pdb_file chain_id mutation_code")
+        sys.exit(1)
+    main()
--- a/scripts/aa_index_scripts/run_aa_index.R
+++ b/scripts/aa_index_scripts/run_aa_index.R
@ -0,0 +1,101 @@
+#!/usr/bin/env Rscript
+library(bio3d)
+library(seqinr)
+library(bios2mds)
+library(protr)
+library(stringr)
+####################################################################
+# TASK: use this to return df for AA index and mutation properties
+# useful for dfs
+
+
+#####################################################################
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/")
+getwd()
+
+drug = "streptomycin"
+gene = "gid"
+
+source("functions/plotting_globals.R")
+import_dirs(drug_name = drug, gene_name = gene)
+
+##############################################################
+my_fasta_file = paste0(indir, "/",  gene,  "_complex.fasta")
+
+my_mcsmf_snps = paste0(outdir, "/", gene, "_mcsm_formatted_snps.csv")
+
+###############################################################
+#%% fasta as vector
+gid_aa_seq_v= read.fasta(my_fasta_file
+                    , seqtype = "AA"
+                    , as.string = F)
+
+gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
+
+#%% fasta as string
+gid_aa_seq_s = read.fasta(my_fasta_file
+                         , seqtype = "AA"
+                         , as.string = T)
+
+gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
+###############################################################
+#===================
+# AA indices
+# https://www.genome.jp/aaindex/AAindex/list_of_indices
+#===================
+data(aa.index)
+
+# default
+aai_kd  = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
+
+aai_rv  = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
+aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
+aai_b   = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
+##########################################################
+#===================
+# mutation matrices
+#===================
+data(sub.mat)
+snps = read.csv(my_mcsmf_snps
+                , header = 0)
+snps
+colnames(snps) <- "mutationinformation"
+
+# run using all matrices
+sub_mat_names = as.character(unlist(attributes(sub.mat)))
+#sub_mat_names = "BLOSUM80"
+
+for (j in sub_mat_names){
+  print(j)
+  snps[[j]] <- NA
+for (i in 1:nrow(snps)) {
+    curr_snp = snps$mutationinformation[i]
+    m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
+    aa1 = m1[,2]
+    aa2 = m1[,3]
+    #snps$blosum_80[i]
+    snps[[j]][i] = sub.mat[[j]][aa1,aa2]
+  }
+
+}
+snps
+##########################################################
+gid_aac = extractAAC(gid_aa_s)
+gid_dc = extractDC(gid_aa_s)
+gid_tc = extractTC(gid_aa_s)
+
+##########################################################
+# Plots
+par(mfrow = c(3,2))
+
+barplot(aai_kd  , main = "AA index: KD")
+#barplot(aai_rv  , main = "AA index: Residue Volume, 1967")
+barplot(aai_rv2 , main = "AA index: Residue Volume") #1973
+barplot(aai_b   , main = "AA index: Bitterness")
+
+barplot(gid_aac , main = "AA: composition")
+barplot(gid_dc  , main = "AA: Dipeptide composition")
+barplot(gid_tc  , main = "AA: Tripeptide composition")
+###########################################################