renamed aa_index folder to aa_index_scripts
This commit is contained in:
parent
650d357afc
commit
0c316e4a41
9 changed files with 4452 additions and 0 deletions
85
scripts/aa_index_scripts/aa_index.R
Normal file
85
scripts/aa_index_scripts/aa_index.R
Normal file
|
@ -0,0 +1,85 @@
|
|||
library(bio3d)
|
||||
library(seqinr)
|
||||
library(bios2mds)
|
||||
library(protr)
|
||||
#############################################################
|
||||
#%% TASK
|
||||
# use this to return df for AA index and mutation properties
|
||||
|
||||
source()
|
||||
|
||||
##############################################################
|
||||
my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta"
|
||||
my_mcsmf_snps = "~/git/Data/streptomycin/output/gid_mcsm_formatted_snps.csv"
|
||||
###############################################################
|
||||
#%% fasta as vector
|
||||
gid_aa_seq_v= read.fasta(my_fasta_file
|
||||
, seqtype = "AA"
|
||||
, as.string = F)
|
||||
|
||||
gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
|
||||
|
||||
#%% fasta as string
|
||||
gid_aa_seq_s = read.fasta(my_fasta_file
|
||||
, seqtype = "AA"
|
||||
, as.string = T)
|
||||
|
||||
gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
|
||||
###############################################################
|
||||
#===================
|
||||
# AA indices
|
||||
# https://www.genome.jp/aaindex/AAindex/list_of_indices
|
||||
#===================
|
||||
data(aa.index)
|
||||
|
||||
# default
|
||||
aai_kd = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
|
||||
|
||||
aai_rv = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
|
||||
aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
|
||||
aai_b = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
|
||||
|
||||
par(mfrow = c(1,1))
|
||||
barplot(aai_kd)
|
||||
barplot(aai_rv)
|
||||
barplot(aai_rv2)
|
||||
#barplot(aai_b, col = c("black", "yellow"))
|
||||
|
||||
##########################################################
|
||||
#===================
|
||||
# mutation matrices
|
||||
#===================
|
||||
data(sub.mat)
|
||||
snps = read.csv(my_mcsmf_snps
|
||||
, header = 0)
|
||||
snps
|
||||
colnames(snps) <- "mutationinformation"
|
||||
|
||||
# run using all matrices
|
||||
sub_mat_names = as.character(unlist(attributes(sub.mat)))
|
||||
#sub_mat_names = "BLOSUM80"
|
||||
|
||||
for (j in sub_mat_names){
|
||||
print(j)
|
||||
snps[[j]] <- NA
|
||||
for (i in 1:nrow(snps)) {
|
||||
curr_snp = snps$mutationinformation[i]
|
||||
m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
|
||||
aa1 = m1[,2]
|
||||
aa2 = m1[,3]
|
||||
#snps$blosum_80[i]
|
||||
snps[[j]][i] = sub.mat[[j]][aa1,aa2]
|
||||
}
|
||||
|
||||
}
|
||||
snps
|
||||
##########################################################
|
||||
gid_aac = extractAAC(gid_aa_s)
|
||||
gid_dc = extractDC(gid_aa_s)
|
||||
gid_tc = extractTC(gid_aa_s)
|
||||
|
||||
par(mfrow = c(1, 3))
|
||||
barplot(gid_aac)
|
||||
barplot(gid_dc)
|
||||
barplot(gid_tc)
|
||||
###########################################################
|
BIN
scripts/aa_index_scripts/aaindex.zip
Normal file
BIN
scripts/aa_index_scripts/aaindex.zip
Normal file
Binary file not shown.
2818
scripts/aa_index_scripts/aaindex/data/aaindex2
Normal file
2818
scripts/aa_index_scripts/aaindex/data/aaindex2
Normal file
File diff suppressed because it is too large
Load diff
BIN
scripts/aa_index_scripts/aaindex/data/aaindex2.p
Normal file
BIN
scripts/aa_index_scripts/aaindex/data/aaindex2.p
Normal file
Binary file not shown.
1383
scripts/aa_index_scripts/aaindex/data/aaindex3
Normal file
1383
scripts/aa_index_scripts/aaindex/data/aaindex3
Normal file
File diff suppressed because it is too large
Load diff
BIN
scripts/aa_index_scripts/aaindex/data/aaindex3.p
Normal file
BIN
scripts/aa_index_scripts/aaindex/data/aaindex3.p
Normal file
Binary file not shown.
90
scripts/aa_index_scripts/aaindex/data/parse_aaindex.py
Normal file
90
scripts/aa_index_scripts/aaindex/data/parse_aaindex.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
from collections import defaultdict
|
||||
|
||||
import os
|
||||
import pickle
|
||||
|
||||
DATA_FOLDER = "/home/chmrodrigues/Documents/ppi2/reverse_mutations/data/aaindex"
|
||||
|
||||
def main():
|
||||
|
||||
aaindex2_file = os.path.join(DATA_FOLDER,"aaindex2")
|
||||
aaindex3_file = os.path.join(DATA_FOLDER,"aaindex3")
|
||||
|
||||
lines_index2 = ' '.join([item for item in open(aaindex2_file,'r').readlines()])
|
||||
lines_index3 = ' '.join([item for item in open(aaindex3_file,'r').readlines()])
|
||||
|
||||
attrs_index2 = [item for item in lines_index2.split('//\n') if len(item) != 0]
|
||||
attrs_index3 = [item for item in lines_index3.split('//\n') if len(item) != 0]
|
||||
|
||||
attr_name = str()
|
||||
all_matrices = dict()
|
||||
for line in attrs_index2:
|
||||
attr_elements = line.split('\n')
|
||||
|
||||
attr_name = [item for item in attr_elements if item.strip().startswith("H ")][0].split()[-1]
|
||||
rows_columns_index = [attr_elements.index(item) for item in attr_elements if item.startswith(" M rows =")][0]
|
||||
|
||||
rows = attr_elements[rows_columns_index].split()[3].replace(",","")
|
||||
columns = attr_elements[rows_columns_index].split()[-1]
|
||||
|
||||
attr_dict = dict()
|
||||
for row in rows:
|
||||
attr_dict[row] = dict()
|
||||
for col in columns:
|
||||
attr_dict[row][col] = None
|
||||
|
||||
for i in range(rows_columns_index+1,len(attr_elements)):
|
||||
values = attr_elements[i].split()
|
||||
try:
|
||||
row = rows[i-(rows_columns_index+1)]
|
||||
for idx,value in enumerate(values):
|
||||
col = columns[idx]
|
||||
try:
|
||||
attr_dict[row][col] = float(value)
|
||||
except ValueError:
|
||||
attr_dict[row][col] = value
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
all_matrices[attr_name] = attr_dict
|
||||
print(len(all_matrices))
|
||||
pickle.dump(all_matrices, open('index2.p','wb'),protocol=2)
|
||||
|
||||
attr_name = str()
|
||||
all_matrices = dict()
|
||||
for line in attrs_index3:
|
||||
attr_elements = line.split('\n')
|
||||
|
||||
attr_name = [item for item in attr_elements if item.strip().startswith("H ")][0].split()[-1]
|
||||
rows_columns_index = [attr_elements.index(item) for item in attr_elements if item.startswith(" M rows =")][0]
|
||||
|
||||
rows = attr_elements[rows_columns_index].split()[3].replace(",","")
|
||||
columns = attr_elements[rows_columns_index].split()[-1]
|
||||
|
||||
attr_dict = dict()
|
||||
for row in rows:
|
||||
attr_dict[row] = dict()
|
||||
for col in columns:
|
||||
attr_dict[row][col] = None
|
||||
|
||||
for i in range(rows_columns_index+1,len(attr_elements)):
|
||||
values = attr_elements[i].split()
|
||||
try:
|
||||
row = rows[i-(rows_columns_index+1)]
|
||||
for idx,value in enumerate(values):
|
||||
col = columns[idx]
|
||||
try:
|
||||
attr_dict[row][col] = float(value)
|
||||
except ValueError:
|
||||
attr_dict[row][col] = value
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
all_matrices[attr_name] = attr_dict
|
||||
pickle.dump(all_matrices, open('index3.p','wb'),protocol=2)
|
||||
print(len(all_matrices))
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
161
scripts/aa_index_scripts/aaindex/get_scores.py
Normal file
161
scripts/aa_index_scripts/aaindex/get_scores.py
Normal file
|
@ -0,0 +1,161 @@
|
|||
"""
|
||||
RSA <= 0.2 Buried (Inaccessible)
|
||||
RSA > 0.2 Exposed (Accessible)
|
||||
|
||||
SST = [H,I,G] - Helix
|
||||
SST = [B,E] - Beta
|
||||
SST = [T] - Turn
|
||||
SST = [S,-] - Coil
|
||||
"""
|
||||
from Bio.PDB import PDBParser, DSSP
|
||||
import pickle
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
CURRENT_FOLDER = '/home/local/BHRI/sportelli/Desktop/Important_Code/structural/aaindex'
|
||||
DATA_FOLDER = os.path.join(CURRENT_FOLDER,'data')
|
||||
|
||||
RSA_SST_DEPENDENT = {
|
||||
'exposed_helix' : 'KOSJ950101',
|
||||
'exposed_beta' : 'KOSJ950102',
|
||||
'exposed_turn' : 'KOSJ950103',
|
||||
'exposed_coil' : 'KOSJ950104',
|
||||
'buried_helix' : 'KOSJ950105',
|
||||
'buried_beta' : 'KOSJ950106',
|
||||
'buried_turn' : 'KOSJ950107',
|
||||
'buried_coil' : 'KOSJ950108',
|
||||
}
|
||||
|
||||
SST_DEPENDENT = {
|
||||
'helix' : 'KOSJ950109',
|
||||
'beta' : 'KOSJ950110',
|
||||
'turn' : 'KOSJ950111',
|
||||
'coil' : 'KOSJ950112',
|
||||
}
|
||||
|
||||
RSA_DEPENDENT1 = {
|
||||
'exposed' : 'KOSJ950113',
|
||||
'buried' : 'KOSJ950114',
|
||||
}
|
||||
|
||||
RSA_DEPENDENT2 = {
|
||||
'exposed' : 'OVEJ920104',
|
||||
'buried' : 'OVEJ920105',
|
||||
}
|
||||
|
||||
def get_environment(pdb_file, chain, position, insertion_code=' '):
|
||||
parser = PDBParser()
|
||||
structure = parser.get_structure(pdb_file, pdb_file)
|
||||
model = structure[0]
|
||||
|
||||
dssp = DSSP(model, pdb_file)
|
||||
dssp_key = [item for item in dssp.keys() if item[0] == chain and item[1][1] == int(position) and item[1][2] == insertion_code]
|
||||
|
||||
dssp_key = dssp_key[0]
|
||||
sst = dssp[dssp_key][2]
|
||||
rsa = float(dssp[dssp_key][3])
|
||||
|
||||
return{'sst':sst, 'rsa':rsa}
|
||||
|
||||
def main():
|
||||
"""
|
||||
READ IMPUT
|
||||
"""
|
||||
pdb_file = sys.argv[1]
|
||||
chain_id = sys.argv[2]
|
||||
mutation_code = sys.argv[3]
|
||||
|
||||
aa_from = mutation_code[0]
|
||||
aa_to = mutation_code[-1]
|
||||
position = mutation_code[1:-1]
|
||||
insertion_code = ' '
|
||||
if not position[-1].isdigit():
|
||||
insertion_code = position[-1]
|
||||
position = position[:-1]
|
||||
|
||||
"""
|
||||
READ DATABASES
|
||||
index2 - Amino acid substitution indexes
|
||||
index3 - Statistical protein contact potentials
|
||||
"""
|
||||
index2 = pickle.load(open('{}/aaindex2.p'.format(DATA_FOLDER),'rb'))
|
||||
index3 = pickle.load(open('{}/aaindex3.p'.format(DATA_FOLDER),'rb'))
|
||||
|
||||
"""
|
||||
LOOP THROUGH TABLES AND EXTRACT VALUES
|
||||
"""
|
||||
results_index2 = dict()
|
||||
results_index3 = dict()
|
||||
for key in index2.keys():
|
||||
if index2[key][aa_from][aa_to] != None:
|
||||
results_index2[key] = index2[key][aa_from][aa_to]
|
||||
else:
|
||||
results_index2[key] = index2[key][aa_to][aa_from]
|
||||
|
||||
for key in index3.keys():
|
||||
if index3[key][aa_from][aa_to] != None:
|
||||
results_index3[key] = index3[key][aa_from][aa_to]
|
||||
else:
|
||||
results_index3[key] = index3[key][aa_to][aa_from]
|
||||
|
||||
"""
|
||||
GET ENVIRONMENT CHARACTERISTICS
|
||||
"""
|
||||
environment = get_environment(pdb_file, chain_id, position, insertion_code)
|
||||
|
||||
buried = 'buried'
|
||||
sst = str()
|
||||
if environment['rsa'] <= 0.2:
|
||||
buried = 'exposed'
|
||||
|
||||
if environment['sst'] in ['H','I','G']:
|
||||
sst = 'helix'
|
||||
elif environment['sst'] in ['B','E']:
|
||||
sst = 'beta'
|
||||
elif environment['sst'] in ['T']:
|
||||
sst = 'turn'
|
||||
else:
|
||||
sst = 'coil'
|
||||
|
||||
results_index2['KOSJ950100_RSA_SST'] = results_index2[RSA_SST_DEPENDENT['{}_{}'.format(buried,sst)]]
|
||||
results_index2['KOSJ950100_SST'] = results_index2[SST_DEPENDENT[sst]]
|
||||
results_index2['KOSJ950110_RSA'] = results_index2[RSA_DEPENDENT1[buried]]
|
||||
results_index2['OVEJ920100_RSA'] = results_index2[RSA_DEPENDENT2[buried]]
|
||||
|
||||
for value in RSA_SST_DEPENDENT.values():
|
||||
results_index2.pop(value)
|
||||
for value in SST_DEPENDENT.values():
|
||||
results_index2.pop(value)
|
||||
for value in RSA_DEPENDENT1.values():
|
||||
results_index2.pop(value)
|
||||
for value in RSA_DEPENDENT2.values():
|
||||
results_index2.pop(value)
|
||||
|
||||
"""
|
||||
PRINT RESULTS
|
||||
"""
|
||||
output_dict = dict()
|
||||
output_dict.update(results_index2)
|
||||
output_dict.update(results_index3)
|
||||
|
||||
keys = list(output_dict.keys())
|
||||
keys.sort()
|
||||
values = [str(output_dict[item]) for item in keys]
|
||||
|
||||
# print(",".join(keys))
|
||||
print(",".join(values))
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
print("Error on parsing argument list")
|
||||
print("Please provide a one letter code for wild-type and mutant residues")
|
||||
print("Eg.: python get_scores.py pdb_file chain_id mutation_code")
|
||||
sys.exit(1)
|
||||
main()
|
101
scripts/aa_index_scripts/run_aa_index.R
Normal file
101
scripts/aa_index_scripts/run_aa_index.R
Normal file
|
@ -0,0 +1,101 @@
|
|||
#!/usr/bin/env Rscript
|
||||
library(bio3d)
|
||||
library(seqinr)
|
||||
library(bios2mds)
|
||||
library(protr)
|
||||
library(stringr)
|
||||
####################################################################
|
||||
# TASK: use this to return df for AA index and mutation properties
|
||||
# useful for dfs
|
||||
|
||||
|
||||
#####################################################################
|
||||
# working dir and loading libraries
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/scripts/")
|
||||
getwd()
|
||||
|
||||
drug = "streptomycin"
|
||||
gene = "gid"
|
||||
|
||||
source("functions/plotting_globals.R")
|
||||
import_dirs(drug_name = drug, gene_name = gene)
|
||||
|
||||
##############################################################
|
||||
my_fasta_file = paste0(indir, "/", gene, "_complex.fasta")
|
||||
|
||||
my_mcsmf_snps = paste0(outdir, "/", gene, "_mcsm_formatted_snps.csv")
|
||||
|
||||
###############################################################
|
||||
#%% fasta as vector
|
||||
gid_aa_seq_v= read.fasta(my_fasta_file
|
||||
, seqtype = "AA"
|
||||
, as.string = F)
|
||||
|
||||
gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
|
||||
|
||||
#%% fasta as string
|
||||
gid_aa_seq_s = read.fasta(my_fasta_file
|
||||
, seqtype = "AA"
|
||||
, as.string = T)
|
||||
|
||||
gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
|
||||
###############################################################
|
||||
#===================
|
||||
# AA indices
|
||||
# https://www.genome.jp/aaindex/AAindex/list_of_indices
|
||||
#===================
|
||||
data(aa.index)
|
||||
|
||||
# default
|
||||
aai_kd = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
|
||||
|
||||
aai_rv = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
|
||||
aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
|
||||
aai_b = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
|
||||
##########################################################
|
||||
#===================
|
||||
# mutation matrices
|
||||
#===================
|
||||
data(sub.mat)
|
||||
snps = read.csv(my_mcsmf_snps
|
||||
, header = 0)
|
||||
snps
|
||||
colnames(snps) <- "mutationinformation"
|
||||
|
||||
# run using all matrices
|
||||
sub_mat_names = as.character(unlist(attributes(sub.mat)))
|
||||
#sub_mat_names = "BLOSUM80"
|
||||
|
||||
for (j in sub_mat_names){
|
||||
print(j)
|
||||
snps[[j]] <- NA
|
||||
for (i in 1:nrow(snps)) {
|
||||
curr_snp = snps$mutationinformation[i]
|
||||
m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
|
||||
aa1 = m1[,2]
|
||||
aa2 = m1[,3]
|
||||
#snps$blosum_80[i]
|
||||
snps[[j]][i] = sub.mat[[j]][aa1,aa2]
|
||||
}
|
||||
|
||||
}
|
||||
snps
|
||||
##########################################################
|
||||
gid_aac = extractAAC(gid_aa_s)
|
||||
gid_dc = extractDC(gid_aa_s)
|
||||
gid_tc = extractTC(gid_aa_s)
|
||||
|
||||
##########################################################
|
||||
# Plots
|
||||
par(mfrow = c(3,2))
|
||||
|
||||
barplot(aai_kd , main = "AA index: KD")
|
||||
#barplot(aai_rv , main = "AA index: Residue Volume, 1967")
|
||||
barplot(aai_rv2 , main = "AA index: Residue Volume") #1973
|
||||
barplot(aai_b , main = "AA index: Bitterness")
|
||||
|
||||
barplot(gid_aac , main = "AA: composition")
|
||||
barplot(gid_dc , main = "AA: Dipeptide composition")
|
||||
barplot(gid_tc , main = "AA: Tripeptide composition")
|
||||
###########################################################
|
Loading…
Add table
Add a link
Reference in a new issue