LSHTM_analysis/scripts/aa_index_scripts/run_aa_index.R

#!/usr/bin/env Rscript
library(bio3d)
library(seqinr)
library(bios2mds)
library(protr)
library(stringr)
####################################################################
# TASK: use this to return df for AA index and mutation properties
# useful for dfs


#####################################################################
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/")
getwd()

drug = "streptomycin"
gene = "gid"

source("functions/plotting_globals.R")
import_dirs(drug_name = drug, gene_name = gene)

##############################################################
my_fasta_file = paste0(indir, "/",  gene,  "_complex.fasta")

my_mcsmf_snps = paste0(outdir, "/", gene, "_mcsm_formatted_snps.csv")

###############################################################
#%% fasta as vector
gid_aa_seq_v= read.fasta(my_fasta_file
                    , seqtype = "AA"
                    , as.string = F)

gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v

#%% fasta as string
gid_aa_seq_s = read.fasta(my_fasta_file
                         , seqtype = "AA"
                         , as.string = T)

gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
###############################################################
#===================
# AA indices
# https://www.genome.jp/aaindex/AAindex/list_of_indices
#===================
data(aa.index)

# default
aai_kd  = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD

aai_rv  = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
aai_b   = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
##########################################################
#===================
# mutation matrices
#===================
data(sub.mat)
snps = read.csv(my_mcsmf_snps
                , header = 0)
snps
colnames(snps) <- "mutationinformation"

# run using all matrices
sub_mat_names = as.character(unlist(attributes(sub.mat)))
#sub_mat_names = "BLOSUM80"

for (j in sub_mat_names){
  print(j)
  snps[[j]] <- NA
for (i in 1:nrow(snps)) {
    curr_snp = snps$mutationinformation[i]
    m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
    aa1 = m1[,2]
    aa2 = m1[,3]
    #snps$blosum_80[i]
    snps[[j]][i] = sub.mat[[j]][aa1,aa2]
  }

}
snps
##########################################################
gid_aac = extractAAC(gid_aa_s)
gid_dc = extractDC(gid_aa_s)
gid_tc = extractTC(gid_aa_s)

##########################################################
# Plots
par(mfrow = c(3,2))

barplot(aai_kd  , main = "AA index: KD")
#barplot(aai_rv  , main = "AA index: Residue Volume, 1967")
barplot(aai_rv2 , main = "AA index: Residue Volume") #1973
barplot(aai_b   , main = "AA index: Bitterness")

barplot(gid_aac , main = "AA: composition")
barplot(gid_dc  , main = "AA: Dipeptide composition")
barplot(gid_tc  , main = "AA: Tripeptide composition")
###########################################################