From 2511162a1d6e9d48bf7534e558c27b2763aa3a4b Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 13 Aug 2021 16:22:11 +0100 Subject: [PATCH] added aa_index/ with script that return dfs for plots for shiny perhaps --- scripts/aa_index/aa_index.R | 85 +++++++++++++++++++++++++++ scripts/aa_index/run_aa_index.R | 101 ++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 scripts/aa_index/aa_index.R create mode 100644 scripts/aa_index/run_aa_index.R diff --git a/scripts/aa_index/aa_index.R b/scripts/aa_index/aa_index.R new file mode 100644 index 0000000..f0ca875 --- /dev/null +++ b/scripts/aa_index/aa_index.R @@ -0,0 +1,85 @@ +library(bio3d) +library(seqinr) +library(bios2mds) +library(protr) +############################################################# +#%% TASK +# use this to return df for AA index and mutation properties + +source() + +############################################################## +my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta" +my_mcsmf_snps = "~/git/Data/streptomycin/output/gid_mcsm_formatted_snps.csv" +############################################################### +#%% fasta as vector +gid_aa_seq_v= read.fasta(my_fasta_file + , seqtype = "AA" + , as.string = F) + +gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v + +#%% fasta as string +gid_aa_seq_s = read.fasta(my_fasta_file + , seqtype = "AA" + , as.string = T) + +gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s +############################################################### +#=================== +# AA indices +# https://www.genome.jp/aaindex/AAindex/list_of_indices +#=================== +data(aa.index) + +# default +aai_kd = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD + +aai_rv = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967 +aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973) +aai_b = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984) + +par(mfrow = c(1,1)) +barplot(aai_kd) +barplot(aai_rv) +barplot(aai_rv2) +#barplot(aai_b, col = c("black", "yellow")) + +########################################################## +#=================== +# mutation matrices +#=================== +data(sub.mat) +snps = read.csv(my_mcsmf_snps + , header = 0) +snps +colnames(snps) <- "mutationinformation" + +# run using all matrices +sub_mat_names = as.character(unlist(attributes(sub.mat))) +#sub_mat_names = "BLOSUM80" + +for (j in sub_mat_names){ + print(j) + snps[[j]] <- NA +for (i in 1:nrow(snps)) { + curr_snp = snps$mutationinformation[i] + m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})") + aa1 = m1[,2] + aa2 = m1[,3] + #snps$blosum_80[i] + snps[[j]][i] = sub.mat[[j]][aa1,aa2] + } + +} +snps +########################################################## +gid_aac = extractAAC(gid_aa_s) +gid_dc = extractDC(gid_aa_s) +gid_tc = extractTC(gid_aa_s) + +par(mfrow = c(1, 3)) +barplot(gid_aac) +barplot(gid_dc) +barplot(gid_tc) +########################################################### \ No newline at end of file diff --git a/scripts/aa_index/run_aa_index.R b/scripts/aa_index/run_aa_index.R new file mode 100644 index 0000000..8824ca6 --- /dev/null +++ b/scripts/aa_index/run_aa_index.R @@ -0,0 +1,101 @@ +#!/usr/bin/env Rscript +library(bio3d) +library(seqinr) +library(bios2mds) +library(protr) +library(stringr) +#################################################################### +# TASK: use this to return df for AA index and mutation properties +# useful for dfs + + +##################################################################### +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +drug = "streptomycin" +gene = "gid" + +source("functions/plotting_globals.R") +import_dirs(drug_name = drug, gene_name = gene) + +############################################################## +my_fasta_file = paste0(indir, "/", gene, "_complex.fasta") + +my_mcsmf_snps = paste0(outdir, "/", gene, "_mcsm_formatted_snps.csv") + +############################################################### +#%% fasta as vector +gid_aa_seq_v= read.fasta(my_fasta_file + , seqtype = "AA" + , as.string = F) + +gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v + +#%% fasta as string +gid_aa_seq_s = read.fasta(my_fasta_file + , seqtype = "AA" + , as.string = T) + +gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s +############################################################### +#=================== +# AA indices +# https://www.genome.jp/aaindex/AAindex/list_of_indices +#=================== +data(aa.index) + +# default +aai_kd = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD + +aai_rv = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967 +aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973) +aai_b = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984) +########################################################## +#=================== +# mutation matrices +#=================== +data(sub.mat) +snps = read.csv(my_mcsmf_snps + , header = 0) +snps +colnames(snps) <- "mutationinformation" + +# run using all matrices +sub_mat_names = as.character(unlist(attributes(sub.mat))) +#sub_mat_names = "BLOSUM80" + +for (j in sub_mat_names){ + print(j) + snps[[j]] <- NA +for (i in 1:nrow(snps)) { + curr_snp = snps$mutationinformation[i] + m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})") + aa1 = m1[,2] + aa2 = m1[,3] + #snps$blosum_80[i] + snps[[j]][i] = sub.mat[[j]][aa1,aa2] + } + +} +snps +########################################################## +gid_aac = extractAAC(gid_aa_s) +gid_dc = extractDC(gid_aa_s) +gid_tc = extractTC(gid_aa_s) + +########################################################## +# Plots +par(mfrow = c(3,2)) + +barplot(aai_kd , main = "AA index: KD") +#barplot(aai_rv , main = "AA index: Residue Volume, 1967") +barplot(aai_rv2 , main = "AA index: Residue Volume") #1973 +barplot(aai_b , main = "AA index: Bitterness") + +barplot(gid_aac , main = "AA: composition") +barplot(gid_dc , main = "AA: Dipeptide composition") +barplot(gid_tc , main = "AA: Tripeptide composition") +########################################################### \ No newline at end of file