LSHTM_analysis/scripts/pdb_fasta_plot.R

110 lines
2.9 KiB
R

library(bio3d)
infile_pdb = "~/git/Data/ethambutol/input/embb_complex.pdb"
infile_pdb = "~/git/Data/streptomycin/input/gid_complex.pdb"
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/")
getwd()
drug = ""
gene = ""
source("functions/plotting_globals.R")
import_dirs(drug_name = drug, gene_name = gene)
########################################################################
# get atom coordinates from pdb:
~/git/LSHTM_analysis/scripts/my_pdbtools/pdbtools/scripts/pdb_seq -c B -a <input> > <output>
########################################################################
my_pdb = read.pdb(infile_pdb
, maxlines = -1
, multi = FALSE
, rm.insert = FALSE
, rm.alt = TRUE
, ATOM.only = FALSE
, hex = FALSE
, verbose = TRUE)
my_pdb = my_pdb[['atom']]
table(my_pdb$type)
table(my_pdb$type == "HETATM")
my_pdb_c1 = my_pdb[my_pdb$type != "HETATM",]
foo = table(grepl("[A-Z]{3}", my_pdb_c1$resid)); foo
my_pdb_c = my_pdb_c[grepl("[A-Z]{3}", my_pdb_c1$resid),]
table(my_pdb_c$type == "HETATM")
table(my_pdb_c$chain)
table(my_pdb_c$chain, my_pdb_c$resno)
chain_count = table(my_pdb_c$chain)
barplot(chain_count)
hetatm_df = my_pdb[my_pdb$type == "HETATM",]
barplot(table(hetatm_df$resid, hetatm_df$chain), legend =T)
# TO DO:
# DECIPHER CHAIN
sel_chain = "B" # embb
sel_chain = "A"
#my_fasta_file = paste0(indir, "/", gene, "_complex.fasta")
my_fasta_file = "~/git/Data/ethambutol/input/7bvf_B_seq.txt"
my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta"
aa_seq_s = read.fasta(my_fasta_file
, seqtype = "AA"
, as.string = F)
aa_seq = as.character(aa_seq_s[[1]]); aa_seq
g_len = length(aa_seq)
barplot(g_len)
table(my_pdb_c$chain[my_pdb_c$chain == sel_chain])
c_len = unique(length(table(my_pdb_c$resno[my_pdb_c$chain == sel_chain]))); c_len
barplot(c_len)
gc_lengths = c(g_len, c_len)
str_coverage = (c_len/g_len)*100
# make this interactive
barplot(gc_lengths
, horiz = T
, main = "gene length and structural coverage"
, xlab = "length"
, ylab = ""
, names.arg = c("gene_length", "chain_length")
, col = c("beige", "darkgrey"))
# add text underneath graph in Rshiny
cat("\nGene length:", g_len
, "\nChain length:", c_len
, "\n% structural coverage:", str_coverage)
#%%
foo = data.frame(my_pdb_c$resno, my_pdb_c$resid)
foo_u = foo[!duplicated(foo$my_pdb_c.resno),]
foo_u$my_pdb_c.resid
aa3_upper = foo_u$my_pdb_c.resid
aa3_upper
aa3_1up = str_to_title(aa3_upper)
aa3_1up
foo_aa1 = a(aa3_1up); foo_aa1
aa_seq
identical(foo_aa1, aa_seq)
foo_aa1_s = paste(foo_aa1, collapse = ""); foo_aa1_s
aa_seq_s = paste(aa_seq, collapse = ""); aa_seq_s
seqaln(foo_aa1_s)
# magic answer
# position 129 idendtified from global alignment from EMBOSS