diff --git a/scripts/pdb_fasta_plot.R b/scripts/pdb_fasta_plot.R new file mode 100644 index 0000000..4f98a25 --- /dev/null +++ b/scripts/pdb_fasta_plot.R @@ -0,0 +1,110 @@ +library(bio3d) +infile_pdb = "~/git/Data/ethambutol/input/embb_complex.pdb" +infile_pdb = "~/git/Data/streptomycin/input/gid_complex.pdb" + +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +drug = "" +gene = "" + +source("functions/plotting_globals.R") +import_dirs(drug_name = drug, gene_name = gene) +######################################################################## +# get atom coordinates from pdb: +~/git/LSHTM_analysis/scripts/my_pdbtools/pdbtools/scripts/pdb_seq -c B -a > + +######################################################################## +my_pdb = read.pdb(infile_pdb + , maxlines = -1 + , multi = FALSE + , rm.insert = FALSE + , rm.alt = TRUE + , ATOM.only = FALSE + , hex = FALSE + , verbose = TRUE) + +my_pdb = my_pdb[['atom']] +table(my_pdb$type) +table(my_pdb$type == "HETATM") + +my_pdb_c1 = my_pdb[my_pdb$type != "HETATM",] +foo = table(grepl("[A-Z]{3}", my_pdb_c1$resid)); foo + +my_pdb_c = my_pdb_c[grepl("[A-Z]{3}", my_pdb_c1$resid),] + +table(my_pdb_c$type == "HETATM") +table(my_pdb_c$chain) +table(my_pdb_c$chain, my_pdb_c$resno) + +chain_count = table(my_pdb_c$chain) + +barplot(chain_count) + +hetatm_df = my_pdb[my_pdb$type == "HETATM",] +barplot(table(hetatm_df$resid, hetatm_df$chain), legend =T) + + +# TO DO: +# DECIPHER CHAIN +sel_chain = "B" # embb +sel_chain = "A" + +#my_fasta_file = paste0(indir, "/", gene, "_complex.fasta") +my_fasta_file = "~/git/Data/ethambutol/input/7bvf_B_seq.txt" +my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta" + +aa_seq_s = read.fasta(my_fasta_file + , seqtype = "AA" + , as.string = F) + +aa_seq = as.character(aa_seq_s[[1]]); aa_seq +g_len = length(aa_seq) +barplot(g_len) + +table(my_pdb_c$chain[my_pdb_c$chain == sel_chain]) +c_len = unique(length(table(my_pdb_c$resno[my_pdb_c$chain == sel_chain]))); c_len +barplot(c_len) + +gc_lengths = c(g_len, c_len) +str_coverage = (c_len/g_len)*100 + +# make this interactive +barplot(gc_lengths + , horiz = T + , main = "gene length and structural coverage" + , xlab = "length" + , ylab = "" + , names.arg = c("gene_length", "chain_length") + , col = c("beige", "darkgrey")) + +# add text underneath graph in Rshiny +cat("\nGene length:", g_len + , "\nChain length:", c_len + , "\n% structural coverage:", str_coverage) + +#%% +foo = data.frame(my_pdb_c$resno, my_pdb_c$resid) +foo_u = foo[!duplicated(foo$my_pdb_c.resno),] + +foo_u$my_pdb_c.resid +aa3_upper = foo_u$my_pdb_c.resid +aa3_upper + + +aa3_1up = str_to_title(aa3_upper) +aa3_1up + +foo_aa1 = a(aa3_1up); foo_aa1 +aa_seq + +identical(foo_aa1, aa_seq) + +foo_aa1_s = paste(foo_aa1, collapse = ""); foo_aa1_s +aa_seq_s = paste(aa_seq, collapse = ""); aa_seq_s + +seqaln(foo_aa1_s) +# magic answer +# position 129 idendtified from global alignment from EMBOSS