191 lines
6.3 KiB
R
191 lines
6.3 KiB
R
source("~/git/LSHTM_analysis/config/rpob.R")
|
|
|
|
#================================
|
|
# output files
|
|
# In total: 6 files are written
|
|
#================================
|
|
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
|
|
|
|
# mutational positions: all
|
|
outfile_mutpos = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_all.txt")
|
|
outfile_meta1 = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_cu.txt")
|
|
|
|
# mutational positions with sensitivity: S, R and common
|
|
outfile_mutpos_S = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_S.txt")
|
|
outfile_mutpos_R = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_R.txt")
|
|
outfile_mutpos_common = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_common.txt")
|
|
outfile_meta2 = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_annot_cu.txt")
|
|
|
|
#=============
|
|
# Input
|
|
#=============
|
|
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
|
|
df3 = read.csv(df3_filename)
|
|
|
|
chain_suffix = ".C"
|
|
|
|
# mut_info checks
|
|
table(df3$mutation_info)
|
|
table(df3$mutation_info_orig)
|
|
table(df3$mutation_info_labels_orig)
|
|
|
|
# used in plots and analyses
|
|
table(df3$mutation_info_labels) # different, and matches dst_mode
|
|
table(df3$dst_mode)
|
|
|
|
# create column based on dst mode with different colname
|
|
table(is.na(df3$dst))
|
|
table(is.na(df3$dst_mode))
|
|
|
|
#===============
|
|
# Create column: sensitivity mapped to dst_mode
|
|
#===============
|
|
df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
|
|
table(df3$sensitivity)
|
|
|
|
length(unique((df3$mutationinformation)))
|
|
all_colnames = as.data.frame(colnames(df3))
|
|
|
|
############################################################
|
|
cols_to_extract = c("mutationinformation"
|
|
, "wild_type"
|
|
, "chain"
|
|
, "mutant_type"
|
|
, "position"
|
|
, "X5uhc_position"
|
|
, "X5uhc_offset"
|
|
, "dst_mode"
|
|
, "mutation_info_labels_orig"
|
|
, "mutation_info_labels"
|
|
, "sensitivity")
|
|
|
|
df3_plot = df3[, cols_to_extract]
|
|
|
|
# use the x5uhc_position column
|
|
|
|
# create pos_chain column: allows easier colouring in chimera
|
|
df3_plot$pos_chain = paste(df3_plot$X5uhc_position, chain_suffix, sep = ".")
|
|
pos_cu = length(unique(df3_plot$X5uhc_position))
|
|
X5uhc_pos = unique(df3_plot$X5uhc_position)
|
|
X5uhc_pos = paste0(X5uhc_pos, chain_suffix)
|
|
|
|
#===========================
|
|
# positions with mutations
|
|
#===========================
|
|
df3_all_mut_pos = df3_plot[, c("X5uhc_position", "pos_chain")]
|
|
gene_mut_pos_u = unique(df3_all_mut_pos$pos_chain)
|
|
class(gene_mut_pos_u)
|
|
paste(gene_mut_pos_u, collapse=',')
|
|
|
|
if (length(gene_mut_pos_u) == pos_cu){
|
|
cat("\nPASS: all mutation positions extracted"
|
|
, "\nWriting file:", outfile_mutpos)
|
|
} else{
|
|
stop("\nAbort: mutation position count mismatch")
|
|
}
|
|
|
|
write.table(paste(gene_mut_pos_u, collapse=',')
|
|
, outfile_mutpos
|
|
, row.names = F
|
|
, col.names = F)
|
|
|
|
write.table(paste("Count of positions with mutations in gene"
|
|
, tolower(gene), ":", pos_cu)
|
|
, outfile_meta1
|
|
, row.names = F
|
|
, col.names = F)
|
|
#========================================
|
|
# positions with sensitivity annotations
|
|
#========================================
|
|
df3_muts_annot = df3_plot[, c("mutationinformation", "X5uhc_position", "pos_chain", "sensitivity")]
|
|
|
|
# aggregrate position count by sensitivity
|
|
result <- aggregate(sensitivity ~ X5uhc_position, data = df3_muts_annot, paste, collapse = "")
|
|
|
|
sensitive_pos = result$X5uhc_position[grep("(^S+$)", result$sensitivity)]
|
|
sensitive_pos = paste0(sensitive_pos, chain_suffix)
|
|
|
|
resistant_pos = result$X5uhc_position[grep("(^R+$)", result$sensitivity)]
|
|
resistant_pos = paste0(resistant_pos, chain_suffix)
|
|
|
|
common_pos = result$X5uhc_position[grep("SR|RS" , result$sensitivity)]
|
|
common_pos = paste0(common_pos, chain_suffix)
|
|
|
|
if (tolower(gene)!= "alr"){
|
|
length_check = length(sensitive_pos) + length(resistant_pos) + length(common_pos)
|
|
cpl = length(common_pos)
|
|
}else{
|
|
length_check = length(sensitive_pos) + length(resistant_pos)
|
|
cpl = 0
|
|
}
|
|
|
|
if (length_check == pos_cu){
|
|
cat("\nPASS: position with mutational sensitivity extracted"
|
|
, "\nWriting files: sensitive, resistant and common position counts" )
|
|
} else{
|
|
stop("\nAbort: position with mutational sensitivity count mismatch")
|
|
}
|
|
# spl handling for rpob 5uhc
|
|
revised_gene_mut_pos_u = c(sensitive_pos, resistant_pos, common_pos)
|
|
revised_pos_cu = length(unique(revised_gene_mut_pos_u))
|
|
if (length(revised_gene_mut_pos_u) == revised_pos_cu){
|
|
cat("\nPASS: all mutation positions extracted"
|
|
, "\nWriting file:", outfile_mutpos)
|
|
} else{
|
|
stop("\nAbort: mutation position count mismatch")
|
|
}
|
|
|
|
write.table(paste(revised_gene_mut_pos_u, collapse=',')
|
|
, outfile_mutpos
|
|
, row.names = F
|
|
, col.names = F)
|
|
|
|
write.table(paste("Count of positions with mutations in gene"
|
|
, tolower(gene), ":", revised_pos_cu)
|
|
, outfile_meta1
|
|
, row.names = F
|
|
, col.names = F)
|
|
|
|
# mut_annot
|
|
write.table(paste(sensitive_pos, collapse = ',')
|
|
, outfile_mutpos_S
|
|
, row.names = F, col.names = F)
|
|
|
|
write.table(paste(resistant_pos, collapse = ',')
|
|
, outfile_mutpos_R
|
|
, row.names = F, col.names = F)
|
|
|
|
write.table(paste(common_pos, collapse = ',')
|
|
, outfile_mutpos_common
|
|
, row.names = F, col.names = F)
|
|
|
|
write.table(paste("Count of positions with mutations in gene:"
|
|
, tolower(gene)
|
|
, "\nTotal mutational positions:", revised_pos_cu
|
|
, "\nsensitive:", length(sensitive_pos)
|
|
, "\nresistant:", length(resistant_pos)
|
|
, "\ncommon:" , cpl)
|
|
, outfile_meta2
|
|
, row.names = F
|
|
, col.names = F)
|
|
|
|
#Quick check to find out the discrepancy
|
|
revised_gene_mut_pos_u
|
|
gene_mut_pos_u
|
|
|
|
library("qpcR")
|
|
foo <- data.frame(qpcR:::cbind.na(gene_mut_pos_u, revised_gene_mut_pos_u))
|
|
|
|
|
|
table(!gene_mut_pos_u%in%revised_gene_mut_pos_u)
|
|
table(!revised_gene_mut_pos_u%in%gene_mut_pos_u)
|
|
|
|
X5uhc_pos
|
|
#table(!gene_mut_pos_u%in%X5uhc_pos)
|
|
table(X5uhc_pos%in%gene_mut_pos_u)
|
|
|
|
X5uhc_pos[!X5uhc_pos%in%gene_mut_pos_u]
|
|
X5uhc_pos[!gene_mut_pos_u%in%X5uhc_pos]
|
|
|
|
#TODO: NOTE
|
|
#D1148G (i.e D1154) is NOT Present in 5UHC
|