LSHTM_analysis/scripts/plotting/structure_figures/mut_landscape.R

source("~/git/LSHTM_analysis/config/alr.R")
source("~/git/LSHTM_analysis/config/embb.R")
source("~/git/LSHTM_analysis/config/gid.R")
source("~/git/LSHTM_analysis/config/katg.R")
source("~/git/LSHTM_analysis/config/pnca.R")
source("~/git/LSHTM_analysis/config/rpob.R")

#================================
# output files
# In total: 6 files are written
#================================
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene))

# mutational positions: all
outfile_mutpos = paste0(outdir_images, "/", tolower(gene), "_mutpos_all.txt")
outfile_meta1 =  paste0(outdir_images, "/", tolower(gene), "_mutpos_cu.txt")

# mutational positions with sensitivity: S, R and common
outfile_mutpos_S = paste0(outdir_images, "/", tolower(gene), "_mutpos_S.txt")
outfile_mutpos_R = paste0(outdir_images, "/", tolower(gene), "_mutpos_R.txt")
outfile_mutpos_common = paste0(outdir_images, "/", tolower(gene), "_mutpos_common.txt")
outfile_meta2 =  paste0(outdir_images, "/", tolower(gene), "_mutpos_annot_cu.txt")

#=============
# Input
#=============
df3_filename = paste0("~/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
df3 = read.csv(df3_filename)

# Determine for each gene
if (tolower(gene) == "embb"){
  chain_suffix = ".B"
} else{
  chain_suffix = ".A"
}

# mut_info checks
table(df3$mutation_info)
table(df3$mutation_info_orig)
table(df3$mutation_info_labels_orig)

# used in plots and analyses
table(df3$mutation_info_labels) # different, and matches dst_mode
table(df3$dst_mode)

# create column based on dst mode with different colname
table(is.na(df3$dst))
table(is.na(df3$dst_mode))

#===============
# Create column: sensitivity mapped to dst_mode
#===============
df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
table(df3$sensitivity)

length(unique((df3$mutationinformation)))
all_colnames = as.data.frame(colnames(df3))

############################################################
cols_to_extract = c("mutationinformation"
                    , "wild_type"
                    , "chain"
                    , "mutant_type"
                    , "position"
                    , "dst_mode"
                    , "mutation_info_labels_orig"
                    , "mutation_info_labels"
                    , "sensitivity")

df3_plot = df3[, cols_to_extract]
# create pos_chain column: allows easier colouring in chimera
df3_plot$pos_chain = paste(df3_plot$position, df3_plot$chain, sep = ".")
pos_cu = length(unique(df3_plot$position))

#===========================
# positions with mutations
#===========================
#df3_all_mut_pos = df3_plot[, c("mutationinformation", "position", "chain")]
#df3_all_mut_pos$pos_chain = paste(df3_all_mut_pos$position, df3_all_mut_pos$chain, sep = ".")

df3_all_mut_pos = df3_plot[, c("position", "pos_chain")]
gene_mut_pos_u = unique(df3_all_mut_pos$pos_chain)
class(gene_mut_pos_u)
paste(gene_mut_pos_u, collapse=',')

if (length(gene_mut_pos_u) == pos_cu){
  cat("\nPASS: all mutation positions extracted"
      , "\nWriting file:", outfile_mutpos)
} else{
  stop("\nAbort: mutation position count mismatch")
}

write.table(paste(gene_mut_pos_u, collapse=',')
            , outfile_mutpos
            , row.names = F
            , col.names = F)

write.table(paste("Count of positions with mutations in gene"
                  , tolower(gene), ":", pos_cu)
            , outfile_meta1
            , row.names = F
            , col.names = F)
#========================================
# positions with sensitivity annotations
#========================================
df3_muts_annot = df3_plot[, c("mutationinformation", "position", "pos_chain", "sensitivity")]

# aggregrate position count by sensitivity
result <- aggregate(sensitivity ~ position, data = df3_muts_annot, paste, collapse = "")

sensitive_pos = result$position[grep("(^S+$)", result$sensitivity)]
sensitive_pos = paste0(sensitive_pos, chain_suffix)

resistant_pos = result$position[grep("(^R+$)", result$sensitivity)]
resistant_pos = paste0(resistant_pos, chain_suffix)

common_pos    = result$position[grep("SR|RS" , result$sensitivity)]
common_pos    = paste0(common_pos, chain_suffix)

if (tolower(gene)!= "alr"){
  length_check = length(sensitive_pos) + length(resistant_pos) + length(common_pos)
  cpl          =  length(common_pos)
}else{
  length_check = length(sensitive_pos) + length(resistant_pos)
  cpl          = 0
}

if (length_check == pos_cu){
  cat("\nPASS: position with mutational sensitivity extracted"
      , "\nWriting files: sensitive, resistant and common position counts" )
} else{
  stop("\nAbort: position with mutational sensitivity count mismatch")
}

write.table(paste(sensitive_pos, collapse = ',')
            , outfile_mutpos_S
            , row.names = F, col.names = F)

write.table(paste(resistant_pos, collapse = ',')
            , outfile_mutpos_R
            , row.names = F, col.names = F)

write.table(paste(common_pos, collapse = ',')
            , outfile_mutpos_common
            , row.names = F, col.names = F)

write.table(paste("Count of positions with mutations in gene:"
                  , tolower(gene)
                  , "\nTotal mutational positions:", pos_cu
                  , "\nsensitive:", length(sensitive_pos)
                  , "\nresistant:", length(resistant_pos)
                  , "\ncommon:"   , cpl)
            , outfile_meta2
            , row.names = F
            , col.names = F)