source("~/git/LSHTM_analysis/config/rpob.R") #================================ # output files # In total: 6 files are written #================================ outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene)) # mutational positions: all outfile_mutpos = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_all.txt") outfile_meta1 = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_cu.txt") # mutational positions with sensitivity: S, R and common outfile_mutpos_S = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_S.txt") outfile_mutpos_R = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_R.txt") outfile_mutpos_common = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_common.txt") outfile_meta2 = paste0(outdir_images, "/5uhc_", tolower(gene), "_mutpos_annot_cu.txt") #============= # Input #============= df3_filename = paste0("~/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") df3 = read.csv(df3_filename) chain_suffix = ".C" # mut_info checks table(df3$mutation_info) table(df3$mutation_info_orig) table(df3$mutation_info_labels_orig) # used in plots and analyses table(df3$mutation_info_labels) # different, and matches dst_mode table(df3$dst_mode) # create column based on dst mode with different colname table(is.na(df3$dst)) table(is.na(df3$dst_mode)) #=============== # Create column: sensitivity mapped to dst_mode #=============== df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S") table(df3$sensitivity) length(unique((df3$mutationinformation))) all_colnames = as.data.frame(colnames(df3)) ############################################################ cols_to_extract = c("mutationinformation" , "wild_type" , "chain" , "mutant_type" , "position" , "X5uhc_position" , "X5uhc_offset" , "dst_mode" , "mutation_info_labels_orig" , "mutation_info_labels" , "sensitivity") df3_plot = df3[, cols_to_extract] # use the x5uhc_position column # create pos_chain column: allows easier colouring in chimera df3_plot$pos_chain = paste(df3_plot$X5uhc_position, chain_suffix, sep = ".") pos_cu = length(unique(df3_plot$X5uhc_position)) X5uhc_pos = unique(df3_plot$X5uhc_position) X5uhc_pos = paste0(X5uhc_pos, chain_suffix) #=========================== # positions with mutations #=========================== df3_all_mut_pos = df3_plot[, c("X5uhc_position", "pos_chain")] gene_mut_pos_u = unique(df3_all_mut_pos$pos_chain) class(gene_mut_pos_u) paste(gene_mut_pos_u, collapse=',') if (length(gene_mut_pos_u) == pos_cu){ cat("\nPASS: all mutation positions extracted" , "\nWriting file:", outfile_mutpos) } else{ stop("\nAbort: mutation position count mismatch") } write.table(paste(gene_mut_pos_u, collapse=',') , outfile_mutpos , row.names = F , col.names = F) write.table(paste("Count of positions with mutations in gene" , tolower(gene), ":", pos_cu) , outfile_meta1 , row.names = F , col.names = F) #======================================== # positions with sensitivity annotations #======================================== df3_muts_annot = df3_plot[, c("mutationinformation", "X5uhc_position", "pos_chain", "sensitivity")] # aggregrate position count by sensitivity result <- aggregate(sensitivity ~ X5uhc_position, data = df3_muts_annot, paste, collapse = "") sensitive_pos = result$X5uhc_position[grep("(^S+$)", result$sensitivity)] sensitive_pos = paste0(sensitive_pos, chain_suffix) resistant_pos = result$X5uhc_position[grep("(^R+$)", result$sensitivity)] resistant_pos = paste0(resistant_pos, chain_suffix) common_pos = result$X5uhc_position[grep("SR|RS" , result$sensitivity)] common_pos = paste0(common_pos, chain_suffix) if (tolower(gene)!= "alr"){ length_check = length(sensitive_pos) + length(resistant_pos) + length(common_pos) cpl = length(common_pos) }else{ length_check = length(sensitive_pos) + length(resistant_pos) cpl = 0 } if (length_check == pos_cu){ cat("\nPASS: position with mutational sensitivity extracted" , "\nWriting files: sensitive, resistant and common position counts" ) } else{ stop("\nAbort: position with mutational sensitivity count mismatch") } # spl handling for rpob 5uhc revised_gene_mut_pos_u = c(sensitive_pos, resistant_pos, common_pos) revised_pos_cu = length(unique(revised_gene_mut_pos_u)) if (length(revised_gene_mut_pos_u) == revised_pos_cu){ cat("\nPASS: all mutation positions extracted" , "\nWriting file:", outfile_mutpos) } else{ stop("\nAbort: mutation position count mismatch") } write.table(paste(revised_gene_mut_pos_u, collapse=',') , outfile_mutpos , row.names = F , col.names = F) write.table(paste("Count of positions with mutations in gene" , tolower(gene), ":", revised_pos_cu) , outfile_meta1 , row.names = F , col.names = F) # mut_annot write.table(paste(sensitive_pos, collapse = ',') , outfile_mutpos_S , row.names = F, col.names = F) write.table(paste(resistant_pos, collapse = ',') , outfile_mutpos_R , row.names = F, col.names = F) write.table(paste(common_pos, collapse = ',') , outfile_mutpos_common , row.names = F, col.names = F) write.table(paste("Count of positions with mutations in gene:" , tolower(gene) , "\nTotal mutational positions:", revised_pos_cu , "\nsensitive:", length(sensitive_pos) , "\nresistant:", length(resistant_pos) , "\ncommon:" , cpl) , outfile_meta2 , row.names = F , col.names = F) #Quick check to find out the discrepancy revised_gene_mut_pos_u gene_mut_pos_u library("qpcR") foo <- data.frame(qpcR:::cbind.na(gene_mut_pos_u, revised_gene_mut_pos_u)) table(!gene_mut_pos_u%in%revised_gene_mut_pos_u) table(!revised_gene_mut_pos_u%in%gene_mut_pos_u) X5uhc_pos #table(!gene_mut_pos_u%in%X5uhc_pos) table(X5uhc_pos%in%gene_mut_pos_u) X5uhc_pos[!X5uhc_pos%in%gene_mut_pos_u] X5uhc_pos[!gene_mut_pos_u%in%X5uhc_pos] #TODO: NOTE #D1148G (i.e D1154) is NOT Present in 5UHC