diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R new file mode 100644 index 0000000..4bf2b80 --- /dev/null +++ b/scripts/plotting/combined_or.R @@ -0,0 +1,193 @@ + +#!/usr/bin/env Rscript +######################################################### +# TASK: Basic lineage barplot showing numbers + +# Output: Basic barplot with lineage samples and mut count + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + + +source("Header_TT.R") +require(cowplot) +source("combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +#========================= +#======= +# output +#======= +or_combined = "or_combined_PS_LIG.svg" +plot_or_combined = paste0(plotdir,"/", or_combined) + +or_kin_combined = "or_kin_combined_PS_LIG.svg" +plot_or_kin_combined = paste0(plotdir,"/", or_kin_combined) +#======================================================================= + +########################### +# Data for OR and stability plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +ps_df = merged_df3_comp +lig_df = merged_df3_comp_lig + +# Ensure correct data type in columns to plot: should be TRUE +is.numeric(ps_df$or_mychisq) +is.numeric(lig_df$or_mychisq) + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig) +#%% end of section 1 + +# sanity check: should be <10 +if (max(lig_df$ligand_distance) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +############# +# Plots: Bubble plot +# x = Position, Y = stability +# size of dots = OR +# col: stability +############# + +#----------------- +# Plot 1: DUET vs OR by position as geom_points +#------------------- + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +# Spelling Correction: made redundant as already corrected at the source +#ps_df$duet_outcome[ps_df$duet_outcome=='Stabilizing'] <- 'Stabilising' +#ps_df$duet_outcome[ps_df$duet_outcome=='Destabilizing'] <- 'Destabilising' + +table(ps_df$duet_outcome) ; sum(table(ps_df$duet_outcome)) + +g1 = ggplot(ps_df, aes(x = factor(position) + , y = duet_scaled)) + +p1 = g1 + + geom_point(aes(col = duet_outcome + #, size = or_mychisq))+ + , size = or_kin)) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "DUET(PS)" + , size = "Odds Ratio" + , colour = "DUET Outcome") + + guides(colour = guide_legend(override.aes = list(size=4))) + +p1 + +#------------------- +# generate plot 2: Lig vs OR by position as geom_points +#------------------- + +# Spelling Correction: made redundant as already corrected at the source + +#lig_df$ligand_outcome[lig_df$ligand_outcome=='Stabilizing'] <- 'Stabilising' +#lig_df$ligand_outcome[lig_df$ligand_outcome=='Destabilizing'] <- 'Destabilising' + +table(lig_df$ligand_outcome) + +g2 = ggplot(lig_df, aes(x = factor(position) + , y = affinity_scaled)) + +p2 = g2 + + geom_point(aes(col = ligand_outcome + #, size = or_mychisq))+ + , size = or_kin)) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "Ligand Affinity" + , size = "Odds Ratio" + , colour = "Ligand Outcome" + ) + + guides(colour = guide_legend(override.aes = list(size=4))) + +p2 + +#====================== +# combine using cowplot +#====================== + +svg(plot_or_combined, width = 32, height = 12) +svg(plot_or_kin_combined, width = 32, height = 12) + +theme_set(theme_gray()) # to preserve default theme + +printFile = cowplot::plot_grid(plot_grid(p1, p2 + , ncol = 1 + , align = 'v' + , labels = c("", "") + , label_size = my_als+5)) +print(printFile) +dev.off() + diff --git a/scripts/plotting/combining_two_df.R b/scripts/plotting/combining_dfs_plotting.R similarity index 68% rename from scripts/plotting/combining_two_df.R rename to scripts/plotting/combining_dfs_plotting.R index 5df2525..7d45d1b 100644 --- a/scripts/plotting/combining_two_df.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -1,8 +1,4 @@ -#!/usr/bin/env Rscript -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - +#!/usr/bin/env Rscript ######################################################### # TASK: To combine struct params and meta data for plotting # Input csv files: @@ -16,21 +12,22 @@ getwd() # 3) small combined df including NAs for AF, OR, etc. # Dim: same as mcsm data # 4) large combined df excluding NAs -# Dim: dim(#1) - no. of NAs(AF|OR) + 1 +# Dim: dim(#1) - na_count_df2 # 5) small combined df excluding NAs -# Dim: dim(#2) - no. of unique NAs - 1 +# Dim: dim(#2) - na_count_df3 # This script is sourced from other .R scripts for plotting ######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() -########################################################## -# Installing and loading required packages -########################################################## -#source("Header_TT.R") -require(data.table) -require(arsenal) -require(compare) -library(tidyverse) - +source("Header_TT.R") +#require(data.table) +#require(arsenal) +#require(compare) +#library(tidyverse) source("plotting_data.R") # should return the following dfs, directories and variables @@ -53,28 +50,13 @@ cat(paste0("Variables imported:" , "\nAngstrom symbol:", angstroms_symbol)) # clear excess variable -rm(my_df, upos, dup_muts, my_df_u_lig) +rm(my_df, upos, dup_muts) #======================================================== - -#======================================================== -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") -plotdir = paste0("~/git/Data", "/", drug, "/output/plots") #=========== # input #=========== #in_file1: output of plotting_data.R - +# my_df_u # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") @@ -85,56 +67,22 @@ cat(paste0("Input infile 2:", infile_gene_metadata)) #=========== # output #=========== -# mutations with opposite effects -out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") -outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) +# other variables that you can write +# primarily called by other scripts for plotting +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig #%%=============================================================== -table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) ) - -# spelling Correction 1: DUET incase American spelling needed! -#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing" -#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing" - - -# spelling Correction 2: Ligand incase American spelling needed! -table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) ) -#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing" -#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing" - - -# muts with opposing effects on protomer and ligand stability -table(my_df_u$duet_outcome != my_df_u$ligand_outcome) -changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),] - -# sanity check: redundant, but uber cautious! -dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome) -ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome) - -cat("Identifying muts with opposite stability effects") -if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat("PASS: muts with opposite effects on stability and affinity correctly identified" - , "\nNo. of such muts: ", nrow(changes)) -}else { - cat("FAIL: unsuccessful in extracting muts with changed stability effects") -} - -#*************************** -# write file: changed muts -write.csv(changes, outfile_opp_muts) - -cat("Finished writing file for muts with opp effects:" - , "\nFilename: ", outfile_opp_muts - , "\nDim:", dim(changes)) - -# clear variables -rm(out_filename_opp_muts, outfile_opp_muts) -rm(changes, dl_i, ld_i) - -# count na in each column -na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count -df_ncols = ncol(my_df_u) ########################### # 2: Read file: _meta data.csv @@ -146,8 +94,6 @@ gene_metadata <- read.csv(infile_gene_metadata , header = T) cat("Dim:", dim(gene_metadata)) -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: remove # counting NAs in AF, OR cols: if (identical(sum(is.na(my_df_u$or_mychisq)) , sum(is.na(my_df_u$pval_fisher)) @@ -176,31 +122,24 @@ if (identical(sum(is.na(my_df_u$or_kin)) , "\nNA in AF:", sum(is.na(my_df_u$af_kin))) } -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# clear variables -rm(in_filename_gene_metadata, infile_gene_metadata) - str(gene_metadata) +################################################################### +# combining: PS +################################################################### # sort by position (same as my_df) -# earlier it was mutationinformation -#head(gene_metadata$mutationinformation) -#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),] -##head(gene_metadata$mutationinformation) - head(gene_metadata$position) gene_metadata = gene_metadata[order(gene_metadata$position),] head(gene_metadata$position) -########################### -# Merge 1: two dfs with NA -# merged_df2 -########################### +#========================= +# Merge 1: merged_df2 +# dfs with NAs in ORs +#========================= head(my_df_u$mutationinformation) head(gene_metadata$mutationinformation) # Find common columns b/w two df -# FIXME: mutation has empty cell for some muts merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata)) cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" @@ -214,9 +153,6 @@ table(nchar(my_df_u$wild_type)) table(nchar(my_df_u$mutant_type)) table(nchar(my_df_u$position)) -#============= -# merged_df2: gene_metadata + my_df -#============== # all.y because x might contain non-structural positions! merged_df2 = merge(x = gene_metadata , y = my_df_u @@ -226,9 +162,7 @@ merged_df2 = merge(x = gene_metadata cat("Dim of merged_df2: ", dim(merged_df2)) head(merged_df2$position) -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: count how many unique muts have entries in meta data -# should PASS +# sanity check cat("Checking nrows in merged_df2") if(nrow(gene_metadata) == nrow(merged_df2)){ cat("PASS: nrow(merged_df2) = nrow (gene associated gene_metadata)" @@ -243,47 +177,23 @@ if(nrow(gene_metadata) == nrow(merged_df2)){ meta_muts_u = unique(gene_metadata$mutationinformation) # find the index where it differs unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) + quit() } -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# sort by position -head(merged_df2$position) -merged_df2 = merged_df2[order(merged_df2$position),] -head(merged_df2$position) - -merged_df2v3 = merge(x = gene_metadata - , y = my_df_u - , by = merging_cols - , all = T) - -merged_df2v2 = merge(x = gene_metadata - , y = my_df_u - , by = merging_cols - , all.x = T) -#!=!=!=!=!=!=!=! -#identical(merged_df2, merged_df2v2) - -nrow(merged_df2[merged_df2$position==186,]) -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$position%in%merged_df2v2$position) - -#!!!!!!!!!!! check why these differ - -######### -# merge 3b (merged_df3):remove duplicated mutations +#========================= +# Merge 2: merged_df3 +# dfs with NAs in ORs +# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#========================= +# remove duplicated mutations cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)" ,"\nCannot trust lineage info from this" ,"\nlinking col: mutationinforamtion" ,"\nfilename: merged_df3") -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] head(merged_df3$position); tail(merged_df3$position) # should be sorted @@ -326,12 +236,10 @@ if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_k quit() } -########################### -# 4: merging two dfs: without NA -########################### -######### -# merge 4a (merged_df2_comp): same as merge 1 but excluding NA -######### +#========================= +# Merge3: merged_df2_comp +# same as merge 1 but excluding NAs from ORs, etc. +#========================= cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" ,"\nlinking col: Mutationinforamtion" ,"\nfilename: merged_df2_comp") @@ -357,9 +265,12 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ print('Index mismatch for mychisq and kin ors. Aborting NA ommission') } -######### -# merge 4b (merged_df3_comp): remove duplicate mutation information -######### +#========================= +# Merge4: merged_df3_comp +# same as merge 2 but excluding NAs from ORs, etc or +# remove duplicate mutation information +#========================= + if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ print('mychisq and kin ors missing indices match. Procedding with omitting NAs') na_count_df3 = sum(is.na(merged_df3$af)) @@ -388,7 +299,6 @@ bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] all.equal(foo, bar) #summary(comparedf(foo, bar)) -#=============== end of combining df #============================================================== ################# # OPTIONAL: write ALL 4 output files @@ -416,7 +326,32 @@ all.equal(foo, bar) # clear variables rm(foo, bar, gene_metadata , in_filename_params, infile_params, merging_cols + , in_filename_gene_metadata, infile_gene_metadata , merged_df2v2, merged_df2v3) +#************************* +##################################################################### +# Combining: LIG +##################################################################### -#============================= end of script +#========================= +# Merges 5-8 +#========================= +merged_df2_lig = merged_df2[merged_df2$ligand_distance<10,] +merged_df2_comp_lig = merged_df2_comp[merged_df2_comp$ligand_distance<10,] + +merged_df3_lig = merged_df3[merged_df3$ligand_distance<10,] +merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,] + +# sanity check +if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){ + print("PASS: verified merged_df3_lig") +}else{ + cat(paste0('FAIL: nrow mismatch for merged_df3_lig' + , "\nExpected:", nrow(my_df_u_lig) + , "\nGot:", nrow(merged_df3_lig))) +} + +#========================================================================== +# end of script +##========================================================================== \ No newline at end of file diff --git a/scripts/plotting/combining_two_df_FIXME.R b/scripts/plotting/combining_two_df_FIXME.R deleted file mode 100755 index 0d34e06..0000000 --- a/scripts/plotting/combining_two_df_FIXME.R +++ /dev/null @@ -1,442 +0,0 @@ -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - -######################################################### -# TASK: To combine struct params and meta data for plotting -# Input csv files: -# 1) _all_params.csv -# 2) _meta_data.csv - -# Output: -# 1) muts with opposite effects on stability -# 2) large combined df including NAs for AF, OR,etc -# Dim: same no. of rows as gene associated meta_data_with_AFandOR -# 3) small combined df including NAs for AF, OR, etc. -# Dim: same as mcsm data -# 4) large combined df excluding NAs -# Dim: dim(#1) - no. of NAs(AF|OR) + 1 -# 5) small combined df excluding NAs -# Dim: dim(#2) - no. of unique NAs - 1 -# This script is sourced from other .R scripts for plotting -######################################################### - -########################################################## -# Installing and loading required packages -########################################################## -source("Header_TT.R") -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - - -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") - -#=========== -# input -#=========== -#in_filename = "mcsm_complex1_normalised.csv" -in_filename_params = paste0(tolower(gene), "_all_params.csv") -infile_params = paste0(outdir, "/", in_filename_params) -cat(paste0("Input file 1:", infile_params) ) - -# infile 2: gene associated meta data -#in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") -in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") -infile_gene_metadata = paste0(outdir, "/", in_filename_gene_metadata) -cat(paste0("Input infile 2:", infile_gene_metadata)) - -#=========== -# output -#=========== -# mutations with opposite effects -out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") -outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) - - -#%%=============================================================== -########################### -# Read file: struct params -########################### -cat("Reading struct params including mcsm:" - , in_filename_params) - -mcsm_data = read.csv(infile_params - #, row.names = 1 - , stringsAsFactors = F - , header = T) - -cat("Input dimensions:", dim(mcsm_data)) #416, 86 - -# clear variables -rm(in_filename_params, infile_params) - -str(mcsm_data) - -table(mcsm_data$duet_outcome); sum(table(mcsm_data$duet_outcome) ) - -# spelling Correction 1: DUET incase American spelling needed! -#mcsm_data$duet_outcome[mcsm_data$duet_outcome=="Stabilising"] <- "Stabilizing" -#mcsm_data$duet_outcome[mcsm_data$duet_outcome=="Destabilising"] <- "Destabilizing" - -# checks: should be the same as above -table(mcsm_data$duet_outcome); sum(table(mcsm_data$duet_outcome) ) -head(mcsm_data$duet_outcome); tail(mcsm_data$duet_outcome) - -# spelling Correction 2: Ligand incase American spelling needed! -table(mcsm_data$ligand_outcome); sum(table(mcsm_data$ligand_outcome) ) -#mcsm_data$ligand_outcome[mcsm_data$ligand_outcome=="Stabilising"] <- "Stabilizing" -#mcsm_data$ligand_outcome[mcsm_data$ligand_outcome=="Destabilising"] <- "Destabilizing" - -# checks: should be the same as above -table(mcsm_data$ligand_outcome); sum(table(mcsm_data$ligand_outcome) ) -head(mcsm_data$ligand_outcome); tail(mcsm_data$ligand_outcome) - -# muts with opposing effects on protomer and ligand stability -table(mcsm_data$duet_outcome != mcsm_data$ligand_outcome) -changes = mcsm_data[which(mcsm_data$duet_outcome != mcsm_data$ligand_outcome),] - -# sanity check: redundant, but uber cautious! -dl_i = which(mcsm_data$duet_outcome != mcsm_data$ligand_outcome) -ld_i = which(mcsm_data$ligand_outcome != mcsm_data$duet_outcome) - -cat("Identifying muts with opposite stability effects") -if(nrow(changes) == (table(mcsm_data$duet_outcome != mcsm_data$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat("PASS: muts with opposite effects on stability and affinity correctly identified" - , "\nNo. of such muts: ", nrow(changes)) -}else { - cat("FAIL: unsuccessful in extracting muts with changed stability effects") -} - -#*************************** -# write file: changed muts -write.csv(changes, outfile_opp_muts) - -cat("Finished writing file for muts with opp effects:" - , "\nFilename: ", outfile_opp_muts - , "\nDim:", dim(changes)) - -# clear variables -rm(out_filename_opp_muts, outfile_opp_muts) -rm(changes, dl_i, ld_i) - -#*************************** -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by mutationinformation -##mcsm_data = mcsm_data[order(mcsm_data$mutationinformation),] -##head(mcsm_data$mutationinformation) - -df_ncols = ncol(mcsm_data) - -# REMOVE as this is dangerous due to dup muts -# get freq count of positions and add to the df -#setDT(mcsm_data)[, occurrence := .N, by = .(position)] - -#cat("Added 1 col: position frequency to see which posn has how many muts" -# , "\nNo. of cols now", ncol(mcsm_data) -# , "\nNo. of cols before: ", df_ncols) - -#pos_count_check = data.frame(mcsm_data$position, mcsm_data$occurrence) - -# check duplicate muts -if (length(unique(mcsm_data$mutationinformation)) == length(mcsm_data$mutationinformation)){ - cat("No duplicate mutations in mcsm data") -}else{ - dup_muts = mcsm_data[duplicated(mcsm_data$mutationinformation),] - dup_muts_nu = length(unique(dup_muts$mutationinformation)) - cat(paste0("CAUTION:", nrow(dup_muts), " Duplicate mutations identified" - , "\nOf these, no. of unique mutations are:", dup_muts_nu - , "\nExtracting df with unique mutations only")) - mcsm_data_u = mcsm_data[!duplicated(mcsm_data$mutationinformation),] -} - -if (nrow(mcsm_data_u) == length(unique(mcsm_data$mutationinformation))){ - cat("Df without duplicate mutations successfully extracted") -} else{ - cat("FAIL: could not extract clean df!") - quit() -} - -########################### -# 2: Read file: _meta data.csv -########################### -cat("Reading meta data file:", infile_gene_metadata) - -gene_metadata <- read.csv(infile_gene_metadata - , stringsAsFactors = F - , header = T) -cat("Dim:", dim(gene_metadata)) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: remove -# counting NAs in AF, OR cols: -if (identical(sum(is.na(gene_metadata$OR)) - , sum(is.na(gene_metadata$pvalue)) - , sum(is.na(gene_metadata$AF)))){ - cat("PASS: NA count match for OR, pvalue and AF\n") - na_count = sum(is.na(gene_metadata$AF)) - cat("No. of NAs: ", sum(is.na(gene_metadata$OR))) -} else{ - cat("FAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(gene_metadata$OR)) - , "\nNA in pvalue: ", sum(is.na(gene_metadata$pvalue)) - , "\nNA in AF:", sum(is.na(gene_metadata$AF))) -} -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# clear variables -rm(in_filename_gene_metadata, infile_gene_metadata) - -str(gene_metadata) - -# sort by position (same as mcsm_data) -# earlier it was mutationinformation -#head(gene_metadata$mutationinformation) -#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),] -##head(gene_metadata$mutationinformation) - -head(gene_metadata$position) -gene_metadata = gene_metadata[order(gene_metadata$position),] -head(gene_metadata$position) - -########################### -# Merge 1: two dfs with NA -# merged_df2 -########################### -head(mcsm_data$mutationinformation) -head(gene_metadata$mutationinformation) - -# Find common columns b/w two df -merging_cols = intersect(colnames(mcsm_data), colnames(gene_metadata)) - -cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" - , "\nNo. of merging cols:", length(merging_cols) - , "\nMerging columns identified:")) -print(merging_cols) - -#============= -# merged_df2): gene_metadata + mcsm_data -#============== -merged_df2 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all.y = T) - -cat("Dim of merged_df2: ", dim(merged_df2) #4520, 11 - ) -head(merged_df2$position) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: count how many unique muts have entries in meta data -# sanity check -cat("Checking nrows in merged_df2") -if(nrow(gene_metadata) == nrow(merged_df2)){ - cat("nrow(merged_df2) = nrow (gene associated gene_metadata)" - ,"\nExpected no. of rows: ",nrow(gene_metadata) - ,"\nGot no. of rows: ", nrow(merged_df2)) -} else{ - cat("nrow(merged_df2)!= nrow(gene associated gene_metadata)" - , "\nExpected no. of rows after merge: ", nrow(gene_metadata) - , "\nGot no. of rows: ", nrow(merged_df2) - , "\nFinding discrepancy") - merged_muts_u = unique(merged_df2$mutationinformation) - meta_muts_u = unique(gene_metadata$mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - -# sort by position -head(merged_df2$position) -merged_df2 = merged_df2[order(merged_df2$position),] -head(merged_df2$position) - -merged_df2v3 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all = T) - -merged_df2v2 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn"t have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$position%in%merged_df2v2$position) - -rm(merged_df2v2) - -#!!!!!!!!!!! check why these differ - -######### -# merge 3b (merged_df3):remove duplicate mutation information -######### -cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)" - ,"\nCannot trust lineage info from this" - ,"\nlinking col: Mutationinforamtion" - ,"\nfilename: merged_df3") - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] -head(merged_df3$position); tail(merged_df3$position) # should be sorted - -# sanity check -cat("Checking nrows in merged_df3") -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat("PASS: No. of rows match with mcsm_data" - ,"\nExpected no. of rows: ", nrow(mcsm_data) - ,"\nGot no. of rows: ", nrow(merged_df3)) -} else { - cat("FAIL: No. of rows mismatch" - , "\nNo. of rows mcsm_data: ", nrow(mcsm_data) - , "\nNo. of rows merged_df3: ", nrow(merged_df3)) -} - -# counting NAs in AF, OR cols in merged_df3 -# this is becuase mcsm has no AF, OR cols, -# so you cannot count NAs -if (identical(sum(is.na(merged_df3$OR)) - , sum(is.na(merged_df3$pvalue)) - , sum(is.na(merged_df3$AF)))){ - cat("PASS: NA count match for OR, pvalue and AF\n") - na_count_df3 = sum(is.na(merged_df3$AF)) - cat("No. of NAs: ", sum(is.na(merged_df3$OR))) -} else{ - cat("FAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(merged_df3$OR)) - , "\nNA in pvalue: ", sum(is.na(merged_df3$pvalue)) - , "\nNA in AF:", sum(is.na(merged_df3$AF))) -} - -########################### -# 4: merging two dfs: without NA -########################### -######### -# merge 4a (merged_df2_comp): same as merge 1 but excluding NA -######### -cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" - ,"\nlinking col: Mutationinforamtion" - ,"\nfilename: merged_df2_comp") - -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$mutationinformation),] - -# sanity check -cat("Checking nrows in merged_df2_comp") -if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ - cat("PASS: No. of rows match" - ,"\nDim of merged_df2_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count + 1 - , "\nNo. of rows: ", nrow(merged_df2_comp) - , "\nNo. of cols: ", ncol(merged_df2_comp)) -}else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count + 1 - ,"\nGot no. of rows: ", nrow(merged_df2_comp)) -} - -######### -# merge 4b (merged_df3_comp): remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$mutationinformation),] - -cat("Dim of merged_df3_comp: " - , "\nNo. of rows: ", nrow(merged_df3_comp) - , "\nNo. of cols: ", ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -# sanity check -cat("Checking nrows in merged_df3_comp") -if(nrow(merged_df3_comp) == nrow(merged_df3)){ - cat("NO NAs detected in merged_df3 in AF|OR cols" - ,"\nNo. of rows are identical: ", nrow(merged_df3)) -} else{ - if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { - cat("PASS: NAs detected in merged_df3 in AF|OR cols" - , "\nNo. of NAs: ", na_count_df3 - , "\nExpected no. of rows in merged_df3_comp: ", nrow(merged_df3) - na_count_df3 - , "\nGot no. of rows: ", nrow(merged_df3_comp)) - } -} - -#=============== end of combining df -#********************* -# writing 1 file in the style of a loop: merged_df3 -# print(output dir) -#i = "merged_df3" -#out_filename = paste0(i, ".csv") -#outfile = paste0(outdir, "/", out_filename) - -#cat("Writing output file: " -# ,"\nFilename: ", out_filename -# ,"\nPath: ", outdir) - -#template: write.csv(merged_df3, "merged_df3.csv") -#write.csv(get(i), outfile, row.names = FALSE) -#cat("Finished writing: ", outfile -# , "\nNo. of rows: ", nrow(get(i)) -# , "\nNo. of cols: ", ncol(get(i))) - -#%% write_output files; all 4 files: -outvars = c("merged_df2" - , "merged_df3" - , "merged_df2_comp" - , "merged_df3_comp") - -cat("Writing output files: " - , "\nPath:", outdir) - -for (i in outvars){ -# cat(i, "\n") - out_filename = paste0(i, ".csv") -# cat(out_filename, "\n") -# cat("getting value of variable: ", get(i)) - outfile = paste0(outdir, "/", out_filename) -# cat("Full output path: ", outfile, "\n") - cat("Writing output file:" - ,"\nFilename: ", out_filename,"\n") - write.csv(get(i), outfile, row.names = FALSE) - cat("Finished writing: ", outfile - , "\nNo. of rows: ", nrow(get(i)) - , "\nNo. of cols: ", ncol(get(i)), "\n") -} - -# alternate way to replace with implicit loop -# FIXME -#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, "/", outvars, ".csv"))) -#************************* -# clear variables -rm(mcsm_data, gene_metadata, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, df_ncols, outdir) -rm(pos_count_check) -#============================= end of script - diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R index 1c91483..8b107bb 100644 --- a/scripts/plotting/lineage_basic_barplot.R +++ b/scripts/plotting/lineage_basic_barplot.R @@ -5,27 +5,30 @@ getwd() ######################################################### # TASK: Basic lineage barplot showing numbers -# Output: +# Output: Basic barplot with lineage samples and mut count ########################################################## # Installing and loading required packages ########################################################## source("Header_TT.R") require(data.table) -source("combining_two_df.R") - -#========================== +source("combining_dfs_plotting.R") # should return the following dfs, directories and variables -# df with NA: -# merged_df2 -# merged_df3 +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp -# df without NA: -# merged_df2_comp -# merged_df3_comp +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig -# my_df_u +# 9) my_df_u +# 10) my_df_u_lig cat(paste0("Directories imported:" , "\ndatadir:", datadir @@ -38,13 +41,16 @@ cat(paste0("Variables imported:" , "\ngene:", gene , "\ngene_match:", gene_match , "\nAngstrom symbol:", angstroms_symbol - , "\nNo. of cols:", df_ncols , "\nNo. of duplicated muts:", dup_muts_nu , "\nNA count for ORs:", na_count , "\nNA count in df2:", na_count_df2 , "\nNA count in df3:", na_count_df3)) -#========================= +#=========== +# input +#=========== +# output of combining_dfs_plotting.R + #======= # output #======= @@ -82,15 +88,11 @@ is.factor(my_df$lineage) # fill = lineage #============================ table(my_df$lineage) - -#**************** -# Plot: Lineage Barplot -#**************** +as.data.frame(table(my_df$lineage)) #============= # Data for plots #============= - # REASSIGNMENT df <- my_df @@ -111,18 +113,7 @@ sel_lineages = c("lineage1" #, "lineage7" ) -df_lin = subset(df, subset = lineage %in% sel_lineages ) - -#FIXME; add sanity check for numbers. -# Done this manually - -############################################################ - -######### -# Data for barplot: Lineage barplot -# to show total samples and number of unique mutations -# within each linege -########## +df_lin = subset(df, subset = lineage %in% sel_lineages) # Create df with lineage inform & no. of unique mutations # per lineage and total samples within lineage @@ -193,7 +184,7 @@ printFile = g + geom_bar(stat = "identity" , axis.title.y = element_text(size = my_als , colour = 'black') , legend.position = "top" - , legend.text = element_text(size = my_als) + + , legend.text = element_text(size = my_als)) + #geom_text() + geom_label(aes(label = value) , size = 5 @@ -212,7 +203,7 @@ printFile = g + geom_bar(stat = "identity" , name='' , labels=c('Mutations', 'Total Samples')) + scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') - , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))) + , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')) print(printFile) dev.off() diff --git a/scripts/plotting/lineage_count.txt b/scripts/plotting/lineage_count.txt new file mode 100644 index 0000000..fe6a87d --- /dev/null +++ b/scripts/plotting/lineage_count.txt @@ -0,0 +1,71 @@ +#============= +# merged_df2 +#============= +---------------- +# no. of samples +---------------- + Var1 Freq +1 8 +2 lineage1 144 +3 lineage1;lineage2 3 +4 lineage1;lineage4 4 +5 lineage2 1886 +6 lineage2;lineage4 19 +7 lineage3 190 +8 lineage3;lineage4 11 +9 lineage4 2213 +10 lineage4;lineage6 1 +11 lineage4;lineage7 1 +12 lineage4;lineageBOV 1 +13 lineage5 31 +14 lineage6 9 +15 lineage7 3 +16 lineageBOV 392 + +---------------- +# no. of nsSNPs +---------------- + + sel_lineages num_snps_u total_samples +1 lineage1 74 144 +2 lineage2 277 1886 +3 lineage3 104 190 +4 lineage4 311 2213 +5 lineage5 18 31 +6 lineage6 8 9 +7 lineage7 1 3 + + +#============= +# merged_df2_comp +#============= +---------------- +# no. of samples +---------------- + + Var1 Freq +1 3 +2 lineage1 108 +3 lineage1;lineage2 2 +4 lineage1;lineage4 2 +5 lineage2 1497 +6 lineage2;lineage4 13 +7 lineage3 154 +8 lineage3;lineage4 3 +9 lineage4 1846 +10 lineage4;lineageBOV 1 +11 lineage5 12 +12 lineage6 2 +13 lineageBOV 36 + +---------------- +# no. of nsSNPs +---------------- + sel_lineages num_snps_u total_samples +1 lineage1 42 108 +2 lineage2 141 1497 +3 lineage3 75 154 +4 lineage4 148 1846 +5 lineage5 9 12 +6 lineage6 2 2 +7 lineage7 0 0 diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R new file mode 100644 index 0000000..5b02f34 --- /dev/null +++ b/scripts/plotting/opp_mcsm_muts.R @@ -0,0 +1,95 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: To write muts with opposite effects on +# protomer and ligand stability +######################################################### +# working dir and loading libraries + +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("plotting_data.R") + +# should return the following dfs, directories and variables +# my_df +# my_df_u +# my_df_u_lig +# dup_muts + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nLength of upos:", length(upos) + , "\nAngstrom symbol:", angstroms_symbol)) + +# clear excess variable +rm(my_df, upos, dup_muts) +#======================================================== +#=========== +# input +#=========== +#in_file1: output of plotting_data.R +# my_df_u + +# output +#=========== +# mutations with opposite effects +out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") +outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) + +#%%=============================================================== + +# spelling Correction 1: DUET incase American spelling needed! +table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) ) +#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing" +#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing" + + +# spelling Correction 2: Ligand incase American spelling needed! +table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) ) +#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing" +#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing" + + +# muts with opposing effects on protomer and ligand stability +table(my_df_u$duet_outcome != my_df_u$ligand_outcome) +changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),] + +# sanity check: redundant, but uber cautious! +dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome) +ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome) + +cat("Identifying muts with opposite stability effects") +if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { + cat("PASS: muts with opposite effects on stability and affinity correctly identified" + , "\nNo. of such muts: ", nrow(changes)) +}else { + cat("FAIL: unsuccessful in extracting muts with changed stability effects") +} + +#========================== +# write file: changed muts +#========================== +write.csv(changes, outfile_opp_muts) + +cat("Finished writing file for muts with opp effects:" + , "\nFilename: ", outfile_opp_muts + , "\nDim:", dim(changes)) + +# clear variables +rm(out_filename_opp_muts, outfile_opp_muts) +rm(changes, dl_i, ld_i) + +# count na in each column +na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count +df_ncols = ncol(my_df_u) + +#===================================== end of script