diff --git a/scripts/plotting/corr_PS_LIG.R b/scripts/plotting/corr_PS_LIG.R index c566ca9..6b1a708 100644 --- a/scripts/plotting/corr_PS_LIG.R +++ b/scripts/plotting/corr_PS_LIG.R @@ -248,6 +248,4 @@ pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)] dev.off() corr_lig_rho = corr.test(my_corr_lig[1:4], method = "spearman")$r corr_lig_p = corr.test(my_corr_lig[1:4], method = "spearman")$p -####################################################### - - +####################################################### \ No newline at end of file diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index 32a0fbc..893a929 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -110,9 +110,6 @@ cat("No. of rows in my_data:", nrow(logo_data) #======================================================================= #%% logo plots from dataframe -############# -# PLOTS -############# foo = logo_data[, c("position" , "mutant_type","duet_scaled", "or_mychisq" , "mut_prop_polarity", "mut_prop_water")] @@ -150,6 +147,76 @@ rownames(wide_df_logor_m) colnames(wide_df_logor_m) position_logor = as.numeric(colnames(wide_df_logor_m)) + +#=============================== +# logo data: multiple nsSNPs (>1) +#================================= +#require(data.table) + +# get freq count of positions so you can subset freq<1 +setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] + +table(logo_data$position) +table(logo_data$mut_pos_occurrence) + +max_mut = max(table(logo_data$position)) + +# extract freq_pos > 1 +my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] +u = unique(my_data_snp$position) +max_mult_mut = max(table(my_data_snp$position)) + +if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){ + + cat("PASS: positions with multiple muts extracted" + , "\nNo. of mutations:", nrow(my_data_snp) + , "\nNo. of positions:", length(u) + , "\nMax no. of muts at any position", max_mult_mut) +}else{ + cat("FAIL: positions with multiple muts could NOT be extracted" + , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] + , "\nGot:", nrow(my_data_snp) ) +} + +cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]]) + +#-------------------------------------- +# matrix for_mychisq mutant type +# frequency of mutant type by position +#--------------------------------------- +table(my_data_snp$mutant_type, my_data_snp$position) +tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) +class(tab_mt) + +# unclass to convert to matrix +tab_mt = unclass(tab_mt) +tab_mt = as.matrix(tab_mt, rownames = T) + +# should be TRUE +is.matrix(tab_mt) + +rownames(tab_mt) #aa +colnames(tab_mt) #pos + +#------------------------------------- +# matrix for wild type +# frequency of wild type by position +#------------------------------------- +tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt +tab_wt = unclass(tab_wt) + +# remove wt duplicates +wt = my_data_snp[, c("position", "wild_type")] +wt = wt[!duplicated(wt),] + +tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 + +rownames(tab_wt) +rownames(tab_wt) + +identical(colnames(tab_mt), colnames(tab_wt)) +identical(ncol(tab_mt), ncol(tab_wt)) + ######################################################################## # End of script ######################################################################## \ No newline at end of file diff --git a/scripts/plotting/logo_combined.R b/scripts/plotting/logo_combined.R index 1d14da0..925d1ec 100755 --- a/scripts/plotting/logo_combined.R +++ b/scripts/plotting/logo_combined.R @@ -3,95 +3,6 @@ # TASK: producing logo-type plot showing # multiple muts per position coloured by aa property ######################################################### -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting") -getwd() - -source("Header_TT.R") -source("../functions/plotting_globals.R") -source("../functions/plotting_data.R") -source("../functions/combining_dfs_plotting.R") -########################################################### -# command line args -#******************** -#drug = 'streptomycin' -#gene = 'gid' -#******************** -# !!!FUTURE TODO!!! -# Can pass additional params of output/plot dir by user. -# Not strictly required for my workflow since it is optimised -# to have a streamlined input/output flow without filename worries. -#******************** -spec = matrix(c( - "drug" , "d", 1, "character", - "gene" , "g", 1, "character", - "data_file1" , "fa", 2, "character", - "data_file2" , "fb", 2, "character" -), byrow = TRUE, ncol = 4) - -opt = getopt(spec) - -#FIXME: detect if script running from cmd, then set these -drug = opt$drug -gene = opt$gene -infile_params = opt$data_file1 -infile_metadata = opt$data_file2 - -if(is.null(drug)|is.null(gene)) { - stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") -} - -#=========== -# input -#=========== -#--------------------- -# call: import_dirs() -#--------------------- -import_dirs(drug_name = drug, gene_name = gene) - -#--------------------------- -# call: plotting_data() -#--------------------------- -#if (!exists("infile_params") && exists("gene")){ -if (!is.character(infile_params) && exists("gene")){ # when running as cmd - #in_filename_params = paste0(tolower(gene), "_all_params.csv") - in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid - infile_params = paste0(outdir, "/", in_filename_params) - cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") -} - -# Input 1: read _comb_afor.csv -cat("\nReading mcsm combined data file: ", infile_params) -mcsm_df = read.csv(infile_params, header = T) -pd_df = plotting_data(mcsm_df) -my_df_u = pd_df[[2]] - -#-------------------------------- -# call: combining_dfs_plotting() -#-------------------------------- -#if (!exists("infile_metadata") && exists("gene")){ -if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd - in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid - infile_metadata = paste0(outdir, "/", in_filename_metadata) - cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") -} - -# Input 2: read _meta data.csv -cat("\nReading meta data file: ", infile_metadata) - -gene_metadata <- read.csv(infile_metadata - , stringsAsFactors = F - , header = T) - -all_plot_dfs = combining_dfs_plotting(my_df_u - , gene_metadata - , lig_dist_colname = 'ligand_distance' - , lig_dist_cutoff = 10) - -merged_df3 = all_plot_dfs[[2]] - #=========== # output #=========== diff --git a/scripts/plotting/logo_multiple_muts.R b/scripts/plotting/logo_multiple_muts.R index 10cc496..4902c1c 100755 --- a/scripts/plotting/logo_multiple_muts.R +++ b/scripts/plotting/logo_multiple_muts.R @@ -1,29 +1,9 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: producing logo-type plot showing -# multiple muts per position coloured by aa property -######################################################### -#======================================================================= -# working dir and loading libraries +#!/usr/bin/env Rscript getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() - source("Header_TT.R") -source("../functions/plotting_globals.R") -source("../functions/plotting_data.R") -source("../functions/combining_dfs_plotting.R") -########################################################### -# command line args -#******************** -#drug = 'streptomycin' -#gene = 'gid' -#******************** -# !!!FUTURE TODO!!! -# Can pass additional params of output/plot dir by user. -# Not strictly required for my workflow since it is optimised -# to have a streamlined input/output flow without filename worries. -#******************** + spec = matrix(c( "drug" , "d", 1, "character", "gene" , "g", 1, "character", @@ -33,7 +13,6 @@ spec = matrix(c( opt = getopt(spec) -#FIXME: detect if script running from cmd, then set these drug = opt$drug gene = opt$gene infile_params = opt$data_file1 @@ -42,131 +21,22 @@ infile_metadata = opt$data_file2 if(is.null(drug)|is.null(gene)) { stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") } + #=========== -# input +# Input #=========== -#--------------------- -# call: import_dirs() -#--------------------- -import_dirs(drug_name = drug, gene_name = gene) -#--------------------------- -# call: plotting_data() -#--------------------------- -#if (!exists("infile_params") && exists("gene")){ -if (!is.character(infile_params) && exists("gene")){ # when running as cmd - #in_filename_params = paste0(tolower(gene), "_all_params.csv") - in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid - infile_params = paste0(outdir, "/", in_filename_params) - cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") -} - -# Input 1: read _comb_afor.csv -cat("\nReading mcsm combined data file: ", infile_params) -mcsm_df = read.csv(infile_params, header = T) -pd_df = plotting_data(mcsm_df) -my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() - -#-------------------------------- -# call: combining_dfs_plotting() -#-------------------------------- -#if (!exists("infile_metadata") && exists("gene")){ -if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd - in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid - infile_metadata = paste0(outdir, "/", in_filename_metadata) - cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") -} - -# Input 2: read _meta data.csv -cat("\nReading meta data file: ", infile_metadata) - -gene_metadata <- read.csv(infile_metadata - , stringsAsFactors = F - , header = T) - -all_plot_dfs = combining_dfs_plotting(my_df_u - , gene_metadata - , lig_dist_colname = 'ligand_distance' - , lig_dist_cutoff = 10) - -merged_df3 = all_plot_dfs[[2]] +source("get_plotting_dfs.R") #=========== # output #=========== - logo_multiple_muts = "logo_multiple_muts.svg" plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts) -########################################################################## - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -my_df = merged_df3 -#%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -colnames(my_df) -str(my_df) - -#rownames(my_df) = my_df$mutation - -c1 = unique(my_df$position) -nrow(my_df) - -# get freq count of positions so you can subset freq<1 -#require(data.table) -setDT(my_df)[, mut_pos_occurrence := .N, by = .(position)] #189, 36 - -table(my_df$position) -table(my_df$mut_pos_occurrence) - -max_mut = max(table(my_df$position)) - -# extract freq_pos>1 -my_data_snp = my_df[my_df$mut_pos_occurrence!=1,] -u = unique(my_data_snp$position) -max_mult_mut = max(table(my_data_snp$position)) - -if (nrow(my_data_snp) == nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] ){ - - cat("PASS: positions with multiple muts extracted" - , "\nNo. of mutations:", nrow(my_data_snp) - , "\nNo. of positions:", length(u) - , "\nMax no. of muts at any position", max_mult_mut) -}else{ - cat("FAIL: positions with multiple muts could NOT be extracted" - , "\nExpected:",nrow(my_df) - table(my_df$mut_pos_occurrence)[[1]] - , "\nGot:", nrow(my_data_snp) ) -} - -cat("\nNo. of sites with only 1 mutations:", table(my_df$mut_pos_occurrence)[[1]]) - - -######################################################################## -# end of data extraction and cleaning for_mychisq plots # -######################################################################## - -#============== -# matrix for_mychisq mutant type -# frequency of mutant type by position -#============== -table(my_data_snp$mutant_type, my_data_snp$position) -tab_mt = table(my_data_snp$mutant_type, my_data_snp$position) -class(tab_mt) - -# unclass to convert to matrix -tab_mt = unclass(tab_mt) -tab_mt = as.matrix(tab_mt, rownames = T) - -#should be TRUE -is.matrix(tab_mt) - -rownames(tab_mt) #aa -colnames(tab_mt) #pos - -#************** +#********************* # Plot 1: mutant logo -#************** +#********************* p0 = ggseqlogo(tab_mt , method = 'custom' , seq_type = 'aa') + @@ -175,62 +45,43 @@ p0 = ggseqlogo(tab_mt theme(text=element_text(family="FreeSans"))+ theme_logo()+ scale_x_continuous(breaks = 1:ncol(tab_mt) - , labels = colnames(tab_mt))+ + , labels = colnames(tab_mt))+ scale_y_continuous( breaks = 1:max_mult_mut , limits = c(0, max_mult_mut)) -#p0 +p0 +cat('\nDone: p0') -cat('p0 done\n') # further customisation -p1 = p0 + theme(legend.position = "none" - , legend.title = element_blank() - , legend.text = element_text(size = 20) - , axis.text.x = element_text(size = 17, angle = 90) - , axis.text.y = element_blank()) -#p1 -cat('p0+p1 done\n') -#============== -# matrix for wild type -# frequency of wild type by position -#============== -tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt -tab_wt = unclass(tab_wt) +mut_logo_p = p0 + theme(legend.position = "none" + , legend.title = element_blank() + , legend.text = element_text(size = 20) + , axis.text.x = element_text(size = 17, angle = 90) + , axis.text.y = element_blank()) +#mut_logo_p +cat('\nDone: p0+mut_logo_p') -#remove wt duplicates -wt = my_data_snp[, c("position", "wild_type")] -wt = wt[!duplicated(wt),] - -tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1 -rownames(tab_wt) -rownames(tab_wt) - -#************** +#*********************** # Plot 2: wild_type logo -#************** -# sanity check: MUST BE TRUE - -identical(colnames(tab_mt), colnames(tab_wt)) -identical(ncol(tab_mt), ncol(tab_wt)) - +#*********************** p2 = ggseqlogo(tab_wt , method = 'custom' , seq_type = 'aa' #, col_scheme = "taylor" #, col_scheme = chemistry2 - ) + +) + #ylab('my custom height') + theme(text=element_text(family="FreeSans"))+ theme(axis.text.x = element_blank() , axis.text.y = element_blank()) + theme_logo() + - scale_x_continuous(breaks = 1:ncol(tab_wt) - , labels = colnames(tab_wt)) + scale_x_continuous(breaks = 1:ncol(tab_wt) + , labels = colnames(tab_wt)) #p2 +cat('\nDone: p2 done') # further customise -cat('p2 done\n') -p3 = p2 + +wt_logo_p = p2 + theme(legend.position = "bottom" #, legend.title = element_blank() , legend.title = element_text("Amino acid properties", size = 20) @@ -241,27 +92,27 @@ p3 = p2 + labs(x= "Position") -#p3 +#wt_logo_p +cat('\nDone: wt_logo_p') -# Now combine using cowplot, which ensures the plots are aligned -suppressMessages( require(cowplot) ) +#------------------------------------ +# Now combine using cowplot +# which ensures the plots are aligned +#------------------------------------ +#suppressMessages( require(cowplot) ) +cat('\nDone: wt_logo_p') -cat('p3 done\n') - -#plot_grid(p1, p3, ncol = 1, align = 'v') #+ -cat('p3+p2 done\n') - -#colour scheme -#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r - -cat("Output plot:", plot_logo_multiple_muts, "\n") +#plot_grid(p1, p3, ncol = 1, align = 'v') +cat('\nDone: mut_logo_p + wt_logo_p') +# colour scheme: https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r +cat("\nOutput plot:", plot_logo_multiple_muts, "\n") svg(plot_logo_multiple_muts, width = 32, height = 10) -OutPlot1 = cowplot::plot_grid(p1, p3 - , nrow = 2 - , align = "v" - , rel_heights = c(3/4, 1/4)) +mult_muts_combined = cowplot::plot_grid(mut_logo_p, wt_logo_p + , nrow = 2 + , align = "v" + , rel_heights = c(3/4, 1/4)) -print(OutPlot1) -#dev.off() +print(mult_muts_combined) +#dev.off() \ No newline at end of file diff --git a/scripts/plotting/logo_plot.R b/scripts/plotting/logo_plot.R index 7c2d66f..f9a39fe 100755 --- a/scripts/plotting/logo_plot.R +++ b/scripts/plotting/logo_plot.R @@ -77,7 +77,7 @@ dev.off() cat("Logo plot with log10 OR as y axis:", plot_logo_logOR) svg(plot_logo_logOR, width = 30 , height = 6) -ggseqlogo(wide_df_logor_m +logo_logOR = ggseqlogo(wide_df_logor_m , method = "custom" , seq_type="aa") + ylab("my custom height") + theme(legend.position = "bottom" @@ -97,7 +97,10 @@ ggseqlogo(wide_df_logor_m , limits = factor(1:length(position_logor)))+ ylab("Log (Odds Ratio)") + scale_y_continuous(limits = c(0, 9)) + +print(logo_logOR) dev.off() + ######################################################################= # End of script ######################################################################= diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt index 8e380f3..6d87259 100644 --- a/scripts/plotting/running_plotting_scripts.txt +++ b/scripts/plotting/running_plotting_scripts.txt @@ -29,9 +29,7 @@ sources: ./logo_plot.R -d streptomycin -g gid sources: - ## plotting_globals.R (previously dir.R) - ## plotting_data.R - ## combining_dfs_plotting.R + ## get_plotting_dfs.R outputs: plotdir ## 1 svg of OR for all positions @@ -46,9 +44,7 @@ sources: ./logo_multiple_muts.R -d streptomycin -g gid sources: - ## plotting_globals.R (previously dir.R) - ## plotting_data.R - ## combining_dfs_plotting.R + ## get_plotting_dfs.R outputs: plotdir ## 1 svg of multiple muts for all positions > 1 muts (mutations + wt) @@ -63,9 +59,7 @@ sources: #======================== sources: - ## plotting_globals.R (previously dir.R) - ## plotting_data.R - ## combining_dfs_plotting.R + ## get_plotting_dfs.R outputs: plotdir ## 1 svg combined of OR logo plot + mulitple_muts i.e output from