diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 7d517f8..18e0374 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -2,11 +2,12 @@ ########################################################### # TASK: To combine mcsm combined file and meta data. -# This script is sourced from other .R scripts for plotting. +# This script is sourced by other .R scripts for plotting. ########################################################### - # load libraries and functions +#source("Header_TT.R") + #========================================================== # combining_dfs_plotting(): @@ -34,28 +35,7 @@ combining_dfs_plotting <- function( my_df_u , gene_metadata , lig_dist_colname = 'ligand_distance' , lig_dist_cutoff = 10){ - # #====================================== - # # 1: Read file: _meta data.csv - # #====================================== - # cat("\nReading meta data file:", df1_mcsm_comb) - # - # my_df_u <- read.csv(df1_mcsm_comb - # , stringsAsFactors = F - # , header = T) - # cat("\nDim:", dim(my_df_u)) - # - # #====================================== - # # 2: Read file: _meta data.csv - # #====================================== - # cat("\nReading meta data file:", df2_gene_metadata) - # - # gene_metadata <- read.csv(df2_gene_metadata - # , stringsAsFactors = F - # , header = T) - # cat("\nDim:", dim(gene_metadata)) - # - # table(gene_metadata$mutation_info) - + # counting NAs in AF, OR cols # or_mychisq if (identical(sum(is.na(my_df_u$or_mychisq)) @@ -219,7 +199,6 @@ combining_dfs_plotting <- function( my_df_u , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) } - #=================================================== # Merge3: merged_df2_comp # same as merge 1 but excluding NAs from ORs, etc. @@ -273,6 +252,13 @@ combining_dfs_plotting <- function( my_df_u # compare dfs: foo and merged_df3_com all.equal(foo, bar) #summary(comparedf(foo, bar)) + cat("\n------------------------" + , "\nSummary of created dfs:" + , "\n------------------------" + , "\n1) Dim of merged_df2: " , nrow(merged_df2), "," , ncol(merged_df2) + , "\n2) Dim of merged_df2_comp: " , nrow(merged_df2_comp), "," , ncol(merged_df2_comp) + , "\n3) Dim of merged_df3: " , nrow(merged_df3), "," , ncol(merged_df3) + , "\n4) Dim of merged_df3_comp: " , nrow(merged_df3_comp), "," , ncol(merged_df3_comp)) ##################################################################### # Combining: LIG @@ -281,13 +267,35 @@ combining_dfs_plotting <- function( my_df_u #============ # Merges 5-8 #============ - df_lig = my_df_u[my_df_u[[lig_dist_colname]]_meta data.csv +#====================================== if (!exists("infile_metadata") && exists("gene")){ #if (!is.character(infile_params) && exists("gene")){{ in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid @@ -56,18 +69,6 @@ if (!exists("infile_metadata") && exists("gene")){ cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") } -#============================ -# Input 1: plotting_data() -#============================ -#--------------------- -# call: plotting_data() -#--------------------- -pd_df = plotting_data(infile_params) -my_df = pd_df[[1]] # this forms one of the input for combining_dfs_plotting() - -#====================================== -# Input 2: read _meta data.csv -#====================================== cat("\nReading meta data file:", infile_metadata) gene_metadata <- read.csv(infile_metadata @@ -79,9 +80,11 @@ all_plot_dfs = combining_dfs_plotting(my_df_u , lig_dist_colname = 'ligand_distance' , lig_dist_cutoff = 10) -merged_df2 = all_plot_dfs[[1]] -merged_df3 = all_plot_dfs[[2]] -merged_df2_comp = all_plot_dfs[[3]] -merged_df3_comp = all_plot_dfs[[4]] -merged_df2_lig = all_plot_dfs[[5]] -merged_df3_lig = all_plot_dfs[[6]] +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +merged_df2_comp = all_plot_dfs[[3]] +merged_df3_comp = all_plot_dfs[[4]] +merged_df2_lig = all_plot_dfs[[5]] +merged_df3_lig = all_plot_dfs[[6]] +merged_df2_comp_lig = all_plot_dfs[[7]] +merged_df3_comp_lig = all_plot_dfs[[8]] \ No newline at end of file diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R index 66b9a12..bd21ed8 100755 --- a/scripts/plotting/Header_TT.R +++ b/scripts/plotting/Header_TT.R @@ -7,6 +7,7 @@ # install.packages("gplots", dependencies = TRUE) # library(gplots) #} +require("getopt", quietly = TRUE) # cmd parse arguments if (!require("tidyverse")) { install.packages("tidyverse", dependencies = TRUE) diff --git a/scripts/plotting/logo_combined.R b/scripts/plotting/logo_combined.R index 24df20e..4bc2a3f 100644 --- a/scripts/plotting/logo_combined.R +++ b/scripts/plotting/logo_combined.R @@ -10,15 +10,66 @@ setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() source("Header_TT.R") -#library(ggplot2) -#library(data.table) -#library(dplyr) +source("../functions/plotting_globals.R") +source("../functions/plotting_data.R") +source("../functions/combining_dfs_plotting.R") +########################################################### +# command line args +#******************** +drug = 'streptomycin' +gene = 'gid' #=========== # input #=========== -source("combining_dfs_plotting.R") +#--------------------- +# call: import_dirs() +#--------------------- +import_dirs(drug, gene) +#--------------------------- +# call: plotting_data() +#--------------------------- +if (!exists("infile_params") && exists("gene")){ + #if (!is.character(infile_params) && exists("gene")){ + #in_filename_params = paste0(tolower(gene), "_all_params.csv") + in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + infile_params = paste0(outdir, "/", in_filename_params) + cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") +} + +# Input 1: read _comb_afor.csv +pd_df = plotting_data(infile_params) +my_df_u = pd_df[[1]] # this forms one of the input for combining_dfs_plotting() + +#-------------------------------- +# call: combining_dfs_plotting() +#-------------------------------- +if (!exists("infile_metadata") && exists("gene")){ + #if (!is.character(infile_params) && exists("gene")){{ + in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid + infile_metadata = paste0(outdir, "/", in_filename_metadata) + cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") +} + +# Input 2: read _meta data.csv +cat("\nReading meta data file:", infile_metadata) + +gene_metadata <- read.csv(infile_metadata + , stringsAsFactors = F + , header = T) + +all_plot_dfs = combining_dfs_plotting(my_df_u + , gene_metadata + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10) + +#merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +#merged_df2_comp = all_plot_dfs[[3]] +#merged_df3_comp = all_plot_dfs[[4]] +#merged_df2_lig = all_plot_dfs[[5]] +#merged_df3_lig = all_plot_dfs[[6]] #=========== # output #=========== @@ -32,12 +83,6 @@ plot_logo_combined_labelled = paste0(plotdir,"/", logo_combined_labelled) # REASSIGNMENT my_df = merged_df3 #%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -# clear excess variables -rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig - , merged_df3_comp, merged_df3_comp_lig - , my_df_u, my_df_u_lig, merged_df3_lig) - colnames(my_df) str(my_df) @@ -166,8 +211,8 @@ p3 = p2 + theme(legend.position = "bottom" #, legend.title = element_blank() , legend.title = element_text("Amino acid properties", size = 20) - , legend.text = element_text( size = 20) - , axis.text.x = element_text(size = 20, angle = 90) + , legend.text = element_text(size = 20) + , axis.text.x = element_text(size = 16, angle = 90) , axis.text.y = element_blank() , axis.title.y = element_blank() , axis.title.x = element_text(size = 22))+ @@ -237,7 +282,7 @@ position_or = as.numeric(colnames(wide_df_or)) #============================================ # custom height (OR) logo plot: yayy works logo_or = ggseqlogo(wide_df_or, method="custom", seq_type="aa") + ylab("my custom height") + - theme(axis.text.x = element_text(size = 16 + theme(axis.text.x = element_text(size = 15 , angle = 90 , hjust = 1 , vjust = 0.4) diff --git a/scripts/plotting/logo_multiple_muts.R b/scripts/plotting/logo_multiple_muts.R index fb9ec63..5a488e7 100644 --- a/scripts/plotting/logo_multiple_muts.R +++ b/scripts/plotting/logo_multiple_muts.R @@ -10,14 +10,66 @@ setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() source("Header_TT.R") -#library(ggplot2) -#library(data.table) -#library(dplyr) +source("../functions/plotting_globals.R") +source("../functions/plotting_data.R") +source("../functions/combining_dfs_plotting.R") +########################################################### +# command line args +#******************** +drug = 'streptomycin' +gene = 'gid' #=========== # input #=========== -source("combining_dfs_plotting.R") +#--------------------- +# call: import_dirs() +#--------------------- +import_dirs(drug, gene) + +#--------------------------- +# call: plotting_data() +#--------------------------- +if (!exists("infile_params") && exists("gene")){ + #if (!is.character(infile_params) && exists("gene")){ + #in_filename_params = paste0(tolower(gene), "_all_params.csv") + in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + infile_params = paste0(outdir, "/", in_filename_params) + cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") +} + +# Input 1: read _comb_afor.csv +pd_df = plotting_data(infile_params) +my_df_u = pd_df[[1]] # this forms one of the input for combining_dfs_plotting() + +#-------------------------------- +# call: combining_dfs_plotting() +#-------------------------------- +if (!exists("infile_metadata") && exists("gene")){ + #if (!is.character(infile_params) && exists("gene")){{ + in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid + infile_metadata = paste0(outdir, "/", in_filename_metadata) + cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") +} + +# Input 2: read _meta data.csv +cat("\nReading meta data file:", infile_metadata) + +gene_metadata <- read.csv(infile_metadata + , stringsAsFactors = F + , header = T) + +all_plot_dfs = combining_dfs_plotting(my_df_u + , gene_metadata + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10) + +#merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +#merged_df2_comp = all_plot_dfs[[3]] +#merged_df3_comp = all_plot_dfs[[4]] +#merged_df2_lig = all_plot_dfs[[5]] +#merged_df3_lig = all_plot_dfs[[6]] #=========== # output @@ -32,12 +84,6 @@ plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts) # REASSIGNMENT my_df = merged_df3 #%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -# clear excess variables -rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig - , merged_df3_comp, merged_df3_comp_lig - , my_df_u, my_df_u_lig, merged_df3_lig) - colnames(my_df) str(my_df) @@ -117,7 +163,7 @@ p0 p1 = p0 + theme(legend.position = "none" , legend.title = element_blank() , legend.text = element_text(size = 20) - , axis.text.x = element_text(size = 20, angle = 90) + , axis.text.x = element_text(size = 17, angle = 90) , axis.text.y = element_blank()) p1 @@ -138,7 +184,6 @@ rownames(tab_wt) #************** # Plot 2: wild_type logo - #************** # sanity check: MUST BE TRUE @@ -164,8 +209,8 @@ p3 = p2 + theme(legend.position = "bottom" #, legend.title = element_blank() , legend.title = element_text("Amino acid properties", size = 20) - , legend.text = element_text( size = 20) - , axis.text.x = element_text(size = 20, angle = 90) + , legend.text = element_text(size = 20) + , axis.text.x = element_text(size = 17, angle = 90) , axis.text.y = element_blank() , axis.title.x = element_text(size = 22))+ diff --git a/scripts/plotting/logo_plot.R b/scripts/plotting/logo_plot.R old mode 100644 new mode 100755 index facf3fa..707df8a --- a/scripts/plotting/logo_plot.R +++ b/scripts/plotting/logo_plot.R @@ -15,9 +15,36 @@ source("../functions/combining_dfs_plotting.R") ########################################################### # command line args #******************** -drug = 'streptomycin' -gene = 'gid' +#drug = 'streptomycin' +#gene = 'gid' +#******************** +# !!!FUTURE TODO!!! +# Can pass additional params of output/plot dir by user. +# Not strictly required for my workflow since it is optimised +# to have a streamlined input/output flow without filename worries. +#******************** +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character", + "data_file1" , "fa", 2, "character", + "data_file2" , "fb", 2, "character" +), byrow = TRUE, ncol = 4) +opt = getopt(spec) + +#FIXME: detect if script running from cmd, then set these +drug = opt$drug +gene = opt$gene +infile_params = opt$data_file1 +infile_metadata = opt$data_file2 + +# hardcoding when not using cmd +#drug = "streptomycin" +#gene = "gid" + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} #=========== # input #=========== @@ -29,8 +56,8 @@ import_dirs(drug, gene) #--------------------------- # call: plotting_data() #--------------------------- -if (!exists("infile_params") && exists("gene")){ - #if (!is.character(infile_params) && exists("gene")){ +#if (!exists("infile_params") && exists("gene")){ +if (!is.character(infile_params) && exists("gene")){ #in_filename_params = paste0(tolower(gene), "_all_params.csv") in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid infile_params = paste0(outdir, "/", in_filename_params) @@ -38,14 +65,15 @@ if (!exists("infile_params") && exists("gene")){ } # Input 1: read _comb_afor.csv -pd_df = plotting_data(infile_params) +my_df = read.csv(infile_params, header = T) +pd_df = plotting_data(my_df) my_df_u = pd_df[[1]] # this forms one of the input for combining_dfs_plotting() #-------------------------------- # call: combining_dfs_plotting() #-------------------------------- -if (!exists("infile_metadata") && exists("gene")){ - #if (!is.character(infile_params) && exists("gene")){{ +#if (!exists("infile_metadata") && exists("gene")){ +if (!is.character(infile_params) && exists("gene")){ in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid infile_metadata = paste0(outdir, "/", in_filename_metadata) cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") @@ -63,7 +91,7 @@ all_plot_dfs = combining_dfs_plotting(my_df_u , lig_dist_colname = 'ligand_distance' , lig_dist_cutoff = 10) -merged_df2 = all_plot_dfs[[1]] +#merged_df2 = all_plot_dfs[[1]] merged_df3 = all_plot_dfs[[2]] #merged_df2_comp = all_plot_dfs[[3]] #merged_df3_comp = all_plot_dfs[[4]] @@ -146,6 +174,7 @@ wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0) wide_df_or = as.matrix(wide_df_or) rownames(wide_df_or) = wide_df_or[,1] +dim(wide_df_or) wide_df_or = wide_df_or[,-1] str(wide_df_or)