From 6bbc3328b9b3cfd96c0ddf572efed024870361a7 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 24 Jun 2021 14:21:34 +0100 Subject: [PATCH] added get_plotting_dfs.R as a mother script to be sourced by all plotting scripts --- scripts/plotting/get_plotting_dfs.R | 155 ++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 scripts/plotting/get_plotting_dfs.R diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R new file mode 100644 index 0000000..32a0fbc --- /dev/null +++ b/scripts/plotting/get_plotting_dfs.R @@ -0,0 +1,155 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Get formatted data for plots +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +source("Header_TT.R") +source("../functions/my_pairs_panel.R") # with lower panel turned off +source("../functions/plotting_globals.R") +source("../functions/plotting_data.R") +source("../functions/combining_dfs_plotting.R") + +#******************** +# cmd args passed +# in from other scripts +# to call this +#******************** +#drug = 'streptomycin' +#gene = 'gid' +#==================== +# variables for lig +#==================== + +LigDist_colname = "ligand_distance" +LigDist_cutoff = 20 + +#=========== +# input +#=========== +#--------------------- +# call: import_dirs() +#--------------------- +import_dirs(drug, gene) + +#--------------------------- +# call: plotting_data() +#--------------------------- +#if (!exists("infile_params") && exists("gene")){ +if (!is.character(infile_params) && exists("gene")){ # when running as cmd + #in_filename_params = paste0(tolower(gene), "_all_params.csv") + in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + infile_params = paste0(outdir, "/", in_filename_params) + cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") +} + +# Input 1: read _comb_afor.csv +cat("\nReading mcsm combined data file: ", infile_params) +mcsm_df = read.csv(infile_params, header = T) +pd_df = plotting_data(mcsm_df + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) + +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] + +#-------------------------------- +# call: combining_dfs_plotting() +#-------------------------------- +#if (!exists("infile_metadata") && exists("gene")){ +if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd + in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid + infile_metadata = paste0(outdir, "/", in_filename_metadata) + cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") +} + +# Input 2: read _meta data.csv +cat("\nReading meta data file: ", infile_metadata) + +gene_metadata <- read.csv(infile_metadata + , stringsAsFactors = F + , header = T) + +all_plot_dfs = combining_dfs_plotting(my_df_u + , gene_metadata + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) + +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +merged_df2_comp = all_plot_dfs[[3]] +merged_df3_comp = all_plot_dfs[[4]] +merged_df2_lig = all_plot_dfs[[5]] +merged_df3_lig = all_plot_dfs[[6]] +merged_df2_comp_lig = all_plot_dfs[[7]] +merged_df3_comp_lig = all_plot_dfs[[8]] + +#################################################################### +# Data for logoplots +#################################################################### +#------------------------- +# choose df for logoplot +#------------------------- +logo_data = merged_df3 +#logo_data = merged_df3_comp + +# quick checks +colnames(logo_data) +str(logo_data) + +c1 = unique(logo_data$position) +nrow(logo_data) +cat("No. of rows in my_data:", nrow(logo_data) + , "\nDistinct positions corresponding to snps:", length(c1) + , "\n===========================================================") +#======================================================================= +#%% logo plots from dataframe + +############# +# PLOTS +############# +foo = logo_data[, c("position" + , "mutant_type","duet_scaled", "or_mychisq" + , "mut_prop_polarity", "mut_prop_water")] + +logo_data$log10or = log10(logo_data$or_mychisq) +logo_data_plot = logo_data[, c("position" + , "mutant_type", "or_mychisq", "log10or")] + +logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")] +wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0) + +wide_df_or = as.matrix(wide_df_or) +rownames(wide_df_or) = wide_df_or[,1] +dim(wide_df_or) +wide_df_or = wide_df_or[,-1] +str(wide_df_or) + +position_or = as.numeric(colnames(wide_df_or)) + +#================== +# logo data: logOR +#================== +# extracting data with log10R +logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")] +wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0) + +wide_df_logor = as.matrix(wide_df_logor) + +rownames(wide_df_logor) = wide_df_logor[,1] +wide_df_logor = subset(wide_df_logor, select = -c(1) ) +colnames(wide_df_logor) +wide_df_logor_m = data.matrix(wide_df_logor) + +rownames(wide_df_logor_m) +colnames(wide_df_logor_m) + +position_logor = as.numeric(colnames(wide_df_logor_m)) +######################################################################## +# End of script +######################################################################## \ No newline at end of file