changes made to combining_dfs_plotting.R
This commit is contained in:
parent
c6d1260f74
commit
2aec79af31
9 changed files with 258 additions and 126 deletions
|
@ -2,11 +2,12 @@
|
||||||
|
|
||||||
###########################################################
|
###########################################################
|
||||||
# TASK: To combine mcsm combined file and meta data.
|
# TASK: To combine mcsm combined file and meta data.
|
||||||
# This script is sourced from other .R scripts for plotting.
|
# This script is sourced by other .R scripts for plotting.
|
||||||
###########################################################
|
###########################################################
|
||||||
|
|
||||||
# load libraries and functions
|
# load libraries and functions
|
||||||
|
|
||||||
|
#source("Header_TT.R")
|
||||||
|
|
||||||
#==========================================================
|
#==========================================================
|
||||||
# combining_dfs_plotting():
|
# combining_dfs_plotting():
|
||||||
|
|
||||||
|
@ -34,27 +35,6 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
, gene_metadata
|
, gene_metadata
|
||||||
, lig_dist_colname = 'ligand_distance'
|
, lig_dist_colname = 'ligand_distance'
|
||||||
, lig_dist_cutoff = 10){
|
, lig_dist_cutoff = 10){
|
||||||
# #======================================
|
|
||||||
# # 1: Read file: <gene>_meta data.csv
|
|
||||||
# #======================================
|
|
||||||
# cat("\nReading meta data file:", df1_mcsm_comb)
|
|
||||||
#
|
|
||||||
# my_df_u <- read.csv(df1_mcsm_comb
|
|
||||||
# , stringsAsFactors = F
|
|
||||||
# , header = T)
|
|
||||||
# cat("\nDim:", dim(my_df_u))
|
|
||||||
#
|
|
||||||
# #======================================
|
|
||||||
# # 2: Read file: <gene>_meta data.csv
|
|
||||||
# #======================================
|
|
||||||
# cat("\nReading meta data file:", df2_gene_metadata)
|
|
||||||
#
|
|
||||||
# gene_metadata <- read.csv(df2_gene_metadata
|
|
||||||
# , stringsAsFactors = F
|
|
||||||
# , header = T)
|
|
||||||
# cat("\nDim:", dim(gene_metadata))
|
|
||||||
#
|
|
||||||
# table(gene_metadata$mutation_info)
|
|
||||||
|
|
||||||
# counting NAs in AF, OR cols
|
# counting NAs in AF, OR cols
|
||||||
# or_mychisq
|
# or_mychisq
|
||||||
|
@ -219,7 +199,6 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
, "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
|
, "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#===================================================
|
#===================================================
|
||||||
# Merge3: merged_df2_comp
|
# Merge3: merged_df2_comp
|
||||||
# same as merge 1 but excluding NAs from ORs, etc.
|
# same as merge 1 but excluding NAs from ORs, etc.
|
||||||
|
@ -273,6 +252,13 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
# compare dfs: foo and merged_df3_com
|
# compare dfs: foo and merged_df3_com
|
||||||
all.equal(foo, bar)
|
all.equal(foo, bar)
|
||||||
#summary(comparedf(foo, bar))
|
#summary(comparedf(foo, bar))
|
||||||
|
cat("\n------------------------"
|
||||||
|
, "\nSummary of created dfs:"
|
||||||
|
, "\n------------------------"
|
||||||
|
, "\n1) Dim of merged_df2: " , nrow(merged_df2), "," , ncol(merged_df2)
|
||||||
|
, "\n2) Dim of merged_df2_comp: " , nrow(merged_df2_comp), "," , ncol(merged_df2_comp)
|
||||||
|
, "\n3) Dim of merged_df3: " , nrow(merged_df3), "," , ncol(merged_df3)
|
||||||
|
, "\n4) Dim of merged_df3_comp: " , nrow(merged_df3_comp), "," , ncol(merged_df3_comp))
|
||||||
|
|
||||||
#####################################################################
|
#####################################################################
|
||||||
# Combining: LIG
|
# Combining: LIG
|
||||||
|
@ -281,13 +267,35 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
#============
|
#============
|
||||||
# Merges 5-8
|
# Merges 5-8
|
||||||
#============
|
#============
|
||||||
|
cat("\n=========================================="
|
||||||
|
, "\nStarting filtering for mcsm ligand df"
|
||||||
|
, "\n===========================================")
|
||||||
|
|
||||||
|
if (lig_dist_colname%in%names(my_df_u)){
|
||||||
|
cat("\nFiltering column: ", lig_dist_colname
|
||||||
|
, "\nCut off criteria: ", lig_dist_cutoff, "Angstroms")
|
||||||
df_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
|
df_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
|
||||||
|
|
||||||
merged_df2_lig = merged_df2[merged_df2$ligand_distance<lig_dist_cutoff,]
|
#merged_df2_lig = merged_df2[merged_df2$ligand_distance<lig_dist_cutoff,]
|
||||||
merged_df2_comp_lig = merged_df2_comp[merged_df2_comp$ligand_distance<lig_dist_cutoff,]
|
merged_df2_lig = merged_df2[merged_df2[[lig_dist_colname]] < lig_dist_cutoff,]
|
||||||
|
dim(merged_df2_lig)
|
||||||
|
|
||||||
merged_df3_lig = merged_df3[merged_df3$ligand_distance<lig_dist_cutoff,]
|
merged_df2_comp_lig = merged_df2_comp[merged_df2_comp[[lig_dist_colname]] < lig_dist_cutoff,]
|
||||||
merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<lig_dist_cutoff,]
|
|
||||||
|
merged_df3_lig = merged_df3[merged_df3[[lig_dist_colname]] < lig_dist_cutoff,]
|
||||||
|
merged_df3_comp_lig = merged_df3_comp[merged_df3_comp[[lig_dist_colname]] < lig_dist_cutoff,]
|
||||||
|
|
||||||
|
cat("\n------------------------"
|
||||||
|
, "\nSummary of created ligand dfs:"
|
||||||
|
, "\n------------------------"
|
||||||
|
, "\n1) Dim of merged_df2_lig: " , nrow(merged_df2_lig), "," , ncol(merged_df2_lig)
|
||||||
|
, "\n2) Dim of merged_df2_comp_lig: " , nrow(merged_df2_comp_lig), "," , ncol(merged_df2_comp_lig)
|
||||||
|
, "\n3) Dim of merged_df3_lig: " , nrow(merged_df3_lig), "," , ncol(merged_df3_lig)
|
||||||
|
, "\n4) Dim of merged_df3_comp_lig: " , nrow(merged_df3_comp_lig), "," , ncol(merged_df3_comp_lig))
|
||||||
|
} else {
|
||||||
|
cat("\nFiltering column: ", lig_dist_colname, " not found\n")
|
||||||
|
}
|
||||||
|
#quit()
|
||||||
|
|
||||||
# sanity check
|
# sanity check
|
||||||
if (nrow(merged_df3_lig) == nrow(df_lig)){
|
if (nrow(merged_df3_lig) == nrow(df_lig)){
|
||||||
|
@ -327,8 +335,13 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
# , "\nNo. of cols: ", ncol(get(i)), "\n")
|
# , "\nNo. of cols: ", ncol(get(i)), "\n")
|
||||||
#}
|
#}
|
||||||
|
|
||||||
return(list(merged_df2, merged_df3
|
return(list( merged_df2
|
||||||
, merged_df2_comp, merged_df3_comp
|
, merged_df3
|
||||||
, merged_df2_lig, merged_df3_lig))
|
, merged_df2_comp
|
||||||
|
, merged_df3_comp
|
||||||
|
, merged_df2_lig
|
||||||
|
, merged_df3_lig
|
||||||
|
, merged_df2_comp_lig
|
||||||
|
, merged_df3_comp_lig))
|
||||||
|
|
||||||
}
|
}
|
|
@ -16,20 +16,18 @@ library(dplyr)
|
||||||
## my_df_u_lig
|
## my_df_u_lig
|
||||||
## dup_muts
|
## dup_muts
|
||||||
#========================================================
|
#========================================================
|
||||||
plotting_data <- function(infile_params, mcsm_lig_cutoff = 10) {
|
plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) {
|
||||||
my_df = data.frame()
|
my_df = data.frame()
|
||||||
my_df_u = data.frame()
|
my_df_u = data.frame()
|
||||||
my_df_u_lig = data.frame()
|
my_df_u_lig = data.frame()
|
||||||
dup_muts = data.frame()
|
dup_muts = data.frame()
|
||||||
|
|
||||||
cat(paste0("\nInput file to prepare for plotting:", infile_params, "\n") )
|
|
||||||
|
|
||||||
#===========================
|
#===========================
|
||||||
# Read file: struct params
|
# Read file: struct params
|
||||||
#===========================
|
#===========================
|
||||||
my_df = read.csv(infile_params, header = T)
|
#df = read.csv(infile_params, header = T)
|
||||||
|
|
||||||
cat("\nInput dimensions:", dim(my_df))
|
cat("\nInput dimensions:", dim(df))
|
||||||
|
|
||||||
#==================================
|
#==================================
|
||||||
# add foldx outcome category
|
# add foldx outcome category
|
||||||
|
@ -43,17 +41,17 @@ cat("\nInput dimensions:", dim(my_df))
|
||||||
# adding foldx scaled values
|
# adding foldx scaled values
|
||||||
# scale data b/w -1 and 1
|
# scale data b/w -1 and 1
|
||||||
#------------------------------
|
#------------------------------
|
||||||
n = which(colnames(my_df) == "ddg"); n
|
n = which(colnames(df) == "ddg"); n
|
||||||
|
|
||||||
my_min = min(my_df[,n]); my_min
|
my_min = min(df[,n]); my_min
|
||||||
my_max = max(my_df[,n]); my_max
|
my_max = max(df[,n]); my_max
|
||||||
|
|
||||||
my_df$foldx_scaled = ifelse(my_df[,n] < 0
|
df$foldx_scaled = ifelse(df[,n] < 0
|
||||||
, my_df[,n]/abs(my_min)
|
, df[,n]/abs(my_min)
|
||||||
, my_df[,n]/my_max)
|
, df[,n]/my_max)
|
||||||
# sanity check
|
# sanity check
|
||||||
my_min = min(my_df$foldx_scaled); my_min
|
my_min = min(df$foldx_scaled); my_min
|
||||||
my_max = max(my_df$foldx_scaled); my_max
|
my_max = max(df$foldx_scaled); my_max
|
||||||
|
|
||||||
if (my_min == -1 && my_max == 1){
|
if (my_min == -1 && my_max == 1){
|
||||||
cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
|
cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
|
||||||
|
@ -67,9 +65,9 @@ if (my_min == -1 && my_max == 1){
|
||||||
# adding foldx outcome category
|
# adding foldx outcome category
|
||||||
# ddg<0 = "Stabilising" (-ve)
|
# ddg<0 = "Stabilising" (-ve)
|
||||||
#------------------------------
|
#------------------------------
|
||||||
c1 = table(my_df$ddg < 0)
|
c1 = table(df$ddg < 0)
|
||||||
my_df$foldx_outcome = ifelse(my_df$ddg < 0, "Stabilising", "Destabilising")
|
df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
|
||||||
c2 = table(my_df$ddg < 0)
|
c2 = table(df$ddg < 0)
|
||||||
|
|
||||||
if ( all(c1 == c2) ){
|
if ( all(c1 == c2) ){
|
||||||
cat("\nPASS: foldx outcome successfully created")
|
cat("\nPASS: foldx outcome successfully created")
|
||||||
|
@ -83,19 +81,19 @@ if ( all(c1 == c2) ){
|
||||||
#==================================
|
#==================================
|
||||||
|
|
||||||
# check for duplicate mutations
|
# check for duplicate mutations
|
||||||
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
|
if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){
|
||||||
cat(paste0("\nCAUTION:", " Duplicate mutations identified"
|
cat(paste0("\nCAUTION:", " Duplicate mutations identified"
|
||||||
, "\nExtracting these...\n"))
|
, "\nExtracting these...\n"))
|
||||||
#cat(my_df[duplicated(my_df$mutationinformation),])
|
#cat(my_df[duplicated(my_df$mutationinformation),])
|
||||||
dup_muts = my_df[duplicated(my_df$mutationinformation),]
|
dup_muts = df[duplicated(df$mutationinformation),]
|
||||||
dup_muts_nu = length(unique(dup_muts$mutationinformation))
|
dup_muts_nu = length(unique(dup_muts$mutationinformation))
|
||||||
cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
|
cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
|
||||||
, "\nNo. of unique duplicate mutations:", dup_muts_nu
|
, "\nNo. of unique duplicate mutations:", dup_muts_nu
|
||||||
, "\n\nExtracting df with unique mutations only\n"))
|
, "\n\nExtracting df with unique mutations only\n"))
|
||||||
my_df_u = my_df[!duplicated(my_df$mutationinformation),]
|
my_df_u = df[!duplicated(df$mutationinformation),]
|
||||||
}else{
|
}else{
|
||||||
cat(paste0("\nNo duplicate mutations detected\n"))
|
cat(paste0("\nNo duplicate mutations detected\n"))
|
||||||
my_df_u = my_df
|
my_df_u = df
|
||||||
}
|
}
|
||||||
|
|
||||||
upos = unique(my_df_u$position)
|
upos = unique(my_df_u$position)
|
||||||
|
@ -105,15 +103,14 @@ cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
|
||||||
#===============================================
|
#===============================================
|
||||||
# extract mutations <10 Angstroms and symbol
|
# extract mutations <10 Angstroms and symbol
|
||||||
#===============================================
|
#===============================================
|
||||||
table(my_df_u$ligand_distance<mcsm_lig_cutoff)
|
table(my_df_u$ligand_distance<lig_dist_cutoff)
|
||||||
|
|
||||||
my_df_u_lig = my_df_u[my_df_u$ligand_distance <mcsm_lig_cutoff,]
|
my_df_u_lig = my_df_u[my_df_u$ligand_distance <lig_dist_cutoff,]
|
||||||
|
|
||||||
cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
|
cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
|
||||||
|
|
||||||
# return list of DFs
|
# return list of DFs
|
||||||
|
my_df = df
|
||||||
#return(list(my_df, my_df_u, my_df_u_lig, dup_muts))
|
|
||||||
#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
|
#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
|
||||||
all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
|
all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
|
||||||
#all_df = Map(setNames, all_df, df_names)
|
#all_df = Map(setNames, all_df, df_names)
|
||||||
|
|
|
@ -10,21 +10,21 @@
|
||||||
# input args: 'drug' and 'gene'
|
# input args: 'drug' and 'gene'
|
||||||
# output: dir names for input and output files
|
# output: dir names for input and output files
|
||||||
|
|
||||||
import_dirs <- function(drug, gene) {
|
import_dirs <- function(drug_name, gene_name) {
|
||||||
|
|
||||||
#============================
|
#============================
|
||||||
# directories and variables
|
# directories and variables
|
||||||
#============================
|
#============================
|
||||||
|
|
||||||
datadir <<- paste0("~/git/Data/")
|
datadir <<- paste0("~/git/Data/")
|
||||||
indir <<- paste0(datadir, drug, "/input")
|
indir <<- paste0(datadir, drug_name, "/input")
|
||||||
outdir <<- paste0("~/git/Data/", drug, "/output")
|
outdir <<- paste0("~/git/Data/", drug_name, "/output")
|
||||||
plotdir <<- paste0("~/git/Data/", drug, "/output/plots")
|
plotdir <<- paste0("~/git/Data/", drug_name, "/output/plots")
|
||||||
|
|
||||||
dr_muts_col <<- paste0('dr_mutations_', drug)
|
dr_muts_col <<- paste0('dr_mutations_', drug_name)
|
||||||
other_muts_col <<- paste0('other_mutations_', drug)
|
other_muts_col <<- paste0('other_mutations_', drug_name)
|
||||||
resistance_col <<- "drtype"
|
resistance_col <<- "drtype"
|
||||||
gene_match <<- paste0(gene,"_p.")
|
gene_match <<- paste0(gene_name,"_p.")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,20 +37,17 @@ import_dirs <- function(drug, gene) {
|
||||||
#==================
|
#==================
|
||||||
# Angstroms symbol
|
# Angstroms symbol
|
||||||
#==================
|
#==================
|
||||||
|
|
||||||
angstroms_symbol <<- "\u212b"
|
angstroms_symbol <<- "\u212b"
|
||||||
#cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
|
#cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
|
||||||
|
|
||||||
#===============
|
#===============
|
||||||
# Delta symbol
|
# Delta symbol
|
||||||
#===============
|
#===============
|
||||||
|
|
||||||
delta_symbol <<- "\u0394"; delta_symbol
|
delta_symbol <<- "\u0394"; delta_symbol
|
||||||
|
|
||||||
#==========
|
#==========
|
||||||
# Colours
|
# Colours
|
||||||
#==========
|
#==========
|
||||||
|
|
||||||
mcsm_red2 <<- "#ae301e" # most negative
|
mcsm_red2 <<- "#ae301e" # most negative
|
||||||
mcsm_red1 <<- "#f8766d"
|
mcsm_red1 <<- "#f8766d"
|
||||||
|
|
||||||
|
@ -58,3 +55,4 @@ mcsm_mid <<- "white" # middle
|
||||||
|
|
||||||
mcsm_blue1 <<- "#00bfc4"
|
mcsm_blue1 <<- "#00bfc4"
|
||||||
mcsm_blue2 <<- "#007d85" # most positive
|
mcsm_blue2 <<- "#007d85" # most positive
|
||||||
|
#########################################################
|
||||||
|
|
|
@ -7,6 +7,7 @@ getwd()
|
||||||
#===========================================
|
#===========================================
|
||||||
source("plotting_data.R")
|
source("plotting_data.R")
|
||||||
infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
|
infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
|
||||||
|
|
||||||
pd_df = plotting_data(infile)
|
pd_df = plotting_data(infile)
|
||||||
my_df = pd_df[[1]]
|
my_df = pd_df[[1]]
|
||||||
my_df_u = pd_df[[2]]
|
my_df_u = pd_df[[2]]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# #!/usr/bin/env Rscript
|
#!/usr/bin/env Rscript
|
||||||
|
|
||||||
# working dir and loading libraries
|
# working dir and loading libraries
|
||||||
getwd()
|
getwd()
|
||||||
|
@ -33,14 +33,20 @@ source("plotting_globals.R")
|
||||||
source("plotting_data.R")
|
source("plotting_data.R")
|
||||||
source("combining_dfs_plotting.R")
|
source("combining_dfs_plotting.R")
|
||||||
|
|
||||||
gene = 'gid'
|
|
||||||
drug = 'streptomycin'
|
|
||||||
|
|
||||||
#---------------------
|
#---------------------
|
||||||
# call: import_dirs()
|
# call: import_dirs()
|
||||||
#---------------------
|
#---------------------
|
||||||
import_dirs(drug, gene)
|
gene = 'gid'
|
||||||
|
drug = 'streptomycin'
|
||||||
|
|
||||||
|
import_dirs(drug_name = drug, gene_name = gene)
|
||||||
|
|
||||||
|
#-------------------------------
|
||||||
|
# test function: plotting_data()
|
||||||
|
#-------------------------------
|
||||||
|
#============================
|
||||||
|
# Input 1: plotting_data()
|
||||||
|
#============================
|
||||||
if (!exists("infile_params") && exists("gene")){
|
if (!exists("infile_params") && exists("gene")){
|
||||||
#if (!is.character(infile_params) && exists("gene")){
|
#if (!is.character(infile_params) && exists("gene")){
|
||||||
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
|
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
|
||||||
|
@ -49,6 +55,13 @@ if (!exists("infile_params") && exists("gene")){
|
||||||
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
|
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mcsm_comb_data = read.csv(infile_params, header = T)
|
||||||
|
pd_df = plotting_data(df = mcsm_comb_data, lig_dist_cutoff = 10)
|
||||||
|
my_df_u = pd_df[[2]]
|
||||||
|
|
||||||
|
#======================================
|
||||||
|
# Input 2: read <gene>_meta data.csv
|
||||||
|
#======================================
|
||||||
if (!exists("infile_metadata") && exists("gene")){
|
if (!exists("infile_metadata") && exists("gene")){
|
||||||
#if (!is.character(infile_params) && exists("gene")){{
|
#if (!is.character(infile_params) && exists("gene")){{
|
||||||
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
|
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
|
||||||
|
@ -56,18 +69,6 @@ if (!exists("infile_metadata") && exists("gene")){
|
||||||
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
|
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
#============================
|
|
||||||
# Input 1: plotting_data()
|
|
||||||
#============================
|
|
||||||
#---------------------
|
|
||||||
# call: plotting_data()
|
|
||||||
#---------------------
|
|
||||||
pd_df = plotting_data(infile_params)
|
|
||||||
my_df = pd_df[[1]] # this forms one of the input for combining_dfs_plotting()
|
|
||||||
|
|
||||||
#======================================
|
|
||||||
# Input 2: read <gene>_meta data.csv
|
|
||||||
#======================================
|
|
||||||
cat("\nReading meta data file:", infile_metadata)
|
cat("\nReading meta data file:", infile_metadata)
|
||||||
|
|
||||||
gene_metadata <- read.csv(infile_metadata
|
gene_metadata <- read.csv(infile_metadata
|
||||||
|
@ -85,3 +86,5 @@ merged_df2_comp = all_plot_dfs[[3]]
|
||||||
merged_df3_comp = all_plot_dfs[[4]]
|
merged_df3_comp = all_plot_dfs[[4]]
|
||||||
merged_df2_lig = all_plot_dfs[[5]]
|
merged_df2_lig = all_plot_dfs[[5]]
|
||||||
merged_df3_lig = all_plot_dfs[[6]]
|
merged_df3_lig = all_plot_dfs[[6]]
|
||||||
|
merged_df2_comp_lig = all_plot_dfs[[7]]
|
||||||
|
merged_df3_comp_lig = all_plot_dfs[[8]]
|
|
@ -7,6 +7,7 @@
|
||||||
# install.packages("gplots", dependencies = TRUE)
|
# install.packages("gplots", dependencies = TRUE)
|
||||||
# library(gplots)
|
# library(gplots)
|
||||||
#}
|
#}
|
||||||
|
require("getopt", quietly = TRUE) # cmd parse arguments
|
||||||
|
|
||||||
if (!require("tidyverse")) {
|
if (!require("tidyverse")) {
|
||||||
install.packages("tidyverse", dependencies = TRUE)
|
install.packages("tidyverse", dependencies = TRUE)
|
||||||
|
|
|
@ -10,15 +10,66 @@ setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||||
getwd()
|
getwd()
|
||||||
|
|
||||||
source("Header_TT.R")
|
source("Header_TT.R")
|
||||||
#library(ggplot2)
|
source("../functions/plotting_globals.R")
|
||||||
#library(data.table)
|
source("../functions/plotting_data.R")
|
||||||
#library(dplyr)
|
source("../functions/combining_dfs_plotting.R")
|
||||||
|
###########################################################
|
||||||
|
# command line args
|
||||||
|
#********************
|
||||||
|
drug = 'streptomycin'
|
||||||
|
gene = 'gid'
|
||||||
|
|
||||||
#===========
|
#===========
|
||||||
# input
|
# input
|
||||||
#===========
|
#===========
|
||||||
source("combining_dfs_plotting.R")
|
#---------------------
|
||||||
|
# call: import_dirs()
|
||||||
|
#---------------------
|
||||||
|
import_dirs(drug, gene)
|
||||||
|
|
||||||
|
#---------------------------
|
||||||
|
# call: plotting_data()
|
||||||
|
#---------------------------
|
||||||
|
if (!exists("infile_params") && exists("gene")){
|
||||||
|
#if (!is.character(infile_params) && exists("gene")){
|
||||||
|
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
|
||||||
|
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
|
||||||
|
infile_params = paste0(outdir, "/", in_filename_params)
|
||||||
|
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Input 1: read <gene>_comb_afor.csv
|
||||||
|
pd_df = plotting_data(infile_params)
|
||||||
|
my_df_u = pd_df[[1]] # this forms one of the input for combining_dfs_plotting()
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
# call: combining_dfs_plotting()
|
||||||
|
#--------------------------------
|
||||||
|
if (!exists("infile_metadata") && exists("gene")){
|
||||||
|
#if (!is.character(infile_params) && exists("gene")){{
|
||||||
|
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
|
||||||
|
infile_metadata = paste0(outdir, "/", in_filename_metadata)
|
||||||
|
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Input 2: read <gene>_meta data.csv
|
||||||
|
cat("\nReading meta data file:", infile_metadata)
|
||||||
|
|
||||||
|
gene_metadata <- read.csv(infile_metadata
|
||||||
|
, stringsAsFactors = F
|
||||||
|
, header = T)
|
||||||
|
|
||||||
|
all_plot_dfs = combining_dfs_plotting(my_df_u
|
||||||
|
, gene_metadata
|
||||||
|
, lig_dist_colname = 'ligand_distance'
|
||||||
|
, lig_dist_cutoff = 10)
|
||||||
|
|
||||||
|
#merged_df2 = all_plot_dfs[[1]]
|
||||||
|
merged_df3 = all_plot_dfs[[2]]
|
||||||
|
#merged_df2_comp = all_plot_dfs[[3]]
|
||||||
|
#merged_df3_comp = all_plot_dfs[[4]]
|
||||||
|
#merged_df2_lig = all_plot_dfs[[5]]
|
||||||
|
#merged_df3_lig = all_plot_dfs[[6]]
|
||||||
#===========
|
#===========
|
||||||
# output
|
# output
|
||||||
#===========
|
#===========
|
||||||
|
@ -32,12 +83,6 @@ plot_logo_combined_labelled = paste0(plotdir,"/", logo_combined_labelled)
|
||||||
# REASSIGNMENT
|
# REASSIGNMENT
|
||||||
my_df = merged_df3
|
my_df = merged_df3
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
|
||||||
# clear excess variables
|
|
||||||
rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
|
|
||||||
, merged_df3_comp, merged_df3_comp_lig
|
|
||||||
, my_df_u, my_df_u_lig, merged_df3_lig)
|
|
||||||
|
|
||||||
colnames(my_df)
|
colnames(my_df)
|
||||||
str(my_df)
|
str(my_df)
|
||||||
|
|
||||||
|
@ -167,7 +212,7 @@ p3 = p2 +
|
||||||
#, legend.title = element_blank()
|
#, legend.title = element_blank()
|
||||||
, legend.title = element_text("Amino acid properties", size = 20)
|
, legend.title = element_text("Amino acid properties", size = 20)
|
||||||
, legend.text = element_text(size = 20)
|
, legend.text = element_text(size = 20)
|
||||||
, axis.text.x = element_text(size = 20, angle = 90)
|
, axis.text.x = element_text(size = 16, angle = 90)
|
||||||
, axis.text.y = element_blank()
|
, axis.text.y = element_blank()
|
||||||
, axis.title.y = element_blank()
|
, axis.title.y = element_blank()
|
||||||
, axis.title.x = element_text(size = 22))+
|
, axis.title.x = element_text(size = 22))+
|
||||||
|
@ -237,7 +282,7 @@ position_or = as.numeric(colnames(wide_df_or))
|
||||||
#============================================
|
#============================================
|
||||||
# custom height (OR) logo plot: yayy works
|
# custom height (OR) logo plot: yayy works
|
||||||
logo_or = ggseqlogo(wide_df_or, method="custom", seq_type="aa") + ylab("my custom height") +
|
logo_or = ggseqlogo(wide_df_or, method="custom", seq_type="aa") + ylab("my custom height") +
|
||||||
theme(axis.text.x = element_text(size = 16
|
theme(axis.text.x = element_text(size = 15
|
||||||
, angle = 90
|
, angle = 90
|
||||||
, hjust = 1
|
, hjust = 1
|
||||||
, vjust = 0.4)
|
, vjust = 0.4)
|
||||||
|
|
|
@ -10,14 +10,66 @@ setwd("~/git/LSHTM_analysis/scripts/plotting")
|
||||||
getwd()
|
getwd()
|
||||||
|
|
||||||
source("Header_TT.R")
|
source("Header_TT.R")
|
||||||
#library(ggplot2)
|
source("../functions/plotting_globals.R")
|
||||||
#library(data.table)
|
source("../functions/plotting_data.R")
|
||||||
#library(dplyr)
|
source("../functions/combining_dfs_plotting.R")
|
||||||
|
###########################################################
|
||||||
|
# command line args
|
||||||
|
#********************
|
||||||
|
drug = 'streptomycin'
|
||||||
|
gene = 'gid'
|
||||||
|
|
||||||
#===========
|
#===========
|
||||||
# input
|
# input
|
||||||
#===========
|
#===========
|
||||||
source("combining_dfs_plotting.R")
|
#---------------------
|
||||||
|
# call: import_dirs()
|
||||||
|
#---------------------
|
||||||
|
import_dirs(drug, gene)
|
||||||
|
|
||||||
|
#---------------------------
|
||||||
|
# call: plotting_data()
|
||||||
|
#---------------------------
|
||||||
|
if (!exists("infile_params") && exists("gene")){
|
||||||
|
#if (!is.character(infile_params) && exists("gene")){
|
||||||
|
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
|
||||||
|
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
|
||||||
|
infile_params = paste0(outdir, "/", in_filename_params)
|
||||||
|
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Input 1: read <gene>_comb_afor.csv
|
||||||
|
pd_df = plotting_data(infile_params)
|
||||||
|
my_df_u = pd_df[[1]] # this forms one of the input for combining_dfs_plotting()
|
||||||
|
|
||||||
|
#--------------------------------
|
||||||
|
# call: combining_dfs_plotting()
|
||||||
|
#--------------------------------
|
||||||
|
if (!exists("infile_metadata") && exists("gene")){
|
||||||
|
#if (!is.character(infile_params) && exists("gene")){{
|
||||||
|
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
|
||||||
|
infile_metadata = paste0(outdir, "/", in_filename_metadata)
|
||||||
|
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Input 2: read <gene>_meta data.csv
|
||||||
|
cat("\nReading meta data file:", infile_metadata)
|
||||||
|
|
||||||
|
gene_metadata <- read.csv(infile_metadata
|
||||||
|
, stringsAsFactors = F
|
||||||
|
, header = T)
|
||||||
|
|
||||||
|
all_plot_dfs = combining_dfs_plotting(my_df_u
|
||||||
|
, gene_metadata
|
||||||
|
, lig_dist_colname = 'ligand_distance'
|
||||||
|
, lig_dist_cutoff = 10)
|
||||||
|
|
||||||
|
#merged_df2 = all_plot_dfs[[1]]
|
||||||
|
merged_df3 = all_plot_dfs[[2]]
|
||||||
|
#merged_df2_comp = all_plot_dfs[[3]]
|
||||||
|
#merged_df3_comp = all_plot_dfs[[4]]
|
||||||
|
#merged_df2_lig = all_plot_dfs[[5]]
|
||||||
|
#merged_df3_lig = all_plot_dfs[[6]]
|
||||||
|
|
||||||
#===========
|
#===========
|
||||||
# output
|
# output
|
||||||
|
@ -32,12 +84,6 @@ plot_logo_multiple_muts = paste0(plotdir,"/", logo_multiple_muts)
|
||||||
# REASSIGNMENT
|
# REASSIGNMENT
|
||||||
my_df = merged_df3
|
my_df = merged_df3
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
|
||||||
# clear excess variables
|
|
||||||
rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
|
|
||||||
, merged_df3_comp, merged_df3_comp_lig
|
|
||||||
, my_df_u, my_df_u_lig, merged_df3_lig)
|
|
||||||
|
|
||||||
colnames(my_df)
|
colnames(my_df)
|
||||||
str(my_df)
|
str(my_df)
|
||||||
|
|
||||||
|
@ -117,7 +163,7 @@ p0
|
||||||
p1 = p0 + theme(legend.position = "none"
|
p1 = p0 + theme(legend.position = "none"
|
||||||
, legend.title = element_blank()
|
, legend.title = element_blank()
|
||||||
, legend.text = element_text(size = 20)
|
, legend.text = element_text(size = 20)
|
||||||
, axis.text.x = element_text(size = 20, angle = 90)
|
, axis.text.x = element_text(size = 17, angle = 90)
|
||||||
, axis.text.y = element_blank())
|
, axis.text.y = element_blank())
|
||||||
p1
|
p1
|
||||||
|
|
||||||
|
@ -138,7 +184,6 @@ rownames(tab_wt)
|
||||||
|
|
||||||
#**************
|
#**************
|
||||||
# Plot 2: wild_type logo
|
# Plot 2: wild_type logo
|
||||||
|
|
||||||
#**************
|
#**************
|
||||||
# sanity check: MUST BE TRUE
|
# sanity check: MUST BE TRUE
|
||||||
|
|
||||||
|
@ -165,7 +210,7 @@ p3 = p2 +
|
||||||
#, legend.title = element_blank()
|
#, legend.title = element_blank()
|
||||||
, legend.title = element_text("Amino acid properties", size = 20)
|
, legend.title = element_text("Amino acid properties", size = 20)
|
||||||
, legend.text = element_text(size = 20)
|
, legend.text = element_text(size = 20)
|
||||||
, axis.text.x = element_text(size = 20, angle = 90)
|
, axis.text.x = element_text(size = 17, angle = 90)
|
||||||
, axis.text.y = element_blank()
|
, axis.text.y = element_blank()
|
||||||
, axis.title.x = element_text(size = 22))+
|
, axis.title.x = element_text(size = 22))+
|
||||||
|
|
||||||
|
|
45
scripts/plotting/logo_plot.R
Normal file → Executable file
45
scripts/plotting/logo_plot.R
Normal file → Executable file
|
@ -15,9 +15,36 @@ source("../functions/combining_dfs_plotting.R")
|
||||||
###########################################################
|
###########################################################
|
||||||
# command line args
|
# command line args
|
||||||
#********************
|
#********************
|
||||||
drug = 'streptomycin'
|
#drug = 'streptomycin'
|
||||||
gene = 'gid'
|
#gene = 'gid'
|
||||||
|
#********************
|
||||||
|
# !!!FUTURE TODO!!!
|
||||||
|
# Can pass additional params of output/plot dir by user.
|
||||||
|
# Not strictly required for my workflow since it is optimised
|
||||||
|
# to have a streamlined input/output flow without filename worries.
|
||||||
|
#********************
|
||||||
|
spec = matrix(c(
|
||||||
|
"drug" , "d", 1, "character",
|
||||||
|
"gene" , "g", 1, "character",
|
||||||
|
"data_file1" , "fa", 2, "character",
|
||||||
|
"data_file2" , "fb", 2, "character"
|
||||||
|
), byrow = TRUE, ncol = 4)
|
||||||
|
|
||||||
|
opt = getopt(spec)
|
||||||
|
|
||||||
|
#FIXME: detect if script running from cmd, then set these
|
||||||
|
drug = opt$drug
|
||||||
|
gene = opt$gene
|
||||||
|
infile_params = opt$data_file1
|
||||||
|
infile_metadata = opt$data_file2
|
||||||
|
|
||||||
|
# hardcoding when not using cmd
|
||||||
|
#drug = "streptomycin"
|
||||||
|
#gene = "gid"
|
||||||
|
|
||||||
|
if(is.null(drug)|is.null(gene)) {
|
||||||
|
stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
|
||||||
|
}
|
||||||
#===========
|
#===========
|
||||||
# input
|
# input
|
||||||
#===========
|
#===========
|
||||||
|
@ -29,8 +56,8 @@ import_dirs(drug, gene)
|
||||||
#---------------------------
|
#---------------------------
|
||||||
# call: plotting_data()
|
# call: plotting_data()
|
||||||
#---------------------------
|
#---------------------------
|
||||||
if (!exists("infile_params") && exists("gene")){
|
#if (!exists("infile_params") && exists("gene")){
|
||||||
#if (!is.character(infile_params) && exists("gene")){
|
if (!is.character(infile_params) && exists("gene")){
|
||||||
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
|
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
|
||||||
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
|
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
|
||||||
infile_params = paste0(outdir, "/", in_filename_params)
|
infile_params = paste0(outdir, "/", in_filename_params)
|
||||||
|
@ -38,14 +65,15 @@ if (!exists("infile_params") && exists("gene")){
|
||||||
}
|
}
|
||||||
|
|
||||||
# Input 1: read <gene>_comb_afor.csv
|
# Input 1: read <gene>_comb_afor.csv
|
||||||
pd_df = plotting_data(infile_params)
|
my_df = read.csv(infile_params, header = T)
|
||||||
|
pd_df = plotting_data(my_df)
|
||||||
my_df_u = pd_df[[1]] # this forms one of the input for combining_dfs_plotting()
|
my_df_u = pd_df[[1]] # this forms one of the input for combining_dfs_plotting()
|
||||||
|
|
||||||
#--------------------------------
|
#--------------------------------
|
||||||
# call: combining_dfs_plotting()
|
# call: combining_dfs_plotting()
|
||||||
#--------------------------------
|
#--------------------------------
|
||||||
if (!exists("infile_metadata") && exists("gene")){
|
#if (!exists("infile_metadata") && exists("gene")){
|
||||||
#if (!is.character(infile_params) && exists("gene")){{
|
if (!is.character(infile_params) && exists("gene")){
|
||||||
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
|
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
|
||||||
infile_metadata = paste0(outdir, "/", in_filename_metadata)
|
infile_metadata = paste0(outdir, "/", in_filename_metadata)
|
||||||
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
|
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
|
||||||
|
@ -63,7 +91,7 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
|
||||||
, lig_dist_colname = 'ligand_distance'
|
, lig_dist_colname = 'ligand_distance'
|
||||||
, lig_dist_cutoff = 10)
|
, lig_dist_cutoff = 10)
|
||||||
|
|
||||||
merged_df2 = all_plot_dfs[[1]]
|
#merged_df2 = all_plot_dfs[[1]]
|
||||||
merged_df3 = all_plot_dfs[[2]]
|
merged_df3 = all_plot_dfs[[2]]
|
||||||
#merged_df2_comp = all_plot_dfs[[3]]
|
#merged_df2_comp = all_plot_dfs[[3]]
|
||||||
#merged_df3_comp = all_plot_dfs[[4]]
|
#merged_df3_comp = all_plot_dfs[[4]]
|
||||||
|
@ -146,6 +174,7 @@ wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
|
||||||
|
|
||||||
wide_df_or = as.matrix(wide_df_or)
|
wide_df_or = as.matrix(wide_df_or)
|
||||||
rownames(wide_df_or) = wide_df_or[,1]
|
rownames(wide_df_or) = wide_df_or[,1]
|
||||||
|
dim(wide_df_or)
|
||||||
wide_df_or = wide_df_or[,-1]
|
wide_df_or = wide_df_or[,-1]
|
||||||
str(wide_df_or)
|
str(wide_df_or)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue