changes made to combining_dfs_plotting.R

This commit is contained in:
Tanushree Tunstall 2021-06-23 16:15:15 +01:00
parent 4f4734f565
commit 8277b489d6
9 changed files with 258 additions and 126 deletions

View file

@ -2,11 +2,12 @@
###########################################################
# TASK: To combine mcsm combined file and meta data.
# This script is sourced from other .R scripts for plotting.
# This script is sourced by other .R scripts for plotting.
###########################################################
# load libraries and functions
#source("Header_TT.R")
#==========================================================
# combining_dfs_plotting():
@ -34,28 +35,7 @@ combining_dfs_plotting <- function( my_df_u
, gene_metadata
, lig_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10){
# #======================================
# # 1: Read file: <gene>_meta data.csv
# #======================================
# cat("\nReading meta data file:", df1_mcsm_comb)
#
# my_df_u <- read.csv(df1_mcsm_comb
# , stringsAsFactors = F
# , header = T)
# cat("\nDim:", dim(my_df_u))
#
# #======================================
# # 2: Read file: <gene>_meta data.csv
# #======================================
# cat("\nReading meta data file:", df2_gene_metadata)
#
# gene_metadata <- read.csv(df2_gene_metadata
# , stringsAsFactors = F
# , header = T)
# cat("\nDim:", dim(gene_metadata))
#
# table(gene_metadata$mutation_info)
# counting NAs in AF, OR cols
# or_mychisq
if (identical(sum(is.na(my_df_u$or_mychisq))
@ -219,7 +199,6 @@ combining_dfs_plotting <- function( my_df_u
, "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
}
#===================================================
# Merge3: merged_df2_comp
# same as merge 1 but excluding NAs from ORs, etc.
@ -273,6 +252,13 @@ combining_dfs_plotting <- function( my_df_u
# compare dfs: foo and merged_df3_com
all.equal(foo, bar)
#summary(comparedf(foo, bar))
cat("\n------------------------"
, "\nSummary of created dfs:"
, "\n------------------------"
, "\n1) Dim of merged_df2: " , nrow(merged_df2), "," , ncol(merged_df2)
, "\n2) Dim of merged_df2_comp: " , nrow(merged_df2_comp), "," , ncol(merged_df2_comp)
, "\n3) Dim of merged_df3: " , nrow(merged_df3), "," , ncol(merged_df3)
, "\n4) Dim of merged_df3_comp: " , nrow(merged_df3_comp), "," , ncol(merged_df3_comp))
#####################################################################
# Combining: LIG
@ -281,13 +267,35 @@ combining_dfs_plotting <- function( my_df_u
#============
# Merges 5-8
#============
df_lig = my_df_u[my_df_u[[lig_dist_colname]]<lig_dist_cutoff,]
cat("\n=========================================="
, "\nStarting filtering for mcsm ligand df"
, "\n===========================================")
merged_df2_lig = merged_df2[merged_df2$ligand_distance<lig_dist_cutoff,]
merged_df2_comp_lig = merged_df2_comp[merged_df2_comp$ligand_distance<lig_dist_cutoff,]
if (lig_dist_colname%in%names(my_df_u)){
cat("\nFiltering column: ", lig_dist_colname
, "\nCut off criteria: ", lig_dist_cutoff, "Angstroms")
df_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
merged_df3_lig = merged_df3[merged_df3$ligand_distance<lig_dist_cutoff,]
merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<lig_dist_cutoff,]
#merged_df2_lig = merged_df2[merged_df2$ligand_distance<lig_dist_cutoff,]
merged_df2_lig = merged_df2[merged_df2[[lig_dist_colname]] < lig_dist_cutoff,]
dim(merged_df2_lig)
merged_df2_comp_lig = merged_df2_comp[merged_df2_comp[[lig_dist_colname]] < lig_dist_cutoff,]
merged_df3_lig = merged_df3[merged_df3[[lig_dist_colname]] < lig_dist_cutoff,]
merged_df3_comp_lig = merged_df3_comp[merged_df3_comp[[lig_dist_colname]] < lig_dist_cutoff,]
cat("\n------------------------"
, "\nSummary of created ligand dfs:"
, "\n------------------------"
, "\n1) Dim of merged_df2_lig: " , nrow(merged_df2_lig), "," , ncol(merged_df2_lig)
, "\n2) Dim of merged_df2_comp_lig: " , nrow(merged_df2_comp_lig), "," , ncol(merged_df2_comp_lig)
, "\n3) Dim of merged_df3_lig: " , nrow(merged_df3_lig), "," , ncol(merged_df3_lig)
, "\n4) Dim of merged_df3_comp_lig: " , nrow(merged_df3_comp_lig), "," , ncol(merged_df3_comp_lig))
} else {
cat("\nFiltering column: ", lig_dist_colname, " not found\n")
}
#quit()
# sanity check
if (nrow(merged_df3_lig) == nrow(df_lig)){
@ -297,7 +305,7 @@ combining_dfs_plotting <- function( my_df_u
, "\nExpected:", nrow(df_lig)
, "\nGot:", nrow(merged_df3_lig)))
}
#==============================================================
############################################
@ -327,8 +335,13 @@ combining_dfs_plotting <- function( my_df_u
# , "\nNo. of cols: ", ncol(get(i)), "\n")
#}
return(list(merged_df2, merged_df3
, merged_df2_comp, merged_df3_comp
, merged_df2_lig, merged_df3_lig))
return(list( merged_df2
, merged_df3
, merged_df2_comp
, merged_df3_comp
, merged_df2_lig
, merged_df3_lig
, merged_df2_comp_lig
, merged_df3_comp_lig))
}

View file

@ -16,20 +16,18 @@ library(dplyr)
## my_df_u_lig
## dup_muts
#========================================================
plotting_data <- function(infile_params, mcsm_lig_cutoff = 10) {
plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) {
my_df = data.frame()
my_df_u = data.frame()
my_df_u_lig = data.frame()
dup_muts = data.frame()
cat(paste0("\nInput file to prepare for plotting:", infile_params, "\n") )
#===========================
# Read file: struct params
#===========================
my_df = read.csv(infile_params, header = T)
#df = read.csv(infile_params, header = T)
cat("\nInput dimensions:", dim(my_df))
cat("\nInput dimensions:", dim(df))
#==================================
# add foldx outcome category
@ -43,17 +41,17 @@ cat("\nInput dimensions:", dim(my_df))
# adding foldx scaled values
# scale data b/w -1 and 1
#------------------------------
n = which(colnames(my_df) == "ddg"); n
n = which(colnames(df) == "ddg"); n
my_min = min(my_df[,n]); my_min
my_max = max(my_df[,n]); my_max
my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max
my_df$foldx_scaled = ifelse(my_df[,n] < 0
, my_df[,n]/abs(my_min)
, my_df[,n]/my_max)
df$foldx_scaled = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max)
# sanity check
my_min = min(my_df$foldx_scaled); my_min
my_max = max(my_df$foldx_scaled); my_max
my_min = min(df$foldx_scaled); my_min
my_max = max(df$foldx_scaled); my_max
if (my_min == -1 && my_max == 1){
cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
@ -67,9 +65,9 @@ if (my_min == -1 && my_max == 1){
# adding foldx outcome category
# ddg<0 = "Stabilising" (-ve)
#------------------------------
c1 = table(my_df$ddg < 0)
my_df$foldx_outcome = ifelse(my_df$ddg < 0, "Stabilising", "Destabilising")
c2 = table(my_df$ddg < 0)
c1 = table(df$ddg < 0)
df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
c2 = table(df$ddg < 0)
if ( all(c1 == c2) ){
cat("\nPASS: foldx outcome successfully created")
@ -83,19 +81,19 @@ if ( all(c1 == c2) ){
#==================================
# check for duplicate mutations
if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){
if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){
cat(paste0("\nCAUTION:", " Duplicate mutations identified"
, "\nExtracting these...\n"))
#cat(my_df[duplicated(my_df$mutationinformation),])
dup_muts = my_df[duplicated(my_df$mutationinformation),]
dup_muts = df[duplicated(df$mutationinformation),]
dup_muts_nu = length(unique(dup_muts$mutationinformation))
cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
, "\nNo. of unique duplicate mutations:", dup_muts_nu
, "\n\nExtracting df with unique mutations only\n"))
my_df_u = my_df[!duplicated(my_df$mutationinformation),]
my_df_u = df[!duplicated(df$mutationinformation),]
}else{
cat(paste0("\nNo duplicate mutations detected\n"))
my_df_u = my_df
my_df_u = df
}
upos = unique(my_df_u$position)
@ -105,15 +103,14 @@ cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
#===============================================
# extract mutations <10 Angstroms and symbol
#===============================================
table(my_df_u$ligand_distance<mcsm_lig_cutoff)
table(my_df_u$ligand_distance<lig_dist_cutoff)
my_df_u_lig = my_df_u[my_df_u$ligand_distance <mcsm_lig_cutoff,]
my_df_u_lig = my_df_u[my_df_u$ligand_distance <lig_dist_cutoff,]
cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
# return list of DFs
#return(list(my_df, my_df_u, my_df_u_lig, dup_muts))
my_df = df
#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
#all_df = Map(setNames, all_df, df_names)

View file

@ -10,21 +10,21 @@
# input args: 'drug' and 'gene'
# output: dir names for input and output files
import_dirs <- function(drug, gene) {
import_dirs <- function(drug_name, gene_name) {
#============================
# directories and variables
#============================
datadir <<- paste0("~/git/Data/")
indir <<- paste0(datadir, drug, "/input")
outdir <<- paste0("~/git/Data/", drug, "/output")
plotdir <<- paste0("~/git/Data/", drug, "/output/plots")
indir <<- paste0(datadir, drug_name, "/input")
outdir <<- paste0("~/git/Data/", drug_name, "/output")
plotdir <<- paste0("~/git/Data/", drug_name, "/output/plots")
dr_muts_col <<- paste0('dr_mutations_', drug)
other_muts_col <<- paste0('other_mutations_', drug)
dr_muts_col <<- paste0('dr_mutations_', drug_name)
other_muts_col <<- paste0('other_mutations_', drug_name)
resistance_col <<- "drtype"
gene_match <<- paste0(gene,"_p.")
gene_match <<- paste0(gene_name,"_p.")
}
@ -37,20 +37,17 @@ import_dirs <- function(drug, gene) {
#==================
# Angstroms symbol
#==================
angstroms_symbol <<- "\u212b"
#cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10", angstroms_symbol, " of the ligand\n"))
#===============
# Delta symbol
#===============
delta_symbol <<- "\u0394"; delta_symbol
#==========
# Colours
#==========
mcsm_red2 <<- "#ae301e" # most negative
mcsm_red1 <<- "#f8766d"
@ -58,3 +55,4 @@ mcsm_mid <<- "white" # middle
mcsm_blue1 <<- "#00bfc4"
mcsm_blue2 <<- "#007d85" # most positive
#########################################################

View file

@ -7,6 +7,7 @@ getwd()
#===========================================
source("plotting_data.R")
infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
pd_df = plotting_data(infile)
my_df = pd_df[[1]]
my_df_u = pd_df[[2]]

View file

@ -1,4 +1,4 @@
# #!/usr/bin/env Rscript
#!/usr/bin/env Rscript
# working dir and loading libraries
getwd()
@ -33,14 +33,20 @@ source("plotting_globals.R")
source("plotting_data.R")
source("combining_dfs_plotting.R")
gene = 'gid'
drug = 'streptomycin'
#---------------------
# call: import_dirs()
#---------------------
import_dirs(drug, gene)
gene = 'gid'
drug = 'streptomycin'
import_dirs(drug_name = drug, gene_name = gene)
#-------------------------------
# test function: plotting_data()
#-------------------------------
#============================
# Input 1: plotting_data()
#============================
if (!exists("infile_params") && exists("gene")){
#if (!is.character(infile_params) && exists("gene")){
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
@ -49,6 +55,13 @@ if (!exists("infile_params") && exists("gene")){
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
}
mcsm_comb_data = read.csv(infile_params, header = T)
pd_df = plotting_data(df = mcsm_comb_data, lig_dist_cutoff = 10)
my_df_u = pd_df[[2]]
#======================================
# Input 2: read <gene>_meta data.csv
#======================================
if (!exists("infile_metadata") && exists("gene")){
#if (!is.character(infile_params) && exists("gene")){{
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
@ -56,18 +69,6 @@ if (!exists("infile_metadata") && exists("gene")){
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
}
#============================
# Input 1: plotting_data()
#============================
#---------------------
# call: plotting_data()
#---------------------
pd_df = plotting_data(infile_params)
my_df = pd_df[[1]] # this forms one of the input for combining_dfs_plotting()
#======================================
# Input 2: read <gene>_meta data.csv
#======================================
cat("\nReading meta data file:", infile_metadata)
gene_metadata <- read.csv(infile_metadata
@ -79,9 +80,11 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
, lig_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10)
merged_df2 = all_plot_dfs[[1]]
merged_df3 = all_plot_dfs[[2]]
merged_df2_comp = all_plot_dfs[[3]]
merged_df3_comp = all_plot_dfs[[4]]
merged_df2_lig = all_plot_dfs[[5]]
merged_df3_lig = all_plot_dfs[[6]]
merged_df2 = all_plot_dfs[[1]]
merged_df3 = all_plot_dfs[[2]]
merged_df2_comp = all_plot_dfs[[3]]
merged_df3_comp = all_plot_dfs[[4]]
merged_df2_lig = all_plot_dfs[[5]]
merged_df3_lig = all_plot_dfs[[6]]
merged_df2_comp_lig = all_plot_dfs[[7]]
merged_df3_comp_lig = all_plot_dfs[[8]]