added function lineage_plot_data.R and corresponding test script, also renamed corr_plot_df.R to corr_plot_data and corresponing test script
This commit is contained in:
parent
3d45780c1a
commit
d13484e8f5
4 changed files with 364 additions and 0 deletions
130
scripts/functions/corr_plot_data.R
Normal file
130
scripts/functions/corr_plot_data.R
Normal file
|
@ -0,0 +1,130 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Script to format data for Correlation plots:
|
||||
|
||||
# corr_data_extract()
|
||||
# INPUT:
|
||||
# df: data with all parameters (my_use case)
|
||||
# merged_df3 or merged_df2!?
|
||||
# gene: [sanity check]
|
||||
# drug: relates to a column name that will need to extracted
|
||||
# ligand_dist_colname = LigDist_colname (variable from plotting_globals()
|
||||
|
||||
# colnames_to_extract = c("mutationinformation", "duet_affinity_change")
|
||||
# display_colnames_key = c(mutationinformation = "MUT" , duet_affinity_change = "DUET")
|
||||
# extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted.
|
||||
# NOTE*: No formatting applied to these cols i.e display name
|
||||
|
||||
# RETURNS: DF
|
||||
# containing all the columns required for generating downstream correlation plots
|
||||
|
||||
# TO DO: SHINY
|
||||
#1) Corr type?
|
||||
#2)
|
||||
##################################################################
|
||||
corr_data_extract <- function(df
|
||||
#, gene_name = gene
|
||||
, drug_name = drug
|
||||
, ligand_dist_colname = LigDist_colname
|
||||
, colnames_to_extract
|
||||
, colnames_display_key
|
||||
, extract_scaled_cols = F){
|
||||
|
||||
if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
|
||||
#if ( missing(colnames_to_extract) ){
|
||||
|
||||
cat("\n=========================================="
|
||||
, "\nCORR PLOTS data: ALL params"
|
||||
, "\n=========================================")
|
||||
|
||||
cat("\nExtracting default columns for"
|
||||
#, "\nGene name:", gene
|
||||
, "\nDrug name:", drug)
|
||||
|
||||
colnames_to_extract = c(drug
|
||||
#, "mutationinformation"
|
||||
, "mutation_info_labels"
|
||||
, "duet_stability_change"
|
||||
, "ligand_affinity_change"
|
||||
#, "ligand_distance"
|
||||
, ligand_dist_colname
|
||||
, "ddg_foldx"
|
||||
, "deepddg"
|
||||
, "asa"
|
||||
, "rsa"
|
||||
, "kd_values"
|
||||
, "rd_values"
|
||||
, "af"
|
||||
, "log10_or_mychisq"
|
||||
, "neglog_pval_fisher"
|
||||
, "ddg_dynamut2"
|
||||
, "consurf_score"
|
||||
, "snap2_score"
|
||||
, "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
|
||||
, "mcsm_na_affinity"
|
||||
, "mcsm_ppi2_affinity"
|
||||
)
|
||||
|
||||
# [optional] arg: extract_scaled_cols
|
||||
if (extract_scaled_cols){
|
||||
cat("\nExtracting scaled columns as well...\n")
|
||||
all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
|
||||
colnames_to_extract = c(colnames_to_extract, all_scaled_cols)
|
||||
|
||||
}else{
|
||||
colnames_to_extract = colnames_to_extract
|
||||
}
|
||||
|
||||
corr_df = df[, colnames(df)%in%colnames_to_extract]
|
||||
|
||||
# arg: colnames_display_key
|
||||
colnames_display_key = c(duet_stability_change = "DUET"
|
||||
, ligand_affinity_change = "mCSM-lig"
|
||||
#, ligand_distance = "ligand_distance"
|
||||
#, ligand_dist_colname = "ligand_distance"
|
||||
, ddg_foldx = "FoldX"
|
||||
, deepddg = "DeepDDG"
|
||||
, asa = "ASA"
|
||||
, rsa = "RSA"
|
||||
, kd_values = "KD"
|
||||
, rd_values = "RD"
|
||||
, af = "MAF"
|
||||
, log10_or_mychisq = "Log (OR)"
|
||||
, neglog_pval_fisher = "-Log (P)"
|
||||
, ddg_dynamut2 = "Dynamut2"
|
||||
, consurf_score = "Consurf"
|
||||
, snap2_score = "SNAP2"
|
||||
, ddg_dynamut = "Dynamut"
|
||||
, ddg_encom = "ENCoM-DDG"
|
||||
, ddg_mcsm = "mCSM"
|
||||
, ddg_sdm = "SDM"
|
||||
, ddg_duet = "DUET-d"
|
||||
, dds_encom = "ENCoM-DDS"
|
||||
, mcsm_na_affinity = "mCSM-NA"
|
||||
, mcsm_ppi2_affinity = "mCSM-PPI2")
|
||||
|
||||
# COMMENT: This only works when all the columns are in the namekey vector.
|
||||
# If one is missing, there is no error, but it also renamed as "NA.
|
||||
#names(corr_df) <- colnames_display_key[names(corr_df)]
|
||||
|
||||
# Solution: to use plyr::rename()
|
||||
# Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's.
|
||||
corr_df = plyr::rename(corr_df
|
||||
, replace = colnames_display_key
|
||||
, warn_missing = T
|
||||
, warn_duplicated = T)
|
||||
|
||||
cat("\nExtracted ncols:", ncol(corr_df)
|
||||
,"\nRenaming successful")
|
||||
|
||||
cat("\nSneak peak...")
|
||||
print(head(corr_df))
|
||||
|
||||
# Move drug column to the end
|
||||
last_col = colnames(corr_df[ncol(corr_df)])
|
||||
corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)
|
||||
|
||||
return(corr_df_f)
|
||||
}
|
||||
|
||||
}
|
208
scripts/functions/lineage_plot_data.R
Normal file
208
scripts/functions/lineage_plot_data.R
Normal file
|
@ -0,0 +1,208 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#########################################################
|
||||
# TASK: Script to format data for lineage plots
|
||||
# Called by get_plotting_dfs.R
|
||||
|
||||
# lineage_plot_data()
|
||||
# INPUT:
|
||||
# df : merged_df2 (data with 1:many relationship b/w snp and lineage)
|
||||
# NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship)
|
||||
# lineage_column_name : Column name that contains lineage info
|
||||
# remove_empty_lineage : where lineage info is missing, whether to omit those or not
|
||||
# lineage_label_col_name: Column containing pre-formatted lineage labels.
|
||||
# For my case, this is called "lineage_labels"
|
||||
# This column has short labels like L1, L2, L3, etc.
|
||||
# if this is left empty, then the lineage_column_name will be used
|
||||
# id_colname : sample-id column. Used to calculate SNP count
|
||||
# snp_colname : SNP column. Used to calculate SNP diversity
|
||||
|
||||
# RETURNS: List
|
||||
# WF and LF data for lineage-wise snp count and snp diversity
|
||||
|
||||
# TO DO: SHINY
|
||||
#1) remove empty positions
|
||||
#2) select lineages to display?
|
||||
#########################################################
|
||||
|
||||
lineage_plot_data <- function(df
|
||||
, lineage_column_name = "lineage"
|
||||
, remove_empty_lineage = T
|
||||
, lineage_label_col_name = "lineage_labels"
|
||||
, id_colname = "id"
|
||||
, snp_colname = "mutationinformation"){
|
||||
|
||||
################################################################
|
||||
# Get WF and LF data with lineage count, and snp diversity
|
||||
################################################################
|
||||
# Initialise output list
|
||||
lineage_dataL = list(
|
||||
lin_wf = data.frame()
|
||||
, lin_lf = data.frame())
|
||||
|
||||
table(df[[lineage_column_name]])
|
||||
|
||||
#------------------------
|
||||
# Check lineage counts
|
||||
# Including missing
|
||||
#------------------------
|
||||
if (missing(remove_empty_lineage)){
|
||||
|
||||
miss_ll = table(df[[lineage_column_name]] == "")[[2]]
|
||||
rm_ll = which(df[[lineage_column_name]] == "")
|
||||
|
||||
if (length(rm_ll) == miss_ll){
|
||||
cat("\nNo. of samples with missing lineage classification:"
|
||||
, miss_ll
|
||||
, "Removing these...")
|
||||
df = df[-rm_ll,]
|
||||
df = droplevels(df)
|
||||
}else{
|
||||
cat("\nSomething went wrong...numbers mismatch"
|
||||
, "samples with missing lineages:", mis_all
|
||||
, "No. of corresponding indices to remove:", rm_ll)
|
||||
}
|
||||
}else{
|
||||
df = df
|
||||
df = droplevels(df)
|
||||
}
|
||||
|
||||
#------------------------
|
||||
# Lineage labels column
|
||||
#------------------------
|
||||
if (lineage_label_col_name == ""){
|
||||
cat("\nLineage label column missing..."
|
||||
, "\nUsing the column:" , lineage_column_name, "as labels as well")
|
||||
lin_labels = lineage_column_name
|
||||
|
||||
#------------------------------------------
|
||||
if ( !is.factor((df[[lin_labels]])) ){
|
||||
df[lin_labels] = as.factor(df[lin_labels])
|
||||
df[lin_labels] = factor()
|
||||
}else{
|
||||
cat("\nLineage label column already factor")
|
||||
}
|
||||
#------------------------------------------
|
||||
}else{
|
||||
#lin_labels = "lineage_labels"
|
||||
lin_labels = lineage_label_col_name
|
||||
cat("\nLineage label column present"
|
||||
, "\nUsing it, column name:", lin_labels)
|
||||
#------------------------------------------
|
||||
if ( !is.factor((df[[lin_labels]])) ){
|
||||
df[lin_labels] = as.factor(df[lin_labels])
|
||||
}else{
|
||||
cat("\nLineage label already factor")
|
||||
}
|
||||
#------------------------------------------
|
||||
}
|
||||
|
||||
# This is how lineage labels will appear
|
||||
cat("\nLineage labels will appear as below\n")
|
||||
print( table(df[[lin_labels]]) )
|
||||
cat("\n")
|
||||
cat( "Class of", lin_labels, ":", class(df[[lin_labels]]) )
|
||||
cat("\n")
|
||||
print( "No. of levels:", nlevels(df[[lin_labels]]) )
|
||||
|
||||
#==========================================
|
||||
# WF data: lineages with
|
||||
# snp count
|
||||
# total_samples
|
||||
# snp diversity (perc)
|
||||
#==========================================
|
||||
cat("\nCreating WF Lineage data...")
|
||||
|
||||
sel_lineages = levels(df[[lin_labels]])
|
||||
|
||||
lin_wf = data.frame(sel_lineages) #4, 1
|
||||
total_snps_u = NULL
|
||||
total_samples = NULL
|
||||
|
||||
for (i in sel_lineages){
|
||||
#print(i)
|
||||
curr_total = length(unique(df[[id_colname]])[df[[lin_labels]]==i])
|
||||
#print(curr_total)
|
||||
total_samples = c(total_samples, curr_total)
|
||||
print(total_samples)
|
||||
|
||||
foo = df[df[[lin_labels]]==i,]
|
||||
print(paste0(i, "=======\n"))
|
||||
print(length(unique(foo[[snp_colname]])))
|
||||
curr_count = length(unique(foo[[snp_colname]]))
|
||||
|
||||
total_snps_u = c(total_snps_u, curr_count)
|
||||
}
|
||||
|
||||
lin_wf
|
||||
|
||||
# Add these counts as columns to the df
|
||||
lin_wf$num_snps_u = total_snps_u
|
||||
lin_wf$total_samples = total_samples
|
||||
lin_wf
|
||||
|
||||
#----------------------
|
||||
# Add SNP diversity
|
||||
#----------------------
|
||||
lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
|
||||
lin_wf
|
||||
|
||||
#----------------------
|
||||
# Add some formatting
|
||||
#----------------------
|
||||
# SNP diversity
|
||||
lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
|
||||
lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
|
||||
|
||||
# should be as you like it to appear
|
||||
lin_wf$sel_lineages
|
||||
|
||||
# Important: Relevel factors so that x-axis categ appear as you want
|
||||
#lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c())
|
||||
#levels(lin_lf$sel_lineages)
|
||||
|
||||
lineage_dataL[['lin_wf']] = lin_wf
|
||||
|
||||
cat("\nCOMPLETED: Successfully created WF lineage data")
|
||||
|
||||
#=================================
|
||||
# LF data: lineages with
|
||||
# snp count
|
||||
# total_samples
|
||||
# snp diversity (perc)
|
||||
#=================================
|
||||
cat("\nCreating LF Lineage data...")
|
||||
|
||||
names(lin_wf)
|
||||
tot_cols = ncol(lin_wf)
|
||||
pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
|
||||
pivot_cols_n = length(pivot_cols)
|
||||
|
||||
expected_rows = nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
|
||||
|
||||
lin_lf <- gather(lin_wf
|
||||
, count_categ
|
||||
, p_count
|
||||
, num_snps_u:total_samples
|
||||
, factor_key = TRUE)
|
||||
lin_lf
|
||||
|
||||
# quick checks
|
||||
if ( nrow(lin_lf ) == expected_rows ){
|
||||
cat("\nPASS: Lineage LF data created"
|
||||
, "\nnrow: ", nrow(lin_lf)
|
||||
, "\nncol: ", ncol(lin_lf))
|
||||
} else {
|
||||
cat("\nFAIL: numbers mismatch"
|
||||
, "\nExpected nrow: ", expected_rows)
|
||||
}
|
||||
|
||||
# Important: Relevel factors so that x-axis categ appear as you want
|
||||
#lin_lf$sel_lineages = factor(lin_lf$sel_lineages, c())
|
||||
#levels(lin_lf$sel_lineages)
|
||||
|
||||
lineage_dataL[['lin_lf']] = lin_lf
|
||||
|
||||
cat("\nCOMPLETED: Successfully created LF lineage data")
|
||||
return(lineage_dataL)
|
||||
# end bracket
|
||||
}
|
7
scripts/functions/tests/test_corr_plot_data.R
Normal file
7
scripts/functions/tests/test_corr_plot_data.R
Normal file
|
@ -0,0 +1,7 @@
|
|||
#!/usr/bin/env Rscript
|
||||
source("~/git/LSHTM_analysis/config/gid.R")
|
||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
|
||||
m3 = corr_data_extract(merged_df3); head(m3)
|
||||
m2 = corr_data_extract(meregd_df2); head(m2)
|
||||
m3S = corr_data_extract(merged_df3, extract_scaled_cols = T); head(m3S)
|
19
scripts/functions/tests/test_lineage_plot_data.R
Normal file
19
scripts/functions/tests/test_lineage_plot_data.R
Normal file
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env Rscript
|
||||
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||
source("~/git/LSHTM_analysis/config/gid.R")
|
||||
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||
#source("~/git/LSHTM_analysis/config/rpob.R")
|
||||
|
||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
#####################################################################
|
||||
lineage_dfL = lineage_plot_data(df = merged_df2
|
||||
, lineage_column_name = "lineage"
|
||||
, remove_empty_lineage = F
|
||||
, lineage_label_col_name = "lineage_labels"
|
||||
, id_colname = "id"
|
||||
, snp_colname = "mutationinformation"
|
||||
)
|
||||
lin_wf = lineage_dfL[['lin_wf']]
|
||||
lin_lf = lineage_dfL[['lin_lf']]
|
Loading…
Add table
Add a link
Reference in a new issue