moved all test scripts for functions to tests/

This commit is contained in:
Tanushree Tunstall 2021-09-09 13:12:07 +01:00
parent 2ee66c770b
commit 03031d2eb6
15 changed files with 162 additions and 776 deletions

View file

@ -0,0 +1,63 @@
#!/usr/bin/env Rscript
library(ggplot2)
library(tidyverse)
library(data.table)
setwd("~/git/LSHTM_analysis/scripts/functions/")
getwd()
#############################################################
#===========================================
# load functions, data, dirs, hardocded vars
# that will be used in testing the functions
#===========================================
source("plotting_data.R")
infile = "/home/tanu/git/Data/streptomycin/output/"
pd_df = plotting_data(infile)
my_df = pd_df[[1]]
my_df_u = pd_df[[2]]
my_df_u_lig = pd_df[[3]]
dup_muts = pd_df[[4]]
source("../plotting_globals.R")
drug = "streptomycin"
gene = "gid"
import_dirs(drug, gene)
#=====================
# functions to test
#=====================
source("stability_count_bp.R")
source("position_count_bp.R")
#################################################################
##############################################
# read a sample file containing muts and prop
###############################################
df<- read.csv(file.choose())
setDT(df)[, pos_count := .N, by = .(position)]
foo = data.frame(df$position, df$pos_count)
#snpsBYpos_df <- df %>%
# group_by(position) %>%
# summarize(snpsBYpos = mean(pos_count))
# subset df without duplicates for position
df2 = df[!duplicated(df$position)]
##################################################################
# ---------------------------------------
# barplot for nssnps, coloured by aa prop
# ---------------------------------------
pos_colname = "position"
aa_prop_colname = "mut_prop_water"
aa_prop_colours = c("black", "blue")
my_legname = "aa_prop: water"
# call function
aa_prop_bp(plotdf = df
, position_colname = pos_colname
, fill_colname = aa_prop_colname
, fill_colours = aa_prop_cols
, leg_name = my_legname)
#===============================================================

View file

@ -0,0 +1,59 @@
#!/usr/bin/env Rscript
#########################################################
# TASK: To calculate Allele Frequency and
# Odds Ratio from master data
#########################################################
# load libraries
#source("Header_TT.R")
require("getopt", quietly = TRUE) # cmd parse arguments
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/functions/")
getwd()
# load functions
source("plotting_globals.R")
source("mychisq_or.R")
source("myaf_or_calcs.R")
# cmd options + sensible defaults
drug = "streptomycin"
gene = "gid"
# call function
import_dirs(drug, gene)
# input file 1: master data
#in_filename_master = 'original_tanushree_data_v2.csv' #19K
in_filename_master = 'mtb_gwas_meta_v6.csv' #35k
infile_master = paste0(datadir, in_filename_master)
cat(paste0('Reading infile1: raw data', ' ', infile_master) )
# input file 2: gene associated meta data file to extract valid snps and add calcs to.
# This is outfile_metadata from data_extraction.py
in_filename_metadata = paste0(tolower(gene), '_metadata.csv')
infile_metadata = paste0(outdir, '/', in_filename_metadata)
cat(paste0('Reading input file 2 i.e gene associated metadata:', infile_metadata))
# out_filename_af_or = paste0(tolower(gene), '_meta_data_with_AF_OR.csv')
out_filename_af_or = paste0(tolower(gene), '_af_or.csv')
outfile_af_or = paste0(outdir, '/', out_filename_af_or)
cat(paste0('Output file with full path:', outfile_af_or))
cat("master data:", infile_master)
cat("gene data:", infile_metadata)
dr_muts_col # comes from global (dr_mutations_<drug>)
other_muts_col # comes from global (other_mutations_<drug>)
#################################################
my_afor ( infile_master
, infile_metadata
, outfile = outfile_af_or
#, outfile = "FOO_TEST.csv"
, drug
, gene
, idcol = "id"
, dr_muts_col
, other_muts_col
)

View file

@ -0,0 +1,113 @@
#!/usr/bin/env Rscript
setwd("~/git/LSHTM_analysis/scripts/functions/")
getwd()
#############################################################
#===========================================
# load functions, data, dirs, hardocded vars
# that will be used in testing the functions
#===========================================
drug = "streptomycin"
gene = "gid"
source("plotting_data.R")
infile = paste0("~/git/Data/", drug, "/output/", gene, "_comb_stab_struc_params.csv")
infile_df = read.csv(infile)
lig_dist = 5
pd_df = plotting_data(infile_df
, lig_dist_colname = 'ligand_distance'
, lig_dist_cutoff = lig_dist)
my_df = pd_df[[1]]
my_df_u = pd_df[[2]]
my_df_u_lig = pd_df[[3]]
dup_muts = pd_df[[4]]
#=====================
# functions to test
#=====================
source("stability_count_bp.R")
source("position_count_bp.R")
##################################################################
# ------------------------------
# barplot for mscm stability
# ------------------------------
basic_bp_duet = paste0(tolower(gene), "_basic_barplot_PS.svg")
plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet)
svg(plot_basic_bp_duet)
print(paste0("plot filename:", basic_bp_duet))
# function only
stability_count_bp(plotdf = my_df_u
, df_colname = "duet_outcome"
, leg_title = "DUET outcome"
, label_categories = c("Destabilising", "Stabilising")
, leg_position = "top")
dev.off()
# ------------------------------
# barplot for ligand affinity
# ------------------------------
basic_bp_ligand = paste0(tolower(gene), "_basic_barplot_LIG.svg")
plot_basic_bp_ligand = paste0(plotdir, "/", basic_bp_ligand)
svg(plot_basic_bp_ligand)
print(paste0("plot filename:", basic_bp_ligand))
# function only
lig_dist = 10
stability_count_bp(plotdf = my_df_u_lig
, df_colname = "ligand_outcome"
, leg_title = "Ligand outcome"
, yaxis_title = paste0("Number of nsSNPs\nLigand dist: <", lig_dist, "\u212b")
#, bp_plot_title = "Sites < 10 Ang of ligand"
)
dev.off()
# ------------------------------
# barplot for foldX
# ------------------------------
basic_bp_foldx = paste0(tolower(gene), "_basic_barplot_foldx.svg")
plot_basic_bp_foldx = paste0(plotdir,"/", basic_bp_foldx)
svg(plot_basic_bp_foldx)
print(paste0("plot filename:", plot_basic_bp_foldx))
stability_count_bp(plotdf = my_df_u
, df_colname = "foldx_outcome"
, leg_title = "FoldX outcome")
dev.off()
#===============================================================
# ------------------------------
# barplot for nssnp site count: all
# ------------------------------
pos_count_duet = paste0(tolower(gene), "_position_count_PS.svg")
plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet)
svg(plot_pos_count_duet)
print(paste0("plot filename:", plot_pos_count_duet))
# function only
site_snp_count_bp(plotdf = my_df_u
, df_colname = "position")
dev.off()
# ------------------------------
# barplot for nssnp site count: within 10 Ang
# ------------------------------
pos_count_ligand = paste0(tolower(gene), "_position_count_LIG.svg")
plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand)
svg(plot_pos_count_ligand)
print(paste0("plot filename:", plot_pos_count_ligand))
# function only
site_snp_count_bp(plotdf = my_df_u_lig
, df_colname = "position")
dev.off()
#===============================================================

View file

@ -0,0 +1,62 @@
setwd("~/git/LSHTM_analysis/scripts/plotting")
source ('get_plotting_dfs.R')
source("../functions/bp_lineage.R")
#########################################
# Lineage and SNP count: lineage lf data
#########################################
#=========================
# Data: All lineages or
# selected few
#=========================
sel_lineages = levels(lin_lf$sel_lineages_f)
sel_lineages
lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,]
# drop unused factor levels
lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f)
levels(lin_lf_plot$sel_lineages_f)
#=========================
# Lineage count plot
#=========================
lin_count_bp(lin_lf_plot
, x_categ = "sel_lineages_f"
, y_count = "p_count"
, bar_fill_categ = "count_categ"
, display_label_col = "p_count"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 20
, bar_col_labels = c("Mutations", "Total Samples")
, bar_col_values = c("grey50", "gray75")
, y_scale_percent = F # T for diversity
, y_log10 = F
, y_label = "Count")
###############################################
# Lineage SNP diversity count: lineage wf data
###############################################
#=========================
# Data: All lineages or
# selected few
#=========================
sel_lineages = levels(lin_wf$sel_lineages_f)
sel_lineages
lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,]
# drop unused factor levels
lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f)
levels(lin_wf_plot$sel_lineages_f)
#=========================
# Lineage Diversity plot
#=========================
lin_count_bp(lin_wf_plot
, x_categ = "sel_lineages_f"
, y_count = "snp_diversity"
, display_label_col = "snp_diversity_f"
, bar_stat_stype = "identity"
, x_lab_angle = 90
, my_xats = 20
, y_scale_percent = T
, y_label = "SNP diversity")

View file

@ -0,0 +1,100 @@
#!/usr/bin/env Rscript
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/functions/")
getwd()
# infile_params = paste0(outdir, "/" , tolower(gene), "_comb_afor.csv")
# infile_metadata = paste0(outdir, "/", tolower(gene), "_metadata")
#
#
# source("combining_dfs_plotting_func.R")
#
####################################################################
# in_file_params = "~/git/Data/streptomycin/output/gid_comb_afor.csv"
# in_file_metadata = "~/git/Data/streptomycin/output/gid_metadata.csv"
#
# all_plot_dfs = combining_dfs_plotting(df1_mcsm_comb = infile_params
# , df2_gene_metadata = infile_metadata
# , lig_dist_colname = 'ligand_distance'
# , lig_dist_cutoff = 10)
#
# merged_df2 = all_plot_dfs[[1]]
# merged_df3 = all_plot_dfs[[2]]
# merged_df2_comp = all_plot_dfs[[3]]
# merged_df3_comp = all_plot_dfs[[4]]
# merged_df2_lig = all_plot_dfs[[5]]
# merged_df3_lig = all_plot_dfs[[6]]
#
# bar_colnames = data.frame(colnames(merged_df2))
###########################################################
source("plotting_globals.R")
source("plotting_data.R")
source("combining_dfs_plotting.R")
#---------------------
# call: import_dirs()
#---------------------
gene = 'gid'
drug = 'streptomycin'
import_dirs(drug_name = drug, gene_name = gene)
#============================
# Input 1: plotting_data()
#============================
if (!exists("infile_params") && exists("gene")){
#if (!is.character(infile_params) && exists("gene")){
#in_filename_params = paste0(tolower(gene), "_all_params.csv")
in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
infile_params = paste0(outdir, "/", in_filename_params)
cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
}
mcsm_comb_data = read.csv(infile_params, header = T)
#-------------------------------
# call function: plotting_data()
#-------------------------------
pd_df = plotting_data(df = mcsm_comb_data
, ligand_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10
my_df_u = pd_df[[2]]
#======================================
# Input 2: read <gene>_meta data.csv
#======================================
if (!exists("infile_metadata") && exists("gene")){
#if (!is.character(infile_params) && exists("gene")){{
in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
infile_metadata = paste0(outdir, "/", in_filename_metadata)
cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
}
cat("\nReading meta data file:", infile_metadata)
gene_metadata <- read.csv(infile_metadata
, stringsAsFactors = F
, header = T)
#-----------------------------------------
# test function: combining_dfs_plotting()
#-----------------------------------------
all_plot_dfs = combining_dfs_plotting(my_df_u
, gene_metadata
, lig_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10)
merged_df2 = all_plot_dfs[[1]]
merged_df3 = all_plot_dfs[[2]]
merged_df2_comp = all_plot_dfs[[3]]
merged_df3_comp = all_plot_dfs[[4]]
merged_df2_lig = all_plot_dfs[[5]]
merged_df3_lig = all_plot_dfs[[6]]
merged_df2_comp_lig = all_plot_dfs[[7]]
merged_df3_comp_lig = all_plot_dfs[[8]]
########################################################################
# End of script
########################################################################

View file

@ -0,0 +1,58 @@
setwd("~/git/LSHTM_analysis/scripts/plotting/")
source("Header_TT.R")
source("../functions/lf_bp.R")
# ================================================
# Data: run get_plotting_data.R
# to get the long format data to test this function
# drug = "streptomycin"
# gene = "gid"
# source("get_plotting_dfs.R")
# ==================================================
######################
# Make plot: ggplot
######################
lf_bp(lf_df = lf_encomddg
, p_title = "ENCoM-DDG"
, colour_categ = "ddg_encom_outcome"
, x_grp = "mutation_info"
, y_var = "param_value"
, facet_var = "param_type"
, n_facet_row = 1
, y_scales = "free_y"
, colour_bp_strip = "khaki2"
, dot_size = 3
, dot_transparency = 0.3
, violin_quantiles = c(0.25, 0.5, 0.75)
, my_ats = 22 # axis text size
, my_als = 20 # axis label size
, my_fls = 20 # facet label size
, my_pts = 22 # plot title size
, make_boxplot = F
, bp_width = "auto"
, add_stats = T
, stat_grp_comp = c("DM", "OM")
, stat_method = "wilcox.test"
, my_paired = FALSE
, stat_label = c("p.format", "p.signif") )
#wilcox.test(wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "DM"]
# , wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "OM"])
######################
# Make plot: plotly
######################
# FIXME: This labels are not working as I want!
# lf_bp_plotly(lf_df = lf_deepddg
# , p_title = "DeepDDG"
# , colour_categ = "deepddg_outcome"
# , x_grp = "mutation_info"
# , y_var = "param_value"
# , facet_var = "param_type"
# , n_facet_row = 1
# , y_scales = "free_y"
# , colour_bp_strip = "khaki2"
# , dot_size = 3
# , dot_transparency = 0.3
# , violin_quantiles = c(0.25, 0.5, 0.75)
# )

View file

@ -0,0 +1,19 @@
setwd("~/git/LSHTM_analysis/scripts/functions")
source("lf_unpaired_stats.R")
#####################
# call stat function()
# a useful way to check stats
# for any lf data
#####################
# Note: Data
# run other_plots_data.R
# to get the long format data to test this function
stat_results_df <- lf_unpaired_stats(lf_data = lf_duet
, lf_stat_value = "param_value"
, lf_stat_group = "mutation_info"
, lf_col_statvars = "param_type"
, my_paired = FALSE
, stat_adj = "none"
)

View file

@ -0,0 +1,32 @@
###############################
# TEST function lineage_dist.R
# to plot lineage
# dist plots with or without facet
##############################
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting/")
getwd()
source("Header_TT.R")
source("get_plotting_dfs.R")
cat("cols imported:"
, mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
#############################################################
lineage_distP(lin_dist_plot
, with_facet = F
, leg_label = "Mutation Class"
)
lineage_distP(lin_dist_plot
, with_facet = T
, facet_wrap_var = "mutation_info_labels"
, leg_label = "Mutation Class"
, leg_pos_wf = "none"
, leg_dir_wf = "horizontal"
)

View file

@ -0,0 +1,35 @@
#!/usr/bin/env Rscript
getwd()
setwd("~/git/LSHTM_analysis/scripts/functions/")
getwd()
#############################################################
#===========================================
# load functions, data, dirs, hardocded vars
# that will be used in testing the functions
#===========================================
source("plotting_globals.R")
drug = "streptomycin"
gene = "gid"
import_dirs(drug_name = drug, gene_name = gene)
#-------------------------------
# test function: plotting_data()
#-------------------------------
source("plotting_data.R")
infile_params = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
mcsm_comb_data = read.csv(infile_params, header = T)
pd_df = plotting_data(df = mcsm_comb_data
, ligand_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10)
my_df = pd_df[[1]]
my_df_u = pd_df[[2]]
my_df_u_lig = pd_df[[3]]
dup_muts = pd_df[[4]]
########################################################################
# End of script
########################################################################