diff --git a/scripts/plotting/basic_barplots.R b/scripts/plotting/basic_barplots.R index f81160b..aaddce9 100755 --- a/scripts/plotting/basic_barplots.R +++ b/scripts/plotting/basic_barplots.R @@ -68,18 +68,18 @@ import_dirs(drug, gene) # my_df_u_lig # dup_muts #*********************************** -#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" +#infile_params = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" -#if (!exists("infile") && exists("gene")){ -if (!is.character(infile) && exists("gene")){ +if (!exists("infile_params") && exists("gene")){ +#if (!is.character(infile_params) && exists("gene")){ #in_filename_params = paste0(tolower(gene), "_all_params.csv") in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid - infile = paste0(outdir, "/", in_filename_params) - cat("\nInput file not specified, assuming filename: ", infile, "\n") + infile_params = paste0(outdir, "/", in_filename_params) + cat("\nInput file not specified, assuming filename: ", infile_params, "\n") } # Get the DFs out of plotting_data() -pd_df = plotting_data(infile) +pd_df = plotting_data(infile_params) my_df = pd_df[[1]] my_df_u = pd_df[[2]] my_df_u_lig = pd_df[[3]] diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index bc5866b..7d517f8 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -1,433 +1,322 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: To combine struct params and meta data for plotting -# Input csv files: -# 1) _all_params.csv -# 2) _meta_data.csv +#!/usr/bin/env Rscript -# Output: -# 1) muts with opposite effects on stability -# 2) large combined df including NAs for AF, OR,etc +########################################################### +# TASK: To combine mcsm combined file and meta data. +# This script is sourced from other .R scripts for plotting. +########################################################### + +# load libraries and functions + +#========================================================== +# combining_dfs_plotting(): + +# input args + +## df1_mcsm_comb: _meta_data.csv +## df2_gene_metadata: _all_params.csv +## lig_dist_cutoff = 10, cut off distance + +# Output: returns +# 1) large combined df including NAs for AF, OR,etc # Dim: same no. of rows as gene associated meta_data_with_AFandOR -# 3) small combined df including NAs for AF, OR, etc. +# 2) small combined df including NAs for AF, OR, etc. # Dim: same as mcsm data -# 4) large combined df excluding NAs +# 3) large combined df excluding NAs # Dim: dim(#1) - na_count_df2 -# 5) small combined df excluding NAs +# 4) small combined df excluding NAs # Dim: dim(#2) - na_count_df3 -# This script is sourced from other .R scripts for plotting -######################################################### -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - -require("getopt", quietly = TRUE) # cmd parse arguments - -# load functions -source("Header_TT.R") -source("../functions/plotting_globals.R") -source("../functions/plotting_data.R") - -############################################################# -# command line args -#******************** -# !!!FUTURE TODO!!! -# Can pass additional params of output/plot dir by user. -# Not strictly required for my workflow since it is optimised -# to have a streamlined input/output flow without filename worries. -#******************** -spec = matrix(c( - "drug" ,"d", 1, "character", - "gene" ,"g", 1, "character", - "data" ,"f", 2, "character" -), byrow = TRUE, ncol = 4) - -opt = getopt(spec) - -#FIXME: detect if script running from cmd, then set these -drug = opt$drug -gene = opt$gene -infile = opt$data - -# hardcoding when not using cmd -#drug = "streptomycin" -#gene = "gid" - -if(is.null(drug)|is.null(gene)) { - stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") -} - -######################################################### -# call functions with relevant args -#*********************************** -# import_dirs(): returns - # datadir - # indir - # outdir - # plotdir - # dr_muts_col - # other_muts_col - # resistance_col -#*********************************** -import_dirs(drug, gene) -#*********************************** -# plotting_data(): returns - # my_df - # my_df_u - # my_df_u_lig - # dup_muts -#*********************************** -#infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" - -if (!exists("infile") && exists("gene")){ -#if (!is.character(infile) && exists("gene")){ - #in_filename_params = paste0(tolower(gene), "_all_params.csv") - #in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined for gid - in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid - infile = paste0(outdir, "/", in_filename_params) - cat("\nInput file not specified, assuming filename: ", infile, "\n") -} - -# Get the DFs out of plotting_data() -pd_df = plotting_data(infile) -my_df = pd_df[[1]] -my_df_u = pd_df[[2]] -my_df_u_lig = pd_df[[3]] -dup_muts = pd_df[[4]] - -cat(paste0("Directories imported:" - , "\ndatadir:" , datadir - , "\nindir:" , indir - , "\noutdir:" , outdir - , "\nplotdir:" , plotdir)) - -cat(paste0("\nVariables imported:" - , "\ndrug:" , drug - , "\ngene:" , gene - , "\ngene match:" , gene_match - , "\n")) -#======================================================== -#=========== -# input -#=========== -#in_file1: output of plotting_data.R -# my_df_u - -# infile 2: gene associated meta data -#in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") -in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") -infile_gene_metadata = paste0(outdir, "/", in_filename_gene_metadata) -cat(paste0("Input infile 2:", infile_gene_metadata)) - -#=========== -# output -#=========== -# other variables that you can write -# primarily called by other scripts for plotting - -# PS combined: -# 1) merged_df2 -# 2) merged_df2_comp -# 3) merged_df3 -# 4) merged_df3_comp - -# LIG combined: -# 5) merged_df2_lig -# 6) merged_df2_comp_lig -# 7) merged_df3_lig -# 8) merged_df3_comp_lig - -#%%=============================================================== - -########################### -# 2: Read file: _meta data.csv -########################### -cat("Reading meta data file:", infile_gene_metadata) - -gene_metadata <- read.csv(infile_gene_metadata - , stringsAsFactors = F - , header = T) -cat("Dim:", dim(gene_metadata)) - -table(gene_metadata$mutation_info) - -# counting NAs in AF, OR cols -# or_mychisq -if (identical(sum(is.na(my_df_u$or_mychisq)) - , sum(is.na(my_df_u$pval_fisher)) - , sum(is.na(my_df_u$af)))){ - cat("\nPASS: NA count match for OR, pvalue and AF\n") - na_count = sum(is.na(my_df_u$af)) - cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_mychisq))) -} else{ - cat("\nFAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(my_df_u$or_mychisq)) - , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher)) - , "\nNA in AF:", sum(is.na(my_df_u$af))) -} - -# or kin -if (identical(sum(is.na(my_df_u$or_kin)) - , sum(is.na(my_df_u$pwald_kin)) - , sum(is.na(my_df_u$af_kin)))){ - cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations") - na_count = sum(is.na(my_df_u$af_kin)) - cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin))) -} else{ - cat("\nFAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(my_df_u$or_kin)) - , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin)) - , "\nNA in AF:", sum(is.na(my_df_u$af_kin))) -} - -str(gene_metadata) - -################################################################### -# combining: PS -################################################################### -# sort by position (same as my_df) -head(gene_metadata$position) -gene_metadata = gene_metadata[order(gene_metadata$position),] -head(gene_metadata$position) - -#========================= -# Merge 1: merged_df2 -# dfs with NAs in ORs -#========================= -head(my_df_u$mutationinformation) -head(gene_metadata$mutationinformation) - -# Find common columns b/w two df -merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata)) - -cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" - , "\nNo. of merging cols:", length(merging_cols) - , "\nMerging columns identified:")) -print(merging_cols) - -# using all common cols create confusion, so pick one! -# merging_cols = merging_cols[[1]] -merging_cols = 'mutationinformation' - -# important checks! -table(nchar(my_df_u$mutationinformation)) -table(nchar(my_df_u$wild_type)) -table(nchar(my_df_u$mutant_type)) -table(nchar(my_df_u$position)) - -# all.y because x might contain non-structural positions! -merged_df2 = merge(x = gene_metadata - , y = my_df_u - , by = merging_cols - , all.y = T) - -cat("Dim of merged_df2: ", dim(merged_df2)) - -dup_cols = names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] -cat("\nNo. of duplicate cols:", length(dup_cols)) -check_df_cols = merged_df2[dup_cols] - -identical(check_df_cols$wild_type.x, check_df_cols$wild_type.y) -identical(check_df_cols$position.x, check_df_cols$position.y) -identical(check_df_cols$mutant_type.x, check_df_cols$mutant_type.y) -# False: because some of the ones with OR don't have mutation -identical(check_df_cols$mutation.x, check_df_cols$mutation.y) - -cols_to_drop = names(merged_df2)[grepl("\\.y",names(merged_df2))] -cat("\nNo. of cols to drop:", length(cols_to_drop)) - -# subset -merged_df2 = merged_df2[,!(names(merged_df2)%in%cols_to_drop)] - -# rename the cols with '.x' suffix -names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] -colnames(merged_df2) <- gsub("\\.x$", "", colnames(merged_df2)) -names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] - -#====================================================== -#------------- -# DEBUG -#------------- -merged_df2_g = merged_df2[,!(names(merged_df2)%in%cols_to_drop)] - -check_cols = colnames(merged_df2)[!colnames(merged_df2)%in%colnames(merged_df2_g)] -if ( identical(check_cols, cols_to_drop) ){ - cat("\nPASS: cols identified have been successfully dropped" - , "\nNo. of cols dropped: ", length(check_cols) - , "\nNo. of cols in original df: ", ncol(merged_df2) - , "\nNo. of cols in revised df: " , ncol(merged_df2_g)) -} - -#====================================================== -head(merged_df2$position) - -# sanity check -cat("Checking nrows in merged_df2") -if(nrow(gene_metadata) == nrow(merged_df2)){ - cat("PASS: nrow(merged_df2) = nrow (gene associated gene_metadata)" - ,"\nExpected no. of rows: ",nrow(gene_metadata) - ,"\nGot no. of rows: ", nrow(merged_df2)) -} else{ - cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)" - , "\nExpected no. of rows after merge: ", nrow(gene_metadata) - , "\nGot no. of rows: ", nrow(merged_df2) - , "\nFinding discrepancy") - merged_muts_u = unique(merged_df2$mutationinformation) - meta_muts_u = unique(gene_metadata$mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) - quit() -} - -#========================= -# Merge 2: merged_df3 -# dfs with NAs in ORs -# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#========================= -# remove duplicated mutations -cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)" - ,"\nCannot trust lineage info from this" - ,"\nlinking col: mutationinforamtion" - ,"\nfilename: merged_df3") - -merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] -head(merged_df3$position); tail(merged_df3$position) # should be sorted - -# sanity check -cat("Checking nrows in merged_df3") -if(nrow(my_df_u) == nrow(merged_df3)){ - cat("PASS: No. of rows match with my_df" - ,"\nExpected no. of rows: ", nrow(my_df_u) - ,"\nGot no. of rows: ", nrow(merged_df3)) -} else { - cat("FAIL: No. of rows mismatch" - , "\nNo. of rows my_df: ", nrow(my_df_u) - , "\nNo. of rows merged_df3: ", nrow(merged_df3)) - quit() -} - -# counting NAs in AF, OR cols in merged_df3 -# this is because mcsm has no AF, OR cols, -# so you cannot count NAs -if (identical(sum(is.na(merged_df3$or_kin)) - , sum(is.na(merged_df3$pwald_kin)) - , sum(is.na(merged_df3$af_kin)))){ - cat("PASS: NA count match for OR, pvalue and AF\n") - na_count_df3 = sum(is.na(merged_df3$af_kin)) - cat("No. of NAs: ", sum(is.na(merged_df3$or_kin))) -} else{ - cat("FAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(merged_df3$or_kin)) - , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin)) - , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) -} - - -#========================= -# Merge3: merged_df2_comp -# same as merge 1 but excluding NAs from ORs, etc. -#========================= -cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" - ,"\nlinking col: Mutationinforamtion" - ,"\nfilename: merged_df2_comp") - -na_count_df2 = sum(is.na(merged_df2$af)) -merged_df2_comp = merged_df2[!is.na(merged_df2$af),] - -# sanity check: no +-1 gymnastics -cat("Checking nrows in merged_df2_comp") -if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df2_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - , "\nNo. of rows: ", nrow(merged_df2_comp) - , "\nNo. of cols: ", ncol(merged_df2_comp)) -}else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - ,"\nGot no. of rows: ", nrow(merged_df2_comp)) -} - -#========================= -# Merge4: merged_df3_comp -# same as merge 2 but excluding NAs from ORs, etc or -# remove duplicate mutation information -#========================= -na_count_df3 = sum(is.na(merged_df3$af)) -#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way - -merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way -cat("Checking nrows in merged_df3_comp") - -if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df3_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - , "\nNo. of rows: ", nrow(merged_df3_comp) - , "\nNo. of cols: ", ncol(merged_df3_comp)) -}else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - ,"\nGot no. of rows: ", nrow(merged_df3_comp)) -} - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$af),] -bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] -# compare dfs: foo and merged_df3_com -all.equal(foo, bar) -#summary(comparedf(foo, bar)) - -#============================================================== - -##################################################################### -# Combining: LIG -##################################################################### - -#========================= -# Merges 5-8 -#========================= - -merged_df2_lig = merged_df2[merged_df2$ligand_distance<10,] -merged_df2_comp_lig = merged_df2_comp[merged_df2_comp$ligand_distance<10,] - -merged_df3_lig = merged_df3[merged_df3$ligand_distance<10,] -merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,] - -# sanity check -if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){ - print("PASS: verified merged_df3_lig") -}else{ - cat(paste0("FAIL: nrow mismatch for merged_df3_lig" - , "\nExpected:", nrow(my_df_u_lig) - , "\nGot:", nrow(merged_df3_lig))) -} - - -#============================================================== - -################# -# OPTIONAL: write output files in one go -################# -#outvars = c(#"merged_df2", - #"merged_df2_comp", - #"merged_df2_lig", - #"merged_df2_comp_lig", - - #"meregd_df3_comp" - #"merged_df3_comp_lig", - #"merged_df3", - #"merged_df3_lig") - -#cat("Writing output files: " - #, "\nPath:", outdir) - -#for (i in outvars){ +# 5) LIGAND large combined df including NAs for AF, OR,etc +# Dim: dim() +# 6) LIGAND small combined df excluding NAs +# Dim: dim() +#========================================================== +combining_dfs_plotting <- function( my_df_u + , gene_metadata + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10){ + # #====================================== + # # 1: Read file: _meta data.csv + # #====================================== + # cat("\nReading meta data file:", df1_mcsm_comb) + # + # my_df_u <- read.csv(df1_mcsm_comb + # , stringsAsFactors = F + # , header = T) + # cat("\nDim:", dim(my_df_u)) + # + # #====================================== + # # 2: Read file: _meta data.csv + # #====================================== + # cat("\nReading meta data file:", df2_gene_metadata) + # + # gene_metadata <- read.csv(df2_gene_metadata + # , stringsAsFactors = F + # , header = T) + # cat("\nDim:", dim(gene_metadata)) + # + # table(gene_metadata$mutation_info) + + # counting NAs in AF, OR cols + # or_mychisq + if (identical(sum(is.na(my_df_u$or_mychisq)) + , sum(is.na(my_df_u$pval_fisher)) + , sum(is.na(my_df_u$af)))){ + cat("\nPASS: NA count match for OR, pvalue and AF\n") + na_count = sum(is.na(my_df_u$af)) + cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_mychisq))) + } else{ + cat("\nFAIL: NA count mismatch" + , "\nNA in OR: ", sum(is.na(my_df_u$or_mychisq)) + , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher)) + , "\nNA in AF:", sum(is.na(my_df_u$af))) + } + + # or kin + if (identical(sum(is.na(my_df_u$or_kin)) + , sum(is.na(my_df_u$pwald_kin)) + , sum(is.na(my_df_u$af_kin)))){ + cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations") + na_count = sum(is.na(my_df_u$af_kin)) + cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin))) + } else{ + cat("\nFAIL: NA count mismatch" + , "\nNA in OR: ", sum(is.na(my_df_u$or_kin)) + , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin)) + , "\nNA in AF:", sum(is.na(my_df_u$af_kin))) + } + + str(gene_metadata) + + ################################################################### + # combining: PS + ################################################################### + # sort by position (same as my_df) + head(gene_metadata$position) + gene_metadata = gene_metadata[order(gene_metadata$position),] + head(gene_metadata$position) + + #========================= + # Merge 1: merged_df2 + # dfs with NAs in ORs + #========================= + head(my_df_u$mutationinformation) + head(gene_metadata$mutationinformation) + + # Find common columns b/w two df + merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata)) + + cat(paste0("\nMerging dfs with NAs: big df (1-many relationship b/w id & mut)" + , "\nNo. of merging cols:", length(merging_cols) + , "\nMerging columns identified:")) + + print(merging_cols) + + # using all common cols create confusion, so pick one! + # merging_cols = merging_cols[[1]] + merging_cols = 'mutationinformation' + + cat("\nLinking column being used: mutationinformation") + + # important checks! + table(nchar(my_df_u$mutationinformation)) + table(nchar(my_df_u$wild_type)) + table(nchar(my_df_u$mutant_type)) + table(nchar(my_df_u$position)) + + # all.y because x might contain non-structural positions! + merged_df2 = merge(x = gene_metadata + , y = my_df_u + , by = merging_cols + , all.y = T) + + cat("\nDim of merged_df2: ", dim(merged_df2)) + + # Remove duplicate columns + dup_cols = names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] + cat("\nNo. of duplicate cols:", length(dup_cols)) + check_df_cols = merged_df2[dup_cols] + + identical(check_df_cols$wild_type.x, check_df_cols$wild_type.y) + identical(check_df_cols$position.x, check_df_cols$position.y) + identical(check_df_cols$mutant_type.x, check_df_cols$mutant_type.y) + # False: because some of the ones with OR don't have mutation + identical(check_df_cols$mutation.x, check_df_cols$mutation.y) + + cols_to_drop = names(merged_df2)[grepl("\\.y",names(merged_df2))] + cat("\nNo. of cols to drop:", length(cols_to_drop)) + + # Drop duplicate columns + merged_df2 = merged_df2[,!(names(merged_df2)%in%cols_to_drop)] + + # Drop the '.x' suffix in the colnames + names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] + colnames(merged_df2) <- gsub("\\.x$", "", colnames(merged_df2)) + names(merged_df2)[grepl("\\.x$|\\.y$", names(merged_df2))] + + head(merged_df2$position) + + # sanity check + cat("\nChecking nrows in merged_df2") + if(nrow(gene_metadata) == nrow(merged_df2)){ + cat("\nPASS: nrow(merged_df2) = nrow (gene associated gene_metadata)" + ,"\nExpected no. of rows: ",nrow(gene_metadata) + ,"\nGot no. of rows: ", nrow(merged_df2)) + } else{ + cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)" + , "\nExpected no. of rows after merge: ", nrow(gene_metadata) + , "\nGot no. of rows: ", nrow(merged_df2) + , "\nFinding discrepancy") + merged_muts_u = unique(merged_df2$mutationinformation) + meta_muts_u = unique(gene_metadata$mutationinformation) + # find the index where it differs + unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) + quit() + } + + #================================================================= + # Merge 2: merged_df3 + # dfs with NAs in ORs + # + # Cannot trust lineage, country from this df as the same mutation + # can have many different lineages + # but this should be good for the numerical corr plots + #================================================================== + # remove duplicated mutations + cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)" + ,"\nCannot trust lineage info from this" + ,"\nlinking col: mutationinforamtion" + ,"\nfilename: merged_df3") + + merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] + head(merged_df3$position); tail(merged_df3$position) # should be sorted + + # sanity check + cat("\nChecking nrows in merged_df3") + if(nrow(my_df_u) == nrow(merged_df3)){ + cat("\nPASS: No. of rows match with my_df" + ,"\nExpected no. of rows: ", nrow(my_df_u) + ,"\nGot no. of rows: ", nrow(merged_df3)) + } else { + cat("\nFAIL: No. of rows mismatch" + , "\nNo. of rows my_df: ", nrow(my_df_u) + , "\nNo. of rows merged_df3: ", nrow(merged_df3)) + quit() + } + + # counting NAs in AF, OR cols in merged_df3 + # this is because mcsm has no AF, OR cols, + # so you cannot count NAs + if (identical(sum(is.na(merged_df3$or_kin)) + , sum(is.na(merged_df3$pwald_kin)) + , sum(is.na(merged_df3$af_kin)))){ + cat("\nPASS: NA count match for OR, pvalue and AF\n") + na_count_df3 = sum(is.na(merged_df3$af_kin)) + cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin))) + } else{ + cat("\nFAIL: NA count mismatch" + , "\nNA in OR: ", sum(is.na(merged_df3$or_kin)) + , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin)) + , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) + } + + + #=================================================== + # Merge3: merged_df2_comp + # same as merge 1 but excluding NAs from ORs, etc. + #==================================================== + cat("\nMerging dfs without any NAs: big df (1-many relationship b/w id & mut)" + ,"\nfilename: merged_df2_comp") + + na_count_df2 = sum(is.na(merged_df2$af)) + merged_df2_comp = merged_df2[!is.na(merged_df2$af),] + + # sanity check: no +-1 gymnastics + cat("\nChecking nrows in merged_df2_comp") + if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df2_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + , "\nNo. of rows: ", nrow(merged_df2_comp) + , "\nNo. of cols: ", ncol(merged_df2_comp)) + }else{ + cat("\nFAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + ,"\nGot no. of rows: ", nrow(merged_df2_comp)) + } + + #====================================================== + # Merge4: merged_df3_comp + # same as merge 2 but excluding NAs from ORs, etc or + # remove duplicate mutation information + #======================================================= + na_count_df3 = sum(is.na(merged_df3$af)) + #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way + + merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way + cat("\nChecking nrows in merged_df3_comp") + + if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df3_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + , "\nNo. of rows: ", nrow(merged_df3_comp) + , "\nNo. of cols: ", ncol(merged_df3_comp)) + }else{ + cat("\nFAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + ,"\nGot no. of rows: ", nrow(merged_df3_comp)) + } + + # alternate way of deriving merged_df3_comp + foo = merged_df3[!is.na(merged_df3$af),] + bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] + # compare dfs: foo and merged_df3_com + all.equal(foo, bar) + #summary(comparedf(foo, bar)) + + ##################################################################### + # Combining: LIG + ##################################################################### + + #============ + # Merges 5-8 + #============ + df_lig = my_df_u[my_df_u[[lig_dist_colname]]_all_params.csv -# 2) _meta_data.csv - -# Output: -# 1) muts with opposite effects on stability -# 2) large combined df including NAs for AF, OR,etc -# Dim: same no. of rows as gene associated meta_data_with_AFandOR -# 3) small combined df including NAs for AF, OR, etc. -# Dim: same as mcsm data -# 4) large combined df excluding NAs -# Dim: dim(#1) - no. of NAs(AF|OR) + 1 -# 5) small combined df excluding NAs -# Dim: dim(#2) - no. of unique NAs - 1 -# This script is sourced from other .R scripts for plotting -######################################################### - -########################################################## -# Installing and loading required packages -########################################################## -source("Header_TT.R") -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - - -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") - -#=========== -# input -#=========== -#in_filename = "mcsm_complex1_normalised.csv" -in_filename_params = paste0(tolower(gene), "_all_params.csv") -infile_params = paste0(outdir, "/", in_filename_params) -cat(paste0("Input file 1:", infile_params) ) - -# infile 2: gene associated meta data -#in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") -in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") -infile_gene_metadata = paste0(outdir, "/", in_filename_gene_metadata) -cat(paste0("Input infile 2:", infile_gene_metadata)) - -#=========== -# output -#=========== -# mutations with opposite effects -out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") -outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) - - -#%%=============================================================== -########################### -# Read file: struct params -########################### -cat("Reading struct params including mcsm:" - , in_filename_params) - -mcsm_data = read.csv(infile_params - #, row.names = 1 - , stringsAsFactors = F - , header = T) - -cat("Input dimensions:", dim(mcsm_data)) #416, 86 - -# clear variables -rm(in_filename_params, infile_params) - -str(mcsm_data) - -table(mcsm_data$duet_outcome); sum(table(mcsm_data$duet_outcome) ) - -# spelling Correction 1: DUET incase American spelling needed! -#mcsm_data$duet_outcome[mcsm_data$duet_outcome=="Stabilising"] <- "Stabilizing" -#mcsm_data$duet_outcome[mcsm_data$duet_outcome=="Destabilising"] <- "Destabilizing" - -# checks: should be the same as above -table(mcsm_data$duet_outcome); sum(table(mcsm_data$duet_outcome) ) -head(mcsm_data$duet_outcome); tail(mcsm_data$duet_outcome) - -# spelling Correction 2: Ligand incase American spelling needed! -table(mcsm_data$ligand_outcome); sum(table(mcsm_data$ligand_outcome) ) -#mcsm_data$ligand_outcome[mcsm_data$ligand_outcome=="Stabilising"] <- "Stabilizing" -#mcsm_data$ligand_outcome[mcsm_data$ligand_outcome=="Destabilising"] <- "Destabilizing" - -# checks: should be the same as above -table(mcsm_data$ligand_outcome); sum(table(mcsm_data$ligand_outcome) ) -head(mcsm_data$ligand_outcome); tail(mcsm_data$ligand_outcome) - -# muts with opposing effects on protomer and ligand stability -table(mcsm_data$duet_outcome != mcsm_data$ligand_outcome) -changes = mcsm_data[which(mcsm_data$duet_outcome != mcsm_data$ligand_outcome),] - -# sanity check: redundant, but uber cautious! -dl_i = which(mcsm_data$duet_outcome != mcsm_data$ligand_outcome) -ld_i = which(mcsm_data$ligand_outcome != mcsm_data$duet_outcome) - -cat("Identifying muts with opposite stability effects") -if(nrow(changes) == (table(mcsm_data$duet_outcome != mcsm_data$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat("PASS: muts with opposite effects on stability and affinity correctly identified" - , "\nNo. of such muts: ", nrow(changes)) -}else { - cat("FAIL: unsuccessful in extracting muts with changed stability effects") -} - -#*************************** -# write file: changed muts -write.csv(changes, outfile_opp_muts) - -cat("Finished writing file for muts with opp effects:" - , "\nFilename: ", outfile_opp_muts - , "\nDim:", dim(changes)) - -# clear variables -rm(out_filename_opp_muts, outfile_opp_muts) -rm(changes, dl_i, ld_i) - -#*************************** -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by mutationinformation -##mcsm_data = mcsm_data[order(mcsm_data$mutationinformation),] -##head(mcsm_data$mutationinformation) - -df_ncols = ncol(mcsm_data) - -# REMOVE as this is dangerous due to dup muts -# get freq count of positions and add to the df -#setDT(mcsm_data)[, occurrence := .N, by = .(position)] - -#cat("Added 1 col: position frequency to see which posn has how many muts" -# , "\nNo. of cols now", ncol(mcsm_data) -# , "\nNo. of cols before: ", df_ncols) - -#pos_count_check = data.frame(mcsm_data$position, mcsm_data$occurrence) - -# check duplicate muts -if (length(unique(mcsm_data$mutationinformation)) == length(mcsm_data$mutationinformation)){ - cat("No duplicate mutations in mcsm data") -}else{ - dup_muts = mcsm_data[duplicated(mcsm_data$mutationinformation),] - dup_muts_nu = length(unique(dup_muts$mutationinformation)) - cat(paste0("CAUTION:", nrow(dup_muts), " Duplicate mutations identified" - , "\nOf these, no. of unique mutations are:", dup_muts_nu - , "\nExtracting df with unique mutations only")) - mcsm_data_u = mcsm_data[!duplicated(mcsm_data$mutationinformation),] -} - -if (nrow(mcsm_data_u) == length(unique(mcsm_data$mutationinformation))){ - cat("Df without duplicate mutations successfully extracted") -} else{ - cat("FAIL: could not extract clean df!") - quit() -} - -########################### -# 2: Read file: _meta data.csv -########################### -cat("Reading meta data file:", infile_gene_metadata) - -gene_metadata <- read.csv(infile_gene_metadata - , stringsAsFactors = F - , header = T) -cat("Dim:", dim(gene_metadata)) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: remove -# counting NAs in AF, OR cols: -if (identical(sum(is.na(gene_metadata$OR)) - , sum(is.na(gene_metadata$pvalue)) - , sum(is.na(gene_metadata$AF)))){ - cat("PASS: NA count match for OR, pvalue and AF\n") - na_count = sum(is.na(gene_metadata$AF)) - cat("No. of NAs: ", sum(is.na(gene_metadata$OR))) -} else{ - cat("FAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(gene_metadata$OR)) - , "\nNA in pvalue: ", sum(is.na(gene_metadata$pvalue)) - , "\nNA in AF:", sum(is.na(gene_metadata$AF))) -} -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# clear variables -rm(in_filename_gene_metadata, infile_gene_metadata) - -str(gene_metadata) - -# sort by position (same as mcsm_data) -# earlier it was mutationinformation -#head(gene_metadata$mutationinformation) -#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),] -##head(gene_metadata$mutationinformation) - -head(gene_metadata$position) -gene_metadata = gene_metadata[order(gene_metadata$position),] -head(gene_metadata$position) - -########################### -# Merge 1: two dfs with NA -# merged_df2 -########################### -head(mcsm_data$mutationinformation) -head(gene_metadata$mutationinformation) - -# Find common columns b/w two df -merging_cols = intersect(colnames(mcsm_data), colnames(gene_metadata)) - -cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" - , "\nNo. of merging cols:", length(merging_cols) - , "\nMerging columns identified:")) -print(merging_cols) - -#============= -# merged_df2): gene_metadata + mcsm_data -#============== -merged_df2 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all.y = T) - -cat("Dim of merged_df2: ", dim(merged_df2) #4520, 11 - ) -head(merged_df2$position) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: count how many unique muts have entries in meta data -# sanity check -cat("Checking nrows in merged_df2") -if(nrow(gene_metadata) == nrow(merged_df2)){ - cat("nrow(merged_df2) = nrow (gene associated gene_metadata)" - ,"\nExpected no. of rows: ",nrow(gene_metadata) - ,"\nGot no. of rows: ", nrow(merged_df2)) -} else{ - cat("nrow(merged_df2)!= nrow(gene associated gene_metadata)" - , "\nExpected no. of rows after merge: ", nrow(gene_metadata) - , "\nGot no. of rows: ", nrow(merged_df2) - , "\nFinding discrepancy") - merged_muts_u = unique(merged_df2$mutationinformation) - meta_muts_u = unique(gene_metadata$mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - -# sort by position -head(merged_df2$position) -merged_df2 = merged_df2[order(merged_df2$position),] -head(merged_df2$position) - -merged_df2v3 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all = T) - -merged_df2v2 = merge(x = gene_metadata - , y = mcsm_data - , by = merging_cols - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn"t have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$position%in%merged_df2v2$position) - -rm(merged_df2v2) - -#!!!!!!!!!!! check why these differ - -######### -# merge 3b (merged_df3):remove duplicate mutation information -######### -cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)" - ,"\nCannot trust lineage info from this" - ,"\nlinking col: Mutationinforamtion" - ,"\nfilename: merged_df3") - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] -head(merged_df3$position); tail(merged_df3$position) # should be sorted - -# sanity check -cat("Checking nrows in merged_df3") -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat("PASS: No. of rows match with mcsm_data" - ,"\nExpected no. of rows: ", nrow(mcsm_data) - ,"\nGot no. of rows: ", nrow(merged_df3)) -} else { - cat("FAIL: No. of rows mismatch" - , "\nNo. of rows mcsm_data: ", nrow(mcsm_data) - , "\nNo. of rows merged_df3: ", nrow(merged_df3)) -} - -# counting NAs in AF, OR cols in merged_df3 -# this is becuase mcsm has no AF, OR cols, -# so you cannot count NAs -if (identical(sum(is.na(merged_df3$OR)) - , sum(is.na(merged_df3$pvalue)) - , sum(is.na(merged_df3$AF)))){ - cat("PASS: NA count match for OR, pvalue and AF\n") - na_count_df3 = sum(is.na(merged_df3$AF)) - cat("No. of NAs: ", sum(is.na(merged_df3$OR))) -} else{ - cat("FAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(merged_df3$OR)) - , "\nNA in pvalue: ", sum(is.na(merged_df3$pvalue)) - , "\nNA in AF:", sum(is.na(merged_df3$AF))) -} - -########################### -# 4: merging two dfs: without NA -########################### -######### -# merge 4a (merged_df2_comp): same as merge 1 but excluding NA -######### -cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" - ,"\nlinking col: Mutationinforamtion" - ,"\nfilename: merged_df2_comp") - -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$mutationinformation),] - -# sanity check -cat("Checking nrows in merged_df2_comp") -if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ - cat("PASS: No. of rows match" - ,"\nDim of merged_df2_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count + 1 - , "\nNo. of rows: ", nrow(merged_df2_comp) - , "\nNo. of cols: ", ncol(merged_df2_comp)) -}else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count + 1 - ,"\nGot no. of rows: ", nrow(merged_df2_comp)) -} - -######### -# merge 4b (merged_df3_comp): remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$mutationinformation),] - -cat("Dim of merged_df3_comp: " - , "\nNo. of rows: ", nrow(merged_df3_comp) - , "\nNo. of cols: ", ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -# sanity check -cat("Checking nrows in merged_df3_comp") -if(nrow(merged_df3_comp) == nrow(merged_df3)){ - cat("NO NAs detected in merged_df3 in AF|OR cols" - ,"\nNo. of rows are identical: ", nrow(merged_df3)) -} else{ - if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { - cat("PASS: NAs detected in merged_df3 in AF|OR cols" - , "\nNo. of NAs: ", na_count_df3 - , "\nExpected no. of rows in merged_df3_comp: ", nrow(merged_df3) - na_count_df3 - , "\nGot no. of rows: ", nrow(merged_df3_comp)) - } -} - -#=============== end of combining df -#********************* -# writing 1 file in the style of a loop: merged_df3 -# print(output dir) -#i = "merged_df3" -#out_filename = paste0(i, ".csv") -#outfile = paste0(outdir, "/", out_filename) - -#cat("Writing output file: " -# ,"\nFilename: ", out_filename -# ,"\nPath: ", outdir) - -#template: write.csv(merged_df3, "merged_df3.csv") -#write.csv(get(i), outfile, row.names = FALSE) -#cat("Finished writing: ", outfile -# , "\nNo. of rows: ", nrow(get(i)) -# , "\nNo. of cols: ", ncol(get(i))) - -#%% write_output files; all 4 files: -outvars = c("merged_df2" - , "merged_df3" - , "merged_df2_comp" - , "merged_df3_comp") - -cat("Writing output files: " - , "\nPath:", outdir) - -for (i in outvars){ -# cat(i, "\n") - out_filename = paste0(i, ".csv") -# cat(out_filename, "\n") -# cat("getting value of variable: ", get(i)) - outfile = paste0(outdir, "/", out_filename) -# cat("Full output path: ", outfile, "\n") - cat("Writing output file:" - ,"\nFilename: ", out_filename,"\n") - write.csv(get(i), outfile, row.names = FALSE) - cat("Finished writing: ", outfile - , "\nNo. of rows: ", nrow(get(i)) - , "\nNo. of cols: ", ncol(get(i)), "\n") -} - -# alternate way to replace with implicit loop -# FIXME -#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, "/", outvars, ".csv"))) -#************************* -# clear variables -rm(mcsm_data, gene_metadata, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, df_ncols, outdir) -rm(pos_count_check) -#============================= end of script -