LSHTM_analysis/scripts/plotting/redundant/other_dfs_data.R

117 lines
3.8 KiB
R

#!/usr/bin/env Rscript
# Didn't end up using it: sorted it at the source
# .py script to combine all dfs to output all_params
#################################################################
# TASK: Script to add all other dfs to merged_df2 and merged_df3
#################################################################
# Combine other dfs:
# dynamut_df, dynamut2_df, mcsm_na_df,
# perhaps : deepddg and mcsm ppi (for embb)
################################################################
# read other files
infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
, "_complex_dynamut_norm.csv")
infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
, "_complex_dynamut2_norm.csv")
infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
, "_complex_mcsm_na_norm.csv")
infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
, "_mcsm_formatted_snps.csv")
dynamut_df = read.csv(infilename_dynamut)
dynamut2_df = read.csv(infilename_dynamut2)
mcsm_na_df = read.csv(infilename_mcsm_na)
mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F)
names(mcsm_f_snps) = "mutationinformation"
#=================================
# check with intersect to find the common col, but use
c1 = length(intersect(names(dynamut_df), names(dynamut2_df)))
c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df)))
if (c1 == 1 && c2 == 1) {
n_common = 1
}else{
cat("\nMore than one common col found, inspect before merging!")
}
# mutationinformation column to be on the safe side
# delete chain from dynamut2_df
#dynamut2_df = subset(dynamut2_df, select = -chain)
# quick checks
lapply(list(dynamut_df
, dynamut2_df
, mcsm_na_df), ncol)
lapply(list(dynamut_df
, dynamut2_df
, mcsm_na_df), colnames)
lapply(list(dynamut_df
, dynamut2_df
, mcsm_na_df), nrow)
ncols_comb = lapply(list(dynamut_df
, dynamut2_df
, mcsm_na_df), ncol)
#---------------------------------
# Combine 1: all other params dfs
#---------------------------------
combined_dfs = Reduce(inner_join, list(dynamut_df
, dynamut2_df
, mcsm_na_df))
# Reduce("+", ncols_comb)
#-----------------------------------------
# Combine 2: combine1 result + merged_df2
#-----------------------------------------
drop_cols = intersect(names(combined_dfs), names(merged_df2))
drop_cols = drop_cols
drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")]
combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols]
nrow(combined_dfs_f); nrow(merged_df2)
ncol(combined_dfs_f); ncol(merged_df2)
#-----------------------------------------
# Combined merged_df2
#-----------------------------------------
merged_df2_combined = merge(merged_df2
, combined_dfs_f
, by = "mutationinformation"
)
expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1
if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){
cat("\nPASS: merged_df2 combined with other parameters dfs."
, "\nUse this for lineage distribution plots")
}else{
cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs")
quit()
}
rm(combined_dfs, combined_dfs_f)
#================================
# combined data
# short_df ps: ~ merged_df3
# TODO: later integrate properly
#================================
#-----------------------------------------
# Combined merged_df2
#-----------------------------------------
merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]