117 lines
3.8 KiB
R
117 lines
3.8 KiB
R
#!/usr/bin/env Rscript
|
|
|
|
# Didn't end up using it: sorted it at the source
|
|
# .py script to combine all dfs to output all_params
|
|
|
|
#################################################################
|
|
# TASK: Script to add all other dfs to merged_df2 and merged_df3
|
|
|
|
#################################################################
|
|
# Combine other dfs:
|
|
# dynamut_df, dynamut2_df, mcsm_na_df,
|
|
# perhaps : deepddg and mcsm ppi (for embb)
|
|
################################################################
|
|
# read other files
|
|
infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
|
|
, "_complex_dynamut_norm.csv")
|
|
|
|
infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
|
|
, "_complex_dynamut2_norm.csv")
|
|
|
|
infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
|
|
, "_complex_mcsm_na_norm.csv")
|
|
|
|
infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
|
|
, "_mcsm_formatted_snps.csv")
|
|
|
|
dynamut_df = read.csv(infilename_dynamut)
|
|
dynamut2_df = read.csv(infilename_dynamut2)
|
|
mcsm_na_df = read.csv(infilename_mcsm_na)
|
|
mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F)
|
|
names(mcsm_f_snps) = "mutationinformation"
|
|
|
|
#=================================
|
|
# check with intersect to find the common col, but use
|
|
c1 = length(intersect(names(dynamut_df), names(dynamut2_df)))
|
|
c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df)))
|
|
|
|
if (c1 == 1 && c2 == 1) {
|
|
n_common = 1
|
|
}else{
|
|
cat("\nMore than one common col found, inspect before merging!")
|
|
}
|
|
|
|
# mutationinformation column to be on the safe side
|
|
# delete chain from dynamut2_df
|
|
#dynamut2_df = subset(dynamut2_df, select = -chain)
|
|
|
|
# quick checks
|
|
lapply(list(dynamut_df
|
|
, dynamut2_df
|
|
, mcsm_na_df), ncol)
|
|
|
|
lapply(list(dynamut_df
|
|
, dynamut2_df
|
|
, mcsm_na_df), colnames)
|
|
|
|
lapply(list(dynamut_df
|
|
, dynamut2_df
|
|
, mcsm_na_df), nrow)
|
|
|
|
ncols_comb = lapply(list(dynamut_df
|
|
, dynamut2_df
|
|
, mcsm_na_df), ncol)
|
|
|
|
#---------------------------------
|
|
# Combine 1: all other params dfs
|
|
#---------------------------------
|
|
combined_dfs = Reduce(inner_join, list(dynamut_df
|
|
, dynamut2_df
|
|
, mcsm_na_df))
|
|
# Reduce("+", ncols_comb)
|
|
|
|
#-----------------------------------------
|
|
# Combine 2: combine1 result + merged_df2
|
|
#-----------------------------------------
|
|
drop_cols = intersect(names(combined_dfs), names(merged_df2))
|
|
drop_cols = drop_cols
|
|
|
|
drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")]
|
|
|
|
combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols]
|
|
|
|
nrow(combined_dfs_f); nrow(merged_df2)
|
|
ncol(combined_dfs_f); ncol(merged_df2)
|
|
|
|
#-----------------------------------------
|
|
# Combined merged_df2
|
|
#-----------------------------------------
|
|
merged_df2_combined = merge(merged_df2
|
|
, combined_dfs_f
|
|
, by = "mutationinformation"
|
|
)
|
|
|
|
expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1
|
|
|
|
if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){
|
|
|
|
cat("\nPASS: merged_df2 combined with other parameters dfs."
|
|
, "\nUse this for lineage distribution plots")
|
|
}else{
|
|
|
|
cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs")
|
|
quit()
|
|
|
|
}
|
|
|
|
rm(combined_dfs, combined_dfs_f)
|
|
|
|
#================================
|
|
# combined data
|
|
# short_df ps: ~ merged_df3
|
|
# TODO: later integrate properly
|
|
#================================
|
|
#-----------------------------------------
|
|
# Combined merged_df2
|
|
#-----------------------------------------
|
|
merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]
|