LSHTM_analysis/scripts/plotting/redundant/other_dfs_data.R

#!/usr/bin/env Rscript

# Didn't end up using it: sorted it at the source
# .py script to combine all dfs to output all_params

#################################################################
# TASK: Script to add all other dfs to merged_df2 and merged_df3

#################################################################
# Combine other dfs:
# dynamut_df, dynamut2_df, mcsm_na_df,
# perhaps : deepddg and mcsm ppi (for embb)
################################################################
# read other files
infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
                            , "_complex_dynamut_norm.csv")

infilename_dynamut2  = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
                              , "_complex_dynamut2_norm.csv")

infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
                            , "_complex_mcsm_na_norm.csv")

infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
                                 , "_mcsm_formatted_snps.csv")

dynamut_df   = read.csv(infilename_dynamut)
dynamut2_df  = read.csv(infilename_dynamut2)
mcsm_na_df   = read.csv(infilename_mcsm_na)
mcsm_f_snps  = read.csv(infilename_mcsm_f_snps, header = F)
names(mcsm_f_snps) = "mutationinformation"

#=================================
# check with intersect to find the common col, but use
c1 = length(intersect(names(dynamut_df), names(dynamut2_df)))
c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df)))

if (c1 == 1 && c2 == 1) {
  n_common = 1
}else{
  cat("\nMore than one common col found, inspect before merging!")
}

# mutationinformation column to be on the safe side
# delete chain from dynamut2_df
#dynamut2_df = subset(dynamut2_df, select = -chain)

# quick checks
lapply(list(dynamut_df
            , dynamut2_df
            , mcsm_na_df), ncol)

lapply(list(dynamut_df
            , dynamut2_df
            , mcsm_na_df), colnames)

lapply(list(dynamut_df
            , dynamut2_df
            , mcsm_na_df), nrow)

ncols_comb = lapply(list(dynamut_df
                         , dynamut2_df
                         , mcsm_na_df), ncol)

#---------------------------------
# Combine 1: all other params dfs
#---------------------------------
combined_dfs = Reduce(inner_join, list(dynamut_df
                                       , dynamut2_df
                                       , mcsm_na_df))
# Reduce("+", ncols_comb)

#-----------------------------------------
# Combine 2: combine1 result + merged_df2
#-----------------------------------------
drop_cols = intersect(names(combined_dfs), names(merged_df2))
drop_cols = drop_cols

drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")]

combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols]

nrow(combined_dfs_f); nrow(merged_df2)
ncol(combined_dfs_f); ncol(merged_df2)

#-----------------------------------------
# Combined merged_df2
#-----------------------------------------
merged_df2_combined = merge(merged_df2
                            , combined_dfs_f
                            , by = "mutationinformation"
)

expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1

if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){

  cat("\nPASS: merged_df2 combined with other parameters dfs."
      , "\nUse this for lineage distribution plots")
}else{

  cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs")
  quit()

}

rm(combined_dfs, combined_dfs_f)

#================================
# combined data
# short_df ps: ~ merged_df3
# TODO: later integrate properly
#================================
#-----------------------------------------
# Combined merged_df2
#-----------------------------------------
merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]