#!/usr/bin/env Rscript # Didn't end up using it: sorted it at the source # .py script to combine all dfs to output all_params ################################################################# # TASK: Script to add all other dfs to merged_df2 and merged_df3 ################################################################# # Combine other dfs: # dynamut_df, dynamut2_df, mcsm_na_df, # perhaps : deepddg and mcsm ppi (for embb) ################################################################ # read other files infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene , "_complex_dynamut_norm.csv") infilename_dynamut2 = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene , "_complex_dynamut2_norm.csv") infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene , "_complex_mcsm_na_norm.csv") infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene , "_mcsm_formatted_snps.csv") dynamut_df = read.csv(infilename_dynamut) dynamut2_df = read.csv(infilename_dynamut2) mcsm_na_df = read.csv(infilename_mcsm_na) mcsm_f_snps = read.csv(infilename_mcsm_f_snps, header = F) names(mcsm_f_snps) = "mutationinformation" #================================= # check with intersect to find the common col, but use c1 = length(intersect(names(dynamut_df), names(dynamut2_df))) c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df))) if (c1 == 1 && c2 == 1) { n_common = 1 }else{ cat("\nMore than one common col found, inspect before merging!") } # mutationinformation column to be on the safe side # delete chain from dynamut2_df #dynamut2_df = subset(dynamut2_df, select = -chain) # quick checks lapply(list(dynamut_df , dynamut2_df , mcsm_na_df), ncol) lapply(list(dynamut_df , dynamut2_df , mcsm_na_df), colnames) lapply(list(dynamut_df , dynamut2_df , mcsm_na_df), nrow) ncols_comb = lapply(list(dynamut_df , dynamut2_df , mcsm_na_df), ncol) #--------------------------------- # Combine 1: all other params dfs #--------------------------------- combined_dfs = Reduce(inner_join, list(dynamut_df , dynamut2_df , mcsm_na_df)) # Reduce("+", ncols_comb) #----------------------------------------- # Combine 2: combine1 result + merged_df2 #----------------------------------------- drop_cols = intersect(names(combined_dfs), names(merged_df2)) drop_cols = drop_cols drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")] combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols] nrow(combined_dfs_f); nrow(merged_df2) ncol(combined_dfs_f); ncol(merged_df2) #----------------------------------------- # Combined merged_df2 #----------------------------------------- merged_df2_combined = merge(merged_df2 , combined_dfs_f , by = "mutationinformation" ) expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1 if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){ cat("\nPASS: merged_df2 combined with other parameters dfs." , "\nUse this for lineage distribution plots") }else{ cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs") quit() } rm(combined_dfs, combined_dfs_f) #================================ # combined data # short_df ps: ~ merged_df3 # TODO: later integrate properly #================================ #----------------------------------------- # Combined merged_df2 #----------------------------------------- merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]