added shorter scripts for each different processing for plots to make it wasire to read code

2021-09-10 18:20:45 +01:00 · 2021-09-10 18:20:45 +01:00 · 3f3fe89a6b
commit 3f3fe89a6b
parent 27f0b15d4c
6 changed files with 1292 additions and 0 deletions
--- a/scripts/plotting/redundant/other_dfs_data.R
+++ b/scripts/plotting/redundant/other_dfs_data.R
@ -0,0 +1,117 @@
+#!/usr/bin/env Rscript  
+
+# Didn't end up using it: sorted it at the source
+# .py script to combine all dfs to output all_params
+
+#################################################################
+# TASK: Script to add all other dfs to merged_df2 and merged_df3
+
+#################################################################
+# Combine other dfs:
+# dynamut_df, dynamut2_df, mcsm_na_df, 
+# perhaps : deepddg and mcsm ppi (for embb)
+################################################################
+# read other files
+infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
+                            , "_complex_dynamut_norm.csv")
+
+infilename_dynamut2  = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
+                              , "_complex_dynamut2_norm.csv")
+
+infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
+                            , "_complex_mcsm_na_norm.csv")
+
+infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
+                                 , "_mcsm_formatted_snps.csv")
+
+dynamut_df   = read.csv(infilename_dynamut)
+dynamut2_df  = read.csv(infilename_dynamut2)
+mcsm_na_df   = read.csv(infilename_mcsm_na)
+mcsm_f_snps  = read.csv(infilename_mcsm_f_snps, header = F)
+names(mcsm_f_snps) = "mutationinformation"
+
+#=================================
+# check with intersect to find the common col, but use 
+c1 = length(intersect(names(dynamut_df), names(dynamut2_df)))
+c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df)))
+
+if (c1 == 1 && c2 == 1) {
+  n_common = 1
+}else{
+  cat("\nMore than one common col found, inspect before merging!")
+}
+
+# mutationinformation column to be on the safe side
+# delete chain from dynamut2_df
+#dynamut2_df = subset(dynamut2_df, select = -chain)
+
+# quick checks
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), ncol)
+
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), colnames)
+
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), nrow)
+
+ncols_comb = lapply(list(dynamut_df
+                         , dynamut2_df
+                         , mcsm_na_df), ncol)
+
+#---------------------------------
+# Combine 1: all other params dfs
+#---------------------------------
+combined_dfs = Reduce(inner_join, list(dynamut_df
+                                       , dynamut2_df
+                                       , mcsm_na_df))
+# Reduce("+", ncols_comb)
+
+#-----------------------------------------
+# Combine 2: combine1 result + merged_df2
+#-----------------------------------------
+drop_cols = intersect(names(combined_dfs), names(merged_df2))
+drop_cols = drop_cols
+
+drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")]
+
+combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols]
+
+nrow(combined_dfs_f); nrow(merged_df2)
+ncol(combined_dfs_f); ncol(merged_df2)
+
+#-----------------------------------------
+# Combined merged_df2
+#-----------------------------------------
+merged_df2_combined = merge(merged_df2
+                            , combined_dfs_f
+                            , by = "mutationinformation"
+)
+
+expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1
+
+if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){
+  
+  cat("\nPASS: merged_df2 combined with other parameters dfs."
+      , "\nUse this for lineage distribution plots")
+}else{
+  
+  cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs")
+  quit()
+  
+}
+
+rm(combined_dfs, combined_dfs_f)
+
+#================================
+# combined data
+# short_df ps: ~ merged_df3
+# TODO: later integrate properly
+#================================
+#-----------------------------------------
+# Combined merged_df2
+#-----------------------------------------
+merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]