From 31b98fb3d37eb31b484168e353096372f3237327 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 12:00:42 +0100
Subject: [PATCH]  plotting script with resolved gene metadata

---
 scripts/plotting/combining_dfs_plotting.R | 186 ++++++----------------
 1 file changed, 48 insertions(+), 138 deletions(-)

diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R
index 5f5d284..7549de6 100644
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@@ -36,18 +36,22 @@ source("plotting_data.R")
 # my_df_u_lig
 # dup_muts
 
-cat(paste0("Directories imported:"
-           , "\ndatadir:", datadir
-           , "\nindir:", indir
-           , "\noutdir:", outdir
-           , "\nplotdir:", plotdir))
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
 
-cat(paste0("Variables imported:"
-           , "\ndrug:", drug
-           , "\ngene:", gene
-           , "\ngene_match:", gene_match
-           , "\nLength of upos:", length(upos)
-           , "\nAngstrom symbol:", angstroms_symbol))       
 
 # clear excess variable
 rm(my_df, upos, dup_muts)
@@ -58,25 +62,6 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u
 
-# quick checks
-head(my_df_u[, c("mutation")])
-
-cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
-foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
-
-
-table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)))
-
-baz = read.csv(file.choose())
-
-baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
-baz = as.data.frame(baz)
-colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
-sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq))
-
-cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq))
-    ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin)))
-
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@@ -113,6 +98,8 @@ gene_metadata <- read.csv(infile_gene_metadata
                       , header = T)
 cat("Dim:", dim(gene_metadata))
 
+table(gene_metadata$mutation_info)
+
 
 # counting NAs in AF, OR cols
 # or_mychisq
@@ -145,66 +132,6 @@ if (identical(sum(is.na(my_df_u$or_kin))
 
 str(gene_metadata)
 
-# change category of ambiguos mutations
-table(gene_metadata$mutation_info)
-
-cols_to_extract2  = c("mutationinformation", "mutation", "mutation_info")
-foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2]
-
-dr_muts = foo2[foo2$mutation_info == dr_muts_col,]
-other_muts = foo2[foo2$mutation_info == other_muts_col,]
-
-common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
-#write.csv(common_muts, 'common_muts.csv')
-rm(common_muts)
-
-# FIXME read properly
-# "ambiguous_mut_names.csv"
-#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe"
-ambiguous_muts = read.csv(file.choose())
-ambiguous_muts_names = ambiguous_muts$mutation
-
-common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
-
-if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
-  print('change me')
-}
-
-# make a copy
-gene_metadata2 = gene_metadata
-table(gene_metadata$mutation_info)
-count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info)))
-#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass")
-table(count_check$checks)
-
-
-poo = c("V180F", "G132A", "D49G")
-poo2 = count_check[rownames(count_check)%in%poo,]
-poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0
-poo2$checks = ifelse(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0, "ambi", "pass")
-
-# remove common_muts_all
-ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids)
-gene_metadata_unambiguous = gene_metadata2[!ids,]
-
-# sanity checks: should be true
-table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous)
-nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata)
-
-# correct common muts
-table(common_muts_all$mutation_info)
-common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
-
-# change the other_muts to dr_muts
-common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
-
-table(common_muts_all$mutation_info)
-common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
-table(common_muts_all$mutation_info)
-
-# add it back to 
-gene_meta_data
-
 ###################################################################
 #                           combining: PS
 ###################################################################
@@ -307,15 +234,6 @@ if (identical(sum(is.na(merged_df3$or_kin))
       , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
 }
 
-# check if the same or and afs are missing for 
-if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin)))
-  && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin)))
-  && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){
-  cat("PASS: Indices match for mychisq and kin ors missing values")
-} else{
-  cat("Index mismatch: mychisq and kin ors missing indices match")
-  quit()
-}
 
 #=========================
 # Merge3: merged_df2_comp
@@ -325,25 +243,21 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
     ,"\nlinking col: Mutationinforamtion"
     ,"\nfilename: merged_df2_comp")
 
-if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
-  na_count_df2 = sum(is.na(merged_df2$af))
-  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
-  # sanity check: no +-1 gymnastics
-  cat("Checking nrows in merged_df2_comp")
-  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
-    cat("\nPASS: No. of rows match"
-        ,"\nDim of merged_df2_comp: "
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        , "\nNo. of rows: ", nrow(merged_df2_comp)
-        , "\nNo. of cols: ", ncol(merged_df2_comp))
-  }else{
-    cat("FAIL: No. of rows mismatch"
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
-  }
+na_count_df2 = sum(is.na(merged_df2$af))
+merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
+
+# sanity check: no +-1 gymnastics
+cat("Checking nrows in merged_df2_comp")
+if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
+  cat("\nPASS: No. of rows match"
+      ,"\nDim of merged_df2_comp: "
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      , "\nNo. of rows: ", nrow(merged_df2_comp)
+      , "\nNo. of cols: ", ncol(merged_df2_comp))
 }else{
-  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
+  cat("FAIL: No. of rows mismatch"
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      ,"\nGot no. of rows: ", nrow(merged_df2_comp))
 }
 
 #=========================
@@ -351,28 +265,24 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================
+na_count_df3 = sum(is.na(merged_df3$af))
+#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
+
+merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
+cat("Checking nrows in merged_df3_comp")
+
+if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
+  cat("\nPASS: No. of rows match"
+      ,"\nDim of merged_df3_comp: "
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+      , "\nNo. of rows: ", nrow(merged_df3_comp)
+      , "\nNo. of cols: ", ncol(merged_df3_comp))
+}else{
+  cat("FAIL: No. of rows mismatch"
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+      ,"\nGot no. of rows: ", nrow(merged_df3_comp))
+}
 
-if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
-  na_count_df3 = sum(is.na(merged_df3$af))
-  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
-  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
-  cat("Checking nrows in merged_df3_comp")
-  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
-    cat("\nPASS: No. of rows match"
-        ,"\nDim of merged_df3_comp: "
-        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-        , "\nNo. of rows: ", nrow(merged_df3_comp)
-        , "\nNo. of cols: ", ncol(merged_df3_comp))
-    }else{
-    cat("FAIL: No. of rows mismatch"
-        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
-   }
-} else{
-  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
-} 
-  
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$af),]
 bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]