saving recovered combining_dfs_plotting.R after editing

2020-09-10 15:52:22 +01:00 · 2020-09-10 15:52:22 +01:00 · 01273a8184
commit 01273a8184
parent e023472091
1 changed files with 39 additions and 130 deletions
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -6,7 +6,6 @@
 # 2) <gene>_meta_data.csv

 # Output: 
-# 1) muts with opposite effects on stability
 # 2) large combined df including NAs for AF, OR,etc
 # 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
 # 3) small combined df including NAs for AF, OR, etc.
@ -36,18 +35,18 @@ source("plotting_data.R")
 # my_df_u_lig
 # dup_muts

-cat(paste0("Directories imported:"
-           , "\ndatadir:", datadir
-           , "\nindir:", indir
-           , "\noutdir:", outdir
-           , "\nplotdir:", plotdir))
+cat("Directories imported:"
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)

-cat(paste0("Variables imported:"
-           , "\ndrug:", drug
-           , "\ngene:", gene
-           , "\ngene_match:", gene_match
-           , "\nLength of upos:", length(upos)
-           , "\nAngstrom symbol:", angstroms_symbol))       
+cat("Variables imported:"
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nLength of upos:", length(upos)
+    , "\nAngstrom symbol:", angstroms_symbol)     

 # clear excess variable
 rm(my_df, upos, dup_muts)
@ -58,24 +57,6 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u

-# quick checks
-head(my_df_u[, c("mutation")])
-
-cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
-foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
-
-
-table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)))
-
-baz = read.csv(file.choose())
-
-baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
-baz = as.data.frame(baz)
-colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
-sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq))
-
-cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq))
-    ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin)))

 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
@ -145,67 +126,6 @@ if (identical(sum(is.na(my_df_u$or_kin))

 str(gene_metadata)

-# change category of ambiguos mutations
-table(gene_metadata$mutation_info)
-
-cols_to_extract2  = c("mutationinformation", "mutation", "mutation_info")
-foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2]
-
-dr_muts = foo2[foo2$mutation_info == dr_muts_col,]
-other_muts = foo2[foo2$mutation_info == other_muts_col,]
-
-common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
-#write.csv(common_muts, 'common_muts.csv')
-rm(common_muts)
-
-# FIXME read properly
-# "ambiguous_mut_names.csv"
-#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe"
-ambiguous_muts = read.csv(file.choose())
-ambiguous_muts_names = ambiguous_muts$mutation
-
-common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
-
-if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
-  print('change me')
-}
-
-# make a copy
-gene_metadata2 = gene_metadata
-table(gene_metadata$mutation_info)
-count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info)))
-#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass")
-table(count_check$checks)
-
-
-poo = c("V180F", "G132A", "D49G")
-poo2 = count_check[rownames(count_check)%in%poo,]
-poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0
-poo2$checks = ifelse(all(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]])>0, "ambi", "pass")
-
-# remove common_muts_all
-ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids)
-gene_metadata_unambiguous = gene_metadata2[!ids,]
-
-# sanity checks: should be true
-table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous)
-nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata)
-
-# correct common muts
-table(common_muts_all$mutation_info)
-common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
-
-# change the other_muts to dr_muts
-common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
-
-table(common_muts_all$mutation_info)
-common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
-table(common_muts_all$mutation_info)
-
-# add it back to 
-gene_metadata2 = rbind(gene_metadata_unambiguous, common_muts_all)
-nrow(gene_metadata2) == nrow(gene_metadata)
-
 ###################################################################
 #                           combining: PS
 ###################################################################
@ -326,53 +246,43 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
    ,"\nlinking col: Mutationinforamtion"
    ,"\nfilename: merged_df2_comp")

-if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
-  na_count_df2 = sum(is.na(merged_df2$af))
-  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
-  # sanity check: no +-1 gymnastics
-  cat("Checking nrows in merged_df2_comp")
-  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
-    cat("\nPASS: No. of rows match"
-        ,"\nDim of merged_df2_comp: "
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        , "\nNo. of rows: ", nrow(merged_df2_comp)
-        , "\nNo. of cols: ", ncol(merged_df2_comp))
-  }else{
+na_count_df2 = sum(is.na(merged_df2$af))
+merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
+# sanity check: no +-1 gymnastics
+cat("Checking nrows in merged_df2_comp")
+if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
+  cat("\nPASS: No. of rows match"
+      ,"\nDim of merged_df2_comp: "
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      , "\nNo. of rows: ", nrow(merged_df2_comp)
+      , "\nNo. of cols: ", ncol(merged_df2_comp))
+}else{
    cat("FAIL: No. of rows mismatch"
        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
-  }
-}else{
-  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 }
-
 #=========================
 # Merge4: merged_df3_comp
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================

-if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
-  na_count_df3 = sum(is.na(merged_df3$af))
-  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
-  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
-  cat("Checking nrows in merged_df3_comp")
-  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
-    cat("\nPASS: No. of rows match"
-        ,"\nDim of merged_df3_comp: "
-        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-        , "\nNo. of rows: ", nrow(merged_df3_comp)
-        , "\nNo. of cols: ", ncol(merged_df3_comp))
-    }else{
-    cat("FAIL: No. of rows mismatch"
-        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
-   }
-} else{
-  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
-} 
+na_count_df3 = sum(is.na(merged_df3$af))
+#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
+merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
+cat("Checking nrows in merged_df3_comp")
+if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
+  cat("\nPASS: No. of rows match"
+      ,"\nDim of merged_df3_comp: "
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+      , "\nNo. of rows: ", nrow(merged_df3_comp)
+      , "\nNo. of cols: ", ncol(merged_df3_comp))
+}else{
+  cat("FAIL: No. of rows mismatch"
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+      , "\nGot no. of rows: ", nrow(merged_df3_comp))
+}
+
  
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$af),]
@ -408,8 +318,7 @@ all.equal(foo, bar)
 # clear variables
 rm(foo, bar, gene_metadata
   , in_filename_params, infile_params, merging_cols
-   , in_filename_gene_metadata, infile_gene_metadata
-   , merged_df2v2, merged_df2v3)
+   , in_filename_gene_metadata, infile_gene_metadata)
 #*************************
 #####################################################################
 #                       Combining: LIG