saving recovered combining_dfs_plotting.R after editing

2020-09-10 15:52:22 +01:00 · 2020-09-10 15:52:22 +01:00 · 65841e4f5b
commit 65841e4f5b
parent 68050a93b4
1 changed files with 39 additions and 130 deletions
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -6,7 +6,6 @@
 # 2) <gene>_meta_data.csv
 # Output: 
 # 1) muts with opposite effects on stability
 # 2) large combined df including NAs for AF, OR,etc
 # 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
 # 3) small combined df including NAs for AF, OR, etc.
@ -36,18 +35,18 @@ source("plotting_data.R")
 # my_df_u_lig
 # dup_muts
-cat(paste0("Directories imported:"
+cat("Directories imported:"
-           , "\ndatadir:", datadir
+    , "\ndatadir:", datadir
-           , "\nindir:", indir
+    , "\nindir:", indir
-           , "\noutdir:", outdir
+    , "\noutdir:", outdir
-           , "\nplotdir:", plotdir))
+    , "\nplotdir:", plotdir)
-cat(paste0("Variables imported:"
+cat("Variables imported:"
-           , "\ndrug:", drug
+    , "\ndrug:", drug
-           , "\ngene:", gene
+    , "\ngene:", gene
-           , "\ngene_match:", gene_match
+    , "\ngene_match:", gene_match
-           , "\nLength of upos:", length(upos)
+    , "\nLength of upos:", length(upos)
-           , "\nAngstrom symbol:", angstroms_symbol))       
+    , "\nAngstrom symbol:", angstroms_symbol)     
 # clear excess variable
 rm(my_df, upos, dup_muts)
@ -58,24 +57,6 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u
 # quick checks
 head(my_df_u[, c("mutation")])
 cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
 foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
 table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)))
 baz = read.csv(file.choose())
 baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
 baz = as.data.frame(baz)
 colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
 sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq))
 cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq))
    ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin)))
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
@ -145,67 +126,6 @@ if (identical(sum(is.na(my_df_u$or_kin))
 str(gene_metadata)
 # change category of ambiguos mutations
 table(gene_metadata$mutation_info)
 cols_to_extract2  = c("mutationinformation", "mutation", "mutation_info")
 foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2]
 dr_muts = foo2[foo2$mutation_info == dr_muts_col,]
 other_muts = foo2[foo2$mutation_info == other_muts_col,]
 common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
 #write.csv(common_muts, 'common_muts.csv')
 rm(common_muts)
 # FIXME read properly
 # "ambiguous_mut_names.csv"
 #"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe"
 ambiguous_muts = read.csv(file.choose())
 ambiguous_muts_names = ambiguous_muts$mutation
 common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
 if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
  print('change me')
 }
 # make a copy
 gene_metadata2 = gene_metadata
 table(gene_metadata$mutation_info)
 count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info)))
 #count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass")
 table(count_check$checks)
 poo = c("V180F", "G132A", "D49G")
 poo2 = count_check[rownames(count_check)%in%poo,]
 poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0
 poo2$checks = ifelse(all(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]])>0, "ambi", "pass")
 # remove common_muts_all
 ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids)
 gene_metadata_unambiguous = gene_metadata2[!ids,]
 # sanity checks: should be true
 table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous)
 nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata)
 # correct common muts
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
 # change the other_muts to dr_muts
 common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
 table(common_muts_all$mutation_info)
 # add it back to 
 gene_metadata2 = rbind(gene_metadata_unambiguous, common_muts_all)
 nrow(gene_metadata2) == nrow(gene_metadata)
 ###################################################################
 #                           combining: PS
 ###################################################################
@ -326,54 +246,44 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
    ,"\nlinking col: Mutationinforamtion"
    ,"\nfilename: merged_df2_comp")
-if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
+na_count_df2 = sum(is.na(merged_df2$af))
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
+merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
-  na_count_df2 = sum(is.na(merged_df2$af))
+# sanity check: no +-1 gymnastics
-  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
+cat("Checking nrows in merged_df2_comp")
-  # sanity check: no +-1 gymnastics
+if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
-  cat("Checking nrows in merged_df2_comp")
+  cat("\nPASS: No. of rows match"
-  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
+      ,"\nDim of merged_df2_comp: "
-    cat("\nPASS: No. of rows match"
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        ,"\nDim of merged_df2_comp: "
+      , "\nNo. of rows: ", nrow(merged_df2_comp)
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      , "\nNo. of cols: ", ncol(merged_df2_comp))
-        , "\nNo. of rows: ", nrow(merged_df2_comp)
+}else{
        , "\nNo. of cols: ", ncol(merged_df2_comp))
  }else{
    cat("FAIL: No. of rows mismatch"
        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
  }
 }else{
  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 }
 #=========================
 # Merge4: merged_df3_comp
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================
-if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
+na_count_df3 = sum(is.na(merged_df3$af))
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
+#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
-  na_count_df3 = sum(is.na(merged_df3$af))
+merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
-  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
+cat("Checking nrows in merged_df3_comp")
-  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
+if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
-  cat("Checking nrows in merged_df3_comp")
+  cat("\nPASS: No. of rows match"
-  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
+      ,"\nDim of merged_df3_comp: "
-    cat("\nPASS: No. of rows match"
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-        ,"\nDim of merged_df3_comp: "
+      , "\nNo. of rows: ", nrow(merged_df3_comp)
-        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+      , "\nNo. of cols: ", ncol(merged_df3_comp))
-        , "\nNo. of rows: ", nrow(merged_df3_comp)
+}else{
-        , "\nNo. of cols: ", ncol(merged_df3_comp))
+  cat("FAIL: No. of rows mismatch"
-    }else{
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-    cat("FAIL: No. of rows mismatch"
+      , "\nGot no. of rows: ", nrow(merged_df3_comp))
        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
   }
 } else{
  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 }
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$af),]
 bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
@ -408,8 +318,7 @@ all.equal(foo, bar)
 # clear variables
 rm(foo, bar, gene_metadata
   , in_filename_params, infile_params, merging_cols
-   , in_filename_gene_metadata, infile_gene_metadata
+   , in_filename_gene_metadata, infile_gene_metadata)
   , merged_df2v2, merged_df2v3)
 #*************************
 #####################################################################
 #                       Combining: LIG