plotting script with resolved gene metadata

2020-09-09 12:00:42 +01:00 · 2020-09-09 12:00:42 +01:00 · 31b98fb3d3
commit 31b98fb3d3
parent 774b34ef00
1 changed files with 48 additions and 138 deletions
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -36,18 +36,22 @@ source("plotting_data.R")
 # my_df_u_lig
 # dup_muts
-cat(paste0("Directories imported:"
+cat("Directories imported:"
-           , "\ndatadir:", datadir
+    , "\n===================="
-           , "\nindir:", indir
+    , "\ndatadir:", datadir
-           , "\noutdir:", outdir
+    , "\nindir:", indir
-           , "\nplotdir:", plotdir))
+    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)
 cat("Variables imported:"
    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nLength of upos:", length(upos)
           , "\nAngstrom symbol:", angstroms_symbol))       
 # clear excess variable
 rm(my_df, upos, dup_muts)
@ -58,25 +62,6 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u
 # quick checks
 head(my_df_u[, c("mutation")])
 cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
 foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
 table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)))
 baz = read.csv(file.choose())
 baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
 baz = as.data.frame(baz)
 colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
 sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq))
 cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq))
    ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin)))
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@ -113,6 +98,8 @@ gene_metadata <- read.csv(infile_gene_metadata
                      , header = T)
 cat("Dim:", dim(gene_metadata))
 table(gene_metadata$mutation_info)
 # counting NAs in AF, OR cols
 # or_mychisq
@ -145,66 +132,6 @@ if (identical(sum(is.na(my_df_u$or_kin))
 str(gene_metadata)
 # change category of ambiguos mutations
 table(gene_metadata$mutation_info)
 cols_to_extract2  = c("mutationinformation", "mutation", "mutation_info")
 foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2]
 dr_muts = foo2[foo2$mutation_info == dr_muts_col,]
 other_muts = foo2[foo2$mutation_info == other_muts_col,]
 common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
 #write.csv(common_muts, 'common_muts.csv')
 rm(common_muts)
 # FIXME read properly
 # "ambiguous_mut_names.csv"
 #"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe"
 ambiguous_muts = read.csv(file.choose())
 ambiguous_muts_names = ambiguous_muts$mutation
 common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
 if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
  print('change me')
 }
 # make a copy
 gene_metadata2 = gene_metadata
 table(gene_metadata$mutation_info)
 count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info)))
 #count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass")
 table(count_check$checks)
 poo = c("V180F", "G132A", "D49G")
 poo2 = count_check[rownames(count_check)%in%poo,]
 poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0
 poo2$checks = ifelse(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0, "ambi", "pass")
 # remove common_muts_all
 ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids)
 gene_metadata_unambiguous = gene_metadata2[!ids,]
 # sanity checks: should be true
 table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous)
 nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata)
 # correct common muts
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
 # change the other_muts to dr_muts
 common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
 table(common_muts_all$mutation_info)
 # add it back to 
 gene_meta_data
 ###################################################################
 #                           combining: PS
 ###################################################################
@ -307,15 +234,6 @@ if (identical(sum(is.na(merged_df3$or_kin))
      , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
 }
 # check if the same or and afs are missing for 
 if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin)))
  && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin)))
  && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){
  cat("PASS: Indices match for mychisq and kin ors missing values")
 } else{
  cat("Index mismatch: mychisq and kin ors missing indices match")
  quit()
 }
 #=========================
 # Merge3: merged_df2_comp
@ -325,25 +243,21 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
    ,"\nlinking col: Mutationinforamtion"
    ,"\nfilename: merged_df2_comp")
-if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
+na_count_df2 = sum(is.na(merged_df2$af))
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
+merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
-  na_count_df2 = sum(is.na(merged_df2$af))
+
-  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
+# sanity check: no +-1 gymnastics
-  # sanity check: no +-1 gymnastics
+cat("Checking nrows in merged_df2_comp")
-  cat("Checking nrows in merged_df2_comp")
+if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
-  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
+  cat("\nPASS: No. of rows match"
-    cat("\nPASS: No. of rows match"
+      ,"\nDim of merged_df2_comp: "
-        ,"\nDim of merged_df2_comp: "
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      , "\nNo. of rows: ", nrow(merged_df2_comp)
-        , "\nNo. of rows: ", nrow(merged_df2_comp)
+      , "\nNo. of cols: ", ncol(merged_df2_comp))
        , "\nNo. of cols: ", ncol(merged_df2_comp))
  }else{
    cat("FAIL: No. of rows mismatch"
        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
  }
 }else{
-  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
+  cat("FAIL: No. of rows mismatch"
      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
      ,"\nGot no. of rows: ", nrow(merged_df2_comp))
 }
 #=========================
@ -351,26 +265,22 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================
 na_count_df3 = sum(is.na(merged_df3$af))
 #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
-if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
+merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
+cat("Checking nrows in merged_df3_comp")
-  na_count_df3 = sum(is.na(merged_df3$af))
+
-  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
+if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
-  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
+  cat("\nPASS: No. of rows match"
-  cat("Checking nrows in merged_df3_comp")
+      ,"\nDim of merged_df3_comp: "
-  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-    cat("\nPASS: No. of rows match"
+      , "\nNo. of rows: ", nrow(merged_df3_comp)
-        ,"\nDim of merged_df3_comp: "
+      , "\nNo. of cols: ", ncol(merged_df3_comp))
-        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+}else{
-        , "\nNo. of rows: ", nrow(merged_df3_comp)
+  cat("FAIL: No. of rows mismatch"
-        , "\nNo. of cols: ", ncol(merged_df3_comp))
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-    }else{
+      ,"\nGot no. of rows: ", nrow(merged_df3_comp))
    cat("FAIL: No. of rows mismatch"
        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
   }
 } else{
  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 }
 # alternate way of deriving merged_df3_comp