re-adding deleted combining_dfs_plotting.R

2020-09-10 15:28:10 +01:00 · 2020-09-10 15:28:10 +01:00 · fdecc944fc
commit fdecc944fc
parent d43ecfa1dc
2 changed files with 443 additions and 6 deletions
--- a/scripts/combining_dfs_plotting.R
+++ b/scripts/combining_dfs_plotting.R
@ -0,0 +1,439 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: To combine struct params and meta data for plotting
 # Input csv files:
 # 1) <gene>_all_params.csv
 # 2) <gene>_meta_data.csv
 # Output: 
 # 1) muts with opposite effects on stability
 # 2) large combined df including NAs for AF, OR,etc
 # 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
 # 3) small combined df including NAs for AF, OR, etc.
 # 		Dim: same as mcsm data
 # 4) large combined df excluding NAs 
 # 		Dim: dim(#1) - na_count_df2
 # 5) small combined df excluding NAs
 # 		Dim: dim(#2) - na_count_df3
 # This script is sourced from other .R scripts for plotting
 #########################################################
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("Header_TT.R")
 #require(data.table)
 #require(arsenal)
 #require(compare)
 #library(tidyverse)
 source("plotting_data.R")
 # should return the following dfs, directories and variables
 # my_df
 # my_df_u
 # my_df_u_lig
 # dup_muts
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nLength of upos:", length(upos)
           , "\nAngstrom symbol:", angstroms_symbol))       
 # clear excess variable
 rm(my_df, upos, dup_muts)
 #========================================================
 #===========
 # input
 #===========
 #in_file1: output of plotting_data.R
 # my_df_u
 # quick checks
 head(my_df_u[, c("mutation")])
 cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
 foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
 table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)))
 baz = read.csv(file.choose())
 baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
 baz = as.data.frame(baz)
 colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
 sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq))
 cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq))
    ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin)))
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
 infile_gene_metadata = paste0(outdir, "/", in_filename_gene_metadata)
 cat(paste0("Input infile 2:", infile_gene_metadata))
 #===========
 # output
 #===========
 # other variables that you can write
 # primarily called by other scripts for plotting
 # PS combined: 
 # 1) merged_df2
 # 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
 # LIG combined: 
 # 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
 #%%===============================================================
 ###########################
 # 2: Read file: <gene>_meta data.csv
 ###########################
 cat("Reading meta data file:", infile_gene_metadata)
 gene_metadata <- read.csv(infile_gene_metadata
                      , stringsAsFactors = F
                      , header = T)
 cat("Dim:", dim(gene_metadata))
 # counting NAs in AF, OR cols
 # or_mychisq
 if (identical(sum(is.na(my_df_u$or_mychisq))
              , sum(is.na(my_df_u$pval_fisher))
              , sum(is.na(my_df_u$af)))){
  cat("\nPASS: NA count match for OR, pvalue and AF\n")
  na_count = sum(is.na(my_df_u$af))
  cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_mychisq)))
 } else{
  cat("\nFAIL: NA count mismatch"
      , "\nNA in OR: ", sum(is.na(my_df_u$or_mychisq))
      , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
      , "\nNA in AF:", sum(is.na(my_df_u$af)))
 }
 # or kin
 if (identical(sum(is.na(my_df_u$or_kin))
              , sum(is.na(my_df_u$pwald_kin))
              , sum(is.na(my_df_u$af_kin)))){
  cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
  na_count = sum(is.na(my_df_u$af_kin))
  cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
 } else{
  cat("\nFAIL: NA count mismatch"
      , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
      , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
      , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
 }
 str(gene_metadata)
 # change category of ambiguos mutations
 table(gene_metadata$mutation_info)
 cols_to_extract2  = c("mutationinformation", "mutation", "mutation_info")
 foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2]
 dr_muts = foo2[foo2$mutation_info == dr_muts_col,]
 other_muts = foo2[foo2$mutation_info == other_muts_col,]
 common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
 #write.csv(common_muts, 'common_muts.csv')
 rm(common_muts)
 # FIXME read properly
 # "ambiguous_mut_names.csv"
 #"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe"
 ambiguous_muts = read.csv(file.choose())
 ambiguous_muts_names = ambiguous_muts$mutation
 common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
 if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
  print('change me')
 }
 # make a copy
 gene_metadata2 = gene_metadata
 table(gene_metadata$mutation_info)
 count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info)))
 #count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass")
 table(count_check$checks)
 poo = c("V180F", "G132A", "D49G")
 poo2 = count_check[rownames(count_check)%in%poo,]
 poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0
 poo2$checks = ifelse(all(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]])>0, "ambi", "pass")
 # remove common_muts_all
 ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids)
 gene_metadata_unambiguous = gene_metadata2[!ids,]
 # sanity checks: should be true
 table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous)
 nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata)
 # correct common muts
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
 # change the other_muts to dr_muts
 common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
 table(common_muts_all$mutation_info)
 # add it back to 
 gene_metadata2 = rbind(gene_metadata_unambiguous, common_muts_all)
 nrow(gene_metadata2) == nrow(gene_metadata)
 ###################################################################
 #                           combining: PS
 ###################################################################
 # sort by position (same as my_df)
 head(gene_metadata$position)
 gene_metadata = gene_metadata[order(gene_metadata$position),]
 head(gene_metadata$position)
 #=========================
 # Merge 1: merged_df2
 # dfs with NAs in ORs
 #=========================
 head(my_df_u$mutationinformation)
 head(gene_metadata$mutationinformation)
 # Find common columns b/w two df
 merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
 cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
           , "\nNo. of merging cols:", length(merging_cols)
           , "\nMerging columns identified:"))
 print(merging_cols)
 # important checks!
 table(nchar(my_df_u$mutationinformation))
 table(nchar(my_df_u$wild_type))
 table(nchar(my_df_u$mutant_type))
 table(nchar(my_df_u$position))
 # all.y  because x might contain non-structural positions!
 merged_df2 = merge(x = gene_metadata
                  , y = my_df_u
                  , by = merging_cols
                  , all.y = T)
 cat("Dim of merged_df2: ", dim(merged_df2))
 head(merged_df2$position)
 # sanity check 
 cat("Checking nrows in merged_df2")
 if(nrow(gene_metadata) == nrow(merged_df2)){
  cat("PASS: nrow(merged_df2) = nrow (gene associated gene_metadata)"
      ,"\nExpected no. of rows: ",nrow(gene_metadata) 
      ,"\nGot no. of rows: ", nrow(merged_df2))
 } else{
  cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
      , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
      , "\nGot no. of rows: ", nrow(merged_df2)
      , "\nFinding discrepancy")
  merged_muts_u = unique(merged_df2$mutationinformation)
  meta_muts_u = unique(gene_metadata$mutationinformation)
    # find the index where it differs
  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
  quit()
 }
 #=========================
 # Merge 2: merged_df3
 # dfs with NAs in ORs
 #
 # Cannot trust lineage, country from this df as the same mutation
 # can have many different lineages
 # but this should be good for the numerical corr plots
 #=========================
 # remove duplicated mutations
 cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)"
    ,"\nCannot trust lineage info from this"
    ,"\nlinking col: mutationinforamtion"
    ,"\nfilename: merged_df3")
 merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] 
 head(merged_df3$position); tail(merged_df3$position) # should be sorted
 # sanity check
 cat("Checking nrows in merged_df3")
 if(nrow(my_df_u) == nrow(merged_df3)){
  cat("PASS: No. of rows match with my_df"
      ,"\nExpected no. of rows: ", nrow(my_df_u)
      ,"\nGot no. of rows: ", nrow(merged_df3))
 } else {
  cat("FAIL: No. of rows mismatch"
      , "\nNo. of rows my_df: ", nrow(my_df_u)
      , "\nNo. of rows merged_df3: ", nrow(merged_df3))
  quit()
 }
 # counting NAs in AF, OR cols in merged_df3
 # this is because mcsm has no AF, OR cols,
 # so you cannot count NAs
 if (identical(sum(is.na(merged_df3$or_kin))
              , sum(is.na(merged_df3$pwald_kin))
              , sum(is.na(merged_df3$af_kin)))){
  cat("PASS: NA count match for OR, pvalue and AF\n")
  na_count_df3 = sum(is.na(merged_df3$af_kin))
  cat("No. of NAs: ", sum(is.na(merged_df3$or_kin)))
 } else{
  cat("FAIL: NA count mismatch"
      , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
      , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
      , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
 }
 # check if the same or and afs are missing for 
 if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin)))
  && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin)))
  && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){
  cat("PASS: Indices match for mychisq and kin ors missing values")
 } else{
  cat("Index mismatch: mychisq and kin ors missing indices match")
  quit()
 }
 #=========================
 # Merge3: merged_df2_comp
 # same as merge 1 but excluding NAs from ORs, etc.
 #=========================
 cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
    ,"\nlinking col: Mutationinforamtion"
    ,"\nfilename: merged_df2_comp")
 if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
  na_count_df2 = sum(is.na(merged_df2$af))
  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
  # sanity check: no +-1 gymnastics
  cat("Checking nrows in merged_df2_comp")
  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
    cat("\nPASS: No. of rows match"
        ,"\nDim of merged_df2_comp: "
        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
        , "\nNo. of rows: ", nrow(merged_df2_comp)
        , "\nNo. of cols: ", ncol(merged_df2_comp))
  }else{
    cat("FAIL: No. of rows mismatch"
        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
  }
 }else{
  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 }
 #=========================
 # Merge4: merged_df3_comp
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================
 if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
  na_count_df3 = sum(is.na(merged_df3$af))
  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
  cat("Checking nrows in merged_df3_comp")
  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
    cat("\nPASS: No. of rows match"
        ,"\nDim of merged_df3_comp: "
        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
        , "\nNo. of rows: ", nrow(merged_df3_comp)
        , "\nNo. of cols: ", ncol(merged_df3_comp))
    }else{
    cat("FAIL: No. of rows mismatch"
        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
   }
 } else{
  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 } 
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$af),]
 bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
 # compare dfs: foo and merged_df3_com
 all.equal(foo, bar)
 #summary(comparedf(foo, bar))
 #==============================================================
 #################
 # OPTIONAL: write ALL 4 output files 
 #################
 #outvars = c("merged_df2"
 #             , "merged_df3"
 #             , "merged_df2_comp"
 #             , "merged_df3_comp")
 #cat("Writing output files: "
 #    , "\nPath:", outdir)
 #for (i in outvars){
 #  out_filename = paste0(i, ".csv")
 #  outfile = paste0(outdir, "/", out_filename)
 #  cat("Writing output file:"
 #      ,"\nFilename: ", out_filename,"\n")
 #  write.csv(get(i), outfile, row.names = FALSE)
 #  cat("Finished writing: ", outfile
 #      , "\nNo. of rows: ", nrow(get(i))
 #      , "\nNo. of cols: ", ncol(get(i)), "\n")
 #}
 #*************************
 # clear variables
 rm(foo, bar, gene_metadata
   , in_filename_params, infile_params, merging_cols
   , in_filename_gene_metadata, infile_gene_metadata
   , merged_df2v2, merged_df2v3)
 #*************************
 #####################################################################
 #                       Combining: LIG
 #####################################################################
 #=========================
 # Merges 5-8
 #=========================
 merged_df2_lig = merged_df2[merged_df2$ligand_distance<10,]
 merged_df2_comp_lig = merged_df2_comp[merged_df2_comp$ligand_distance<10,]
 merged_df3_lig = merged_df3[merged_df3$ligand_distance<10,]
 merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,]
 # sanity check
 if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
  print("PASS: verified merged_df3_lig")
 }else{
  cat(paste0("FAIL: nrow mismatch for merged_df3_lig"
             , "\nExpected:", nrow(my_df_u_lig)
             , "\nGot:", nrow(merged_df3_lig)))
 }
 #==========================================================================
 # end of script
 ##=========================================================================
--- a/scripts/plotting/other_plots.R
+++ b/scripts/plotting/other_plots.R
@ -18,10 +18,8 @@ source("other_plots_data.R")
 #=======
 # output
 #=======
-#dr_other_plots_combined = "dr_other_combined.svg"
+dr_other_plots_combined = "dr_other_muts.svg"
-#plot_dr_other_plots_combined  =  paste0(plotdir,"/", dr_other_plots_combined)
+plot_dr_other_plots_combined  =  paste0(plotdir,"/", dr_other_plots_combined)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
@ -160,8 +158,8 @@ p3
 # combine
 #===========================
 #svg(plot_or_combined, width = 32, height = 12)
-
+svg("test.svg", width = 25, height = 12)
-theme_set(theme_gray()) # to preserve default theme
+#theme_set(theme_gray()) # to preserve default theme
 printFile = cowplot::plot_grid(plot_grid(p1, p2, p3
                                         , nrow = 3