calculating af_or using function and cmd options now

2021-06-11 15:12:08 +01:00 · 2021-06-11 15:12:08 +01:00 · f88e2665e9
commit f88e2665e9
parent 7686aa39b4
5 changed files with 156 additions and 11 deletions
--- a/scripts/af_or_calcs.R
+++ b/scripts/af_or_calcs.R
@ -0,0 +1,141 @@
 #!/usr/bin/env Rscript                                                  
 #########################################################
 # TASK: To calculate Allele Frequency and
 # Odds Ratio from master data
 #########################################################
 # working dir 
 setwd("~/git/LSHTM_analysis/scripts")
 getwd()
 # load libraries
 #source("Header_TT.R")
 require("getopt", quietly = TRUE) # cmd parse arguments
 # load functions
 source("functions/plotting_globals.R")
 source("functions/mychisq_or.R")
 source("functions/myaf_or_calcs.R")
 #############################################################
 # command line args
 #********************
 spec = matrix(c(
  "drug"          ,"d" , 1, "character",
  "gene"          ,"g" , 1, "character",
  "master_data"   ,"m", 2, "character", 
  "gene_data"     ,"G", 2, "character", 
  "outfile"       ,"o" , 2, "character",
  "idcol"         ,"I", 2, "character",
  "drmuts_col"    ,"D", 2, "character", 
  "othermuts_col" ,"O", 2, "character"
 ), byrow = TRUE, ncol = 4)
 opt = getopt(spec)
 drug            = opt$drug
 gene            = opt$gene
 infile_master   = opt$master_data
 infile_metadata = opt$gene_data
 outfile         = opt$outfile
 idcol           = opt$idcol
 dr_muts_col     = opt$drmuts_col
 other_muts_col  = opt$othermuts_col
 if(is.null(drug)|is.null(gene)) {
  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
 # import_dirs()
 import_dirs(drug, gene)
 # setting sensible defaults for opt args
 #----------------------------
 # input file 1: master data
 #----------------------------
 #class(infile_master)
 if (is.null(infile_master)){
 #if (!is.character(infile) && exists("gene")){
  #in_filename_master  = 'original_tanushree_data_v2.csv' #19K
  in_filename_master  = 'mtb_gwas_meta_v6.csv' #35k
  infile_master = paste0(datadir, in_filename_master)
  cat("\nInput file 1 not specified, assuming filename: ", infile_master)
  cat(paste0("\nReading infile 1 i.e raw data: ", infile_master) )
 } 
 #---------------------------------------------------
 # input file 2: gene associated meta
 # data file to extract valid snps and add calcs to.
 #---------------------------------------------------
 if (is.null(infile_metadata)){
  # This is outfile_metadata from data_extraction.py
  in_filename_metadata =  paste0(tolower(gene), '_metadata.csv')
  infile_metadata = paste0(outdir, '/', in_filename_metadata)
  cat("\nInput file 2 not specified, assuming filename: ", infile_metadata)
  cat(paste0("\nReading infile 2 i.e gene associated metadata:", infile_metadata))
 }
 #-------------------------------------------
 # outfile: csv file containing AF and OR
 #-------------------------------------------
 if (is.null(outfile)){  
  # out_filename_af_or = paste0(tolower(gene), '_meta_data_with_AF_OR.csv')
  out_filename_af_or = paste0(tolower(gene), '_af_or.csv')
  outfile = paste0(outdir, '/', out_filename_af_or)
  cat("\nOutfile not specified, assuming filename: ", outfile)
  cat(paste0('\nOutput file with full path:', outfile))
 }
 #-------------------------------------------
 # idcol: column name "id"
 #-------------------------------------------
 if (is.null(idcol)){
  idcol = "id"
 }
 #-------------------------------------------
 # dr-and-others muts cols: comes from plotting_globals.R
 # colnames that can be constructed using drug 
 # (dr_mutations_<drug>), (other_mutations_<drug>)
 #-------------------------------------------
 if (is.null(dr_muts_col)){
  dr_muts_col 
  cat("\ndrug and other mut colnames not specified, sourcing from globals: "
  ,  dr_muts_col, "\n")
 }
 if (is.null(other_muts_col)){
 other_muts_col 
    cat("\ndrug and other mut colnames not specified, sourcing from globals: "
      , other_muts_col, "\n")
 }
 # Informing the user of the sensible defaults being used:
 cat("======================"
 , "\nParameters passed:"
 , "\n======================"
 , "\nDRUG name: ", drug, "\n"
 , "\nGENE name: ", gene, "\n"
 , "\nReading infile 1 i.e raw data: ", infile_master, "\n"
 , "\nReading infile 2 i.e gene associated metadata:", infile_metadata, "\n"
 , '\nOutput file with full path:', outfile, "\n"
 , "\nColumn name of id:", idcol, "\n"
 , "\ndr mutation colname:", dr_muts_col, "\n"
 , "\nother mutation colname:", other_muts_col, "\n")
 #=======================================================================
 #============================
 # call function: my_afor()
 #=============================
 my_afor(  drug
         , gene
         , infile_master
         , infile_metadata
         , outfile
         , idcol
         , dr_muts_col 
         , other_muts_col
         )
 #=======================================================================
--- a/scripts/functions/myaf_or_calcs.R
+++ b/scripts/functions/myaf_or_calcs.R
@ -1,15 +1,16 @@
-my_afor <- function (  infile_master
+my_afor <- function (  drug
                     , gene
                     , infile_master
                     , infile_metadata
                     , outfile
-                     , drug
+                     , idcol
                     , gene
                     , idcol = "id"
                     , dr_muts_col
                     , other_muts_col){
  #===========================================
  # 1: Read master/raw data stored in Data/
  #===========================================
  cat(infile_master)
  raw_data_all = read.csv(infile_master, stringsAsFactors = F)
  cat("\nExtracting columns based on variables:\n"
--- a/scripts/mut_electrostatic_changes.py
+++ b/scripts/mut_electrostatic_changes.py
@ -69,7 +69,8 @@ if not outdir:
 # input
 #=======
 #in_filename  = 'merged_df3.csv'
-in_filename  = gene.lower() + '_complex_mcsm_norm.csv'
+#in_filename  = gene.lower() + '_complex_mcsm_norm.csv'
 in_filename  = gene.lower() + '_complex_mcsm_norm_SRY.csv' # gid
 infile_merged_df3 = outdir + '/' + in_filename
 print('Input file: ', infile_merged_df3
      , '\n============================================================')
--- a/scripts/plotting/basic_barplots.R
+++ b/scripts/plotting/basic_barplots.R
@ -69,7 +69,6 @@ import_dirs(drug, gene)
  # dup_muts
 #***********************************
 #infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
 #infile = ""
 #if (!exists("infile") && exists("gene")){
 if (!is.character(infile) && exists("gene")){
@ -104,7 +103,6 @@ cat(paste0("\nVariables imported:"
 cat("plots will output to:", plotdir)
 #=======================================================================
 # begin plots
 # ------------------------------
 # barplot for mscm stability
 # ------------------------------
--- a/scripts/running_scripts
+++ b/scripts/running_scripts
@ -34,9 +34,10 @@ In progress...
 ./rd_df.py -d <drug> -g <gene> # fixme: input tsv file is sourced manually from website!
 #==============================
-# af_or calcs: different types
+# af_or_calcs.R: calculates af and or
 # opt defaults, uses sensible defaults
 #==============================
-./af_or_calcs.R -d <drug> -g <gene># fixme: No conditional dir structure
+./af_or_calcs.R -d <drug> -g <gene> 
 #==============================
 # af_or calcs: kinship
@ -62,6 +63,9 @@ USE THE BELOW from within the or_kinship_link.py script or something?! as part o
 # combining dfs: combining_dfs.py
 #==============================
 # FIXME: combining_FIXME.py
-./combining_dfs.py --d <drug> -g <gene>
+./combining_dfs.py -d <drug> -g <gene>
 #==============================
 mut_electrostatic_changes.py
 #==============================
 ./mut_electrostatic_changes.py -d <drug> -g <gene>