handling missing dir for data_extraction.py

This commit is contained in:
Tanushree Tunstall 2020-11-12 13:21:06 +00:00
parent c7194b7423
commit e67fbfd986
3 changed files with 51 additions and 17 deletions

View file

@ -59,14 +59,14 @@ import pandas as pd
import numpy as np import numpy as np
import argparse import argparse
#======================================================================= #=======================================================================
#%% homdir and curr dir and local imports #%% dir and local imports
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
# set working dir # set working dir
os.getcwd() os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd() os.getcwd()
# import aa dict # Requires
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
from tidy_split import tidy_split from tidy_split import tidy_split
#======================================================================= #=======================================================================
@ -74,16 +74,52 @@ from tidy_split import tidy_split
arg_parser = argparse.ArgumentParser() arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
args = arg_parser.parse_args() args = arg_parser.parse_args()
#======================================================================= #=======================================================================
#%% variable assignment: input and output paths & filenames #%% variable assignment: input and output paths & filenames
drug = args.drug drug = args.drug
gene = args.gene gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
#drug = 'pyrazinamide' #drug = 'pyrazinamide'
#gene = 'pncA' #gene = 'pncA'
#%% input and output dirs and files
#=======
# dirs
#=======
if not datadir:
datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
# handle missing dirs here
if not os.path.isdir(datadir):
print('ERROR: Data directory does not exist:', datadir
, '\nPlease create and ensure gwas data is present and then rerun')
sys.exit()
if not os.path.isdir(indir):
print('ERROR: Input directory does not exist:', indir
, '\nPlease either create or specify indir and rerun')
sys.exit()
if not os.path.isdir(outdir):
print('ERROR: Output directory does not exist:', outdir
, '\nPlease create or specify outdir and rerun')
sys.exit()
#=======================================================================
gene_match = gene + '_p.' gene_match = gene + '_p.'
print('mut pattern for gene', gene, ':', gene_match) print('mut pattern for gene', gene, ':', gene_match)
@ -114,13 +150,6 @@ print('Extracting columns based on variables:\n'
, resistance_col , resistance_col
, '\n===============================================================') , '\n===============================================================')
#======================================================================= #=======================================================================
#%% input and output dirs and files
#=======
# dirs
#=======
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/' + 'input'
outdir = datadir + '/' + drug + '/' + 'output'
#======= #=======
# input # input

View file

@ -41,9 +41,9 @@ datadir = homedir + '/' + 'git/Data'
#======= #=======
in_filename = 'aa_codes.csv' in_filename = 'aa_codes.csv'
infile = datadir + '/' + in_filename infile = datadir + '/' + in_filename
print('Input filename:', in_filename #print('Input filename:', in_filename
, '\nInput path:', datadir # , '\nInput path:', datadir
, '\n============================================================') # , '\n============================================================')
#%% Read input file #%% Read input file
aa_table = pd.read_csv(infile) #20, 6 aa_table = pd.read_csv(infile) #20, 6
@ -67,11 +67,11 @@ my_aa.index
# using 'index' creates a dict of dicts # using 'index' creates a dict of dicts
# using 'records' creates a list of dicts # using 'records' creates a list of dicts
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
print('Printing my_aa_dict:', my_aa_dict.keys()) #print('Printing my_aa_dict:', my_aa_dict.keys())
#FIXME : use the below in all code #FIXME : use the below in all code
low_3letter_dict = my_aa.to_dict('index') #20, with 5 subkeys low_3letter_dict = my_aa.to_dict('index') #20, with 5 subkeys
print('Printing lower-case 3 letter aa dict:',low_3letter_dict.keys()) #print('Printing lower-case 3 letter aa dict:',low_3letter_dict.keys())
#------------------------ #------------------------
#2) 1-letter code as key #2) 1-letter code as key
@ -81,7 +81,7 @@ aa_1let.columns
aa_1let.index aa_1let.index
oneletter_aa_dict = aa_1let.to_dict('index') #20, with 5 subkeys oneletter_aa_dict = aa_1let.to_dict('index') #20, with 5 subkeys
print('Printing one letter aa dict:', oneletter_aa_dict.keys()) #print('Printing one letter aa dict:', oneletter_aa_dict.keys())
#------------------------ #------------------------
#3) amino acid name as key #3) amino acid name as key
@ -91,7 +91,7 @@ aa_name.columns
aa_name.index aa_name.index
aa_name_dict = aa_name.to_dict('index') #20, with 5 subkeys aa_name_dict = aa_name.to_dict('index') #20, with 5 subkeys
print('Printing amino acid names aa dict:', aa_name_dict.keys()) #print('Printing amino acid names aa dict:', aa_name_dict.keys())
#------------------------ #------------------------
#3) 3 letter uppercase as key #3) 3 letter uppercase as key
@ -101,7 +101,7 @@ aa_up3let.columns
aa_up3let.index aa_up3let.index
up_3letter_aa_dict = aa_up3let.to_dict('index') #20, with 5 subkeys up_3letter_aa_dict = aa_up3let.to_dict('index') #20, with 5 subkeys
print('Printing upper case 3 letter aa dict:', up_3letter_aa_dict.keys()) #print('Printing upper case 3 letter aa dict:', up_3letter_aa_dict.keys())
#================================================ #================================================
# dict of aa with their corresponding properties # dict of aa with their corresponding properties

View file

@ -1,3 +1,8 @@
#========
# data extraction: Must be run first to extract mutations for each drug-gene combination
#========
./data_extraction.py -d pyrazinamide -g pncA
#======== #========
# foldx # foldx
#======== #========