handling missing dir for data_extraction.py
This commit is contained in:
parent
b0b9e91af7
commit
f9fd74812a
3 changed files with 51 additions and 17 deletions
|
@ -59,14 +59,14 @@ import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import argparse
|
import argparse
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% homdir and curr dir and local imports
|
#%% dir and local imports
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
# set working dir
|
# set working dir
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
|
|
||||||
# import aa dict
|
# Requires
|
||||||
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
||||||
from tidy_split import tidy_split
|
from tidy_split import tidy_split
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
|
@ -74,16 +74,52 @@ from tidy_split import tidy_split
|
||||||
arg_parser = argparse.ArgumentParser()
|
arg_parser = argparse.ArgumentParser()
|
||||||
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
|
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
|
||||||
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
||||||
|
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||||
|
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||||
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||||
|
|
||||||
|
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
||||||
|
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
args = arg_parser.parse_args()
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% variable assignment: input and output paths & filenames
|
#%% variable assignment: input and output paths & filenames
|
||||||
drug = args.drug
|
drug = args.drug
|
||||||
gene = args.gene
|
gene = args.gene
|
||||||
|
datadir = args.datadir
|
||||||
|
indir = args.input_dir
|
||||||
|
outdir = args.output_dir
|
||||||
|
|
||||||
#drug = 'pyrazinamide'
|
#drug = 'pyrazinamide'
|
||||||
#gene = 'pncA'
|
#gene = 'pncA'
|
||||||
|
|
||||||
|
#%% input and output dirs and files
|
||||||
|
#=======
|
||||||
|
# dirs
|
||||||
|
#=======
|
||||||
|
if not datadir:
|
||||||
|
datadir = homedir + '/' + 'git/Data'
|
||||||
|
|
||||||
|
if not indir:
|
||||||
|
indir = datadir + '/' + drug + '/input'
|
||||||
|
|
||||||
|
if not outdir:
|
||||||
|
outdir = datadir + '/' + drug + '/output'
|
||||||
|
|
||||||
|
# handle missing dirs here
|
||||||
|
if not os.path.isdir(datadir):
|
||||||
|
print('ERROR: Data directory does not exist:', datadir
|
||||||
|
, '\nPlease create and ensure gwas data is present and then rerun')
|
||||||
|
sys.exit()
|
||||||
|
if not os.path.isdir(indir):
|
||||||
|
print('ERROR: Input directory does not exist:', indir
|
||||||
|
, '\nPlease either create or specify indir and rerun')
|
||||||
|
sys.exit()
|
||||||
|
if not os.path.isdir(outdir):
|
||||||
|
print('ERROR: Output directory does not exist:', outdir
|
||||||
|
, '\nPlease create or specify outdir and rerun')
|
||||||
|
sys.exit()
|
||||||
|
#=======================================================================
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
print('mut pattern for gene', gene, ':', gene_match)
|
print('mut pattern for gene', gene, ':', gene_match)
|
||||||
|
|
||||||
|
@ -114,13 +150,6 @@ print('Extracting columns based on variables:\n'
|
||||||
, resistance_col
|
, resistance_col
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% input and output dirs and files
|
|
||||||
#=======
|
|
||||||
# dirs
|
|
||||||
#=======
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
|
|
||||||
#=======
|
#=======
|
||||||
# input
|
# input
|
||||||
|
|
|
@ -41,9 +41,9 @@ datadir = homedir + '/' + 'git/Data'
|
||||||
#=======
|
#=======
|
||||||
in_filename = 'aa_codes.csv'
|
in_filename = 'aa_codes.csv'
|
||||||
infile = datadir + '/' + in_filename
|
infile = datadir + '/' + in_filename
|
||||||
print('Input filename:', in_filename
|
#print('Input filename:', in_filename
|
||||||
, '\nInput path:', datadir
|
# , '\nInput path:', datadir
|
||||||
, '\n============================================================')
|
# , '\n============================================================')
|
||||||
|
|
||||||
#%% Read input file
|
#%% Read input file
|
||||||
aa_table = pd.read_csv(infile) #20, 6
|
aa_table = pd.read_csv(infile) #20, 6
|
||||||
|
@ -67,11 +67,11 @@ my_aa.index
|
||||||
# using 'index' creates a dict of dicts
|
# using 'index' creates a dict of dicts
|
||||||
# using 'records' creates a list of dicts
|
# using 'records' creates a list of dicts
|
||||||
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
|
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
|
||||||
print('Printing my_aa_dict:', my_aa_dict.keys())
|
#print('Printing my_aa_dict:', my_aa_dict.keys())
|
||||||
|
|
||||||
#FIXME : use the below in all code
|
#FIXME : use the below in all code
|
||||||
low_3letter_dict = my_aa.to_dict('index') #20, with 5 subkeys
|
low_3letter_dict = my_aa.to_dict('index') #20, with 5 subkeys
|
||||||
print('Printing lower-case 3 letter aa dict:',low_3letter_dict.keys())
|
#print('Printing lower-case 3 letter aa dict:',low_3letter_dict.keys())
|
||||||
|
|
||||||
#------------------------
|
#------------------------
|
||||||
#2) 1-letter code as key
|
#2) 1-letter code as key
|
||||||
|
@ -81,7 +81,7 @@ aa_1let.columns
|
||||||
aa_1let.index
|
aa_1let.index
|
||||||
|
|
||||||
oneletter_aa_dict = aa_1let.to_dict('index') #20, with 5 subkeys
|
oneletter_aa_dict = aa_1let.to_dict('index') #20, with 5 subkeys
|
||||||
print('Printing one letter aa dict:', oneletter_aa_dict.keys())
|
#print('Printing one letter aa dict:', oneletter_aa_dict.keys())
|
||||||
|
|
||||||
#------------------------
|
#------------------------
|
||||||
#3) amino acid name as key
|
#3) amino acid name as key
|
||||||
|
@ -91,7 +91,7 @@ aa_name.columns
|
||||||
aa_name.index
|
aa_name.index
|
||||||
|
|
||||||
aa_name_dict = aa_name.to_dict('index') #20, with 5 subkeys
|
aa_name_dict = aa_name.to_dict('index') #20, with 5 subkeys
|
||||||
print('Printing amino acid names aa dict:', aa_name_dict.keys())
|
#print('Printing amino acid names aa dict:', aa_name_dict.keys())
|
||||||
|
|
||||||
#------------------------
|
#------------------------
|
||||||
#3) 3 letter uppercase as key
|
#3) 3 letter uppercase as key
|
||||||
|
@ -101,7 +101,7 @@ aa_up3let.columns
|
||||||
aa_up3let.index
|
aa_up3let.index
|
||||||
|
|
||||||
up_3letter_aa_dict = aa_up3let.to_dict('index') #20, with 5 subkeys
|
up_3letter_aa_dict = aa_up3let.to_dict('index') #20, with 5 subkeys
|
||||||
print('Printing upper case 3 letter aa dict:', up_3letter_aa_dict.keys())
|
#print('Printing upper case 3 letter aa dict:', up_3letter_aa_dict.keys())
|
||||||
|
|
||||||
#================================================
|
#================================================
|
||||||
# dict of aa with their corresponding properties
|
# dict of aa with their corresponding properties
|
||||||
|
|
|
@ -1,3 +1,8 @@
|
||||||
|
#========
|
||||||
|
# data extraction: Must be run first to extract mutations for each drug-gene combination
|
||||||
|
#========
|
||||||
|
./data_extraction.py -d pyrazinamide -g pncA
|
||||||
|
|
||||||
#========
|
#========
|
||||||
# foldx
|
# foldx
|
||||||
#========
|
#========
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue