appened sys.path to allow local imports

This commit is contained in:
Tanushree Tunstall 2022-06-24 13:41:07 +01:00
parent a15ab80bc6
commit 7dc7e25016
2 changed files with 17 additions and 115 deletions

View file

@ -5,24 +5,26 @@ Created on Mon Jun 20 13:05:23 2022
@author: tanu
"""
#%%Imports ####################################################################
import re
import argparse
###############################################################################
import os, sys
# gene = 'pncA'
# drug = 'pyrazinamide'
#total_mtblineage_uc = 8
###############################################################################
#%% command line args: case sensitive
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args()
# arg_parser = argparse.ArgumentParser()
# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
# args = arg_parser.parse_args()
drug = args.drug
gene = args.gene
# drug = args.drug
# gene = args.gene
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
###############################################################################
#==================
# Import data
@ -35,16 +37,16 @@ from ml_data_7030 import *
#from UQ_yc_RunAllClfs import run_all_ML
#====================
# Import ML function
# Import ML functions
#====================
# TT run all ML clfs: baseline model
from MultModelsCl import MultModelsCl
from MultClfs import *
#==================
# other vars
#==================
tts_split_7030 = '70_30'
OutFile_suffix = '7030'
#==================
# Specify outdir
#==================
@ -54,57 +56,12 @@ print('\nOutput directory:', outdir_ml)
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
###############################################################################
score_type_ordermapD = { 'mcc' : 1
, 'fscore' : 2
, 'jcc' : 3
, 'precision' : 4
, 'recall' : 5
, 'accuracy' : 6
, 'roc_auc' : 7
, 'TN' : 8
, 'FP' : 9
, 'FN' : 10
, 'TP' : 11
, 'trainingY_neg': 12
, 'trainingY_pos': 13
, 'blindY_neg' : 14
, 'blindY_pos' : 15
, 'fit_time' : 16
, 'score_time' : 17
}
scoreCV_mapD = {'test_mcc' : 'MCC'
, 'test_fscore' : 'F1'
, 'test_precision' : 'Precision'
, 'test_recall' : 'Recall'
, 'test_accuracy' : 'Accuracy'
, 'test_roc_auc' : 'ROC_AUC'
, 'test_jcc' : 'JCC'
}
scoreBT_mapD = {'bts_mcc' : 'MCC'
, 'bts_fscore' : 'F1'
, 'bts_precision' : 'Precision'
, 'bts_recall' : 'Recall'
, 'bts_accuracy' : 'Accuracy'
, 'bts_roc_auc' : 'ROC_AUC'
, 'bts_jcc' : 'JCC'
}
# # data dependent variables but NOT dependent on resampling
# bts_size = len(X_bts)
# yc2 = Counter(y_bts)
# yc2_ratio = yc2[0]/yc2[1]
###############################################################################
#%% Running models ############################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: feature groups '
, '\nGene name:', gene
, '\nDrug name:', drug)
fooD = {'baseline_paramD': {
'input_df': X
, 'target': y
@ -160,62 +117,6 @@ scores_7030D = MultModelsCl(**rouC_paramD
, add_cm = True
, add_yn = True)
###############################################################################
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
#================
# Baseline
# SMOTE NC: SMNC
#================
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, tts_split_type = tts_split_7030
, resampling_type = 'smnc'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True
, return_formatted_output = True):
)
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
rs_smnc = 'smnc'
#------------------------
# WF: only CV and BTS
#-----------------------
smnc_allT = smnc_all_scores.T
smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
# map colnames for consistency to allow concatenting
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
smnc_CV['Data_source'] = 'CV'
smnc_CV['Resampling'] = rs_smnc
smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
# map colnames for consistency to allow concatenting
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
smnc_BT['Data_source'] = 'BT'
smnc_BT['Resampling'] = rs_smnc
# Write csv
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
# other data dependent variables
training_size_smnc = len(X_smnc)
n_features = len(X_smnc.columns)
yc1_smnc = Counter(y_smnc)
yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1]
smnc_all['training_size'] = training_size_smnc
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
smnc_all['n_features'] = n_features
###############################################################################
###############################################################################
###############################################################################
#%% COMBINING all dfs: WF and LF

View file

@ -35,9 +35,10 @@ from ml_data_7030 import *
#from UQ_yc_RunAllClfs import run_all_ML
#====================
# Import ML function
# Import ML functions
#====================
from MultClfs import *
#==================
# other vars
#==================