optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added
This commit is contained in:
parent
7dc7e25016
commit
b37a950fec
12 changed files with 180 additions and 128408 deletions
|
@ -197,35 +197,35 @@ def MultModelsCl(input_df, target, skf_cv
|
|||
# Specify multiple Classification Models
|
||||
#======================================================
|
||||
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
||||
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
|
||||
# , ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||
# , ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||
# , ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||
# , ('Gaussian NB' , GaussianNB() )
|
||||
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||
# , ('LDA' , LinearDiscriminantAnalysis() )
|
||||
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
|
||||
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||
, ('Gaussian NB' , GaussianNB() )
|
||||
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||
, ('LDA' , LinearDiscriminantAnalysis() )
|
||||
, ('Logistic Regression' , LogisticRegression(**rs) )
|
||||
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||
# , ('Multinomial' , MultinomialNB() )
|
||||
# , ('Naive Bayes' , BernoulliNB() )
|
||||
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||
# , ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
|
||||
# , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||
# , n_estimators = 1000
|
||||
# , bootstrap = True
|
||||
# , oob_score = True
|
||||
# , **njobs
|
||||
# , **rs
|
||||
# , max_features = 'auto') )
|
||||
# , ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||
# , ('SVC' , SVC(**rs) )
|
||||
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
|
||||
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||
, ('Multinomial' , MultinomialNB() )
|
||||
, ('Naive Bayes' , BernoulliNB() )
|
||||
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||
, ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
|
||||
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||
, n_estimators = 1000
|
||||
, bootstrap = True
|
||||
, oob_score = True
|
||||
, **njobs
|
||||
, **rs
|
||||
, max_features = 'auto') )
|
||||
, ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||
, ('SVC' , SVC(**rs) )
|
||||
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
|
||||
]
|
||||
|
||||
mm_skf_scoresD = {}
|
||||
|
@ -440,10 +440,11 @@ def ProcessMultModelsCl(inputD = {}):
|
|||
, '\nCV df:', len(scoresDF_CV.columns)
|
||||
, '\nBT_df:', len(scoresDF_BT.columns)
|
||||
, '\nmetaDF:', len(metaDF.columns))
|
||||
|
||||
if len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
|
||||
print('\nFirst proceeding to rowbind CV and BT dfs:')
|
||||
expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
|
||||
print('\nFinal output should have:',expected_ncols_out, 'columns' )
|
||||
print('\nFinal output should have:', expected_ncols_out, 'columns' )
|
||||
|
||||
#-----------------
|
||||
# Combine WF
|
||||
|
@ -496,8 +497,7 @@ def ProcessMultModelsCl(inputD = {}):
|
|||
sys.exit('\nFIRST IF FAILS')
|
||||
else:
|
||||
print('\nConcatenting dfs not possible [WF],check numbers ')
|
||||
|
||||
|
||||
|
||||
#-------------------------------------
|
||||
# Combine WF+Metadata: Final output
|
||||
#-------------------------------------
|
||||
|
@ -515,11 +515,15 @@ def ProcessMultModelsCl(inputD = {}):
|
|||
print('\nPASS: Combined df has expected ncols')
|
||||
else:
|
||||
sys.exit('\nFAIL: Length mismatch for combined_df')
|
||||
|
||||
print('\nAdding column: Model_name')
|
||||
|
||||
combDF['Model_name'] = combDF.index
|
||||
|
||||
print('\n========================================================='
|
||||
, '\nSUCCESS: Ran multiple classifiers'
|
||||
, '\n=======================================================')
|
||||
|
||||
|
||||
#resampling_methods_wf = combined_baseline_wf[['resampling']]
|
||||
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
||||
#, '\n', resampling_methods_wf)
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:464: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
PASS: x_features has no target variable
|
||||
|
||||
No. of columns for x_features: 174
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./run_7030.py", line 35, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 636, in setvars
|
||||
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
|
||||
train, test = next(cv.split(X=arrays[0], y=stratify))
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
|
||||
for train, test in self._iter_indices(X, y, groups):
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
|
||||
raise ValueError(
|
||||
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -557,7 +557,7 @@ def setvars(gene,drug):
|
|||
# FG5: Genomic features
|
||||
#========================
|
||||
X_gn_mafor_Fnum = ['maf'
|
||||
, 'logorI'
|
||||
#, 'logorI'
|
||||
# , 'or_rawI'
|
||||
# , 'or_mychisq'
|
||||
# , 'or_logistic'
|
||||
|
|
635
scripts/ml/run_7030.py
Executable file → Normal file
635
scripts/ml/run_7030.py
Executable file → Normal file
|
@ -5,28 +5,28 @@ Created on Mon Jun 20 13:05:23 2022
|
|||
|
||||
@author: tanu
|
||||
"""
|
||||
#%%Imports ####################################################################
|
||||
import re
|
||||
import argparse
|
||||
###############################################################################
|
||||
import os, sys
|
||||
|
||||
# gene = 'pncA'
|
||||
# drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
###############################################################################
|
||||
#%% command line args: case sensitive
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
||||
args = arg_parser.parse_args()
|
||||
# arg_parser = argparse.ArgumentParser()
|
||||
# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
||||
# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
||||
# args = arg_parser.parse_args()
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
# drug = args.drug
|
||||
# gene = args.gene
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# other vars
|
||||
#==================
|
||||
tts_split = '70/30'
|
||||
OutFile_suffix = '7030'
|
||||
homedir = os.path.expanduser("~")
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Import data
|
||||
|
@ -39,10 +39,15 @@ from ml_data_7030 import *
|
|||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
#====================
|
||||
# Import ML function
|
||||
# Import ML functions
|
||||
#====================
|
||||
# TT run all ML clfs: baseline model
|
||||
from MultModelsCl import MultModelsCl
|
||||
from MultClfs import *
|
||||
|
||||
#==================
|
||||
# other vars
|
||||
#==================
|
||||
tts_split_7030 = '70_30'
|
||||
OutFile_suffix = '7030'
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
|
@ -51,563 +56,71 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
|||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
||||
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
||||
#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
||||
|
||||
###############################################################################
|
||||
score_type_ordermapD = { 'mcc' : 1
|
||||
, 'fscore' : 2
|
||||
, 'jcc' : 3
|
||||
, 'precision' : 4
|
||||
, 'recall' : 5
|
||||
, 'accuracy' : 6
|
||||
, 'roc_auc' : 7
|
||||
, 'TN' : 8
|
||||
, 'FP' : 9
|
||||
, 'FN' : 10
|
||||
, 'TP' : 11
|
||||
, 'trainingY_neg': 12
|
||||
, 'trainingY_pos': 13
|
||||
, 'blindY_neg' : 14
|
||||
, 'blindY_pos' : 15
|
||||
, 'fit_time' : 16
|
||||
, 'score_time' : 17
|
||||
}
|
||||
|
||||
scoreCV_mapD = {'test_mcc' : 'MCC'
|
||||
, 'test_fscore' : 'F1'
|
||||
, 'test_precision' : 'Precision'
|
||||
, 'test_recall' : 'Recall'
|
||||
, 'test_accuracy' : 'Accuracy'
|
||||
, 'test_roc_auc' : 'ROC_AUC'
|
||||
, 'test_jcc' : 'JCC'
|
||||
}
|
||||
|
||||
scoreBT_mapD = {'bts_mcc' : 'MCC'
|
||||
, 'bts_fscore' : 'F1'
|
||||
, 'bts_precision' : 'Precision'
|
||||
, 'bts_recall' : 'Recall'
|
||||
, 'bts_accuracy' : 'Accuracy'
|
||||
, 'bts_roc_auc' : 'ROC_AUC'
|
||||
, 'bts_jcc' : 'JCC'
|
||||
}
|
||||
|
||||
# data dependent variables but NOT dependent on resampling
|
||||
bts_size = len(X_bts)
|
||||
yc2 = Counter(y_bts)
|
||||
yc2_ratio = yc2[0]/yc2[1]
|
||||
|
||||
###############################################################################
|
||||
#%% Running models ############################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: feature groups '
|
||||
, '\nStarting--> Running ML analysis: Baseline modes (No FS)'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
, '\nDrug name:', drug
|
||||
, '\n#####################################################################\n')
|
||||
|
||||
#%% Basic: No Oversampling
|
||||
#================
|
||||
# Baseline
|
||||
# No resampling
|
||||
#================
|
||||
scores_mmD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
|
||||
baseline_all_scores = pd.DataFrame(scores_mmD)
|
||||
rs_none = 'none'
|
||||
#------------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
baseline_allT = baseline_all_scores.T
|
||||
#baseline_train = baseline_all.filter(regex='train_', axis=1)
|
||||
|
||||
baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns
|
||||
baseline_CV['Data_source'] = 'CV'
|
||||
baseline_CV['Resampling'] = rs_none
|
||||
|
||||
baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns
|
||||
baseline_BT['Data_source'] = 'BT'
|
||||
baseline_BT['Resampling'] = rs_none
|
||||
|
||||
# # Write csv
|
||||
#baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
#baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv')
|
||||
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#-----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_ns = len(X)
|
||||
n_features = len(X.columns)
|
||||
yc1 = Counter(y)
|
||||
yc1_ratio = yc1[0]/yc1[1]
|
||||
|
||||
baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
baseline_all = baseline_all.reset_index()
|
||||
baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
score_type_uniqueN = set(baseline_all['score_type'])
|
||||
cL1 = list(score_type_ordermapD.keys())
|
||||
cL2 = list(score_type_uniqueN)
|
||||
|
||||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
|
||||
baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
baseline_all['Resampling'] = rs_none
|
||||
baseline_all['training_size'] = training_size_ns
|
||||
baseline_all['trainingY_ratio']= round(yc1_ratio,2)
|
||||
baseline_all['n_features'] = n_features
|
||||
|
||||
###############################################################################
|
||||
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
|
||||
#================
|
||||
# Baseline
|
||||
# SMOTE NC: SMNC
|
||||
#================
|
||||
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
|
||||
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
|
||||
rs_smnc = 'smnc'
|
||||
#------------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
smnc_allT = smnc_all_scores.T
|
||||
|
||||
smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
|
||||
smnc_CV['Data_source'] = 'CV'
|
||||
smnc_CV['Resampling'] = rs_smnc
|
||||
|
||||
smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
|
||||
smnc_BT['Data_source'] = 'BT'
|
||||
smnc_BT['Resampling'] = rs_smnc
|
||||
|
||||
# Write csv
|
||||
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
|
||||
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#-----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_smnc = len(X_smnc)
|
||||
n_features = len(X_smnc.columns)
|
||||
yc1_smnc = Counter(y_smnc)
|
||||
yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1]
|
||||
|
||||
smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
smnc_all = smnc_all.reset_index()
|
||||
smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
score_type_uniqueN = set(smnc_all['score_type'])
|
||||
cL1 = list(score_type_ordermapD.keys())
|
||||
cL2 = list(score_type_uniqueN)
|
||||
|
||||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
|
||||
smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
smnc_all['Resampling'] = rs_smnc
|
||||
smnc_all['training_size'] = training_size_smnc
|
||||
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
|
||||
smnc_all['n_features'] = n_features
|
||||
|
||||
###############################################################################
|
||||
#%% ROS: Random Over Sampling [Numerical + categorical]
|
||||
#================
|
||||
# Baseline
|
||||
# ROS
|
||||
#================
|
||||
ros_scores_mmD = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
|
||||
ros_all_scores = pd.DataFrame(ros_scores_mmD)
|
||||
rs_ros = 'ros'
|
||||
#------------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
ros_allT = ros_all_scores.T
|
||||
|
||||
ros_CV = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
|
||||
ros_CV['Data_source'] = 'CV'
|
||||
ros_CV['Resampling'] = rs_ros
|
||||
|
||||
ros_BT = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
|
||||
ros_BT['Data_source'] = 'BT'
|
||||
ros_BT['Resampling'] = rs_ros
|
||||
|
||||
# Write csv
|
||||
# ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
|
||||
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_ros = len(X_ros)
|
||||
n_features = len(X_ros.columns)
|
||||
yc1_ros = Counter(y_ros)
|
||||
yc1_ratio_ros = yc1_ros[0]/yc1_ros[1]
|
||||
|
||||
ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
ros_all = ros_all.reset_index()
|
||||
ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
score_type_uniqueN = set(ros_all['score_type'])
|
||||
cL1 = list(score_type_ordermapD.keys())
|
||||
cL2 = list(score_type_uniqueN)
|
||||
|
||||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
|
||||
ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
ros_all['Resampling'] = rs_ros
|
||||
ros_all['training_size'] = training_size_ros
|
||||
ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
|
||||
ros_all['n_features'] = n_features
|
||||
###############################################################################
|
||||
#%% RUS: Random Under Sampling [Numerical + categorical]
|
||||
#================
|
||||
# Baseline
|
||||
# RUS
|
||||
#================
|
||||
rus_scores_mmD = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
|
||||
rus_all_scores = pd.DataFrame(rus_scores_mmD)
|
||||
rs_rus = 'rus'
|
||||
#-----------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
rus_allT = rus_all_scores.T
|
||||
|
||||
rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
|
||||
rus_CV['Data_source'] = 'CV'
|
||||
rus_CV['Resampling'] = rs_rus
|
||||
|
||||
rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
|
||||
rus_BT['Data_source'] = 'BT'
|
||||
rus_BT['Resampling'] = rs_rus
|
||||
|
||||
# # Write csv
|
||||
# rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
|
||||
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_rus = len(X_rus)
|
||||
n_features = len(X_rus.columns)
|
||||
yc1_rus = Counter(y_rus)
|
||||
yc1_ratio_rus = yc1_rus[0]/yc1_rus[1]
|
||||
|
||||
rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
rus_all = rus_all.reset_index()
|
||||
rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
score_type_uniqueN = set(rus_all['score_type'])
|
||||
cL1 = list(score_type_ordermapD.keys())
|
||||
cL2 = list(score_type_uniqueN)
|
||||
|
||||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
|
||||
rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
rus_all['Resampling'] = rs_rus
|
||||
rus_all['training_size'] = training_size_rus
|
||||
rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
|
||||
rus_all['n_features'] = n_features
|
||||
|
||||
###############################################################################
|
||||
#%% ROS+RUS Combined: [Numerical + categorical]
|
||||
#================
|
||||
# Baseline
|
||||
# ROUC
|
||||
#================
|
||||
rouC_scores_mmD = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
|
||||
rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
|
||||
rs_rouC = 'rouC'
|
||||
#-----------------------
|
||||
# WF: only CV and BTS
|
||||
#-----------------------
|
||||
rouC_allT = rouC_all_scores.T
|
||||
|
||||
rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
|
||||
rouC_CV['Data_source'] = 'CV'
|
||||
rouC_CV['Resampling'] = rs_rouC
|
||||
|
||||
rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
|
||||
# map colnames for consistency to allow concatenting
|
||||
rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
|
||||
rouC_BT['Data_source'] = 'BT'
|
||||
rouC_BT['Resampling'] = rs_rouC
|
||||
|
||||
# Write csv
|
||||
# rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
# rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
|
||||
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
||||
|
||||
#----------------------------------
|
||||
# LF*: CV + BTS + Other info
|
||||
#----------------------------------
|
||||
# other data dependent variables
|
||||
training_size_rouC = len(X_rouC)
|
||||
n_features = len(X_rouC.columns)
|
||||
yc1_rouC = Counter(y_rouC)
|
||||
yc1_ratio_rouC = yc1_rouC[0]/yc1_rouC[1]
|
||||
|
||||
rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||
rouC_all = rouC_all.reset_index()
|
||||
rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||
|
||||
# Indicate whether BT or CV
|
||||
bt_pattern = re.compile(r'bts_.*')
|
||||
rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||
|
||||
rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||
|
||||
score_type_uniqueN = set(rouC_all['score_type'])
|
||||
cL1 = list(score_type_ordermapD.keys())
|
||||
cL2 = list(score_type_uniqueN)
|
||||
|
||||
if set(cL1).issubset(cL2):
|
||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||
rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
|
||||
rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||
else:
|
||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||
|
||||
# add cols: specific
|
||||
rouC_all['Resampling'] = rs_rouC
|
||||
rouC_all['training_size'] = training_size_rouC
|
||||
rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
|
||||
rouC_all['n_features'] = n_features
|
||||
|
||||
###############################################################################
|
||||
#%% COMBINING all dfs: WF and LF
|
||||
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
|
||||
#%% Combine WF
|
||||
#-----------------
|
||||
# Combine WF
|
||||
#-----------------
|
||||
dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
|
||||
baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
|
||||
|
||||
dfs_nrows_wf = []
|
||||
for df in dfs_combine_wf:
|
||||
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
|
||||
dfs_nrows_wf = max(dfs_nrows_wf)
|
||||
paramD = {
|
||||
'baseline_paramD': { 'input_df' : X
|
||||
, 'target' : y
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type': 'none'}
|
||||
|
||||
, 'smnc_paramD': { 'input_df' : X_smnc
|
||||
, 'target' : y_smnc
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'smnc'}
|
||||
|
||||
dfs_ncols_wf = []
|
||||
for df in dfs_combine_wf:
|
||||
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
|
||||
dfs_ncols_wf = max(dfs_ncols_wf)
|
||||
, 'ros_paramD': { 'input_df' : X_ros
|
||||
, 'target' : y_ros
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'ros'}
|
||||
|
||||
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
|
||||
expected_ncols_wf = dfs_ncols_wf
|
||||
, 'rus_paramD' : { 'input_df' : X_rus
|
||||
, 'target' : y_rus
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'rus'}
|
||||
|
||||
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
|
||||
, 'rouC_paramD' : { 'input_df' : X_rouC
|
||||
, 'target' : y_rouC
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'rouC'}
|
||||
}
|
||||
|
||||
if len(common_cols_wf) == dfs_ncols_wf :
|
||||
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
|
||||
resampling_methods_wf = combined_baseline_wf[['Resampling']]
|
||||
resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
||||
print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
|
||||
, '\nNo. of dfs combining:', len(dfs_combine_wf)
|
||||
, '\nThe sampling methods are:'
|
||||
, '\n', resampling_methods_wf)
|
||||
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
|
||||
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
|
||||
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
|
||||
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
|
||||
else:
|
||||
print('\nFAIL: concatenating failed'
|
||||
, '\nExpected nrows:', expected_nrows_wf
|
||||
, '\nGot:', len(combined_baseline_wf)
|
||||
, '\nExpected ncols:', expected_ncols_wf
|
||||
, '\nGot:', len(combined_baseline_wf.columns))
|
||||
sys.exit()
|
||||
else:
|
||||
sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
|
||||
# Initial run to get the dict containing CV, BT and metadata DFs
|
||||
mmD = {}
|
||||
for k, v in paramD.items():
|
||||
# print(mmD[k])
|
||||
scores_7030D = MultModelsCl(**paramD[k]
|
||||
, tts_split_type = tts_split_7030
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True
|
||||
, return_formatted_output = True)
|
||||
mmD[k] = scores_7030D
|
||||
|
||||
# Extracting the dfs from within the dict and concatenating to output as one df
|
||||
for k, v in mmD.items():
|
||||
out_wf_7030 = pd.concat(mmD, ignore_index = True)
|
||||
|
||||
# Add index as a column
|
||||
combined_baseline_wf.columns
|
||||
combined_baseline_wf = combined_baseline_wf.reset_index()
|
||||
combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
|
||||
combined_baseline_wf.head()
|
||||
|
||||
# sort df: Resampling, Data_source, and MCC
|
||||
combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
|
||||
##############################################################################
|
||||
#%% Combine LF
|
||||
#-----------------
|
||||
# Combine LF*
|
||||
#-----------------
|
||||
dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
|
||||
|
||||
dfs_nrows = []
|
||||
for df in dfs_combine:
|
||||
dfs_nrows = dfs_nrows + [len(df)]
|
||||
dfs_nrows = max(dfs_nrows)
|
||||
|
||||
dfs_ncols = []
|
||||
for df in dfs_combine:
|
||||
dfs_ncols = dfs_ncols + [len(df.columns)]
|
||||
dfs_ncols = max(dfs_ncols)
|
||||
|
||||
# dfs_ncols = []
|
||||
# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
|
||||
# dfs_ncols2
|
||||
|
||||
expected_nrows = len(dfs_combine) * dfs_nrows
|
||||
expected_ncols = dfs_ncols
|
||||
|
||||
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
|
||||
|
||||
if len(common_cols) == dfs_ncols :
|
||||
combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
|
||||
resampling_methods = combined_baseline[['Resampling', 'training_size']]
|
||||
resampling_methods = resampling_methods.drop_duplicates()
|
||||
print('\nConcatenating dfs with different resampling methods:', tts_split
|
||||
, '\nNo. of dfs combining:', len(dfs_combine)
|
||||
, '\nThe sampling methods are:'
|
||||
, '\n', resampling_methods)
|
||||
if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols:
|
||||
print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
|
||||
, '\nnrows in combined_df:', len(combined_baseline)
|
||||
, '\nncols in combined_df:', len(combined_baseline.columns))
|
||||
else:
|
||||
print('\nFAIL: concatenating failed'
|
||||
, '\nExpected nrows:', expected_nrows
|
||||
, '\nGot:', len(combined_baseline)
|
||||
, '\nExpected ncols:', expected_ncols
|
||||
, '\nGot:', len(combined_baseline.columns))
|
||||
sys.exit()
|
||||
else:
|
||||
sys.exit('\nConcatenting dfs not possible,check numbers ')
|
||||
|
||||
# Add further column indications
|
||||
combined_baseline['test_size'] = bts_size
|
||||
combined_baseline['tts_split'] = tts_split
|
||||
combined_baseline['testY_ratio'] = round(yc2_ratio,2)
|
||||
#combined_baseline.columns
|
||||
|
||||
# change to column names to be lower case for consistency
|
||||
combined_baseline.rename(columns = {'Resampling' : 'resampling'
|
||||
, 'Data_source': 'data_source'}, inplace = True)
|
||||
combined_baseline.columns
|
||||
|
||||
# sort df: resampling, data_source, mcc
|
||||
combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
|
||||
|
||||
# # rpow bind
|
||||
# if all(XXX):
|
||||
# print('\nPASS:colnames match, proceeding to rowbind')
|
||||
# comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
|
||||
print('\n######################################################################'
|
||||
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
|
||||
, '\nGene:', gene.lower()
|
||||
, '\nDrug:', drug
|
||||
, '\noutput file:', outFile_wf
|
||||
, '\nDim of output:', out_wf_7030.shape
|
||||
, '\n######################################################################')
|
||||
###############################################################################
|
||||
#====================
|
||||
# Write output file
|
||||
#====================
|
||||
combined_baseline_wf.to_csv(outFile_wf, index = False)
|
||||
#out_wf_7030.to_csv(outFile_wf, index = False)
|
||||
print('\nFile successfully written:', outFile_wf)
|
||||
|
||||
combined_baseline.to_csv(outFile_lf, index = False)
|
||||
print('\nFile successfully written:', outFile_lf)
|
||||
###############################################################################
|
|
@ -9,6 +9,8 @@ Created on Mon Jun 20 13:05:23 2022
|
|||
import re
|
||||
import argparse
|
||||
import os, sys
|
||||
import collections
|
||||
|
||||
# gene = 'pncA'
|
||||
# drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
@ -25,6 +27,7 @@ import os, sys
|
|||
###############################################################################
|
||||
homedir = os.path.expanduser("~")
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Import data
|
||||
|
@ -54,79 +57,70 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
|||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
||||
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
||||
#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
||||
|
||||
#%% Running models ############################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: feature groups '
|
||||
, '\nStarting--> Running ML analysis: Baseline modes (No FS)'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
, '\nDrug name:', drug
|
||||
, '\n#####################################################################\n')
|
||||
|
||||
fooD = {'baseline_paramD': {
|
||||
'input_df': X
|
||||
, 'target': y
|
||||
, 'var_type': 'mixed'
|
||||
, 'resampling_type': 'none'}
|
||||
,
|
||||
'smnc_paramD': {'input_df': X_smnc
|
||||
, 'target': y_smnc
|
||||
, 'var_type': 'mixed'
|
||||
, 'resampling_type': 'smnc'}
|
||||
}
|
||||
paramD = {
|
||||
'baseline_paramD': { 'input_df' : X
|
||||
, 'target' : y
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type': 'none'}
|
||||
|
||||
, 'smnc_paramD': { 'input_df' : X_smnc
|
||||
, 'target' : y_smnc
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'smnc'}
|
||||
|
||||
, 'ros_paramD': { 'input_df' : X_ros
|
||||
, 'target' : y_ros
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'ros'}
|
||||
|
||||
barD = {}
|
||||
for k, v in fooD.items():
|
||||
#print(k)
|
||||
print(fooD[k])
|
||||
scores_7030D = MultModelsCl(**fooD[k]
|
||||
, 'rus_paramD' : { 'input_df' : X_rus
|
||||
, 'target' : y_rus
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'rus'}
|
||||
|
||||
, 'rouC_paramD' : { 'input_df' : X_rouC
|
||||
, 'target' : y_rouC
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'rouC'}
|
||||
}
|
||||
|
||||
# Initial run to get the dict containing CV, BT and metadata DFs
|
||||
mmD = {}
|
||||
for k, v in paramD.items():
|
||||
# print(fooD[k])
|
||||
scores_7030D = MultModelsCl(**paramD[k]
|
||||
, tts_split_type = tts_split_7030
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
barD[k] = scores_7030D
|
||||
, add_yn = True
|
||||
, return_formatted_output = True)
|
||||
mmD[k] = scores_7030D
|
||||
|
||||
|
||||
ros_paramD = {input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, resampling_type = 'smnc'}
|
||||
|
||||
|
||||
rus_paramD = {input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, resampling_type = 'rus'}
|
||||
|
||||
|
||||
rouC_paramD = {input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, resampling_type = 'rouC'}
|
||||
|
||||
|
||||
|
||||
|
||||
#====
|
||||
scores_7030D = MultModelsCl(**rouC_paramD
|
||||
, tts_split_type = tts_split_7030
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True)
|
||||
|
||||
###############################################################################
|
||||
###############################################################################
|
||||
#%% COMBINING all dfs: WF and LF
|
||||
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
|
||||
|
||||
|
||||
for k, v in mmD.items():
|
||||
out_wf_7030 = pd.concat(mmD, ignore_index = True)
|
||||
|
||||
print('\n######################################################################'
|
||||
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
|
||||
, '\nGene:', gene.lower()
|
||||
, '\nDrug:', drug
|
||||
, '\noutput file:', outFile_wf
|
||||
, '\nDim of output:', out_wf_7030.shape
|
||||
, '\n######################################################################')
|
||||
###############################################################################
|
||||
#====================
|
||||
# Write output file
|
||||
#====================
|
||||
#combined_baseline_wf.to_csv(outFile_wf, index = False)
|
||||
#print('\nFile successfully written:', outFile_wf)
|
||||
out_wf_7030.to_csv(outFile_wf, index = False)
|
||||
print('\nFile successfully written:', outFile_wf)
|
||||
###############################################################################
|
|
@ -11,6 +11,7 @@ time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
|
|||
time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
|
||||
time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
|
||||
time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
|
||||
|
||||
# alr: # ERROR, as expected, too few values!
|
||||
# gid: problems
|
||||
########################################################################
|
||||
|
@ -73,9 +74,11 @@ time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
|
|||
########################################################################
|
||||
########################################################################
|
||||
|
||||
|
||||
# running feature selection
|
||||
# Split:70/30
|
||||
time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt
|
||||
|
||||
real 338m26.705s
|
||||
user 1946m12.173s
|
||||
sys 189m40.122s
|
||||
|
||||
|
|
|
@ -7,21 +7,24 @@ Created on Fri Jun 24 11:07:05 2022
|
|||
"""
|
||||
import re
|
||||
import argparse
|
||||
import os, sys
|
||||
###############################################################################
|
||||
# gene = 'pncA'
|
||||
# drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
#%% command line args: case sensitive
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pncA')
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pyrazinamide')
|
||||
args = arg_parser.parse_args()
|
||||
# #%% command line args: case sensitive
|
||||
# arg_parser = argparse.ArgumentParser()
|
||||
# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
||||
# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
||||
# args = arg_parser.parse_args()
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
# drug = args.drug
|
||||
# gene = args.gene
|
||||
|
||||
###############################################################################
|
||||
homedir = os.path.expanduser("~")
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
|
@ -79,7 +82,7 @@ mmD = MultModelsCl(input_df = X_smnc
|
|||
#================
|
||||
# MultModelsCl: WITH formatted output
|
||||
#================
|
||||
mmDF = MultModelsCl(input_df = X_smnc
|
||||
mmDF3 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, tts_split_type = tts_split_7030
|
||||
|
@ -96,4 +99,4 @@ mmDF = MultModelsCl(input_df = X_smnc
|
|||
# test function
|
||||
#=================
|
||||
# output from function call
|
||||
ProcessMultModelCl(smnc_scores_mmD)
|
||||
ProcessMultModelsCl(mmD)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue