optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added
This commit is contained in:
parent
7dc7e25016
commit
b37a950fec
12 changed files with 180 additions and 128408 deletions
|
@ -197,35 +197,35 @@ def MultModelsCl(input_df, target, skf_cv
|
||||||
# Specify multiple Classification Models
|
# Specify multiple Classification Models
|
||||||
#======================================================
|
#======================================================
|
||||||
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
||||||
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
|
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
|
||||||
# , ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||||
# , ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||||
# , ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||||
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||||
# , ('Gaussian NB' , GaussianNB() )
|
, ('Gaussian NB' , GaussianNB() )
|
||||||
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||||
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||||
# , ('LDA' , LinearDiscriminantAnalysis() )
|
, ('LDA' , LinearDiscriminantAnalysis() )
|
||||||
, ('Logistic Regression' , LogisticRegression(**rs) )
|
, ('Logistic Regression' , LogisticRegression(**rs) )
|
||||||
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||||
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||||
# , ('Multinomial' , MultinomialNB() )
|
, ('Multinomial' , MultinomialNB() )
|
||||||
# , ('Naive Bayes' , BernoulliNB() )
|
, ('Naive Bayes' , BernoulliNB() )
|
||||||
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||||
# , ('QDA' , QuadraticDiscriminantAnalysis() )
|
, ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||||
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
|
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
|
||||||
# , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||||
# , n_estimators = 1000
|
, n_estimators = 1000
|
||||||
# , bootstrap = True
|
, bootstrap = True
|
||||||
# , oob_score = True
|
, oob_score = True
|
||||||
# , **njobs
|
, **njobs
|
||||||
# , **rs
|
, **rs
|
||||||
# , max_features = 'auto') )
|
, max_features = 'auto') )
|
||||||
# , ('Ridge Classifier' , RidgeClassifier(**rs) )
|
, ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||||
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||||
# , ('SVC' , SVC(**rs) )
|
, ('SVC' , SVC(**rs) )
|
||||||
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||||
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
|
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
|
||||||
]
|
]
|
||||||
|
|
||||||
mm_skf_scoresD = {}
|
mm_skf_scoresD = {}
|
||||||
|
@ -440,10 +440,11 @@ def ProcessMultModelsCl(inputD = {}):
|
||||||
, '\nCV df:', len(scoresDF_CV.columns)
|
, '\nCV df:', len(scoresDF_CV.columns)
|
||||||
, '\nBT_df:', len(scoresDF_BT.columns)
|
, '\nBT_df:', len(scoresDF_BT.columns)
|
||||||
, '\nmetaDF:', len(metaDF.columns))
|
, '\nmetaDF:', len(metaDF.columns))
|
||||||
|
|
||||||
if len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
|
if len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
|
||||||
print('\nFirst proceeding to rowbind CV and BT dfs:')
|
print('\nFirst proceeding to rowbind CV and BT dfs:')
|
||||||
expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
|
expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
|
||||||
print('\nFinal output should have:',expected_ncols_out, 'columns' )
|
print('\nFinal output should have:', expected_ncols_out, 'columns' )
|
||||||
|
|
||||||
#-----------------
|
#-----------------
|
||||||
# Combine WF
|
# Combine WF
|
||||||
|
@ -496,8 +497,7 @@ def ProcessMultModelsCl(inputD = {}):
|
||||||
sys.exit('\nFIRST IF FAILS')
|
sys.exit('\nFIRST IF FAILS')
|
||||||
else:
|
else:
|
||||||
print('\nConcatenting dfs not possible [WF],check numbers ')
|
print('\nConcatenting dfs not possible [WF],check numbers ')
|
||||||
|
|
||||||
|
|
||||||
#-------------------------------------
|
#-------------------------------------
|
||||||
# Combine WF+Metadata: Final output
|
# Combine WF+Metadata: Final output
|
||||||
#-------------------------------------
|
#-------------------------------------
|
||||||
|
@ -515,11 +515,15 @@ def ProcessMultModelsCl(inputD = {}):
|
||||||
print('\nPASS: Combined df has expected ncols')
|
print('\nPASS: Combined df has expected ncols')
|
||||||
else:
|
else:
|
||||||
sys.exit('\nFAIL: Length mismatch for combined_df')
|
sys.exit('\nFAIL: Length mismatch for combined_df')
|
||||||
|
|
||||||
|
print('\nAdding column: Model_name')
|
||||||
|
|
||||||
|
combDF['Model_name'] = combDF.index
|
||||||
|
|
||||||
print('\n========================================================='
|
print('\n========================================================='
|
||||||
, '\nSUCCESS: Ran multiple classifiers'
|
, '\nSUCCESS: Ran multiple classifiers'
|
||||||
, '\n=======================================================')
|
, '\n=======================================================')
|
||||||
|
|
||||||
#resampling_methods_wf = combined_baseline_wf[['resampling']]
|
#resampling_methods_wf = combined_baseline_wf[['resampling']]
|
||||||
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
||||||
#, '\n', resampling_methods_wf)
|
#, '\n', resampling_methods_wf)
|
||||||
|
|
|
@ -1,72 +0,0 @@
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:464: SettingWithCopyWarning:
|
|
||||||
A value is trying to be set on a copy of a slice from a DataFrame
|
|
||||||
|
|
||||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
|
||||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
|
||||||
1.22.4
|
|
||||||
1.4.1
|
|
||||||
|
|
||||||
aaindex_df contains non-numerical data
|
|
||||||
|
|
||||||
Total no. of non-numerial columns: 2
|
|
||||||
|
|
||||||
Selecting numerical data only
|
|
||||||
|
|
||||||
PASS: successfully selected numerical columns only for aaindex_df
|
|
||||||
|
|
||||||
Now checking for NA in the remaining aaindex_cols
|
|
||||||
|
|
||||||
Counting aaindex_df cols with NA
|
|
||||||
ncols with NA: 4 columns
|
|
||||||
Dropping these...
|
|
||||||
Original ncols: 127
|
|
||||||
|
|
||||||
Revised df ncols: 123
|
|
||||||
|
|
||||||
Checking NA in revised df...
|
|
||||||
|
|
||||||
PASS: cols with NA successfully dropped from aaindex_df
|
|
||||||
Proceeding with combining aa_df with other features_df
|
|
||||||
|
|
||||||
PASS: ncols match
|
|
||||||
Expected ncols: 123
|
|
||||||
Got: 123
|
|
||||||
|
|
||||||
Total no. of columns in clean aa_df: 123
|
|
||||||
|
|
||||||
Proceeding to merge, expected nrows in merged_df: 271
|
|
||||||
|
|
||||||
PASS: my_features_df and aa_df successfully combined
|
|
||||||
nrows: 271
|
|
||||||
ncols: 269
|
|
||||||
count of NULL values before imputation
|
|
||||||
|
|
||||||
or_mychisq 256
|
|
||||||
log10_or_mychisq 256
|
|
||||||
dtype: int64
|
|
||||||
count of NULL values AFTER imputation
|
|
||||||
|
|
||||||
mutationinformation 0
|
|
||||||
or_rawI 0
|
|
||||||
logorI 0
|
|
||||||
dtype: int64
|
|
||||||
|
|
||||||
PASS: OR values imputed, data ready for ML
|
|
||||||
|
|
||||||
Total no. of features for aaindex: 123
|
|
||||||
|
|
||||||
PASS: x_features has no target variable
|
|
||||||
|
|
||||||
No. of columns for x_features: 174
|
|
||||||
Traceback (most recent call last):
|
|
||||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./run_7030.py", line 35, in <module>
|
|
||||||
setvars(gene,drug)
|
|
||||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 636, in setvars
|
|
||||||
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
|
||||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
|
|
||||||
train, test = next(cv.split(X=arrays[0], y=stratify))
|
|
||||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
|
|
||||||
for train, test in self._iter_indices(X, y, groups):
|
|
||||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
|
|
||||||
raise ValueError(
|
|
||||||
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -557,7 +557,7 @@ def setvars(gene,drug):
|
||||||
# FG5: Genomic features
|
# FG5: Genomic features
|
||||||
#========================
|
#========================
|
||||||
X_gn_mafor_Fnum = ['maf'
|
X_gn_mafor_Fnum = ['maf'
|
||||||
, 'logorI'
|
#, 'logorI'
|
||||||
# , 'or_rawI'
|
# , 'or_rawI'
|
||||||
# , 'or_mychisq'
|
# , 'or_mychisq'
|
||||||
# , 'or_logistic'
|
# , 'or_logistic'
|
||||||
|
|
635
scripts/ml/run_7030.py
Executable file → Normal file
635
scripts/ml/run_7030.py
Executable file → Normal file
|
@ -5,28 +5,28 @@ Created on Mon Jun 20 13:05:23 2022
|
||||||
|
|
||||||
@author: tanu
|
@author: tanu
|
||||||
"""
|
"""
|
||||||
|
#%%Imports ####################################################################
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
###############################################################################
|
import os, sys
|
||||||
|
|
||||||
# gene = 'pncA'
|
# gene = 'pncA'
|
||||||
# drug = 'pyrazinamide'
|
# drug = 'pyrazinamide'
|
||||||
#total_mtblineage_uc = 8
|
#total_mtblineage_uc = 8
|
||||||
|
###############################################################################
|
||||||
#%% command line args: case sensitive
|
#%% command line args: case sensitive
|
||||||
arg_parser = argparse.ArgumentParser()
|
# arg_parser = argparse.ArgumentParser()
|
||||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
||||||
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
||||||
args = arg_parser.parse_args()
|
# args = arg_parser.parse_args()
|
||||||
|
|
||||||
drug = args.drug
|
# drug = args.drug
|
||||||
gene = args.gene
|
# gene = args.gene
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#==================
|
homedir = os.path.expanduser("~")
|
||||||
# other vars
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
|
||||||
#==================
|
|
||||||
tts_split = '70/30'
|
|
||||||
OutFile_suffix = '7030'
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#==================
|
#==================
|
||||||
# Import data
|
# Import data
|
||||||
|
@ -39,10 +39,15 @@ from ml_data_7030 import *
|
||||||
#from UQ_yc_RunAllClfs import run_all_ML
|
#from UQ_yc_RunAllClfs import run_all_ML
|
||||||
|
|
||||||
#====================
|
#====================
|
||||||
# Import ML function
|
# Import ML functions
|
||||||
#====================
|
#====================
|
||||||
# TT run all ML clfs: baseline model
|
from MultClfs import *
|
||||||
from MultModelsCl import MultModelsCl
|
|
||||||
|
#==================
|
||||||
|
# other vars
|
||||||
|
#==================
|
||||||
|
tts_split_7030 = '70_30'
|
||||||
|
OutFile_suffix = '7030'
|
||||||
|
|
||||||
#==================
|
#==================
|
||||||
# Specify outdir
|
# Specify outdir
|
||||||
|
@ -51,563 +56,71 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
||||||
print('\nOutput directory:', outdir_ml)
|
print('\nOutput directory:', outdir_ml)
|
||||||
|
|
||||||
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
||||||
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
||||||
|
|
||||||
###############################################################################
|
#%% Running models ############################################################
|
||||||
score_type_ordermapD = { 'mcc' : 1
|
|
||||||
, 'fscore' : 2
|
|
||||||
, 'jcc' : 3
|
|
||||||
, 'precision' : 4
|
|
||||||
, 'recall' : 5
|
|
||||||
, 'accuracy' : 6
|
|
||||||
, 'roc_auc' : 7
|
|
||||||
, 'TN' : 8
|
|
||||||
, 'FP' : 9
|
|
||||||
, 'FN' : 10
|
|
||||||
, 'TP' : 11
|
|
||||||
, 'trainingY_neg': 12
|
|
||||||
, 'trainingY_pos': 13
|
|
||||||
, 'blindY_neg' : 14
|
|
||||||
, 'blindY_pos' : 15
|
|
||||||
, 'fit_time' : 16
|
|
||||||
, 'score_time' : 17
|
|
||||||
}
|
|
||||||
|
|
||||||
scoreCV_mapD = {'test_mcc' : 'MCC'
|
|
||||||
, 'test_fscore' : 'F1'
|
|
||||||
, 'test_precision' : 'Precision'
|
|
||||||
, 'test_recall' : 'Recall'
|
|
||||||
, 'test_accuracy' : 'Accuracy'
|
|
||||||
, 'test_roc_auc' : 'ROC_AUC'
|
|
||||||
, 'test_jcc' : 'JCC'
|
|
||||||
}
|
|
||||||
|
|
||||||
scoreBT_mapD = {'bts_mcc' : 'MCC'
|
|
||||||
, 'bts_fscore' : 'F1'
|
|
||||||
, 'bts_precision' : 'Precision'
|
|
||||||
, 'bts_recall' : 'Recall'
|
|
||||||
, 'bts_accuracy' : 'Accuracy'
|
|
||||||
, 'bts_roc_auc' : 'ROC_AUC'
|
|
||||||
, 'bts_jcc' : 'JCC'
|
|
||||||
}
|
|
||||||
|
|
||||||
# data dependent variables but NOT dependent on resampling
|
|
||||||
bts_size = len(X_bts)
|
|
||||||
yc2 = Counter(y_bts)
|
|
||||||
yc2_ratio = yc2[0]/yc2[1]
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
print('\n#####################################################################\n'
|
print('\n#####################################################################\n'
|
||||||
, '\nRunning ML analysis: feature groups '
|
, '\nStarting--> Running ML analysis: Baseline modes (No FS)'
|
||||||
, '\nGene name:', gene
|
, '\nGene name:', gene
|
||||||
, '\nDrug name:', drug)
|
, '\nDrug name:', drug
|
||||||
|
, '\n#####################################################################\n')
|
||||||
|
|
||||||
#%% Basic: No Oversampling
|
paramD = {
|
||||||
#================
|
'baseline_paramD': { 'input_df' : X
|
||||||
# Baseline
|
, 'target' : y
|
||||||
# No resampling
|
, 'var_type' : 'mixed'
|
||||||
#================
|
, 'resampling_type': 'none'}
|
||||||
scores_mmD = MultModelsCl(input_df = X
|
|
||||||
, target = y
|
, 'smnc_paramD': { 'input_df' : X_smnc
|
||||||
, var_type = 'mixed'
|
, 'target' : y_smnc
|
||||||
, skf_cv = skf_cv
|
, 'var_type' : 'mixed'
|
||||||
, blind_test_df = X_bts
|
, 'resampling_type' : 'smnc'}
|
||||||
, blind_test_target = y_bts
|
|
||||||
, add_cm = True
|
|
||||||
, add_yn = True)
|
|
||||||
|
|
||||||
baseline_all_scores = pd.DataFrame(scores_mmD)
|
|
||||||
rs_none = 'none'
|
|
||||||
#------------------------
|
|
||||||
# WF: only CV and BTS
|
|
||||||
#-----------------------
|
|
||||||
baseline_allT = baseline_all_scores.T
|
|
||||||
#baseline_train = baseline_all.filter(regex='train_', axis=1)
|
|
||||||
|
|
||||||
baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns
|
|
||||||
baseline_CV['Data_source'] = 'CV'
|
|
||||||
baseline_CV['Resampling'] = rs_none
|
|
||||||
|
|
||||||
baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns
|
|
||||||
baseline_BT['Data_source'] = 'BT'
|
|
||||||
baseline_BT['Resampling'] = rs_none
|
|
||||||
|
|
||||||
# # Write csv
|
|
||||||
#baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
||||||
#baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
||||||
# baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv')
|
|
||||||
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
|
||||||
|
|
||||||
#----------------------------------
|
|
||||||
# LF*: CV + BTS + Other info
|
|
||||||
#-----------------------------------
|
|
||||||
# other data dependent variables
|
|
||||||
training_size_ns = len(X)
|
|
||||||
n_features = len(X.columns)
|
|
||||||
yc1 = Counter(y)
|
|
||||||
yc1_ratio = yc1[0]/yc1[1]
|
|
||||||
|
|
||||||
baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
||||||
baseline_all = baseline_all.reset_index()
|
|
||||||
baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
||||||
|
|
||||||
# Indicate whether BT or CV
|
|
||||||
bt_pattern = re.compile(r'bts_.*')
|
|
||||||
baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
||||||
|
|
||||||
baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
||||||
|
|
||||||
score_type_uniqueN = set(baseline_all['score_type'])
|
|
||||||
cL1 = list(score_type_ordermapD.keys())
|
|
||||||
cL2 = list(score_type_uniqueN)
|
|
||||||
|
|
||||||
if set(cL1).issubset(cL2):
|
|
||||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
||||||
baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
|
|
||||||
baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
||||||
else:
|
|
||||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
||||||
|
|
||||||
# add cols: specific
|
|
||||||
baseline_all['Resampling'] = rs_none
|
|
||||||
baseline_all['training_size'] = training_size_ns
|
|
||||||
baseline_all['trainingY_ratio']= round(yc1_ratio,2)
|
|
||||||
baseline_all['n_features'] = n_features
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
|
|
||||||
#================
|
|
||||||
# Baseline
|
|
||||||
# SMOTE NC: SMNC
|
|
||||||
#================
|
|
||||||
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
|
|
||||||
, target = y_smnc
|
|
||||||
, var_type = 'mixed'
|
|
||||||
, skf_cv = skf_cv
|
|
||||||
, blind_test_df = X_bts
|
|
||||||
, blind_test_target = y_bts
|
|
||||||
, add_cm = True
|
|
||||||
, add_yn = True)
|
|
||||||
|
|
||||||
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
|
|
||||||
rs_smnc = 'smnc'
|
|
||||||
#------------------------
|
|
||||||
# WF: only CV and BTS
|
|
||||||
#-----------------------
|
|
||||||
smnc_allT = smnc_all_scores.T
|
|
||||||
|
|
||||||
smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
|
|
||||||
smnc_CV['Data_source'] = 'CV'
|
|
||||||
smnc_CV['Resampling'] = rs_smnc
|
|
||||||
|
|
||||||
smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
|
|
||||||
smnc_BT['Data_source'] = 'BT'
|
|
||||||
smnc_BT['Resampling'] = rs_smnc
|
|
||||||
|
|
||||||
# Write csv
|
|
||||||
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
||||||
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
||||||
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
|
|
||||||
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
|
||||||
|
|
||||||
#----------------------------------
|
|
||||||
# LF*: CV + BTS + Other info
|
|
||||||
#-----------------------------------
|
|
||||||
# other data dependent variables
|
|
||||||
training_size_smnc = len(X_smnc)
|
|
||||||
n_features = len(X_smnc.columns)
|
|
||||||
yc1_smnc = Counter(y_smnc)
|
|
||||||
yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1]
|
|
||||||
|
|
||||||
smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
||||||
smnc_all = smnc_all.reset_index()
|
|
||||||
smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
||||||
|
|
||||||
# Indicate whether BT or CV
|
|
||||||
bt_pattern = re.compile(r'bts_.*')
|
|
||||||
smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
||||||
|
|
||||||
smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
||||||
|
|
||||||
score_type_uniqueN = set(smnc_all['score_type'])
|
|
||||||
cL1 = list(score_type_ordermapD.keys())
|
|
||||||
cL2 = list(score_type_uniqueN)
|
|
||||||
|
|
||||||
if set(cL1).issubset(cL2):
|
|
||||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
||||||
smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
|
|
||||||
smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
||||||
else:
|
|
||||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
||||||
|
|
||||||
# add cols: specific
|
|
||||||
smnc_all['Resampling'] = rs_smnc
|
|
||||||
smnc_all['training_size'] = training_size_smnc
|
|
||||||
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
|
|
||||||
smnc_all['n_features'] = n_features
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
#%% ROS: Random Over Sampling [Numerical + categorical]
|
|
||||||
#================
|
|
||||||
# Baseline
|
|
||||||
# ROS
|
|
||||||
#================
|
|
||||||
ros_scores_mmD = MultModelsCl(input_df = X_ros
|
|
||||||
, target = y_ros
|
|
||||||
, var_type = 'mixed'
|
|
||||||
, skf_cv = skf_cv
|
|
||||||
, blind_test_df = X_bts
|
|
||||||
, blind_test_target = y_bts
|
|
||||||
, add_cm = True
|
|
||||||
, add_yn = True)
|
|
||||||
|
|
||||||
ros_all_scores = pd.DataFrame(ros_scores_mmD)
|
|
||||||
rs_ros = 'ros'
|
|
||||||
#------------------------
|
|
||||||
# WF: only CV and BTS
|
|
||||||
#-----------------------
|
|
||||||
ros_allT = ros_all_scores.T
|
|
||||||
|
|
||||||
ros_CV = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
|
|
||||||
ros_CV['Data_source'] = 'CV'
|
|
||||||
ros_CV['Resampling'] = rs_ros
|
|
||||||
|
|
||||||
ros_BT = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
|
|
||||||
ros_BT['Data_source'] = 'BT'
|
|
||||||
ros_BT['Resampling'] = rs_ros
|
|
||||||
|
|
||||||
# Write csv
|
|
||||||
# ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
||||||
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
||||||
# ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
|
|
||||||
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
|
||||||
#----------------------------------
|
|
||||||
# LF*: CV + BTS + Other info
|
|
||||||
#----------------------------------
|
|
||||||
# other data dependent variables
|
|
||||||
training_size_ros = len(X_ros)
|
|
||||||
n_features = len(X_ros.columns)
|
|
||||||
yc1_ros = Counter(y_ros)
|
|
||||||
yc1_ratio_ros = yc1_ros[0]/yc1_ros[1]
|
|
||||||
|
|
||||||
ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
||||||
ros_all = ros_all.reset_index()
|
|
||||||
ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
||||||
|
|
||||||
# Indicate whether BT or CV
|
|
||||||
bt_pattern = re.compile(r'bts_.*')
|
|
||||||
ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
||||||
|
|
||||||
ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
||||||
|
|
||||||
score_type_uniqueN = set(ros_all['score_type'])
|
|
||||||
cL1 = list(score_type_ordermapD.keys())
|
|
||||||
cL2 = list(score_type_uniqueN)
|
|
||||||
|
|
||||||
if set(cL1).issubset(cL2):
|
|
||||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
||||||
ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
|
|
||||||
ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
||||||
else:
|
|
||||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
||||||
|
|
||||||
# add cols: specific
|
|
||||||
ros_all['Resampling'] = rs_ros
|
|
||||||
ros_all['training_size'] = training_size_ros
|
|
||||||
ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
|
|
||||||
ros_all['n_features'] = n_features
|
|
||||||
###############################################################################
|
|
||||||
#%% RUS: Random Under Sampling [Numerical + categorical]
|
|
||||||
#================
|
|
||||||
# Baseline
|
|
||||||
# RUS
|
|
||||||
#================
|
|
||||||
rus_scores_mmD = MultModelsCl(input_df = X_rus
|
|
||||||
, target = y_rus
|
|
||||||
, var_type = 'mixed'
|
|
||||||
, skf_cv = skf_cv
|
|
||||||
, blind_test_df = X_bts
|
|
||||||
, blind_test_target = y_bts
|
|
||||||
, add_cm = True
|
|
||||||
, add_yn = True)
|
|
||||||
|
|
||||||
rus_all_scores = pd.DataFrame(rus_scores_mmD)
|
|
||||||
rs_rus = 'rus'
|
|
||||||
#-----------------------
|
|
||||||
# WF: only CV and BTS
|
|
||||||
#-----------------------
|
|
||||||
rus_allT = rus_all_scores.T
|
|
||||||
|
|
||||||
rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
|
|
||||||
rus_CV['Data_source'] = 'CV'
|
|
||||||
rus_CV['Resampling'] = rs_rus
|
|
||||||
|
|
||||||
rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
|
|
||||||
rus_BT['Data_source'] = 'BT'
|
|
||||||
rus_BT['Resampling'] = rs_rus
|
|
||||||
|
|
||||||
# # Write csv
|
|
||||||
# rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
||||||
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
||||||
# rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
|
|
||||||
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
|
||||||
|
|
||||||
#----------------------------------
|
|
||||||
# LF*: CV + BTS + Other info
|
|
||||||
#----------------------------------
|
|
||||||
# other data dependent variables
|
|
||||||
training_size_rus = len(X_rus)
|
|
||||||
n_features = len(X_rus.columns)
|
|
||||||
yc1_rus = Counter(y_rus)
|
|
||||||
yc1_ratio_rus = yc1_rus[0]/yc1_rus[1]
|
|
||||||
|
|
||||||
rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
||||||
rus_all = rus_all.reset_index()
|
|
||||||
rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
||||||
|
|
||||||
# Indicate whether BT or CV
|
|
||||||
bt_pattern = re.compile(r'bts_.*')
|
|
||||||
rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
||||||
|
|
||||||
rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
||||||
|
|
||||||
score_type_uniqueN = set(rus_all['score_type'])
|
|
||||||
cL1 = list(score_type_ordermapD.keys())
|
|
||||||
cL2 = list(score_type_uniqueN)
|
|
||||||
|
|
||||||
if set(cL1).issubset(cL2):
|
|
||||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
||||||
rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
|
|
||||||
rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
||||||
else:
|
|
||||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
||||||
|
|
||||||
# add cols: specific
|
|
||||||
rus_all['Resampling'] = rs_rus
|
|
||||||
rus_all['training_size'] = training_size_rus
|
|
||||||
rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
|
|
||||||
rus_all['n_features'] = n_features
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
#%% ROS+RUS Combined: [Numerical + categorical]
|
|
||||||
#================
|
|
||||||
# Baseline
|
|
||||||
# ROUC
|
|
||||||
#================
|
|
||||||
rouC_scores_mmD = MultModelsCl(input_df = X_rouC
|
|
||||||
, target = y_rouC
|
|
||||||
, var_type = 'mixed'
|
|
||||||
, skf_cv = skf_cv
|
|
||||||
, blind_test_df = X_bts
|
|
||||||
, blind_test_target = y_bts
|
|
||||||
, add_cm = True
|
|
||||||
, add_yn = True)
|
|
||||||
|
|
||||||
rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
|
|
||||||
rs_rouC = 'rouC'
|
|
||||||
#-----------------------
|
|
||||||
# WF: only CV and BTS
|
|
||||||
#-----------------------
|
|
||||||
rouC_allT = rouC_all_scores.T
|
|
||||||
|
|
||||||
rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
|
|
||||||
rouC_CV['Data_source'] = 'CV'
|
|
||||||
rouC_CV['Resampling'] = rs_rouC
|
|
||||||
|
|
||||||
rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
|
|
||||||
# map colnames for consistency to allow concatenting
|
|
||||||
rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
|
|
||||||
rouC_BT['Data_source'] = 'BT'
|
|
||||||
rouC_BT['Resampling'] = rs_rouC
|
|
||||||
|
|
||||||
# Write csv
|
|
||||||
# rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
|
||||||
# rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
|
|
||||||
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
|
||||||
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
||||||
|
|
||||||
#----------------------------------
|
|
||||||
# LF*: CV + BTS + Other info
|
|
||||||
#----------------------------------
|
|
||||||
# other data dependent variables
|
|
||||||
training_size_rouC = len(X_rouC)
|
|
||||||
n_features = len(X_rouC.columns)
|
|
||||||
yc1_rouC = Counter(y_rouC)
|
|
||||||
yc1_ratio_rouC = yc1_rouC[0]/yc1_rouC[1]
|
|
||||||
|
|
||||||
rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
|
||||||
rouC_all = rouC_all.reset_index()
|
|
||||||
rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
|
|
||||||
|
|
||||||
# Indicate whether BT or CV
|
|
||||||
bt_pattern = re.compile(r'bts_.*')
|
|
||||||
rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
|
||||||
|
|
||||||
rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
|
|
||||||
|
|
||||||
score_type_uniqueN = set(rouC_all['score_type'])
|
|
||||||
cL1 = list(score_type_ordermapD.keys())
|
|
||||||
cL2 = list(score_type_uniqueN)
|
|
||||||
|
|
||||||
if set(cL1).issubset(cL2):
|
|
||||||
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
|
||||||
rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
|
|
||||||
rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
|
|
||||||
else:
|
|
||||||
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
|
||||||
|
|
||||||
# add cols: specific
|
|
||||||
rouC_all['Resampling'] = rs_rouC
|
|
||||||
rouC_all['training_size'] = training_size_rouC
|
|
||||||
rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
|
|
||||||
rouC_all['n_features'] = n_features
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
#%% COMBINING all dfs: WF and LF
|
|
||||||
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
|
|
||||||
#%% Combine WF
|
|
||||||
#-----------------
|
|
||||||
# Combine WF
|
|
||||||
#-----------------
|
|
||||||
dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
|
|
||||||
baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
|
|
||||||
|
|
||||||
dfs_nrows_wf = []
|
|
||||||
for df in dfs_combine_wf:
|
|
||||||
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
|
|
||||||
dfs_nrows_wf = max(dfs_nrows_wf)
|
|
||||||
|
|
||||||
dfs_ncols_wf = []
|
, 'ros_paramD': { 'input_df' : X_ros
|
||||||
for df in dfs_combine_wf:
|
, 'target' : y_ros
|
||||||
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
|
, 'var_type' : 'mixed'
|
||||||
dfs_ncols_wf = max(dfs_ncols_wf)
|
, 'resampling_type' : 'ros'}
|
||||||
|
|
||||||
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
|
, 'rus_paramD' : { 'input_df' : X_rus
|
||||||
expected_ncols_wf = dfs_ncols_wf
|
, 'target' : y_rus
|
||||||
|
, 'var_type' : 'mixed'
|
||||||
|
, 'resampling_type' : 'rus'}
|
||||||
|
|
||||||
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
|
, 'rouC_paramD' : { 'input_df' : X_rouC
|
||||||
|
, 'target' : y_rouC
|
||||||
|
, 'var_type' : 'mixed'
|
||||||
|
, 'resampling_type' : 'rouC'}
|
||||||
|
}
|
||||||
|
|
||||||
if len(common_cols_wf) == dfs_ncols_wf :
|
# Initial run to get the dict containing CV, BT and metadata DFs
|
||||||
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
|
mmD = {}
|
||||||
resampling_methods_wf = combined_baseline_wf[['Resampling']]
|
for k, v in paramD.items():
|
||||||
resampling_methods_wf = resampling_methods_wf.drop_duplicates()
|
# print(mmD[k])
|
||||||
print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
|
scores_7030D = MultModelsCl(**paramD[k]
|
||||||
, '\nNo. of dfs combining:', len(dfs_combine_wf)
|
, tts_split_type = tts_split_7030
|
||||||
, '\nThe sampling methods are:'
|
, skf_cv = skf_cv
|
||||||
, '\n', resampling_methods_wf)
|
, blind_test_df = X_bts
|
||||||
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
|
, blind_test_target = y_bts
|
||||||
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
|
, add_cm = True
|
||||||
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
|
, add_yn = True
|
||||||
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
|
, return_formatted_output = True)
|
||||||
else:
|
mmD[k] = scores_7030D
|
||||||
print('\nFAIL: concatenating failed'
|
|
||||||
, '\nExpected nrows:', expected_nrows_wf
|
# Extracting the dfs from within the dict and concatenating to output as one df
|
||||||
, '\nGot:', len(combined_baseline_wf)
|
for k, v in mmD.items():
|
||||||
, '\nExpected ncols:', expected_ncols_wf
|
out_wf_7030 = pd.concat(mmD, ignore_index = True)
|
||||||
, '\nGot:', len(combined_baseline_wf.columns))
|
|
||||||
sys.exit()
|
|
||||||
else:
|
|
||||||
sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
|
|
||||||
|
|
||||||
# Add index as a column
|
print('\n######################################################################'
|
||||||
combined_baseline_wf.columns
|
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
|
||||||
combined_baseline_wf = combined_baseline_wf.reset_index()
|
, '\nGene:', gene.lower()
|
||||||
combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
|
, '\nDrug:', drug
|
||||||
combined_baseline_wf.head()
|
, '\noutput file:', outFile_wf
|
||||||
|
, '\nDim of output:', out_wf_7030.shape
|
||||||
# sort df: Resampling, Data_source, and MCC
|
, '\n######################################################################')
|
||||||
combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
|
|
||||||
##############################################################################
|
|
||||||
#%% Combine LF
|
|
||||||
#-----------------
|
|
||||||
# Combine LF*
|
|
||||||
#-----------------
|
|
||||||
dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
|
|
||||||
|
|
||||||
dfs_nrows = []
|
|
||||||
for df in dfs_combine:
|
|
||||||
dfs_nrows = dfs_nrows + [len(df)]
|
|
||||||
dfs_nrows = max(dfs_nrows)
|
|
||||||
|
|
||||||
dfs_ncols = []
|
|
||||||
for df in dfs_combine:
|
|
||||||
dfs_ncols = dfs_ncols + [len(df.columns)]
|
|
||||||
dfs_ncols = max(dfs_ncols)
|
|
||||||
|
|
||||||
# dfs_ncols = []
|
|
||||||
# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
|
|
||||||
# dfs_ncols2
|
|
||||||
|
|
||||||
expected_nrows = len(dfs_combine) * dfs_nrows
|
|
||||||
expected_ncols = dfs_ncols
|
|
||||||
|
|
||||||
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
|
|
||||||
|
|
||||||
if len(common_cols) == dfs_ncols :
|
|
||||||
combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
|
|
||||||
resampling_methods = combined_baseline[['Resampling', 'training_size']]
|
|
||||||
resampling_methods = resampling_methods.drop_duplicates()
|
|
||||||
print('\nConcatenating dfs with different resampling methods:', tts_split
|
|
||||||
, '\nNo. of dfs combining:', len(dfs_combine)
|
|
||||||
, '\nThe sampling methods are:'
|
|
||||||
, '\n', resampling_methods)
|
|
||||||
if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols:
|
|
||||||
print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
|
|
||||||
, '\nnrows in combined_df:', len(combined_baseline)
|
|
||||||
, '\nncols in combined_df:', len(combined_baseline.columns))
|
|
||||||
else:
|
|
||||||
print('\nFAIL: concatenating failed'
|
|
||||||
, '\nExpected nrows:', expected_nrows
|
|
||||||
, '\nGot:', len(combined_baseline)
|
|
||||||
, '\nExpected ncols:', expected_ncols
|
|
||||||
, '\nGot:', len(combined_baseline.columns))
|
|
||||||
sys.exit()
|
|
||||||
else:
|
|
||||||
sys.exit('\nConcatenting dfs not possible,check numbers ')
|
|
||||||
|
|
||||||
# Add further column indications
|
|
||||||
combined_baseline['test_size'] = bts_size
|
|
||||||
combined_baseline['tts_split'] = tts_split
|
|
||||||
combined_baseline['testY_ratio'] = round(yc2_ratio,2)
|
|
||||||
#combined_baseline.columns
|
|
||||||
|
|
||||||
# change to column names to be lower case for consistency
|
|
||||||
combined_baseline.rename(columns = {'Resampling' : 'resampling'
|
|
||||||
, 'Data_source': 'data_source'}, inplace = True)
|
|
||||||
combined_baseline.columns
|
|
||||||
|
|
||||||
# sort df: resampling, data_source, mcc
|
|
||||||
combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
|
|
||||||
|
|
||||||
# # rpow bind
|
|
||||||
# if all(XXX):
|
|
||||||
# print('\nPASS:colnames match, proceeding to rowbind')
|
|
||||||
# comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#====================
|
#====================
|
||||||
# Write output file
|
# Write output file
|
||||||
#====================
|
#====================
|
||||||
combined_baseline_wf.to_csv(outFile_wf, index = False)
|
#out_wf_7030.to_csv(outFile_wf, index = False)
|
||||||
print('\nFile successfully written:', outFile_wf)
|
print('\nFile successfully written:', outFile_wf)
|
||||||
|
|
||||||
combined_baseline.to_csv(outFile_lf, index = False)
|
|
||||||
print('\nFile successfully written:', outFile_lf)
|
|
||||||
###############################################################################
|
###############################################################################
|
|
@ -9,6 +9,8 @@ Created on Mon Jun 20 13:05:23 2022
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
import os, sys
|
import os, sys
|
||||||
|
import collections
|
||||||
|
|
||||||
# gene = 'pncA'
|
# gene = 'pncA'
|
||||||
# drug = 'pyrazinamide'
|
# drug = 'pyrazinamide'
|
||||||
#total_mtblineage_uc = 8
|
#total_mtblineage_uc = 8
|
||||||
|
@ -25,6 +27,7 @@ import os, sys
|
||||||
###############################################################################
|
###############################################################################
|
||||||
homedir = os.path.expanduser("~")
|
homedir = os.path.expanduser("~")
|
||||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#==================
|
#==================
|
||||||
# Import data
|
# Import data
|
||||||
|
@ -54,79 +57,70 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
||||||
print('\nOutput directory:', outdir_ml)
|
print('\nOutput directory:', outdir_ml)
|
||||||
|
|
||||||
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
||||||
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
|
||||||
|
|
||||||
#%% Running models ############################################################
|
#%% Running models ############################################################
|
||||||
print('\n#####################################################################\n'
|
print('\n#####################################################################\n'
|
||||||
, '\nRunning ML analysis: feature groups '
|
, '\nStarting--> Running ML analysis: Baseline modes (No FS)'
|
||||||
, '\nGene name:', gene
|
, '\nGene name:', gene
|
||||||
, '\nDrug name:', drug)
|
, '\nDrug name:', drug
|
||||||
|
, '\n#####################################################################\n')
|
||||||
|
|
||||||
fooD = {'baseline_paramD': {
|
paramD = {
|
||||||
'input_df': X
|
'baseline_paramD': { 'input_df' : X
|
||||||
, 'target': y
|
, 'target' : y
|
||||||
, 'var_type': 'mixed'
|
, 'var_type' : 'mixed'
|
||||||
, 'resampling_type': 'none'}
|
, 'resampling_type': 'none'}
|
||||||
,
|
|
||||||
'smnc_paramD': {'input_df': X_smnc
|
, 'smnc_paramD': { 'input_df' : X_smnc
|
||||||
, 'target': y_smnc
|
, 'target' : y_smnc
|
||||||
, 'var_type': 'mixed'
|
, 'var_type' : 'mixed'
|
||||||
, 'resampling_type': 'smnc'}
|
, 'resampling_type' : 'smnc'}
|
||||||
}
|
|
||||||
|
, 'ros_paramD': { 'input_df' : X_ros
|
||||||
|
, 'target' : y_ros
|
||||||
|
, 'var_type' : 'mixed'
|
||||||
|
, 'resampling_type' : 'ros'}
|
||||||
|
|
||||||
barD = {}
|
, 'rus_paramD' : { 'input_df' : X_rus
|
||||||
for k, v in fooD.items():
|
, 'target' : y_rus
|
||||||
#print(k)
|
, 'var_type' : 'mixed'
|
||||||
print(fooD[k])
|
, 'resampling_type' : 'rus'}
|
||||||
scores_7030D = MultModelsCl(**fooD[k]
|
|
||||||
|
, 'rouC_paramD' : { 'input_df' : X_rouC
|
||||||
|
, 'target' : y_rouC
|
||||||
|
, 'var_type' : 'mixed'
|
||||||
|
, 'resampling_type' : 'rouC'}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initial run to get the dict containing CV, BT and metadata DFs
|
||||||
|
mmD = {}
|
||||||
|
for k, v in paramD.items():
|
||||||
|
# print(fooD[k])
|
||||||
|
scores_7030D = MultModelsCl(**paramD[k]
|
||||||
, tts_split_type = tts_split_7030
|
, tts_split_type = tts_split_7030
|
||||||
, skf_cv = skf_cv
|
, skf_cv = skf_cv
|
||||||
, blind_test_df = X_bts
|
, blind_test_df = X_bts
|
||||||
, blind_test_target = y_bts
|
, blind_test_target = y_bts
|
||||||
, add_cm = True
|
, add_cm = True
|
||||||
, add_yn = True)
|
, add_yn = True
|
||||||
barD[k] = scores_7030D
|
, return_formatted_output = True)
|
||||||
|
mmD[k] = scores_7030D
|
||||||
|
|
||||||
|
for k, v in mmD.items():
|
||||||
ros_paramD = {input_df = X_ros
|
out_wf_7030 = pd.concat(mmD, ignore_index = True)
|
||||||
, target = y_ros
|
|
||||||
, var_type = 'mixed'
|
print('\n######################################################################'
|
||||||
, resampling_type = 'smnc'}
|
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
|
||||||
|
, '\nGene:', gene.lower()
|
||||||
|
, '\nDrug:', drug
|
||||||
rus_paramD = {input_df = X_rus
|
, '\noutput file:', outFile_wf
|
||||||
, target = y_rus
|
, '\nDim of output:', out_wf_7030.shape
|
||||||
, var_type = 'mixed'
|
, '\n######################################################################')
|
||||||
, resampling_type = 'rus'}
|
|
||||||
|
|
||||||
|
|
||||||
rouC_paramD = {input_df = X_rouC
|
|
||||||
, target = y_rouC
|
|
||||||
, var_type = 'mixed'
|
|
||||||
, resampling_type = 'rouC'}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#====
|
|
||||||
scores_7030D = MultModelsCl(**rouC_paramD
|
|
||||||
, tts_split_type = tts_split_7030
|
|
||||||
, skf_cv = skf_cv
|
|
||||||
, blind_test_df = X_bts
|
|
||||||
, blind_test_target = y_bts
|
|
||||||
, add_cm = True
|
|
||||||
, add_yn = True)
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
###############################################################################
|
|
||||||
#%% COMBINING all dfs: WF and LF
|
|
||||||
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
|
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#====================
|
#====================
|
||||||
# Write output file
|
# Write output file
|
||||||
#====================
|
#====================
|
||||||
#combined_baseline_wf.to_csv(outFile_wf, index = False)
|
out_wf_7030.to_csv(outFile_wf, index = False)
|
||||||
#print('\nFile successfully written:', outFile_wf)
|
print('\nFile successfully written:', outFile_wf)
|
||||||
###############################################################################
|
###############################################################################
|
|
@ -11,6 +11,7 @@ time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
|
||||||
time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
|
time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
|
||||||
time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
|
time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
|
||||||
time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
|
time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
|
||||||
|
|
||||||
# alr: # ERROR, as expected, too few values!
|
# alr: # ERROR, as expected, too few values!
|
||||||
# gid: problems
|
# gid: problems
|
||||||
########################################################################
|
########################################################################
|
||||||
|
@ -73,9 +74,11 @@ time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
|
||||||
########################################################################
|
########################################################################
|
||||||
########################################################################
|
########################################################################
|
||||||
|
|
||||||
|
|
||||||
# running feature selection
|
# running feature selection
|
||||||
# Split:70/30
|
# Split:70/30
|
||||||
time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt
|
time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt
|
||||||
|
|
||||||
|
real 338m26.705s
|
||||||
|
user 1946m12.173s
|
||||||
|
sys 189m40.122s
|
||||||
|
|
||||||
|
|
|
@ -7,21 +7,24 @@ Created on Fri Jun 24 11:07:05 2022
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
|
import os, sys
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# gene = 'pncA'
|
# gene = 'pncA'
|
||||||
# drug = 'pyrazinamide'
|
# drug = 'pyrazinamide'
|
||||||
#total_mtblineage_uc = 8
|
#total_mtblineage_uc = 8
|
||||||
|
|
||||||
#%% command line args: case sensitive
|
# #%% command line args: case sensitive
|
||||||
arg_parser = argparse.ArgumentParser()
|
# arg_parser = argparse.ArgumentParser()
|
||||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pncA')
|
# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
||||||
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pyrazinamide')
|
# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
||||||
args = arg_parser.parse_args()
|
# args = arg_parser.parse_args()
|
||||||
|
|
||||||
drug = args.drug
|
# drug = args.drug
|
||||||
gene = args.gene
|
# gene = args.gene
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#==================
|
#==================
|
||||||
|
@ -79,7 +82,7 @@ mmD = MultModelsCl(input_df = X_smnc
|
||||||
#================
|
#================
|
||||||
# MultModelsCl: WITH formatted output
|
# MultModelsCl: WITH formatted output
|
||||||
#================
|
#================
|
||||||
mmDF = MultModelsCl(input_df = X_smnc
|
mmDF3 = MultModelsCl(input_df = X_smnc
|
||||||
, target = y_smnc
|
, target = y_smnc
|
||||||
, var_type = 'mixed'
|
, var_type = 'mixed'
|
||||||
, tts_split_type = tts_split_7030
|
, tts_split_type = tts_split_7030
|
||||||
|
@ -96,4 +99,4 @@ mmDF = MultModelsCl(input_df = X_smnc
|
||||||
# test function
|
# test function
|
||||||
#=================
|
#=================
|
||||||
# output from function call
|
# output from function call
|
||||||
ProcessMultModelCl(smnc_scores_mmD)
|
ProcessMultModelsCl(mmD)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue