optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added

This commit is contained in:
Tanushree Tunstall 2022-06-24 15:40:18 +01:00
parent 7dc7e25016
commit b37a950fec
12 changed files with 180 additions and 128408 deletions

View file

@ -197,35 +197,35 @@ def MultModelsCl(input_df, target, skf_cv
# Specify multiple Classification Models
#======================================================
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
# , ('Decision Tree' , DecisionTreeClassifier(**rs) )
# , ('Extra Tree' , ExtraTreeClassifier(**rs) )
# , ('Extra Trees' , ExtraTreesClassifier(**rs) )
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
# , ('Gaussian NB' , GaussianNB() )
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
# , ('LDA' , LinearDiscriminantAnalysis() )
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
, ('Gaussian NB' , GaussianNB() )
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('LDA' , LinearDiscriminantAnalysis() )
, ('Logistic Regression' , LogisticRegression(**rs) )
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
# , ('Multinomial' , MultinomialNB() )
# , ('Naive Bayes' , BernoulliNB() )
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
# , ('QDA' , QuadraticDiscriminantAnalysis() )
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
# , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
# , n_estimators = 1000
# , bootstrap = True
# , oob_score = True
# , **njobs
# , **rs
# , max_features = 'auto') )
# , ('Ridge Classifier' , RidgeClassifier(**rs) )
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
# , ('SVC' , SVC(**rs) )
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
, ('Multinomial' , MultinomialNB() )
, ('Naive Bayes' , BernoulliNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, n_estimators = 1000
, bootstrap = True
, oob_score = True
, **njobs
, **rs
, max_features = 'auto') )
, ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
, ('SVC' , SVC(**rs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
]
mm_skf_scoresD = {}
@ -440,10 +440,11 @@ def ProcessMultModelsCl(inputD = {}):
, '\nCV df:', len(scoresDF_CV.columns)
, '\nBT_df:', len(scoresDF_BT.columns)
, '\nmetaDF:', len(metaDF.columns))
if len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
print('\nFirst proceeding to rowbind CV and BT dfs:')
expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
print('\nFinal output should have:',expected_ncols_out, 'columns' )
print('\nFinal output should have:', expected_ncols_out, 'columns' )
#-----------------
# Combine WF
@ -496,8 +497,7 @@ def ProcessMultModelsCl(inputD = {}):
sys.exit('\nFIRST IF FAILS')
else:
print('\nConcatenting dfs not possible [WF],check numbers ')
#-------------------------------------
# Combine WF+Metadata: Final output
#-------------------------------------
@ -515,11 +515,15 @@ def ProcessMultModelsCl(inputD = {}):
print('\nPASS: Combined df has expected ncols')
else:
sys.exit('\nFAIL: Length mismatch for combined_df')
print('\nAdding column: Model_name')
combDF['Model_name'] = combDF.index
print('\n========================================================='
, '\nSUCCESS: Ran multiple classifiers'
, '\n=======================================================')
#resampling_methods_wf = combined_baseline_wf[['resampling']]
#resampling_methods_wf = resampling_methods_wf.drop_duplicates()
#, '\n', resampling_methods_wf)

View file

@ -1,72 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:464: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
PASS: x_features has no target variable
No. of columns for x_features: 174
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./run_7030.py", line 35, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 636, in setvars
X, X_bts, y, y_bts = train_test_split(x_features, y_target
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
train, test = next(cv.split(X=arrays[0], y=stratify))
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
for train, test in self._iter_indices(X, y, groups):
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
raise ValueError(
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -557,7 +557,7 @@ def setvars(gene,drug):
# FG5: Genomic features
#========================
X_gn_mafor_Fnum = ['maf'
, 'logorI'
#, 'logorI'
# , 'or_rawI'
# , 'or_mychisq'
# , 'or_logistic'

635
scripts/ml/run_7030.py Executable file → Normal file
View file

@ -5,28 +5,28 @@ Created on Mon Jun 20 13:05:23 2022
@author: tanu
"""
#%%Imports ####################################################################
import re
import argparse
###############################################################################
import os, sys
# gene = 'pncA'
# drug = 'pyrazinamide'
#total_mtblineage_uc = 8
###############################################################################
#%% command line args: case sensitive
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args()
# arg_parser = argparse.ArgumentParser()
# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
# args = arg_parser.parse_args()
drug = args.drug
gene = args.gene
# drug = args.drug
# gene = args.gene
###############################################################################
#==================
# other vars
#==================
tts_split = '70/30'
OutFile_suffix = '7030'
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
###############################################################################
#==================
# Import data
@ -39,10 +39,15 @@ from ml_data_7030 import *
#from UQ_yc_RunAllClfs import run_all_ML
#====================
# Import ML function
# Import ML functions
#====================
# TT run all ML clfs: baseline model
from MultModelsCl import MultModelsCl
from MultClfs import *
#==================
# other vars
#==================
tts_split_7030 = '70_30'
OutFile_suffix = '7030'
#==================
# Specify outdir
@ -51,563 +56,71 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
###############################################################################
score_type_ordermapD = { 'mcc' : 1
, 'fscore' : 2
, 'jcc' : 3
, 'precision' : 4
, 'recall' : 5
, 'accuracy' : 6
, 'roc_auc' : 7
, 'TN' : 8
, 'FP' : 9
, 'FN' : 10
, 'TP' : 11
, 'trainingY_neg': 12
, 'trainingY_pos': 13
, 'blindY_neg' : 14
, 'blindY_pos' : 15
, 'fit_time' : 16
, 'score_time' : 17
}
scoreCV_mapD = {'test_mcc' : 'MCC'
, 'test_fscore' : 'F1'
, 'test_precision' : 'Precision'
, 'test_recall' : 'Recall'
, 'test_accuracy' : 'Accuracy'
, 'test_roc_auc' : 'ROC_AUC'
, 'test_jcc' : 'JCC'
}
scoreBT_mapD = {'bts_mcc' : 'MCC'
, 'bts_fscore' : 'F1'
, 'bts_precision' : 'Precision'
, 'bts_recall' : 'Recall'
, 'bts_accuracy' : 'Accuracy'
, 'bts_roc_auc' : 'ROC_AUC'
, 'bts_jcc' : 'JCC'
}
# data dependent variables but NOT dependent on resampling
bts_size = len(X_bts)
yc2 = Counter(y_bts)
yc2_ratio = yc2[0]/yc2[1]
###############################################################################
#%% Running models ############################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: feature groups '
, '\nStarting--> Running ML analysis: Baseline modes (No FS)'
, '\nGene name:', gene
, '\nDrug name:', drug)
, '\nDrug name:', drug
, '\n#####################################################################\n')
#%% Basic: No Oversampling
#================
# Baseline
# No resampling
#================
scores_mmD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
baseline_all_scores = pd.DataFrame(scores_mmD)
rs_none = 'none'
#------------------------
# WF: only CV and BTS
#-----------------------
baseline_allT = baseline_all_scores.T
#baseline_train = baseline_all.filter(regex='train_', axis=1)
baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns
# map colnames for consistency to allow concatenting
baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns
baseline_CV['Data_source'] = 'CV'
baseline_CV['Resampling'] = rs_none
baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns
# map colnames for consistency to allow concatenting
baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns
baseline_BT['Data_source'] = 'BT'
baseline_BT['Resampling'] = rs_none
# # Write csv
#baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
#baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv')
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#-----------------------------------
# other data dependent variables
training_size_ns = len(X)
n_features = len(X.columns)
yc1 = Counter(y)
yc1_ratio = yc1[0]/yc1[1]
baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_all = baseline_all.reset_index()
baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
baseline_all['Resampling'] = rs_none
baseline_all['training_size'] = training_size_ns
baseline_all['trainingY_ratio']= round(yc1_ratio,2)
baseline_all['n_features'] = n_features
###############################################################################
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
#================
# Baseline
# SMOTE NC: SMNC
#================
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
rs_smnc = 'smnc'
#------------------------
# WF: only CV and BTS
#-----------------------
smnc_allT = smnc_all_scores.T
smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
# map colnames for consistency to allow concatenting
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
smnc_CV['Data_source'] = 'CV'
smnc_CV['Resampling'] = rs_smnc
smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
# map colnames for consistency to allow concatenting
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
smnc_BT['Data_source'] = 'BT'
smnc_BT['Resampling'] = rs_smnc
# Write csv
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#-----------------------------------
# other data dependent variables
training_size_smnc = len(X_smnc)
n_features = len(X_smnc.columns)
yc1_smnc = Counter(y_smnc)
yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1]
smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
smnc_all = smnc_all.reset_index()
smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(smnc_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
smnc_all['Resampling'] = rs_smnc
smnc_all['training_size'] = training_size_smnc
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
smnc_all['n_features'] = n_features
###############################################################################
#%% ROS: Random Over Sampling [Numerical + categorical]
#================
# Baseline
# ROS
#================
ros_scores_mmD = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
ros_all_scores = pd.DataFrame(ros_scores_mmD)
rs_ros = 'ros'
#------------------------
# WF: only CV and BTS
#-----------------------
ros_allT = ros_all_scores.T
ros_CV = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
# map colnames for consistency to allow concatenting
ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
ros_CV['Data_source'] = 'CV'
ros_CV['Resampling'] = rs_ros
ros_BT = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
# map colnames for consistency to allow concatenting
ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
ros_BT['Data_source'] = 'BT'
ros_BT['Resampling'] = rs_ros
# Write csv
# ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_ros = len(X_ros)
n_features = len(X_ros.columns)
yc1_ros = Counter(y_ros)
yc1_ratio_ros = yc1_ros[0]/yc1_ros[1]
ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
ros_all = ros_all.reset_index()
ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(ros_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
ros_all['Resampling'] = rs_ros
ros_all['training_size'] = training_size_ros
ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
ros_all['n_features'] = n_features
###############################################################################
#%% RUS: Random Under Sampling [Numerical + categorical]
#================
# Baseline
# RUS
#================
rus_scores_mmD = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
rus_all_scores = pd.DataFrame(rus_scores_mmD)
rs_rus = 'rus'
#-----------------------
# WF: only CV and BTS
#-----------------------
rus_allT = rus_all_scores.T
rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
# map colnames for consistency to allow concatenting
rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
rus_CV['Data_source'] = 'CV'
rus_CV['Resampling'] = rs_rus
rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
# map colnames for consistency to allow concatenting
rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
rus_BT['Data_source'] = 'BT'
rus_BT['Resampling'] = rs_rus
# # Write csv
# rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_rus = len(X_rus)
n_features = len(X_rus.columns)
yc1_rus = Counter(y_rus)
yc1_ratio_rus = yc1_rus[0]/yc1_rus[1]
rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
rus_all = rus_all.reset_index()
rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(rus_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
rus_all['Resampling'] = rs_rus
rus_all['training_size'] = training_size_rus
rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
rus_all['n_features'] = n_features
###############################################################################
#%% ROS+RUS Combined: [Numerical + categorical]
#================
# Baseline
# ROUC
#================
rouC_scores_mmD = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
rs_rouC = 'rouC'
#-----------------------
# WF: only CV and BTS
#-----------------------
rouC_allT = rouC_all_scores.T
rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
# map colnames for consistency to allow concatenting
rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
rouC_CV['Data_source'] = 'CV'
rouC_CV['Resampling'] = rs_rouC
rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
# map colnames for consistency to allow concatenting
rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
rouC_BT['Data_source'] = 'BT'
rouC_BT['Resampling'] = rs_rouC
# Write csv
# rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_rouC = len(X_rouC)
n_features = len(X_rouC.columns)
yc1_rouC = Counter(y_rouC)
yc1_ratio_rouC = yc1_rouC[0]/yc1_rouC[1]
rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
rouC_all = rouC_all.reset_index()
rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(rouC_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
rouC_all['Resampling'] = rs_rouC
rouC_all['training_size'] = training_size_rouC
rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
rouC_all['n_features'] = n_features
###############################################################################
#%% COMBINING all dfs: WF and LF
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
#%% Combine WF
#-----------------
# Combine WF
#-----------------
dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
dfs_nrows_wf = []
for df in dfs_combine_wf:
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
dfs_nrows_wf = max(dfs_nrows_wf)
paramD = {
'baseline_paramD': { 'input_df' : X
, 'target' : y
, 'var_type' : 'mixed'
, 'resampling_type': 'none'}
, 'smnc_paramD': { 'input_df' : X_smnc
, 'target' : y_smnc
, 'var_type' : 'mixed'
, 'resampling_type' : 'smnc'}
dfs_ncols_wf = []
for df in dfs_combine_wf:
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
dfs_ncols_wf = max(dfs_ncols_wf)
, 'ros_paramD': { 'input_df' : X_ros
, 'target' : y_ros
, 'var_type' : 'mixed'
, 'resampling_type' : 'ros'}
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
expected_ncols_wf = dfs_ncols_wf
, 'rus_paramD' : { 'input_df' : X_rus
, 'target' : y_rus
, 'var_type' : 'mixed'
, 'resampling_type' : 'rus'}
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
, 'rouC_paramD' : { 'input_df' : X_rouC
, 'target' : y_rouC
, 'var_type' : 'mixed'
, 'resampling_type' : 'rouC'}
}
if len(common_cols_wf) == dfs_ncols_wf :
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
resampling_methods_wf = combined_baseline_wf[['Resampling']]
resampling_methods_wf = resampling_methods_wf.drop_duplicates()
print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
, '\nNo. of dfs combining:', len(dfs_combine_wf)
, '\nThe sampling methods are:'
, '\n', resampling_methods_wf)
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows_wf
, '\nGot:', len(combined_baseline_wf)
, '\nExpected ncols:', expected_ncols_wf
, '\nGot:', len(combined_baseline_wf.columns))
sys.exit()
else:
sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
# Initial run to get the dict containing CV, BT and metadata DFs
mmD = {}
for k, v in paramD.items():
# print(mmD[k])
scores_7030D = MultModelsCl(**paramD[k]
, tts_split_type = tts_split_7030
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True
, return_formatted_output = True)
mmD[k] = scores_7030D
# Extracting the dfs from within the dict and concatenating to output as one df
for k, v in mmD.items():
out_wf_7030 = pd.concat(mmD, ignore_index = True)
# Add index as a column
combined_baseline_wf.columns
combined_baseline_wf = combined_baseline_wf.reset_index()
combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
combined_baseline_wf.head()
# sort df: Resampling, Data_source, and MCC
combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
##############################################################################
#%% Combine LF
#-----------------
# Combine LF*
#-----------------
dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
dfs_nrows = []
for df in dfs_combine:
dfs_nrows = dfs_nrows + [len(df)]
dfs_nrows = max(dfs_nrows)
dfs_ncols = []
for df in dfs_combine:
dfs_ncols = dfs_ncols + [len(df.columns)]
dfs_ncols = max(dfs_ncols)
# dfs_ncols = []
# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
# dfs_ncols2
expected_nrows = len(dfs_combine) * dfs_nrows
expected_ncols = dfs_ncols
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
if len(common_cols) == dfs_ncols :
combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
resampling_methods = combined_baseline[['Resampling', 'training_size']]
resampling_methods = resampling_methods.drop_duplicates()
print('\nConcatenating dfs with different resampling methods:', tts_split
, '\nNo. of dfs combining:', len(dfs_combine)
, '\nThe sampling methods are:'
, '\n', resampling_methods)
if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols:
print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
, '\nnrows in combined_df:', len(combined_baseline)
, '\nncols in combined_df:', len(combined_baseline.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows
, '\nGot:', len(combined_baseline)
, '\nExpected ncols:', expected_ncols
, '\nGot:', len(combined_baseline.columns))
sys.exit()
else:
sys.exit('\nConcatenting dfs not possible,check numbers ')
# Add further column indications
combined_baseline['test_size'] = bts_size
combined_baseline['tts_split'] = tts_split
combined_baseline['testY_ratio'] = round(yc2_ratio,2)
#combined_baseline.columns
# change to column names to be lower case for consistency
combined_baseline.rename(columns = {'Resampling' : 'resampling'
, 'Data_source': 'data_source'}, inplace = True)
combined_baseline.columns
# sort df: resampling, data_source, mcc
combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
# # rpow bind
# if all(XXX):
# print('\nPASS:colnames match, proceeding to rowbind')
# comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
print('\n######################################################################'
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
, '\nGene:', gene.lower()
, '\nDrug:', drug
, '\noutput file:', outFile_wf
, '\nDim of output:', out_wf_7030.shape
, '\n######################################################################')
###############################################################################
#====================
# Write output file
#====================
combined_baseline_wf.to_csv(outFile_wf, index = False)
#out_wf_7030.to_csv(outFile_wf, index = False)
print('\nFile successfully written:', outFile_wf)
combined_baseline.to_csv(outFile_lf, index = False)
print('\nFile successfully written:', outFile_lf)
###############################################################################

View file

@ -9,6 +9,8 @@ Created on Mon Jun 20 13:05:23 2022
import re
import argparse
import os, sys
import collections
# gene = 'pncA'
# drug = 'pyrazinamide'
#total_mtblineage_uc = 8
@ -25,6 +27,7 @@ import os, sys
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
###############################################################################
#==================
# Import data
@ -54,79 +57,70 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
#%% Running models ############################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: feature groups '
, '\nStarting--> Running ML analysis: Baseline modes (No FS)'
, '\nGene name:', gene
, '\nDrug name:', drug)
, '\nDrug name:', drug
, '\n#####################################################################\n')
fooD = {'baseline_paramD': {
'input_df': X
, 'target': y
, 'var_type': 'mixed'
, 'resampling_type': 'none'}
,
'smnc_paramD': {'input_df': X_smnc
, 'target': y_smnc
, 'var_type': 'mixed'
, 'resampling_type': 'smnc'}
}
paramD = {
'baseline_paramD': { 'input_df' : X
, 'target' : y
, 'var_type' : 'mixed'
, 'resampling_type': 'none'}
, 'smnc_paramD': { 'input_df' : X_smnc
, 'target' : y_smnc
, 'var_type' : 'mixed'
, 'resampling_type' : 'smnc'}
, 'ros_paramD': { 'input_df' : X_ros
, 'target' : y_ros
, 'var_type' : 'mixed'
, 'resampling_type' : 'ros'}
barD = {}
for k, v in fooD.items():
#print(k)
print(fooD[k])
scores_7030D = MultModelsCl(**fooD[k]
, 'rus_paramD' : { 'input_df' : X_rus
, 'target' : y_rus
, 'var_type' : 'mixed'
, 'resampling_type' : 'rus'}
, 'rouC_paramD' : { 'input_df' : X_rouC
, 'target' : y_rouC
, 'var_type' : 'mixed'
, 'resampling_type' : 'rouC'}
}
# Initial run to get the dict containing CV, BT and metadata DFs
mmD = {}
for k, v in paramD.items():
# print(fooD[k])
scores_7030D = MultModelsCl(**paramD[k]
, tts_split_type = tts_split_7030
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
barD[k] = scores_7030D
, add_yn = True
, return_formatted_output = True)
mmD[k] = scores_7030D
ros_paramD = {input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, resampling_type = 'smnc'}
rus_paramD = {input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, resampling_type = 'rus'}
rouC_paramD = {input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, resampling_type = 'rouC'}
#====
scores_7030D = MultModelsCl(**rouC_paramD
, tts_split_type = tts_split_7030
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
###############################################################################
###############################################################################
#%% COMBINING all dfs: WF and LF
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
for k, v in mmD.items():
out_wf_7030 = pd.concat(mmD, ignore_index = True)
print('\n######################################################################'
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
, '\nGene:', gene.lower()
, '\nDrug:', drug
, '\noutput file:', outFile_wf
, '\nDim of output:', out_wf_7030.shape
, '\n######################################################################')
###############################################################################
#====================
# Write output file
#====================
#combined_baseline_wf.to_csv(outFile_wf, index = False)
#print('\nFile successfully written:', outFile_wf)
out_wf_7030.to_csv(outFile_wf, index = False)
print('\nFile successfully written:', outFile_wf)
###############################################################################

View file

@ -11,6 +11,7 @@ time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
# alr: # ERROR, as expected, too few values!
# gid: problems
########################################################################
@ -73,9 +74,11 @@ time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
########################################################################
########################################################################
# running feature selection
# Split:70/30
time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt
real 338m26.705s
user 1946m12.173s
sys 189m40.122s

View file

@ -7,21 +7,24 @@ Created on Fri Jun 24 11:07:05 2022
"""
import re
import argparse
import os, sys
###############################################################################
# gene = 'pncA'
# drug = 'pyrazinamide'
#total_mtblineage_uc = 8
#%% command line args: case sensitive
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pncA')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pyrazinamide')
args = arg_parser.parse_args()
# #%% command line args: case sensitive
# arg_parser = argparse.ArgumentParser()
# arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
# arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
# args = arg_parser.parse_args()
drug = args.drug
gene = args.gene
# drug = args.drug
# gene = args.gene
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
###############################################################################
#==================
@ -79,7 +82,7 @@ mmD = MultModelsCl(input_df = X_smnc
#================
# MultModelsCl: WITH formatted output
#================
mmDF = MultModelsCl(input_df = X_smnc
mmDF3 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, tts_split_type = tts_split_7030
@ -96,4 +99,4 @@ mmDF = MultModelsCl(input_df = X_smnc
# test function
#=================
# output from function call
ProcessMultModelCl(smnc_scores_mmD)
ProcessMultModelsCl(mmD)