optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added

This commit is contained in:
Tanushree Tunstall 2022-06-24 15:40:18 +01:00
parent 7dc7e25016
commit b37a950fec
12 changed files with 180 additions and 128408 deletions

View file

@ -197,35 +197,35 @@ def MultModelsCl(input_df, target, skf_cv
# Specify multiple Classification Models # Specify multiple Classification Models
#====================================================== #======================================================
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
# , ('Decision Tree' , DecisionTreeClassifier(**rs) ) , ('Decision Tree' , DecisionTreeClassifier(**rs) )
# , ('Extra Tree' , ExtraTreeClassifier(**rs) ) , ('Extra Tree' , ExtraTreeClassifier(**rs) )
# , ('Extra Trees' , ExtraTreesClassifier(**rs) ) , ('Extra Trees' , ExtraTreesClassifier(**rs) )
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
# , ('Gaussian NB' , GaussianNB() ) , ('Gaussian NB' , GaussianNB() )
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
# , ('K-Nearest Neighbors' , KNeighborsClassifier() ) , ('K-Nearest Neighbors' , KNeighborsClassifier() )
# , ('LDA' , LinearDiscriminantAnalysis() ) , ('LDA' , LinearDiscriminantAnalysis() )
, ('Logistic Regression' , LogisticRegression(**rs) ) , ('Logistic Regression' , LogisticRegression(**rs) )
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
# , ('Multinomial' , MultinomialNB() ) , ('Multinomial' , MultinomialNB() )
# , ('Naive Bayes' , BernoulliNB() ) , ('Naive Bayes' , BernoulliNB() )
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
# , ('QDA' , QuadraticDiscriminantAnalysis() ) , ('QDA' , QuadraticDiscriminantAnalysis() )
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
# , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
# , n_estimators = 1000 , n_estimators = 1000
# , bootstrap = True , bootstrap = True
# , oob_score = True , oob_score = True
# , **njobs , **njobs
# , **rs , **rs
# , max_features = 'auto') ) , max_features = 'auto') )
# , ('Ridge Classifier' , RidgeClassifier(**rs) ) , ('Ridge Classifier' , RidgeClassifier(**rs) )
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
# , ('SVC' , SVC(**rs) ) , ('SVC' , SVC(**rs) )
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
] ]
mm_skf_scoresD = {} mm_skf_scoresD = {}
@ -440,10 +440,11 @@ def ProcessMultModelsCl(inputD = {}):
, '\nCV df:', len(scoresDF_CV.columns) , '\nCV df:', len(scoresDF_CV.columns)
, '\nBT_df:', len(scoresDF_BT.columns) , '\nBT_df:', len(scoresDF_BT.columns)
, '\nmetaDF:', len(metaDF.columns)) , '\nmetaDF:', len(metaDF.columns))
if len(scoresDF_CV.columns) == len(scoresDF_BT.columns): if len(scoresDF_CV.columns) == len(scoresDF_BT.columns):
print('\nFirst proceeding to rowbind CV and BT dfs:') print('\nFirst proceeding to rowbind CV and BT dfs:')
expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns) expected_ncols_out = len(scoresDF_BT.columns) + len(metaDF.columns)
print('\nFinal output should have:',expected_ncols_out, 'columns' ) print('\nFinal output should have:', expected_ncols_out, 'columns' )
#----------------- #-----------------
# Combine WF # Combine WF
@ -497,7 +498,6 @@ def ProcessMultModelsCl(inputD = {}):
else: else:
print('\nConcatenting dfs not possible [WF],check numbers ') print('\nConcatenting dfs not possible [WF],check numbers ')
#------------------------------------- #-------------------------------------
# Combine WF+Metadata: Final output # Combine WF+Metadata: Final output
#------------------------------------- #-------------------------------------
@ -516,6 +516,10 @@ def ProcessMultModelsCl(inputD = {}):
else: else:
sys.exit('\nFAIL: Length mismatch for combined_df') sys.exit('\nFAIL: Length mismatch for combined_df')
print('\nAdding column: Model_name')
combDF['Model_name'] = combDF.index
print('\n=========================================================' print('\n========================================================='
, '\nSUCCESS: Ran multiple classifiers' , '\nSUCCESS: Ran multiple classifiers'
, '\n=======================================================') , '\n=======================================================')

View file

@ -1,72 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:464: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
PASS: x_features has no target variable
No. of columns for x_features: 174
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./run_7030.py", line 35, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 636, in setvars
X, X_bts, y, y_bts = train_test_split(x_features, y_target
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
train, test = next(cv.split(X=arrays[0], y=stratify))
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
for train, test in self._iter_indices(X, y, groups):
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
raise ValueError(
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -557,7 +557,7 @@ def setvars(gene,drug):
# FG5: Genomic features # FG5: Genomic features
#======================== #========================
X_gn_mafor_Fnum = ['maf' X_gn_mafor_Fnum = ['maf'
, 'logorI' #, 'logorI'
# , 'or_rawI' # , 'or_rawI'
# , 'or_mychisq' # , 'or_mychisq'
# , 'or_logistic' # , 'or_logistic'

647
scripts/ml/run_7030.py Executable file → Normal file
View file

@ -5,28 +5,28 @@ Created on Mon Jun 20 13:05:23 2022
@author: tanu @author: tanu
""" """
#%%Imports ####################################################################
import re import re
import argparse import argparse
############################################################################### import os, sys
# gene = 'pncA' # gene = 'pncA'
# drug = 'pyrazinamide' # drug = 'pyrazinamide'
#total_mtblineage_uc = 8 #total_mtblineage_uc = 8
###############################################################################
#%% command line args: case sensitive #%% command line args: case sensitive
arg_parser = argparse.ArgumentParser() # arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') # arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') # arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args() # args = arg_parser.parse_args()
drug = args.drug # drug = args.drug
gene = args.gene # gene = args.gene
############################################################################### ###############################################################################
#================== homedir = os.path.expanduser("~")
# other vars sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
#==================
tts_split = '70/30'
OutFile_suffix = '7030'
############################################################################### ###############################################################################
#================== #==================
# Import data # Import data
@ -39,10 +39,15 @@ from ml_data_7030 import *
#from UQ_yc_RunAllClfs import run_all_ML #from UQ_yc_RunAllClfs import run_all_ML
#==================== #====================
# Import ML function # Import ML functions
#==================== #====================
# TT run all ML clfs: baseline model from MultClfs import *
from MultModelsCl import MultModelsCl
#==================
# other vars
#==================
tts_split_7030 = '70_30'
OutFile_suffix = '7030'
#================== #==================
# Specify outdir # Specify outdir
@ -51,563 +56,71 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml) print('\nOutput directory:', outdir_ml)
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv' #outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
############################################################################### #%% Running models ############################################################
score_type_ordermapD = { 'mcc' : 1
, 'fscore' : 2
, 'jcc' : 3
, 'precision' : 4
, 'recall' : 5
, 'accuracy' : 6
, 'roc_auc' : 7
, 'TN' : 8
, 'FP' : 9
, 'FN' : 10
, 'TP' : 11
, 'trainingY_neg': 12
, 'trainingY_pos': 13
, 'blindY_neg' : 14
, 'blindY_pos' : 15
, 'fit_time' : 16
, 'score_time' : 17
}
scoreCV_mapD = {'test_mcc' : 'MCC'
, 'test_fscore' : 'F1'
, 'test_precision' : 'Precision'
, 'test_recall' : 'Recall'
, 'test_accuracy' : 'Accuracy'
, 'test_roc_auc' : 'ROC_AUC'
, 'test_jcc' : 'JCC'
}
scoreBT_mapD = {'bts_mcc' : 'MCC'
, 'bts_fscore' : 'F1'
, 'bts_precision' : 'Precision'
, 'bts_recall' : 'Recall'
, 'bts_accuracy' : 'Accuracy'
, 'bts_roc_auc' : 'ROC_AUC'
, 'bts_jcc' : 'JCC'
}
# data dependent variables but NOT dependent on resampling
bts_size = len(X_bts)
yc2 = Counter(y_bts)
yc2_ratio = yc2[0]/yc2[1]
###############################################################################
print('\n#####################################################################\n' print('\n#####################################################################\n'
, '\nRunning ML analysis: feature groups ' , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
, '\nGene name:', gene , '\nGene name:', gene
, '\nDrug name:', drug) , '\nDrug name:', drug
, '\n#####################################################################\n')
#%% Basic: No Oversampling
#================ paramD = {
# Baseline 'baseline_paramD': { 'input_df' : X
# No resampling , 'target' : y
#================ , 'var_type' : 'mixed'
scores_mmD = MultModelsCl(input_df = X , 'resampling_type': 'none'}
, target = y
, var_type = 'mixed' , 'smnc_paramD': { 'input_df' : X_smnc
, skf_cv = skf_cv , 'target' : y_smnc
, blind_test_df = X_bts , 'var_type' : 'mixed'
, blind_test_target = y_bts , 'resampling_type' : 'smnc'}
, add_cm = True
, add_yn = True) , 'ros_paramD': { 'input_df' : X_ros
, 'target' : y_ros
baseline_all_scores = pd.DataFrame(scores_mmD) , 'var_type' : 'mixed'
rs_none = 'none' , 'resampling_type' : 'ros'}
#------------------------
# WF: only CV and BTS , 'rus_paramD' : { 'input_df' : X_rus
#----------------------- , 'target' : y_rus
baseline_allT = baseline_all_scores.T , 'var_type' : 'mixed'
#baseline_train = baseline_all.filter(regex='train_', axis=1) , 'resampling_type' : 'rus'}
baseline_CV = baseline_allT.filter(regex='test_', axis = 1); baseline_CV.columns , 'rouC_paramD' : { 'input_df' : X_rouC
# map colnames for consistency to allow concatenting , 'target' : y_rouC
baseline_CV.columns = baseline_CV.columns.map(scoreCV_mapD); baseline_CV.columns , 'var_type' : 'mixed'
baseline_CV['Data_source'] = 'CV' , 'resampling_type' : 'rouC'}
baseline_CV['Resampling'] = rs_none }
baseline_BT = baseline_allT.filter(regex='bts_', axis = 1); baseline_BT.columns # Initial run to get the dict containing CV, BT and metadata DFs
# map colnames for consistency to allow concatenting mmD = {}
baseline_BT.columns = baseline_BT.columns.map(scoreBT_mapD); baseline_BT.columns for k, v in paramD.items():
baseline_BT['Data_source'] = 'BT' # print(mmD[k])
baseline_BT['Resampling'] = rs_none scores_7030D = MultModelsCl(**paramD[k]
, tts_split_type = tts_split_7030
# # Write csv , skf_cv = skf_cv
#baseline_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True) , blind_test_df = X_bts
#baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) , blind_test_target = y_bts
# baseline_CV.to_csv(outdir_ml + gene.lower() + '_baseline_CV_allF.csv') , add_cm = True
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') , add_yn = True
, return_formatted_output = True)
#---------------------------------- mmD[k] = scores_7030D
# LF*: CV + BTS + Other info
#----------------------------------- # Extracting the dfs from within the dict and concatenating to output as one df
# other data dependent variables for k, v in mmD.items():
training_size_ns = len(X) out_wf_7030 = pd.concat(mmD, ignore_index = True)
n_features = len(X.columns)
yc1 = Counter(y) print('\n######################################################################'
yc1_ratio = yc1[0]/yc1[1] , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
, '\nGene:', gene.lower()
baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) , '\nDrug:', drug
baseline_all = baseline_all.reset_index() , '\noutput file:', outFile_wf
baseline_all.rename(columns = {'index': 'original_names'}, inplace = True) , '\nDim of output:', out_wf_7030.shape
, '\n######################################################################')
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
baseline_all['Data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
baseline_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
baseline_all['Resampling'] = rs_none
baseline_all['training_size'] = training_size_ns
baseline_all['trainingY_ratio']= round(yc1_ratio,2)
baseline_all['n_features'] = n_features
###############################################################################
#%% SMOTE NC: Smote Oversampling [Numerical + categorical]
#================
# Baseline
# SMOTE NC: SMNC
#================
smnc_scores_mmD = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
rs_smnc = 'smnc'
#------------------------
# WF: only CV and BTS
#-----------------------
smnc_allT = smnc_all_scores.T
smnc_CV = smnc_allT.filter(regex='test_', axis = 1); smnc_CV.columns
# map colnames for consistency to allow concatenting
smnc_CV.columns = smnc_CV.columns.map(scoreCV_mapD); smnc_CV.columns
smnc_CV['Data_source'] = 'CV'
smnc_CV['Resampling'] = rs_smnc
smnc_BT = smnc_allT.filter(regex='bts_', axis = 1); smnc_BT.columns
# map colnames for consistency to allow concatenting
smnc_BT.columns = smnc_BT.columns.map(scoreBT_mapD); smnc_BT.columns
smnc_BT['Data_source'] = 'BT'
smnc_BT['Resampling'] = rs_smnc
# Write csv
# smnc_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# smnc_CV.to_csv(outdir_ml + gene.lower() + '_smnc_CV_allF.csv')
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#-----------------------------------
# other data dependent variables
training_size_smnc = len(X_smnc)
n_features = len(X_smnc.columns)
yc1_smnc = Counter(y_smnc)
yc1_ratio_smnc = yc1_smnc[0]/yc1_smnc[1]
smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
smnc_all = smnc_all.reset_index()
smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
smnc_all['Data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(smnc_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
smnc_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
smnc_all['Resampling'] = rs_smnc
smnc_all['training_size'] = training_size_smnc
smnc_all['trainingY_ratio'] = round(yc1_ratio_smnc,2)
smnc_all['n_features'] = n_features
###############################################################################
#%% ROS: Random Over Sampling [Numerical + categorical]
#================
# Baseline
# ROS
#================
ros_scores_mmD = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
ros_all_scores = pd.DataFrame(ros_scores_mmD)
rs_ros = 'ros'
#------------------------
# WF: only CV and BTS
#-----------------------
ros_allT = ros_all_scores.T
ros_CV = ros_allT.filter(regex='test_', axis = 1); ros_CV.columns
# map colnames for consistency to allow concatenting
ros_CV.columns = ros_CV.columns.map(scoreCV_mapD); ros_CV.columns
ros_CV['Data_source'] = 'CV'
ros_CV['Resampling'] = rs_ros
ros_BT = ros_allT.filter(regex='bts_', axis = 1); ros_BT.columns
# map colnames for consistency to allow concatenting
ros_BT.columns = ros_BT.columns.map(scoreBT_mapD); ros_BT.columns
ros_BT['Data_source'] = 'BT'
ros_BT['Resampling'] = rs_ros
# Write csv
# ros_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# ros_CV.to_csv(outdir_ml + gene.lower() + '_ros_CV_allF.csv')
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_ros = len(X_ros)
n_features = len(X_ros.columns)
yc1_ros = Counter(y_ros)
yc1_ratio_ros = yc1_ros[0]/yc1_ros[1]
ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
ros_all = ros_all.reset_index()
ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
ros_all['Data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(ros_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
ros_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
ros_all['Resampling'] = rs_ros
ros_all['training_size'] = training_size_ros
ros_all['trainingY_ratio'] = round(yc1_ratio_ros,2)
ros_all['n_features'] = n_features
###############################################################################
#%% RUS: Random Under Sampling [Numerical + categorical]
#================
# Baseline
# RUS
#================
rus_scores_mmD = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
rus_all_scores = pd.DataFrame(rus_scores_mmD)
rs_rus = 'rus'
#-----------------------
# WF: only CV and BTS
#-----------------------
rus_allT = rus_all_scores.T
rus_CV = rus_allT.filter(regex='test_', axis=1); rus_CV.columns
# map colnames for consistency to allow concatenting
rus_CV.columns = rus_CV.columns.map(scoreCV_mapD); rus_CV.columns
rus_CV['Data_source'] = 'CV'
rus_CV['Resampling'] = rs_rus
rus_BT = rus_allT.filter(regex='bts_' , axis=1); rus_BT.columns
# map colnames for consistency to allow concatenting
rus_BT.columns = rus_BT.columns.map(scoreBT_mapD); rus_BT.columns
rus_BT['Data_source'] = 'BT'
rus_BT['Resampling'] = rs_rus
# # Write csv
# rus_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# rus_CV.to_csv(outdir_ml + gene.lower() + '_rus_CV_allF.csv')
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_rus = len(X_rus)
n_features = len(X_rus.columns)
yc1_rus = Counter(y_rus)
yc1_ratio_rus = yc1_rus[0]/yc1_rus[1]
rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
rus_all = rus_all.reset_index()
rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
rus_all['Data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(rus_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
rus_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
rus_all['Resampling'] = rs_rus
rus_all['training_size'] = training_size_rus
rus_all['trainingY_ratio'] = round(yc1_ratio_rus,2)
rus_all['n_features'] = n_features
###############################################################################
#%% ROS+RUS Combined: [Numerical + categorical]
#================
# Baseline
# ROUC
#================
rouC_scores_mmD = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
rs_rouC = 'rouC'
#-----------------------
# WF: only CV and BTS
#-----------------------
rouC_allT = rouC_all_scores.T
rouC_CV = rouC_allT.filter(regex='test_', axis=1); rouC_CV.columns
# map colnames for consistency to allow concatenting
rouC_CV.columns = rouC_CV.columns.map(scoreCV_mapD); rouC_CV.columns
rouC_CV['Data_source'] = 'CV'
rouC_CV['Resampling'] = rs_rouC
rouC_BT = rouC_allT.filter(regex='bts_', axis=1); rouC_BT.columns
# map colnames for consistency to allow concatenting
rouC_BT.columns = rouC_BT.columns.map(scoreBT_mapD); rouC_BT.columns
rouC_BT['Data_source'] = 'BT'
rouC_BT['Resampling'] = rs_rouC
# Write csv
# rouC_CV.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rouC_CV.to_csv(outdir_ml + gene.lower() + '_rouC_CV_allF.csv')
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
#----------------------------------
# LF*: CV + BTS + Other info
#----------------------------------
# other data dependent variables
training_size_rouC = len(X_rouC)
n_features = len(X_rouC.columns)
yc1_rouC = Counter(y_rouC)
yc1_ratio_rouC = yc1_rouC[0]/yc1_rouC[1]
rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
rouC_all = rouC_all.reset_index()
rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CV
bt_pattern = re.compile(r'bts_.*')
rouC_all['Data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(rouC_all['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
rouC_all.sort_values(by = ['Data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# add cols: specific
rouC_all['Resampling'] = rs_rouC
rouC_all['training_size'] = training_size_rouC
rouC_all['trainingY_ratio'] = round(yc1_ratio_rouC,2)
rouC_all['n_features'] = n_features
###############################################################################
#%% COMBINING all dfs: WF and LF
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
#%% Combine WF
#-----------------
# Combine WF
#-----------------
dfs_combine_wf = [baseline_BT, smnc_BT, ros_BT, rus_BT, rouC_BT,
baseline_CV, smnc_CV, ros_CV, rus_CV, rouC_CV]
dfs_nrows_wf = []
for df in dfs_combine_wf:
dfs_nrows_wf = dfs_nrows_wf + [len(df)]
dfs_nrows_wf = max(dfs_nrows_wf)
dfs_ncols_wf = []
for df in dfs_combine_wf:
dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
dfs_ncols_wf = max(dfs_ncols_wf)
expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
expected_ncols_wf = dfs_ncols_wf
common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
if len(common_cols_wf) == dfs_ncols_wf :
combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
resampling_methods_wf = combined_baseline_wf[['Resampling']]
resampling_methods_wf = resampling_methods_wf.drop_duplicates()
print('\nConcatenating dfs with different resampling methods [WF]:', tts_split
, '\nNo. of dfs combining:', len(dfs_combine_wf)
, '\nThe sampling methods are:'
, '\n', resampling_methods_wf)
if len(combined_baseline_wf) == expected_nrows_wf and len(combined_baseline_wf.columns) == expected_ncols_wf:
print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
, '\nnrows in combined_df_wf:', len(combined_baseline_wf)
, '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows_wf
, '\nGot:', len(combined_baseline_wf)
, '\nExpected ncols:', expected_ncols_wf
, '\nGot:', len(combined_baseline_wf.columns))
sys.exit()
else:
sys.exit('\nConcatenting dfs not possible [WF],check numbers ')
# Add index as a column
combined_baseline_wf.columns
combined_baseline_wf = combined_baseline_wf.reset_index()
combined_baseline_wf.rename(columns = {'index': 'Model_name'}, inplace = True)
combined_baseline_wf.head()
# sort df: Resampling, Data_source, and MCC
combined_baseline_wf.sort_values(by = ['Resampling', 'Data_source', 'MCC'], ascending = [True, True, False], inplace = True)
##############################################################################
#%% Combine LF
#-----------------
# Combine LF*
#-----------------
dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
dfs_nrows = []
for df in dfs_combine:
dfs_nrows = dfs_nrows + [len(df)]
dfs_nrows = max(dfs_nrows)
dfs_ncols = []
for df in dfs_combine:
dfs_ncols = dfs_ncols + [len(df.columns)]
dfs_ncols = max(dfs_ncols)
# dfs_ncols = []
# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
# dfs_ncols2
expected_nrows = len(dfs_combine) * dfs_nrows
expected_ncols = dfs_ncols
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
if len(common_cols) == dfs_ncols :
combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
resampling_methods = combined_baseline[['Resampling', 'training_size']]
resampling_methods = resampling_methods.drop_duplicates()
print('\nConcatenating dfs with different resampling methods:', tts_split
, '\nNo. of dfs combining:', len(dfs_combine)
, '\nThe sampling methods are:'
, '\n', resampling_methods)
if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols:
print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
, '\nnrows in combined_df:', len(combined_baseline)
, '\nncols in combined_df:', len(combined_baseline.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows
, '\nGot:', len(combined_baseline)
, '\nExpected ncols:', expected_ncols
, '\nGot:', len(combined_baseline.columns))
sys.exit()
else:
sys.exit('\nConcatenting dfs not possible,check numbers ')
# Add further column indications
combined_baseline['test_size'] = bts_size
combined_baseline['tts_split'] = tts_split
combined_baseline['testY_ratio'] = round(yc2_ratio,2)
#combined_baseline.columns
# change to column names to be lower case for consistency
combined_baseline.rename(columns = {'Resampling' : 'resampling'
, 'Data_source': 'data_source'}, inplace = True)
combined_baseline.columns
# sort df: resampling, data_source, mcc
combined_baseline.sort_values(by = ['resampling', 'data_source', 'score_order'], ascending = [True, True, True], inplace = True)
# # rpow bind
# if all(XXX):
# print('\nPASS:colnames match, proceeding to rowbind')
# comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
############################################################################### ###############################################################################
#==================== #====================
# Write output file # Write output file
#==================== #====================
combined_baseline_wf.to_csv(outFile_wf, index = False) #out_wf_7030.to_csv(outFile_wf, index = False)
print('\nFile successfully written:', outFile_wf) print('\nFile successfully written:', outFile_wf)
combined_baseline.to_csv(outFile_lf, index = False)
print('\nFile successfully written:', outFile_lf)
############################################################################### ###############################################################################

View file

@ -9,6 +9,8 @@ Created on Mon Jun 20 13:05:23 2022
import re import re
import argparse import argparse
import os, sys import os, sys
import collections
# gene = 'pncA' # gene = 'pncA'
# drug = 'pyrazinamide' # drug = 'pyrazinamide'
#total_mtblineage_uc = 8 #total_mtblineage_uc = 8
@ -25,6 +27,7 @@ import os, sys
############################################################################### ###############################################################################
homedir = os.path.expanduser("~") homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml') sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
############################################################################### ###############################################################################
#================== #==================
# Import data # Import data
@ -54,79 +57,70 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml) print('\nOutput directory:', outdir_ml)
outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv' #outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
#%% Running models ############################################################ #%% Running models ############################################################
print('\n#####################################################################\n' print('\n#####################################################################\n'
, '\nRunning ML analysis: feature groups ' , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
, '\nGene name:', gene , '\nGene name:', gene
, '\nDrug name:', drug) , '\nDrug name:', drug
, '\n#####################################################################\n')
fooD = {'baseline_paramD': { paramD = {
'input_df': X 'baseline_paramD': { 'input_df' : X
, 'target': y , 'target' : y
, 'var_type': 'mixed' , 'var_type' : 'mixed'
, 'resampling_type': 'none'} , 'resampling_type': 'none'}
,
'smnc_paramD': {'input_df': X_smnc
, 'target': y_smnc
, 'var_type': 'mixed'
, 'resampling_type': 'smnc'}
}
barD = {} , 'smnc_paramD': { 'input_df' : X_smnc
for k, v in fooD.items(): , 'target' : y_smnc
#print(k) , 'var_type' : 'mixed'
print(fooD[k]) , 'resampling_type' : 'smnc'}
scores_7030D = MultModelsCl(**fooD[k]
, 'ros_paramD': { 'input_df' : X_ros
, 'target' : y_ros
, 'var_type' : 'mixed'
, 'resampling_type' : 'ros'}
, 'rus_paramD' : { 'input_df' : X_rus
, 'target' : y_rus
, 'var_type' : 'mixed'
, 'resampling_type' : 'rus'}
, 'rouC_paramD' : { 'input_df' : X_rouC
, 'target' : y_rouC
, 'var_type' : 'mixed'
, 'resampling_type' : 'rouC'}
}
# Initial run to get the dict containing CV, BT and metadata DFs
mmD = {}
for k, v in paramD.items():
# print(fooD[k])
scores_7030D = MultModelsCl(**paramD[k]
, tts_split_type = tts_split_7030 , tts_split_type = tts_split_7030
, skf_cv = skf_cv , skf_cv = skf_cv
, blind_test_df = X_bts , blind_test_df = X_bts
, blind_test_target = y_bts , blind_test_target = y_bts
, add_cm = True , add_cm = True
, add_yn = True) , add_yn = True
barD[k] = scores_7030D , return_formatted_output = True)
mmD[k] = scores_7030D
ros_paramD = {input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, resampling_type = 'smnc'}
rus_paramD = {input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, resampling_type = 'rus'}
rouC_paramD = {input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, resampling_type = 'rouC'}
#====
scores_7030D = MultModelsCl(**rouC_paramD
, tts_split_type = tts_split_7030
, skf_cv = skf_cv
, blind_test_df = X_bts
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
###############################################################################
###############################################################################
#%% COMBINING all dfs: WF and LF
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
for k, v in mmD.items():
out_wf_7030 = pd.concat(mmD, ignore_index = True)
print('\n######################################################################'
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
, '\nGene:', gene.lower()
, '\nDrug:', drug
, '\noutput file:', outFile_wf
, '\nDim of output:', out_wf_7030.shape
, '\n######################################################################')
############################################################################### ###############################################################################
#==================== #====================
# Write output file # Write output file
#==================== #====================
#combined_baseline_wf.to_csv(outFile_wf, index = False) out_wf_7030.to_csv(outFile_wf, index = False)
#print('\nFile successfully written:', outFile_wf) print('\nFile successfully written:', outFile_wf)
############################################################################### ###############################################################################

View file

@ -11,6 +11,7 @@ time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt
time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt
time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030.txt
time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
# alr: # ERROR, as expected, too few values! # alr: # ERROR, as expected, too few values!
# gid: problems # gid: problems
######################################################################## ########################################################################
@ -73,9 +74,11 @@ time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt
######################################################################## ########################################################################
######################################################################## ########################################################################
# running feature selection # running feature selection
# Split:70/30 # Split:70/30
time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt
real 338m26.705s
user 1946m12.173s
sys 189m40.122s

View file

@ -7,21 +7,24 @@ Created on Fri Jun 24 11:07:05 2022
""" """
import re import re
import argparse import argparse
import os, sys
############################################################################### ###############################################################################
# gene = 'pncA' # gene = 'pncA'
# drug = 'pyrazinamide' # drug = 'pyrazinamide'
#total_mtblineage_uc = 8 #total_mtblineage_uc = 8
#%% command line args: case sensitive # #%% command line args: case sensitive
arg_parser = argparse.ArgumentParser() # arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pncA') # arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pyrazinamide') # arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
args = arg_parser.parse_args() # args = arg_parser.parse_args()
drug = args.drug # drug = args.drug
gene = args.gene # gene = args.gene
############################################################################### ###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
############################################################################### ###############################################################################
#================== #==================
@ -79,7 +82,7 @@ mmD = MultModelsCl(input_df = X_smnc
#================ #================
# MultModelsCl: WITH formatted output # MultModelsCl: WITH formatted output
#================ #================
mmDF = MultModelsCl(input_df = X_smnc mmDF3 = MultModelsCl(input_df = X_smnc
, target = y_smnc , target = y_smnc
, var_type = 'mixed' , var_type = 'mixed'
, tts_split_type = tts_split_7030 , tts_split_type = tts_split_7030
@ -96,4 +99,4 @@ mmDF = MultModelsCl(input_df = X_smnc
# test function # test function
#================= #=================
# output from function call # output from function call
ProcessMultModelCl(smnc_scores_mmD) ProcessMultModelsCl(mmD)