added Mult_clfs_logo and Mult_clsf.py with consistency

This commit is contained in:
Tanushree Tunstall 2022-07-10 12:32:52 +01:00
parent 06f2ce97b6
commit de5c1270be
5 changed files with 201 additions and 115 deletions

View file

@ -105,6 +105,7 @@ def CMLogoSkf(cm_input_df
, target_var = 'dst_mode' , target_var = 'dst_mode'
, gene_group = 'gene_name' , gene_group = 'gene_name'
, std_gene_omit = [] , std_gene_omit = []
, output_dir = outdir
, file_suffix = "" , file_suffix = ""
): ):
@ -138,7 +139,7 @@ def CMLogoSkf(cm_input_df
# else: # else:
# file_suffix = file_suffix # file_suffix = file_suffix
outFile = outdir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix + ".csv" outFile = output_dir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix + ".csv"
print(outFile) print(outFile)
@ -170,7 +171,7 @@ def CMLogoSkf(cm_input_df
#cm_bts_y = cm_test_df.loc[:, 'dst_mode'] #cm_bts_y = cm_test_df.loc[:, 'dst_mode']
cm_bts_y = cm_test_df.loc[:, target_var] cm_bts_y = cm_test_df.loc[:, target_var]
print('\nTEST data dim:', cm_bts_X.shape print('\nTEST data dim:' , cm_bts_X.shape
, '\nTEST Target dim:', cm_bts_y.shape) , '\nTEST Target dim:', cm_bts_y.shape)
print("Running Multiple models on LOGO with SKF") print("Running Multiple models on LOGO with SKF")
@ -209,4 +210,4 @@ def CMLogoSkf(cm_input_df
# Actual Data # Actual Data
#=============== #===============
CMLogoSkf(cm_input_df = combined_df_actual, file_suffix = "actual") CMLogoSkf(cm_input_df = combined_df_actual, file_suffix = "actual")
CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual") # CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual")

View file

@ -74,10 +74,13 @@ from sklearn.impute import KNNImputer as KNN
import json import json
import argparse import argparse
import re import re
import itertools
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
#%% GLOBALS #%% GLOBALS
rs = {'random_state': 42} #rs = {'random_state': 42}
njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores #njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'fscore' : make_scorer(f1_score) , 'fscore' : make_scorer(f1_score)
@ -88,13 +91,13 @@ scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'jcc' : make_scorer(jaccard_score) , 'jcc' : make_scorer(jaccard_score)
}) })
skf_cv = StratifiedKFold(n_splits = 10 #skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None) # #, shuffle = False, random_state= None)
, shuffle = True,**rs) # , shuffle = True,**rs)
rskf_cv = RepeatedStratifiedKFold(n_splits = 10 #rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats = 3 # , n_repeats = 3
, **rs) # , **rs)
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@ -137,6 +140,7 @@ scoreBT_mapD = {'bts_mcc' : 'MCC'
, 'bts_jcc' : 'JCC' , 'bts_jcc' : 'JCC'
} }
#gene_group = 'gene_name'
#%%############################################################################ #%%############################################################################
############################ ############################
# MultModelsCl() # MultModelsCl()
@ -145,17 +149,23 @@ scoreBT_mapD = {'bts_mcc' : 'MCC'
# Multiple Classification - Model Pipeline # Multiple Classification - Model Pipeline
def MultModelsCl(input_df, target def MultModelsCl(input_df, target
, sel_cv , sel_cv
, blind_test_df
, blind_test_target
, tts_split_type , tts_split_type
, resampling_type
#, group = None
, resampling_type = 'none' # default
, add_cm = True # adds confusion matrix based on cross_val_predict , add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers , add_yn = True # adds target var class numbers
, var_type = ['numerical', 'categorical','mixed'] , var_type = ['numerical', 'categorical','mixed']
, scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none']
, run_blind_test = True , run_blind_test = True
, return_formatted_output = True): , blind_test_df = pd.DataFrame()
, blind_test_target = pd.Series(dtype = int)
, return_formatted_output = True
, random_state = 42
, n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
):
''' '''
@ param input_df: input features @ param input_df: input features
@ -174,6 +184,24 @@ def MultModelsCl(input_df, target
Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
''' '''
#%% Func globals
rs = {'random_state': random_state}
njobs = {'n_jobs': n_jobs}
skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None)
, shuffle = True,**rs)
rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats = 3
, **rs)
logo = LeaveOneGroupOut()
# select CV type:
# if group == None:
# sel_cv = skf_cv
# else:
# sel_cv = logo
#====================================================== #======================================================
# Determine categorical and numerical features # Determine categorical and numerical features
#====================================================== #======================================================
@ -196,8 +224,9 @@ def MultModelsCl(input_df, target
# # t = [('num', MinMaxScaler(), numerical_ix) # # t = [('num', MinMaxScaler(), numerical_ix)
# # , ('cat', OneHotEncoder(), categorical_ix) ] # # , ('cat', OneHotEncoder(), categorical_ix) ]
# if var_type == 'mixed': # col_transform = ColumnTransformer(transformers = t
# t = [('cat', OneHotEncoder(), categorical_ix) ] # , remainder='passthrough')
if type(var_type) == list: if type(var_type) == list:
var_type = str(var_type[0]) var_type = str(var_type[0])
else: else:
@ -230,36 +259,36 @@ def MultModelsCl(input_df, target
# Specify multiple Classification Models # Specify multiple Classification Models
#====================================================== #======================================================
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
# , ('Decision Tree' , DecisionTreeClassifier(**rs) ) , ('Decision Tree' , DecisionTreeClassifier(**rs) )
# , ('Extra Tree' , ExtraTreeClassifier(**rs) ) , ('Extra Tree' , ExtraTreeClassifier(**rs) )
# , ('Extra Trees' , ExtraTreesClassifier(**rs) ) , ('Extra Trees' , ExtraTreesClassifier(**rs) )
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
# , ('Gaussian NB' , GaussianNB() ) , ('Gaussian NB' , GaussianNB() )
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
# , ('K-Nearest Neighbors' , KNeighborsClassifier() ) , ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('LDA' , LinearDiscriminantAnalysis() ) , ('LDA' , LinearDiscriminantAnalysis() )
# , ('Logistic Regression' , LogisticRegression(**rs) ) , ('Logistic Regression' , LogisticRegression(**rs) )
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
#, ('Multinomial' , MultinomialNB() ) , ('Multinomial' , MultinomialNB() )
# , ('Naive Bayes' , BernoulliNB() ) , ('Naive Bayes' , BernoulliNB() )
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
# , ('QDA' , QuadraticDiscriminantAnalysis() ) , ('QDA' , QuadraticDiscriminantAnalysis() )
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
# # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
# , n_estimators = 1000 , n_estimators = 1000
# , bootstrap = True , bootstrap = True
# , oob_score = True , oob_score = True
# , **njobs , **njobs
# , **rs , **rs
# , max_features = 'auto') ) , max_features = 'auto') )
# , ('Ridge Classifier' , RidgeClassifier(**rs) ) , ('Ridge Classifier' , RidgeClassifier(**rs) )
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
# , ('SVC' , SVC(**rs) ) , ('SVC' , SVC(**rs) )
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
#
] ]
mm_skf_scoresD = {} mm_skf_scoresD = {}
@ -289,10 +318,11 @@ def MultModelsCl(input_df, target
print('\nRunning model pipeline:', model_pipeline) print('\nRunning model pipeline:', model_pipeline)
skf_cv_modD = cross_validate(model_pipeline cv_modD = cross_validate(model_pipeline
, input_df , input_df
, target , target
, cv = sel_cv , cv = sel_cv
#, groups = group
, scoring = scoring_fn , scoring = scoring_fn
, return_train_score = True) , return_train_score = True)
#============================== #==============================
@ -300,7 +330,7 @@ def MultModelsCl(input_df, target
#============================== #==============================
mm_skf_scoresD[model_name] = {} mm_skf_scoresD[model_name] = {}
for key, value in skf_cv_modD.items(): for key, value in cv_modD.items():
print('\nkey:', key, '\nvalue:', value) print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', np.mean(value)) print('\nmean value:', np.mean(value))
mm_skf_scoresD[model_name][key] = round(np.mean(value),2) mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
@ -308,7 +338,7 @@ def MultModelsCl(input_df, target
# ADD more info: meta data related to input df # ADD more info: meta data related to input df
mm_skf_scoresD[model_name]['resampling'] = resampling_type mm_skf_scoresD[model_name]['resampling'] = resampling_type
mm_skf_scoresD[model_name]['n_training_size'] = len(input_df) mm_skf_scoresD[model_name]['n_training_size'] = len(input_df)
#mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2) mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns) mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns)
mm_skf_scoresD[model_name]['tts_split'] = tts_split_type mm_skf_scoresD[model_name]['tts_split'] = tts_split_type
@ -321,7 +351,12 @@ def MultModelsCl(input_df, target
cmD = {} cmD = {}
# Calculate cm # Calculate cm
y_pred = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs) y_pred = cross_val_predict(model_pipeline
, input_df
, target
, cv = sel_cv
#, groups = group
, **njobs)
#_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
@ -357,7 +392,7 @@ def MultModelsCl(input_df, target
# Build bts numbers dict # Build bts numbers dict
btD = {'n_blindY_neg' : Counter(blind_test_target)[0] btD = {'n_blindY_neg' : Counter(blind_test_target)[0]
, 'n_blindY_pos' : Counter(blind_test_target)[1] , 'n_blindY_pos' : Counter(blind_test_target)[1]
#, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2) , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
, 'n_test_size' : len(blind_test_df) } , 'n_test_size' : len(blind_test_df) }
# Update cmD+tnD dicts with btD # Update cmD+tnD dicts with btD
@ -526,7 +561,8 @@ def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs') sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs')
else: else:
print('\nConcatenting dfs not possible [WF],check numbers ') # print('\nConcatenting dfs not possible [WF],check numbers ')
print('\nOnly combining CV and metadata')
#------------------------------------- #-------------------------------------
# Combine WF+Metadata: Final output # Combine WF+Metadata: Final output

View file

@ -76,7 +76,12 @@ import argparse
import re import re
import itertools import itertools
from sklearn.model_selection import LeaveOneGroupOut from sklearn.model_selection import LeaveOneGroupOut
from sklearn.decomposition import PCA
#%% GLOBALS #%% GLOBALS
#rs = {'random_state': 42}
#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'fscore' : make_scorer(f1_score) , 'fscore' : make_scorer(f1_score)
, 'precision' : make_scorer(precision_score) , 'precision' : make_scorer(precision_score)
@ -86,7 +91,13 @@ scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'jcc' : make_scorer(jaccard_score) , 'jcc' : make_scorer(jaccard_score)
}) })
#skf_cv = StratifiedKFold(n_splits = 10
# #, shuffle = False, random_state= None)
# , shuffle = True,**rs)
#rskf_cv = RepeatedStratifiedKFold(n_splits = 10
# , n_repeats = 3
# , **rs)
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@ -139,21 +150,23 @@ scoreBT_mapD = {'bts_mcc' : 'MCC'
def MultModelsCl_logo_skf(input_df def MultModelsCl_logo_skf(input_df
, target , target
, sel_cv , sel_cv
, tts_split_type
, resampling_type
#, group = None
, blind_test_df = pd.DataFrame()
, blind_test_target = pd.Series(dtype = int)
, tts_split_type = "none"
#, group = 'none'
, resampling_type = 'none' # default
, add_cm = True # adds confusion matrix based on cross_val_predict , add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers , add_yn = True # adds target var class numbers
, var_type = ['numerical', 'categorical','mixed'] , var_type = ['numerical', 'categorical','mixed']
, scale_numeric = ['min_max', 'std', 'min_max_neg', 'none']
, run_blind_test = True , run_blind_test = True
, blind_test_df = pd.DataFrame()
, blind_test_target = pd.Series(dtype = int)
, return_formatted_output = True , return_formatted_output = True
, random_state = 42 , random_state = 42
, n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
, ): ):
''' '''
@ param input_df: input features @ param input_df: input features
@ -165,7 +178,7 @@ def MultModelsCl_logo_skf(input_df
@param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
@type: int or StratifiedKfold() @type: int or StratifiedKfold()
@var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho t encoder) @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
@type: list @type: list
returns returns
@ -185,8 +198,8 @@ def MultModelsCl_logo_skf(input_df
, **rs) , **rs)
logo = LeaveOneGroupOut() logo = LeaveOneGroupOut()
# # select CV type: # select CV type:
# if group == 'none': # if group == None:
# sel_cv = skf_cv # sel_cv = skf_cv
# else: # else:
# sel_cv = logo # sel_cv = logo
@ -201,52 +214,81 @@ def MultModelsCl_logo_skf(input_df
#====================================================== #======================================================
# Determine preprocessing steps ~ var_type # Determine preprocessing steps ~ var_type
#====================================================== #======================================================
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)] # if var_type == 'numerical':
# t = [('num', MinMaxScaler(), numerical_ix)]
# if var_type == 'categorical':
# t = [('cat', OneHotEncoder(), categorical_ix)]
# # if var_type == 'mixed':
# # t = [('num', MinMaxScaler(), numerical_ix)
# # , ('cat', OneHotEncoder(), categorical_ix) ]
# col_transform = ColumnTransformer(transformers = t
# , remainder='passthrough')
if type(var_type) == list:
var_type = str(var_type[0])
else:
var_type = var_type
if var_type in ['numerical','mixed']:
if scale_numeric == ['none']:
t = [('cat', OneHotEncoder(), categorical_ix)]
if scale_numeric != ['none']:
if scale_numeric == ['min_max']:
scaler = MinMaxScaler()
if scale_numeric == ['min_max_neg']:
scaler = MinMaxScaler(feature_range=(-1, 1))
if scale_numeric == ['std']:
scaler = StandardScaler()
t = [('num', scaler, numerical_ix)
, ('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'categorical': if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)] t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'mixed':
t = [('num', MinMaxScaler(), numerical_ix)
, ('cat', OneHotEncoder(), categorical_ix) ]
col_transform = ColumnTransformer(transformers = t col_transform = ColumnTransformer(transformers = t
, remainder='passthrough') , remainder='passthrough')
#====================================================== #======================================================
# Specify multiple Classification Models # Specify multiple Classification Models
#====================================================== #======================================================
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
, ('Decision Tree' , DecisionTreeClassifier(**rs) ) # , ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) ) # , ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) ) # , ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
, ('Gaussian NB' , GaussianNB() ) # , ('Gaussian NB' , GaussianNB() )
, ('Gaussian Process' , GaussianProcessClassifier(**rs) ) # , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
, ('K-Nearest Neighbors' , KNeighborsClassifier() ) # , ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('LDA' , LinearDiscriminantAnalysis() ) # , ('LDA' , LinearDiscriminantAnalysis() )
, ('Logistic Regression' , LogisticRegression(**rs) ) # , ('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) # , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
, ('Multinomial' , MultinomialNB() ) # , ('Multinomial' , MultinomialNB() )
, ('Naive Bayes' , BernoulliNB() ) # , ('Naive Bayes' , BernoulliNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, ('QDA' , QuadraticDiscriminantAnalysis() ) # , ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, n_estimators = 1000 # , n_estimators = 1000
, bootstrap = True # , bootstrap = True
, oob_score = True # , oob_score = True
, **njobs # , **njobs
, **rs # , **rs
, max_features = 'auto') ) # , max_features = 'auto') )
, ('Ridge Classifier' , RidgeClassifier(**rs) ) # , ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
, ('SVC' , SVC(**rs) ) # , ('SVC' , SVC(**rs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) # , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
] ]
mm_skf_scoresD = {} mm_skf_scoresD = {}
@ -269,6 +311,12 @@ def MultModelsCl_logo_skf(input_df
('prep' , col_transform) ('prep' , col_transform)
, ('model' , model_fn)]) , ('model' , model_fn)])
# model_pipeline = Pipeline([
# ('prep' , col_transform)
# , ('pca' , PCA(n_components = 2))
# , ('model' , model_fn)])
print('\nRunning model pipeline:', model_pipeline) print('\nRunning model pipeline:', model_pipeline)
cv_modD = cross_validate(model_pipeline cv_modD = cross_validate(model_pipeline
, input_df , input_df
@ -359,7 +407,8 @@ def MultModelsCl_logo_skf(input_df
bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
print('\nMCC on Blind test:' , bts_mcc_score) print('\nMCC on Blind test:' , bts_mcc_score)
print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
print('\nMCC on Training:' , mm_skf_scoresD[model_name]['test_mcc'] )
mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score
mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2)
@ -387,8 +436,7 @@ def MultModelsCl_logo_skf(input_df
############################ ############################
#Processes the dict from above if use_formatted_output = True #Processes the dict from above if use_formatted_output = True
def ProcessMultModelsCl(inputD = {} def ProcessMultModelsCl(inputD = {}, blind_test_data = True):
, blind_test_data = True):
scoresDF = pd.DataFrame(inputD) scoresDF = pd.DataFrame(inputD)

View file

@ -77,7 +77,7 @@ fooD = MultModelsCl(input_df = df2['X_ros']
, blind_test_df = df2['X_bts'] , blind_test_df = df2['X_bts']
, blind_test_target = df2['y_bts'] , blind_test_target = df2['y_bts']
, tts_split_type = spl_type , tts_split_type = spl_type
, resampling_type = 'none' # default , resampling_type = 'XXXX' # default
, var_type = ['mixed'] , var_type = ['mixed']
, scale_numeric = ['min_max'] , scale_numeric = ['min_max']
, return_formatted_output = False , return_formatted_output = False

View file

@ -93,6 +93,7 @@ for gene, drug in ml_gene_drugD.items():
, sel_cv = skf_cv , sel_cv = skf_cv
, blind_test_df = tempD['X_bts'] , blind_test_df = tempD['X_bts']
, blind_test_target = tempD['y_bts'] , blind_test_target = tempD['y_bts']
, scale_numeric = ['min_max']
, add_cm = True , add_cm = True
, add_yn = True , add_yn = True
, return_formatted_output = True) , return_formatted_output = True)
@ -103,5 +104,5 @@ for gene, drug in ml_gene_drugD.items():
out_wf= pd.concat(mmDD, ignore_index = True) out_wf= pd.concat(mmDD, ignore_index = True)
out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+ out_filename), index = False) out_wf_f.to_csv(out_filename, index = False)