From de5c1270be1a8b5927749f21ea2749ea04f1a174 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sun, 10 Jul 2022 12:32:52 +0100 Subject: [PATCH] added Mult_clfs_logo and Mult_clsf.py with consistency --- scripts/ml/combined_model/cm_logo_skf.py | 7 +- scripts/ml/ml_functions/MultClfs.py | 150 ++++++++++------- scripts/ml/ml_functions/MultClfs_logo_skf.py | 152 ++++++++++++------ .../ml/ml_functions/test_func_singlegene.py | 4 +- scripts/ml/ml_iterator.py | 3 +- 5 files changed, 201 insertions(+), 115 deletions(-) diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py index 4efa0f3..0ad72a2 100755 --- a/scripts/ml/combined_model/cm_logo_skf.py +++ b/scripts/ml/combined_model/cm_logo_skf.py @@ -105,6 +105,7 @@ def CMLogoSkf(cm_input_df , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = [] + , output_dir = outdir , file_suffix = "" ): @@ -138,7 +139,7 @@ def CMLogoSkf(cm_input_df # else: # file_suffix = file_suffix - outFile = outdir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix + ".csv" + outFile = output_dir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix + ".csv" print(outFile) @@ -170,7 +171,7 @@ def CMLogoSkf(cm_input_df #cm_bts_y = cm_test_df.loc[:, 'dst_mode'] cm_bts_y = cm_test_df.loc[:, target_var] - print('\nTEST data dim:', cm_bts_X.shape + print('\nTEST data dim:' , cm_bts_X.shape , '\nTEST Target dim:', cm_bts_y.shape) print("Running Multiple models on LOGO with SKF") @@ -209,4 +210,4 @@ def CMLogoSkf(cm_input_df # Actual Data #=============== CMLogoSkf(cm_input_df = combined_df_actual, file_suffix = "actual") -CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual") +# CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual") diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index 1f46df9..3e6c729 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -74,10 +74,13 @@ from sklearn.impute import KNNImputer as KNN import json import argparse import re +import itertools +from sklearn.model_selection import LeaveOneGroupOut from sklearn.decomposition import PCA + #%% GLOBALS -rs = {'random_state': 42} -njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores +#rs = {'random_state': 42} +#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) , 'fscore' : make_scorer(f1_score) @@ -88,13 +91,13 @@ scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) , 'jcc' : make_scorer(jaccard_score) }) -skf_cv = StratifiedKFold(n_splits = 10 - #, shuffle = False, random_state= None) - , shuffle = True,**rs) +#skf_cv = StratifiedKFold(n_splits = 10 +# #, shuffle = False, random_state= None) +# , shuffle = True,**rs) -rskf_cv = RepeatedStratifiedKFold(n_splits = 10 - , n_repeats = 3 - , **rs) +#rskf_cv = RepeatedStratifiedKFold(n_splits = 10 +# , n_repeats = 3 +# , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} @@ -137,6 +140,7 @@ scoreBT_mapD = {'bts_mcc' : 'MCC' , 'bts_jcc' : 'JCC' } +#gene_group = 'gene_name' #%%############################################################################ ############################ # MultModelsCl() @@ -145,17 +149,23 @@ scoreBT_mapD = {'bts_mcc' : 'MCC' # Multiple Classification - Model Pipeline def MultModelsCl(input_df, target , sel_cv - , blind_test_df - , blind_test_target - , tts_split_type - - , resampling_type = 'none' # default + , tts_split_type + , resampling_type + #, group = None + , add_cm = True # adds confusion matrix based on cross_val_predict , add_yn = True # adds target var class numbers - , var_type = ['numerical', 'categorical','mixed'] + , var_type = ['numerical', 'categorical','mixed'] , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] + , run_blind_test = True - , return_formatted_output = True): + , blind_test_df = pd.DataFrame() + , blind_test_target = pd.Series(dtype = int) + , return_formatted_output = True + + , random_state = 42 + , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores + ): ''' @ param input_df: input features @@ -173,7 +183,25 @@ def MultModelsCl(input_df, target returns Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training ''' + +#%% Func globals + rs = {'random_state': random_state} + njobs = {'n_jobs': n_jobs} + + skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) + rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats = 3 + , **rs) + logo = LeaveOneGroupOut() + + # select CV type: + # if group == None: + # sel_cv = skf_cv + # else: + # sel_cv = logo #====================================================== # Determine categorical and numerical features #====================================================== @@ -196,8 +224,9 @@ def MultModelsCl(input_df, target # # t = [('num', MinMaxScaler(), numerical_ix) # # , ('cat', OneHotEncoder(), categorical_ix) ] - # if var_type == 'mixed': - # t = [('cat', OneHotEncoder(), categorical_ix) ] + # col_transform = ColumnTransformer(transformers = t + # , remainder='passthrough') + if type(var_type) == list: var_type = str(var_type[0]) else: @@ -229,37 +258,37 @@ def MultModelsCl(input_df, target #====================================================== # Specify multiple Classification Models #====================================================== - models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) - # , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) - # , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - # , ('Extra Tree' , ExtraTreeClassifier(**rs) ) - # , ('Extra Trees' , ExtraTreesClassifier(**rs) ) - # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - # , ('Gaussian NB' , GaussianNB() ) - # , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) - # , ('K-Nearest Neighbors' , KNeighborsClassifier() ) - , ('LDA' , LinearDiscriminantAnalysis() ) - # , ('Logistic Regression' , LogisticRegression(**rs) ) - # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - # , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - #, ('Multinomial' , MultinomialNB() ) - # , ('Naive Bayes' , BernoulliNB() ) - # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - # , ('QDA' , QuadraticDiscriminantAnalysis() ) - # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) - # # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 - # , n_estimators = 1000 - # , bootstrap = True - # , oob_score = True - # , **njobs - # , **rs - # , max_features = 'auto') ) - # , ('Ridge Classifier' , RidgeClassifier(**rs) ) - # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - # , ('SVC' , SVC(**rs) ) - # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - # , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) - # + models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) + , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + , ('Gaussian NB' , GaussianNB() ) + , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + , ('LDA' , LinearDiscriminantAnalysis() ) + , ('Logistic Regression' , LogisticRegression(**rs) ) + , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + , ('Multinomial' , MultinomialNB() ) + , ('Naive Bayes' , BernoulliNB() ) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + , ('QDA' , QuadraticDiscriminantAnalysis() ) + , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) + , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + , n_estimators = 1000 + , bootstrap = True + , oob_score = True + , **njobs + , **rs + , max_features = 'auto') ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + , ('SVC' , SVC(**rs) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) ) + ] mm_skf_scoresD = {} @@ -289,10 +318,11 @@ def MultModelsCl(input_df, target print('\nRunning model pipeline:', model_pipeline) - skf_cv_modD = cross_validate(model_pipeline + cv_modD = cross_validate(model_pipeline , input_df , target , cv = sel_cv + #, groups = group , scoring = scoring_fn , return_train_score = True) #============================== @@ -300,7 +330,7 @@ def MultModelsCl(input_df, target #============================== mm_skf_scoresD[model_name] = {} - for key, value in skf_cv_modD.items(): + for key, value in cv_modD.items(): print('\nkey:', key, '\nvalue:', value) print('\nmean value:', np.mean(value)) mm_skf_scoresD[model_name][key] = round(np.mean(value),2) @@ -308,7 +338,7 @@ def MultModelsCl(input_df, target # ADD more info: meta data related to input df mm_skf_scoresD[model_name]['resampling'] = resampling_type mm_skf_scoresD[model_name]['n_training_size'] = len(input_df) - #mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2) + mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2) mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns) mm_skf_scoresD[model_name]['tts_split'] = tts_split_type @@ -321,7 +351,12 @@ def MultModelsCl(input_df, target cmD = {} # Calculate cm - y_pred = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, **njobs) + y_pred = cross_val_predict(model_pipeline + , input_df + , target + , cv = sel_cv + #, groups = group + , **njobs) #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() @@ -357,7 +392,7 @@ def MultModelsCl(input_df, target # Build bts numbers dict btD = {'n_blindY_neg' : Counter(blind_test_target)[0] , 'n_blindY_pos' : Counter(blind_test_target)[1] - #, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2) + , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2) , 'n_test_size' : len(blind_test_df) } # Update cmD+tnD dicts with btD @@ -371,9 +406,9 @@ def MultModelsCl(input_df, target bts_predict = model_pipeline.predict(blind_test_df) bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) - print('\nMCC on Blind test:' , bts_mcc_score) + print('\nMCC on Blind test:' , bts_mcc_score) #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) - print('\nMCC on Training:' , mm_skf_scoresD[model_name]['test_mcc'] ) + print('\nMCC on Training:' , mm_skf_scoresD[model_name]['test_mcc'] ) mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) @@ -384,7 +419,7 @@ def MultModelsCl(input_df, target mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC - + #return(mm_skf_scoresD) #============================ # Process the dict to have WF @@ -526,7 +561,8 @@ def ProcessMultModelsCl(inputD = {}, blind_test_data = True): sys.exit('\nFAIL: Could not merge metadata with CV and BT dfs') else: - print('\nConcatenting dfs not possible [WF],check numbers ') + # print('\nConcatenting dfs not possible [WF],check numbers ') + print('\nOnly combining CV and metadata') #------------------------------------- # Combine WF+Metadata: Final output diff --git a/scripts/ml/ml_functions/MultClfs_logo_skf.py b/scripts/ml/ml_functions/MultClfs_logo_skf.py index 68eb906..1b4c2ff 100755 --- a/scripts/ml/ml_functions/MultClfs_logo_skf.py +++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py @@ -76,7 +76,12 @@ import argparse import re import itertools from sklearn.model_selection import LeaveOneGroupOut +from sklearn.decomposition import PCA + #%% GLOBALS +#rs = {'random_state': 42} +#njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores + scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) , 'fscore' : make_scorer(f1_score) , 'precision' : make_scorer(precision_score) @@ -86,7 +91,13 @@ scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) , 'jcc' : make_scorer(jaccard_score) }) +#skf_cv = StratifiedKFold(n_splits = 10 +# #, shuffle = False, random_state= None) +# , shuffle = True,**rs) +#rskf_cv = RepeatedStratifiedKFold(n_splits = 10 +# , n_repeats = 3 +# , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} @@ -139,21 +150,23 @@ scoreBT_mapD = {'bts_mcc' : 'MCC' def MultModelsCl_logo_skf(input_df , target , sel_cv - - , blind_test_df = pd.DataFrame() - , blind_test_target = pd.Series(dtype = int) - , tts_split_type = "none" - #, group = 'none' - - , resampling_type = 'none' # default + , tts_split_type + , resampling_type + #, group = None + , add_cm = True # adds confusion matrix based on cross_val_predict , add_yn = True # adds target var class numbers , var_type = ['numerical', 'categorical','mixed'] + , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] + , run_blind_test = True + , blind_test_df = pd.DataFrame() + , blind_test_target = pd.Series(dtype = int) , return_formatted_output = True + , random_state = 42 , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores - , ): + ): ''' @ param input_df: input features @@ -165,7 +178,7 @@ def MultModelsCl_logo_skf(input_df @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass @type: int or StratifiedKfold() - @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho t encoder) + @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder) @type: list returns @@ -185,8 +198,8 @@ def MultModelsCl_logo_skf(input_df , **rs) logo = LeaveOneGroupOut() - # # select CV type: - # if group == 'none': + # select CV type: + # if group == None: # sel_cv = skf_cv # else: # sel_cv = logo @@ -201,52 +214,81 @@ def MultModelsCl_logo_skf(input_df #====================================================== # Determine preprocessing steps ~ var_type #====================================================== - if var_type == 'numerical': - t = [('num', MinMaxScaler(), numerical_ix)] + + # if var_type == 'numerical': + # t = [('num', MinMaxScaler(), numerical_ix)] + # if var_type == 'categorical': + # t = [('cat', OneHotEncoder(), categorical_ix)] + + # # if var_type == 'mixed': + # # t = [('num', MinMaxScaler(), numerical_ix) + # # , ('cat', OneHotEncoder(), categorical_ix) ] + + # col_transform = ColumnTransformer(transformers = t + # , remainder='passthrough') + + if type(var_type) == list: + var_type = str(var_type[0]) + else: + var_type = var_type + + if var_type in ['numerical','mixed']: + if scale_numeric == ['none']: + t = [('cat', OneHotEncoder(), categorical_ix)] + if scale_numeric != ['none']: + if scale_numeric == ['min_max']: + scaler = MinMaxScaler() + if scale_numeric == ['min_max_neg']: + scaler = MinMaxScaler(feature_range=(-1, 1)) + if scale_numeric == ['std']: + scaler = StandardScaler() + + t = [('num', scaler, numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] - - if var_type == 'mixed': - t = [('num', MinMaxScaler(), numerical_ix) - , ('cat', OneHotEncoder(), categorical_ix) ] + col_transform = ColumnTransformer(transformers = t , remainder='passthrough') + #====================================================== # Specify multiple Classification Models #====================================================== - models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) - , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) - , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - , ('Extra Tree' , ExtraTreeClassifier(**rs) ) - , ('Extra Trees' , ExtraTreesClassifier(**rs) ) - , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - , ('Gaussian NB' , GaussianNB() ) - , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) - , ('K-Nearest Neighbors' , KNeighborsClassifier() ) - , ('LDA' , LinearDiscriminantAnalysis() ) - , ('Logistic Regression' , LogisticRegression(**rs) ) - , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - , ('Multinomial' , MultinomialNB() ) - , ('Naive Bayes' , BernoulliNB() ) - , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - , ('QDA' , QuadraticDiscriminantAnalysis() ) - , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) - , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 - , n_estimators = 1000 - , bootstrap = True - , oob_score = True - , **njobs - , **rs - , max_features = 'auto') ) - , ('Ridge Classifier' , RidgeClassifier(**rs) ) - , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - , ('SVC' , SVC(**rs) ) - , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) + models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) + # , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + # , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + # , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + # , ('Gaussian NB' , GaussianNB() ) + # , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + # , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + # , ('LDA' , LinearDiscriminantAnalysis() ) + # , ('Logistic Regression' , LogisticRegression(**rs) ) + # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + # , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + # , ('Multinomial' , MultinomialNB() ) + # , ('Naive Bayes' , BernoulliNB() ) + # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + # , ('QDA' , QuadraticDiscriminantAnalysis() ) + # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) + # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + # , n_estimators = 1000 + # , bootstrap = True + # , oob_score = True + # , **njobs + # , **rs + # , max_features = 'auto') ) + # , ('Ridge Classifier' , RidgeClassifier(**rs) ) + # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + # , ('SVC' , SVC(**rs) ) + # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + # , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) ) ] mm_skf_scoresD = {} @@ -268,6 +310,12 @@ def MultModelsCl_logo_skf(input_df model_pipeline = Pipeline([ ('prep' , col_transform) , ('model' , model_fn)]) + + # model_pipeline = Pipeline([ + # ('prep' , col_transform) + # , ('pca' , PCA(n_components = 2)) + # , ('model' , model_fn)]) + print('\nRunning model pipeline:', model_pipeline) cv_modD = cross_validate(model_pipeline @@ -358,9 +406,10 @@ def MultModelsCl_logo_skf(input_df bts_predict = model_pipeline.predict(blind_test_df) bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) - print('\nMCC on Blind test:' , bts_mcc_score) - print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) - + print('\nMCC on Blind test:' , bts_mcc_score) + #print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) + print('\nMCC on Training:' , mm_skf_scoresD[model_name]['test_mcc'] ) + mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) @@ -387,8 +436,7 @@ def MultModelsCl_logo_skf(input_df ############################ #Processes the dict from above if use_formatted_output = True -def ProcessMultModelsCl(inputD = {} - , blind_test_data = True): +def ProcessMultModelsCl(inputD = {}, blind_test_data = True): scoresDF = pd.DataFrame(inputD) diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index 6abccb4..729fafe 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -26,7 +26,7 @@ skf_cv = StratifiedKFold(n_splits = 10 # , n_repeats = 3 # , **rs) # param dict for getmldata() -gene_model_paramD = {'data_combined_model' : False +gene_model_paramD = {'data_combined_model' : False , 'use_or' : False , 'omit_all_genomic_features': False , 'write_maskfile' : False @@ -77,7 +77,7 @@ fooD = MultModelsCl(input_df = df2['X_ros'] , blind_test_df = df2['X_bts'] , blind_test_target = df2['y_bts'] , tts_split_type = spl_type - , resampling_type = 'none' # default + , resampling_type = 'XXXX' # default , var_type = ['mixed'] , scale_numeric = ['min_max'] , return_formatted_output = False diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py index 7f0aafb..8ebb88c 100755 --- a/scripts/ml/ml_iterator.py +++ b/scripts/ml/ml_iterator.py @@ -93,6 +93,7 @@ for gene, drug in ml_gene_drugD.items(): , sel_cv = skf_cv , blind_test_df = tempD['X_bts'] , blind_test_target = tempD['y_bts'] + , scale_numeric = ['min_max'] , add_cm = True , add_yn = True , return_formatted_output = True) @@ -103,5 +104,5 @@ for gene, drug in ml_gene_drugD.items(): out_wf= pd.concat(mmDD, ignore_index = True) out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) - out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+ out_filename), index = False) + out_wf_f.to_csv(out_filename, index = False)