diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py old mode 100644 new mode 100755 index 551d9a1..f4cf311 --- a/scripts/ml/combined_model/cm_logo_skf.py +++ b/scripts/ml/combined_model/cm_logo_skf.py @@ -9,6 +9,72 @@ import sys, os import pandas as pd import numpy as np import re +from copy import deepcopy +from sklearn import linear_model +from sklearn import datasets +from collections import Counter + +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier + +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.gaussian_process import GaussianProcessClassifier, kernels +from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel + +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.neural_network import MLPClassifier + +from sklearn.svm import SVC +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report + +# added +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict + +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold + +from sklearn.pipeline import Pipeline, make_pipeline + +from sklearn.feature_selection import RFE, RFECV + +import itertools +import seaborn as sns +import matplotlib.pyplot as plt + +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import SMOTE +from sklearn.datasets import make_classification +from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC +from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours + +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator +from sklearn.impute import KNNImputer as KNN +import json +import argparse +import re +import itertools +from sklearn.model_selection import LeaveOneGroupOut ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') @@ -22,7 +88,7 @@ from MultClfs_logo_skf import * #from GetMLData import * #from SplitTTS import * -skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) +skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True, random_state = 42) #logo = LeaveOneGroupOut() @@ -38,13 +104,17 @@ def CMLogoSkf(combined_df for bts_gene in bts_genes: print('\n BTS gene:', bts_gene) + if not std_gene_omit: + training_genesL = ['alr'] + else: + training_genesL = [] tr_gene_omit = std_gene_omit + [bts_gene] n_tr_genes = (len(bts_genes) - (len(std_gene_omit))) #n_total_genes = (len(bts_genes) - len(std_gene_omit)) n_total_genes = len(all_genes) - training_genesL = std_gene_omit + list(set(bts_genes) - set(tr_gene_omit)) + training_genesL = training_genesL + list(set(bts_genes) - set(tr_gene_omit)) #training_genesL = [element for element in bts_genes if element not in tr_gene_omit] print('\nTotal genes: ', n_total_genes @@ -53,7 +123,7 @@ def CMLogoSkf(combined_df , '\nOmitted genes:', tr_gene_omit , '\nBlind test gene:', bts_gene) - tts_split_type = "logoBT_" + bts_gene + tts_split_type = "logo_skf_BT_" + bts_gene outFile = "/home/tanu/git/Data/ml_combined/" + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv" print(outFile) @@ -67,7 +137,6 @@ def CMLogoSkf(combined_df #cm_y = cm_training_df.loc[:,'dst_mode'] cm_y = cm_training_df.loc[:, target_var] - gene_group = cm_training_df.loc[:,'gene_name'] print('\nTraining data dim:', cm_X.shape @@ -87,14 +156,14 @@ def CMLogoSkf(combined_df #cm_bts_y = cm_test_df.loc[:, 'dst_mode'] cm_bts_y = cm_test_df.loc[:, target_var] - print('\nTraining data dim:', cm_bts_X.shape - , '\nTraining Target dim:', cm_bts_y.shape) + print('\nTEST data dim:', cm_bts_X.shape + , '\nTEST Target dim:', cm_bts_y.shape) #%%:Running Multiple models on LOGO with SKF cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X , target = cm_y - , group = 'none' + #, group = 'none' , sel_cv = skf_cv , blind_test_df = cm_bts_X @@ -116,5 +185,5 @@ def CMLogoSkf(combined_df cD3_v2.to_csv(outFile) #%% -CMLogoSkf(combined_df) +#CMLogoSkf(combined_df) CMLogoSkf(combined_df, std_gene_omit=['alr']) diff --git a/scripts/ml/ml_functions/FS.py b/scripts/ml/ml_functions/FS.py index ede1880..b07b7ba 100755 --- a/scripts/ml/ml_functions/FS.py +++ b/scripts/ml/ml_functions/FS.py @@ -77,6 +77,7 @@ import re ##################################### rs = {'random_state': 42} njobs = {'n_jobs': 10} + scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) , 'fscore' : make_scorer(f1_score) @@ -87,6 +88,9 @@ scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) , 'jcc' : make_scorer(jaccard_score) }) + +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} +jacc_score_fn = {'jcc': make_scorer(jaccard_score)} skf_cv = StratifiedKFold(n_splits = 10 #, shuffle = False, random_state= None) , shuffle = True,**rs) @@ -95,9 +99,6 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10 , n_repeats = 3 , **rs) -mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} -jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - ############################################################################### def fsgs_rfecv(input_df , target @@ -109,7 +110,10 @@ def fsgs_rfecv(input_df , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') , cv_method = skf_cv , var_type = ['numerical', 'categorical' , 'mixed'] + , resampling_type = 'none' , verbose = 3 + , random_state = 42 + , n_jobs = 10 ): ''' returns @@ -120,6 +124,10 @@ def fsgs_rfecv(input_df optimised/selected based on mcc ''' + rs = {'random_state': random_state} + njobs = {'n_jobs': n_jobs} + + ########################################################################### #================================================ # Determine categorical and numerical features @@ -375,6 +383,8 @@ def fsgs_rfecv(input_df output_modelD['train_score (MCC)'] = train_bscore output_modelD['bts_mcc'] = bts_mcc_score output_modelD['train_bts_diff'] = round(train_test_diff,2) + output_modelD['resampling'] = resampling_type + print(output_modelD) nlen = len(output_modelD) diff --git a/scripts/ml/ml_functions/MultClfs_logo_skf.py b/scripts/ml/ml_functions/MultClfs_logo_skf.py index 8e2cbf7..e18c4c8 100755 --- a/scripts/ml/ml_functions/MultClfs_logo_skf.py +++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py @@ -77,9 +77,6 @@ import re import itertools from sklearn.model_selection import LeaveOneGroupOut #%% GLOBALS -rs = {'random_state': 42} -njobs = {'n_jobs': 10} - scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) , 'fscore' : make_scorer(f1_score) , 'precision' : make_scorer(precision_score) @@ -146,7 +143,7 @@ def MultModelsCl_logo_skf(input_df , blind_test_df = pd.DataFrame() , blind_test_target = pd.Series(dtype = int) , tts_split_type = "none" - , group = 'none' + #, group = 'none' , resampling_type = 'none' # default , add_cm = True # adds confusion matrix based on cross_val_predict @@ -188,11 +185,11 @@ def MultModelsCl_logo_skf(input_df , **rs) logo = LeaveOneGroupOut() - # select CV type: - if group == 'none': - sel_cv = skf_cv - else: - sel_cv = logo + # # select CV type: + # if group == 'none': + # sel_cv = skf_cv + # else: + # sel_cv = logo #====================================================== # Determine categorical and numerical features #====================================================== @@ -277,7 +274,7 @@ def MultModelsCl_logo_skf(input_df , input_df , target , cv = sel_cv - , groups = group + #, groups = group , scoring = scoring_fn , return_train_score = True) #============================== @@ -306,7 +303,12 @@ def MultModelsCl_logo_skf(input_df cmD = {} # Calculate cm - y_pred = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, groups = group, **njobs) + y_pred = cross_val_predict(model_pipeline + , input_df + , target + , cv = sel_cv + #, groups = group + , **njobs) #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()