saving work

This commit is contained in:
Tanushree Tunstall 2022-07-01 20:37:41 +01:00
parent d812835713
commit b5777a17c9
3 changed files with 103 additions and 22 deletions

85
scripts/ml/combined_model/cm_logo_skf.py Normal file → Executable file
View file

@ -9,6 +9,72 @@ import sys, os
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import re import re
from copy import deepcopy
from sklearn import linear_model
from sklearn import datasets
from collections import Counter
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier, kernels
from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
# added
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import RFE, RFECV
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.impute import KNNImputer as KNN
import json
import argparse
import re
import itertools
from sklearn.model_selection import LeaveOneGroupOut
############################################################################### ###############################################################################
homedir = os.path.expanduser("~") homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
@ -22,7 +88,7 @@ from MultClfs_logo_skf import *
#from GetMLData import * #from GetMLData import *
#from SplitTTS import * #from SplitTTS import *
skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True, random_state = 42)
#logo = LeaveOneGroupOut() #logo = LeaveOneGroupOut()
@ -38,13 +104,17 @@ def CMLogoSkf(combined_df
for bts_gene in bts_genes: for bts_gene in bts_genes:
print('\n BTS gene:', bts_gene) print('\n BTS gene:', bts_gene)
if not std_gene_omit:
training_genesL = ['alr']
else:
training_genesL = []
tr_gene_omit = std_gene_omit + [bts_gene] tr_gene_omit = std_gene_omit + [bts_gene]
n_tr_genes = (len(bts_genes) - (len(std_gene_omit))) n_tr_genes = (len(bts_genes) - (len(std_gene_omit)))
#n_total_genes = (len(bts_genes) - len(std_gene_omit)) #n_total_genes = (len(bts_genes) - len(std_gene_omit))
n_total_genes = len(all_genes) n_total_genes = len(all_genes)
training_genesL = std_gene_omit + list(set(bts_genes) - set(tr_gene_omit)) training_genesL = training_genesL + list(set(bts_genes) - set(tr_gene_omit))
#training_genesL = [element for element in bts_genes if element not in tr_gene_omit] #training_genesL = [element for element in bts_genes if element not in tr_gene_omit]
print('\nTotal genes: ', n_total_genes print('\nTotal genes: ', n_total_genes
@ -53,7 +123,7 @@ def CMLogoSkf(combined_df
, '\nOmitted genes:', tr_gene_omit , '\nOmitted genes:', tr_gene_omit
, '\nBlind test gene:', bts_gene) , '\nBlind test gene:', bts_gene)
tts_split_type = "logoBT_" + bts_gene tts_split_type = "logo_skf_BT_" + bts_gene
outFile = "/home/tanu/git/Data/ml_combined/" + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv" outFile = "/home/tanu/git/Data/ml_combined/" + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv"
print(outFile) print(outFile)
@ -67,7 +137,6 @@ def CMLogoSkf(combined_df
#cm_y = cm_training_df.loc[:,'dst_mode'] #cm_y = cm_training_df.loc[:,'dst_mode']
cm_y = cm_training_df.loc[:, target_var] cm_y = cm_training_df.loc[:, target_var]
gene_group = cm_training_df.loc[:,'gene_name'] gene_group = cm_training_df.loc[:,'gene_name']
print('\nTraining data dim:', cm_X.shape print('\nTraining data dim:', cm_X.shape
@ -87,14 +156,14 @@ def CMLogoSkf(combined_df
#cm_bts_y = cm_test_df.loc[:, 'dst_mode'] #cm_bts_y = cm_test_df.loc[:, 'dst_mode']
cm_bts_y = cm_test_df.loc[:, target_var] cm_bts_y = cm_test_df.loc[:, target_var]
print('\nTraining data dim:', cm_bts_X.shape print('\nTEST data dim:', cm_bts_X.shape
, '\nTraining Target dim:', cm_bts_y.shape) , '\nTEST Target dim:', cm_bts_y.shape)
#%%:Running Multiple models on LOGO with SKF #%%:Running Multiple models on LOGO with SKF
cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X
, target = cm_y , target = cm_y
, group = 'none' #, group = 'none'
, sel_cv = skf_cv , sel_cv = skf_cv
, blind_test_df = cm_bts_X , blind_test_df = cm_bts_X
@ -116,5 +185,5 @@ def CMLogoSkf(combined_df
cD3_v2.to_csv(outFile) cD3_v2.to_csv(outFile)
#%% #%%
CMLogoSkf(combined_df) #CMLogoSkf(combined_df)
CMLogoSkf(combined_df, std_gene_omit=['alr']) CMLogoSkf(combined_df, std_gene_omit=['alr'])

View file

@ -77,6 +77,7 @@ import re
##################################### #####################################
rs = {'random_state': 42} rs = {'random_state': 42}
njobs = {'n_jobs': 10} njobs = {'n_jobs': 10}
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'fscore' : make_scorer(f1_score) , 'fscore' : make_scorer(f1_score)
@ -87,6 +88,9 @@ scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'jcc' : make_scorer(jaccard_score) , 'jcc' : make_scorer(jaccard_score)
}) })
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
skf_cv = StratifiedKFold(n_splits = 10 skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None) #, shuffle = False, random_state= None)
, shuffle = True,**rs) , shuffle = True,**rs)
@ -95,9 +99,6 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats = 3 , n_repeats = 3
, **rs) , **rs)
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
############################################################################### ###############################################################################
def fsgs_rfecv(input_df def fsgs_rfecv(input_df
, target , target
@ -109,7 +110,10 @@ def fsgs_rfecv(input_df
, custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef')
, cv_method = skf_cv , cv_method = skf_cv
, var_type = ['numerical', 'categorical' , 'mixed'] , var_type = ['numerical', 'categorical' , 'mixed']
, resampling_type = 'none'
, verbose = 3 , verbose = 3
, random_state = 42
, n_jobs = 10
): ):
''' '''
returns returns
@ -120,6 +124,10 @@ def fsgs_rfecv(input_df
optimised/selected based on mcc optimised/selected based on mcc
''' '''
rs = {'random_state': random_state}
njobs = {'n_jobs': n_jobs}
########################################################################### ###########################################################################
#================================================ #================================================
# Determine categorical and numerical features # Determine categorical and numerical features
@ -375,6 +383,8 @@ def fsgs_rfecv(input_df
output_modelD['train_score (MCC)'] = train_bscore output_modelD['train_score (MCC)'] = train_bscore
output_modelD['bts_mcc'] = bts_mcc_score output_modelD['bts_mcc'] = bts_mcc_score
output_modelD['train_bts_diff'] = round(train_test_diff,2) output_modelD['train_bts_diff'] = round(train_test_diff,2)
output_modelD['resampling'] = resampling_type
print(output_modelD) print(output_modelD)
nlen = len(output_modelD) nlen = len(output_modelD)

View file

@ -77,9 +77,6 @@ import re
import itertools import itertools
from sklearn.model_selection import LeaveOneGroupOut from sklearn.model_selection import LeaveOneGroupOut
#%% GLOBALS #%% GLOBALS
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'fscore' : make_scorer(f1_score) , 'fscore' : make_scorer(f1_score)
, 'precision' : make_scorer(precision_score) , 'precision' : make_scorer(precision_score)
@ -146,7 +143,7 @@ def MultModelsCl_logo_skf(input_df
, blind_test_df = pd.DataFrame() , blind_test_df = pd.DataFrame()
, blind_test_target = pd.Series(dtype = int) , blind_test_target = pd.Series(dtype = int)
, tts_split_type = "none" , tts_split_type = "none"
, group = 'none' #, group = 'none'
, resampling_type = 'none' # default , resampling_type = 'none' # default
, add_cm = True # adds confusion matrix based on cross_val_predict , add_cm = True # adds confusion matrix based on cross_val_predict
@ -188,11 +185,11 @@ def MultModelsCl_logo_skf(input_df
, **rs) , **rs)
logo = LeaveOneGroupOut() logo = LeaveOneGroupOut()
# select CV type: # # select CV type:
if group == 'none': # if group == 'none':
sel_cv = skf_cv # sel_cv = skf_cv
else: # else:
sel_cv = logo # sel_cv = logo
#====================================================== #======================================================
# Determine categorical and numerical features # Determine categorical and numerical features
#====================================================== #======================================================
@ -277,7 +274,7 @@ def MultModelsCl_logo_skf(input_df
, input_df , input_df
, target , target
, cv = sel_cv , cv = sel_cv
, groups = group #, groups = group
, scoring = scoring_fn , scoring = scoring_fn
, return_train_score = True) , return_train_score = True)
#============================== #==============================
@ -306,7 +303,12 @@ def MultModelsCl_logo_skf(input_df
cmD = {} cmD = {}
# Calculate cm # Calculate cm
y_pred = cross_val_predict(model_pipeline, input_df, target, cv = sel_cv, groups = group, **njobs) y_pred = cross_val_predict(model_pipeline
, input_df
, target
, cv = sel_cv
#, groups = group
, **njobs)
#_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()