#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Mar 6 13:41:54 2022 @author: tanu """ #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline import os, sys import pandas as pd import numpy as np import pprint as pp from copy import deepcopy from sklearn import linear_model from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import BaggingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process import kernels from sklearn.gaussian_process.kernels import RBF from sklearn.gaussian_process.kernels import DotProduct from sklearn.gaussian_process.kernels import Matern from sklearn.gaussian_process.kernels import RationalQuadratic from sklearn.gaussian_process.kernels import WhiteKernel from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.neural_network import MLPClassifier from sklearn.linear_model import RidgeClassifier from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_transformer from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score from sklearn.metrics import jaccard_score from sklearn.metrics import make_scorer from sklearn.metrics import classification_report from sklearn.metrics import average_precision_score from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.pipeline import Pipeline from sklearn.pipeline import make_pipeline from sklearn.feature_selection import RFE from sklearn.feature_selection import RFECV import itertools #import seaborn as sns import matplotlib.pyplot as plt import numpy as np print(np.__version__) print(pd.__version__) from statistics import mean, stdev, median, mode #from imblearn.over_sampling import RandomOverSampler #from imblearn.over_sampling import SMOTE #from imblearn.pipeline import Pipeline #from sklearn.datasets import make_classification from sklearn.model_selection import cross_validate, cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.ensemble import AdaBoostClassifier #from imblearn.combine import SMOTEENN #from imblearn.under_sampling import EditedNearestNeighbours from sklearn.model_selection import GridSearchCV from sklearn.base import BaseEstimator scoring_fn = ({'accuracy' : make_scorer(accuracy_score) , 'fscore' : make_scorer(f1_score) , 'mcc' : make_scorer(matthews_corrcoef) , 'precision' : make_scorer(precision_score) , 'recall' : make_scorer(recall_score) , 'roc_auc' : make_scorer(roc_auc_score) , 'jcc' : make_scorer(jaccard_score) }) rs = {'random_state': 42} njobs = {'n_jobs': 10} skf_cv = StratifiedKFold(n_splits = 10 #, shuffle = False, random_state= None) , shuffle = True,**rs) rskf_cv = RepeatedStratifiedKFold(n_splits = 10 , n_repeats=3 #, shuffle = False, random_state= None) #, shuffle = True ,**rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} #%% homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/") # my function #from MultClassPipe import MultClassPipeline from MultClassPipe2 import MultClassPipeline2 from loopity_loop import MultClassPipeSKFLoop from MultClassPipe3 import MultClassPipeSKFCV gene = 'pncA' drug = 'pyrazinamide' #============== # directories #============== datadir = homedir + '/git/Data/' indir = datadir + drug + '/input/' outdir = datadir + drug + '/output/' #======= # input #======= infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' my_df = pd.read_csv(infile_ml1, index_col = 0) my_df.dtypes my_df_cols = my_df.columns geneL_basic = ['pnca'] # -- CHECK script -- imports.py #%% get cols mycols = my_df.columns mycols # change from numberic to num_type = ['int64', 'float64'] cat_type = ['object', 'bool'] # TODO: # Treat active site aa pos as category and not numerical: This needs to be part of merged_df3! #if my_df['active_aa_pos'].dtype in num_type: # my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) # my_df['active_aa_pos'].dtype # -- CHECK script -- imports.py #%%============================================================================ #%% IMPUTE values for OR #%% Combine mmCSM_lig Data #%% Combine PROVEAN data #%% Combine ED logo data #%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 # get logic from upstream! my_df_ml = my_df.copy() my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.groupby(['mutationinformation'])['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 (my_df_ml['ligand_affinity_change'] == 0).sum() #%%============================================================================ # Separate blind test set my_df_ml[drug].isna().sum() blind_test_df = my_df_ml[my_df_ml[drug].isna()] blind_test_df.shape training_df = my_df_ml[my_df_ml[drug].notna()] training_df.shape # Target1: dst training_df[drug].value_counts() training_df['dst_mode'].value_counts() #%% Build X common_cols_stabiltyN = ['ligand_distance' , 'ligand_affinity_change' , 'duet_stability_change' , 'ddg_foldx' , 'deepddg' , 'ddg_dynamut2'] foldX_cols = ['contacts' #, 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' #, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' #, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' #, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' #, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' #, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' ] X_strFN = ['rsa' #, 'asa' , 'kd_values' , 'rd_values'] X_evolFN = ['consurf_score' , 'snap2_score'] # quick inspection which lineage to use: #foo = my_df_ml[['lineage', 'lineage_count_all', 'lineage_count_unique']] X_genomicFN = ['maf' # , 'or_mychisq' # , 'or_logistic' # , 'or_fisher' # , 'pval_fisher' #, 'lineage' #, 'lineage_count_all' #, 'lineage_count_unique' ] #%% Construct numerical and categorical column names # numerical feature names numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN #categorical feature names categorical_FN = ['ss_class' , 'wt_prop_water' # , 'lineage_labels' # misleading if using merged_df3 , 'mut_prop_water' , 'wt_prop_polarity' , 'mut_prop_polarity' , 'wt_calcprop' , 'mut_calcprop' #, 'active_aa_pos' ] #%% extracting dfs based on numerical, categorical column names #---------------------------------- # WITHOUT the target var included #---------------------------------- num_df = training_df[numerical_FN] num_df.shape cat_df = training_df[categorical_FN] cat_df.shape all_df = training_df[numerical_FN + categorical_FN] all_df.shape #------------------------------ # WITH the target var included: #'wtgt': with target #------------------------------ # drug and dst_mode should be the same thing num_df_wtgt = training_df[numerical_FN + ['dst_mode']] num_df_wtgt.shape cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] cat_df_wtgt.shape all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] all_df_wtgt.shape #%%================================================================ #%% Apply ML #TODO: A #%% Data #X = all_df_wtgt[numerical_FN+categorical_FN] X = all_df_wtgt[numerical_FN] y = all_df_wtgt['dst_mode'] #Blind test data {same format} X_bts = blind_test_df[numerical_FN] y_bts = blind_test_df['dst_mode'] X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] # Quick check (X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() #%% MultClassPipeSKFCV: function call() # mm_skf_scoresD = MultClassPipeSKFCV(input_df = X # , target = y # , var_type = 'numerical' # , skf_cv = skf_cv) # mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) # mm_skf_scores_df_all # mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) # mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results # print(mm_skf_scores_df_train) # print(mm_skf_scores_df_test)