#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Feb 24 10:48:10 2022 @author: tanu """ ############################################################################### # questions: # which data to use: merged_df3 or merged_df2 # which is the target? or_mychisq or drtype col # scaling: can it be from -1 to 1? # how to include the mutation information? # 'wild_type', 'mutant', 'postion' # whether to log transform the af and or cols # to allow mean mode values to be imputed for validation set # whether to calculate mean, median accounting for NA or removing them? # strategy: # available data = X_train # available data but NAN = validation_test # test data: mut generated not in mcsm ############################################################################### import os, sys import re from sklearn.datasets import load_boston from sklearn import datasets from sklearn import linear_model from sklearn import preprocessing import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np print(np.__version__) print(pd.__version__) from statistics import mean, stdev from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate from sklearn.metrics import make_scorer from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline from sklearn.pipeline import make_pipeline from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.metrics import confusion_matrix from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import OneHotEncoder from sklearn.compose import make_column_transformer from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score #%% read data homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/test_data") # this needs to be merged_df2 or merged_df3? my_df = pd.read_csv("pnca_all_params.csv") my_df.dtypes my_df_cols = my_df.columns omit_cols1 = ['pdb_file' , 'seq_offset4pdb' , 'mut_3upper' , 'wild_pos' , 'wild_chain_pos' , 'chain' , 'wt_3upper' , 'consurf_colour' , 'consurf_colour_rev' , 'consurf_msa_data' , 'consurf_aa_variety' , 'snap2_accuracy_pc' , 'beta_logistic' , 'se_logistic' , 'zval_logisitc' , 'pval_chisq' , 'log10_or_mychisq' , 'neglog_pval_fisher' , 'or_fisher' , 'wild_type' , 'mutant_type' , 'position' , 'ligand_id' , 'mutation' , 'ss' , 'ss_class' # include it later? , 'contacts' ] omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")]) # [WATCH:] just to test since these have negative values! omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")]) omit_cols = omit_cols1 + omit_cols2 + omit_cols3 # Filter df: Filter columns to focus on my selected ones my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)] my_df_filt_cols = my_df_filt.columns #Fill na of filtered df: fill NaNs with column means/medians in each column my_df_filt2 = my_df_filt.fillna(my_df_filt.mean()) my_df_filt3 = my_df_filt.fillna(my_df_filt.median()) #my_df_filt_noNA = my_df_filt.fillna(0) summ = my_df_filt.describe() summ2 = my_df_filt2.describe() summ3 = my_df_filt3.describe() #summ_noNA = my_df_filt_noNA.describe() ######################## # [WATCH]: Drop na # Get Y my_df2 = my_df_filt.dropna() my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1) my_df2['resistance'].value_counts() Y = my_df2['resistance'] Y = np.array(Y) #Y = Y.reset_index() #Y = Y.drop(['index'], axis = 1) #Y.value_counts() #Y = np.array(Y) # GET X omit_cols_y = ['or_mychisq', 'resistance'] my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)] #my_df_ml = my_df_ml.set_index('mutationinformation') X = my_df_ml X = X.drop(['mutationinformation'], axis = 1) X = np.array(X) #X = X.reset_index() # check dim X.shape Y.shape my_df2 = my_df2.reset_index() ##################### #https://stackoverflow.com/questions/49134338/kfolds-cross-validation-vs-train-test-split rf = RandomForestClassifier(n_estimators=100, random_state=42) #https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 # k-FOLD print('Class Ratio:', sum(Y)/len(Y)) print('Class Ratio:', sum(my_df2['resistance'])/len(my_df2['resistance']) ) skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) target = my_df2.loc[:,'resistance'] fold_no = 1 for train_index, test_index in skf.split(my_df2, target): train = my_df2.loc[train_index,:] test = my_df2.loc[test_index,:] print('Fold',str(fold_no), 'Class Ratio:', sum(test['resistance'])/len(test['resistance'])) fold_no += 1 model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler()) , ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) X_features = X_train.columns.to_list() def train_model(train, test, fold_no): X = X_features y = ['resistance'] X_train = train[X] y_train = train[y] X_test = test[X] y_test = test[y] model_logisP.fit(X_train,y_train) predictions = model_logisP.predict(X_test) print('Fold',str(fold_no), 'Accuracy:', accuracy_score(y_test,predictions)) fold_no = 1 for train_index, test_index in skf.split(my_df2, target): train = my_df2.loc[train_index,:] test = my_df2.loc[test_index,:] train_model(train,test,fold_no) fold_no += 1 #%% skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=20) lst_accu_stratified = [] scaler = preprocessing.MinMaxScaler() X_scaled = scaler.fit_transform(X) X_scaled = X_scaled[:,[1,2,3,15,16]] #lr = linear_model.LogisticRegression(class_weight = 'unbalanced') lr = linear_model.LogisticRegression() for train_index1, test_index1 in skf.split(X, Y): #print(train_index) #print(test_index) x_train_fold1, x_test_fold1 = X_scaled[train_index1], X_scaled[test_index1] y_train_fold1, y_test_fold1 = Y[train_index1], Y[test_index1] lr.fit(x_train_fold1, y_train_fold1) lst_accu_stratified.append(lr.score(x_test_fold1, y_test_fold1)) # print output print('List of possible accuracy', lst_accu_stratified) print('Max accuracy:', max(lst_accu_stratified)*100, "%") print('Min accuracy:', min(lst_accu_stratified)*100, "%") print('Mean accuracy:', mean(lst_accu_stratified)*100,"%") print('St Dev:', stdev(lst_accu_stratified)*100,"%") # cancer data cancer = datasets.load_breast_cancer() x = cancer.data y = cancer.target skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) lst_accu_stratifiedC = [] scaler = preprocessing.MinMaxScaler() x_scaled = scaler.fit_transform(x) x_scaled = x_scaled[:,[1,2,3, 15, 16]] for train_index, test_index in skf.split(x, y): #print(train_index) #print(test_index) x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index] y_train_fold, y_test_fold = y[train_index], y[test_index] lr.fit(x_train_fold, y_train_fold) lst_accu_stratifiedC.append(lr.score(x_test_fold, y_test_fold)) # print output print('List of possible accuracy', lst_accu_stratifiedC) print('Max accuracy:', max(lst_accu_stratifiedC)*100, "%") print('Min accuracy:', min(lst_accu_stratifiedC)*100, "%") print('Mean accuracy:', mean(lst_accu_stratifiedC)*100,"%") print('St Dev:', stdev(lst_accu_stratifiedC)*100,"%") #%% ## # https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f y_all = my_df_filt['or_mychisq'].apply(lambda x: 0 if x <=1 else 1) X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1) seed = 20 # so that the result is reproducible X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1) X_all = X_all.iloc[:,:6] X_train, X_test, y_train, y_test = train_test_split(X_all,y_all , test_size=0.333 , random_state = seed) # Now, it is time to make NA a category. # In Python, NaN is considered NAs. # When encoded, those NaN will be ignored. # Hence, it is useful to replace NaN with na, which is now a category called ‘na’. # This will be taken into account when encoding later on. #X_train = X_train.fillna('na') #X_test = X_test.fillna('na') X_train = X_train.fillna(X_train.median()) X_test = X_test.fillna(X_test.median()) X_train.dtypes features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) col_trans = make_column_transformer( (OneHotEncoder(),features_to_encode), remainder = "passthrough" ) rf_classifier = RandomForestClassifier( min_samples_leaf=50, n_estimators=150, bootstrap=True, oob_score=True, n_jobs=-1, random_state=seed, max_features='auto') pipe = make_pipeline(col_trans, rf_classifier) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) accuracy_score(y_test, y_pred) print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %") recall_score(y_test, y_pred) precision_score(y_test, y_pred) f1_score(y_test, y_pred) roc_auc_score (y_test, y_pred) roc_curve(y_test, y_pred) train_probs = pipe.predict_proba(X_train)[:,1] probs = pipe.predict_proba(X_test)[:, 1] train_predictions = pipe.predict(X_train) print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}') print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}') def evaluate_model(y_pred, probs,train_predictions, train_probs): baseline = {} baseline['recall']=recall_score(y_test, [1 for _ in range(len(y_test))]) baseline['precision'] = precision_score(y_test, [1 for _ in range(len(y_test))]) baseline['roc'] = 0.5 results = {} results['recall'] = recall_score(y_test, y_pred) results['precision'] = precision_score(y_test, y_pred) results['roc'] = roc_auc_score(y_test, probs) train_results = {} train_results['recall'] = recall_score(y_train, train_predictions) train_results['precision'] = precision_score(y_train, train_predictions) train_results['roc'] = roc_auc_score(y_train, train_probs) # for metric in ['recall', 'precision', 'roc']: # print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}") # Calculate false positive rates and true positive rates base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))]) model_fpr, model_tpr, _ = roc_curve(y_test, probs) plt.figure(figsize = (8, 6)) plt.rcParams['font.size'] = 16 # Plot both curves plt.plot(base_fpr, base_tpr, 'b', label = 'baseline') plt.plot(model_fpr, model_tpr, 'r', label = 'model') plt.legend(); plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves'); plt.show() # Recall Baseline: 1.0 Test: 0.92 Train: 0.93 # Precision Baseline: 0.48 Test: 0.9 Train: 0.91 # Roc Baseline: 0.5 Test: 0.97 Train: 0.97 evaluate_model(y_pred,probs,train_predictions,train_probs) #%% import itertools def plot_confusion_matrix(cm, classes, normalize = False, title='Confusion matrix', cmap=plt.cm.Greens): # can change color plt.figure(figsize = (10, 10)) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title, size = 24) plt.colorbar(aspect=4) tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45, size = 14) plt.yticks(tick_marks, classes, size = 14) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. # Label the plot for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), fontsize = 20, horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.grid(None) plt.tight_layout() plt.ylabel('True label', size = 18) plt.xlabel('Predicted label', size = 18) # Let's plot it out cm = confusion_matrix(y_test, y_pred) plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'], title = 'R/S Confusion Matrix') print(rf_classifier.feature_importances_) print(f" There are {len(rf_classifier.feature_importances_)} features in total")