diff --git a/my_datap2.py b/my_datap2.py new file mode 100644 index 0000000..9f2d861 --- /dev/null +++ b/my_datap2.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Feb 24 10:48:10 2022 + +@author: tanu +""" +############################################################################### +# questions: + # which data to use: merged_df3 or merged_df2 + # which is the target? or_mychisq or drtype col + # scaling: can it be from -1 to 1? + # how to include the mutation information? + # 'wild_type', 'mutant', 'postion' + # whether to log transform the af and or cols + # to allow mean mode values to be imputed for validation set + # whether to calculate mean, median accounting for NA or removing them? + +# strategy: + # available data = X_train + # available data but NAN = validation_test + # test data: mut generated not in mcsm + +############################################################################### +import os, sys +import re +from sklearn.datasets import load_boston +from sklearn import linear_model +from sklearn import preprocessing +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) +#%% read data +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/test_data") + +# this needs to be merged_df2 or merged_df3? +my_df = pd.read_csv("pnca_all_params.csv") + +my_df.dtypes +my_df_cols = my_df.columns + +omit_cols1 = ['pdb_file' + , 'seq_offset4pdb' + , 'mut_3upper' + , 'wild_pos' + , 'wild_chain_pos' + , 'chain' + , 'wt_3upper' + , 'consurf_colour' + , 'consurf_colour_rev' + , 'consurf_msa_data' + , 'consurf_aa_variety' + , 'snap2_accuracy_pc' + , 'beta_logistic' + , 'se_logistic' + , 'zval_logisitc' + , 'pval_chisq' + , 'log10_or_mychisq' + , 'neglog_pval_fisher' + , 'or_fisher' + , 'wild_type' + , 'mutant_type' + , 'position' + , 'ligand_id' + , 'mutation' + , 'ss' + , 'ss_class' # include it later? + , 'contacts' + ] + +omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")]) + +# [WATCH:] just to test since these have negative values! +omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")]) + +omit_cols = omit_cols1 + omit_cols2 + omit_cols3 + +my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)] +my_df_filt_cols = my_df_filt.columns + +#fill NaNs with column means in each column +my_df_filt2 = my_df_filt.fillna(my_df_filt.mean()) +my_df_filt3 = my_df_filt.fillna(my_df_filt.median()) + +my_df_filt_noNA = my_df_filt.fillna(0) + +summ = my_df_filt.describe() +summ_noNA = my_df_filt_noNA.describe() + +foo = my_df_filt['or_mychisq'].value_counts() +foo = foo.to_frame() + +######################## +# [WATCH]: Drop na +my_df2 = my_df_filt3.dropna() +my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1) +my_df2['resistance'].value_counts() +y = my_df2['resistance'] +y.value_counts() + + +#%%============================================================================ +X_validation_muts = my_df['mutationinformation'][~my_df['mutationinformation'].isin(my_df2['mutationinformation'])] +X_validation_all = my_df_filt3[~my_df_filt3['mutationinformation'].isin(my_df2['mutationinformation'])] +X_validation_f = X_validation_all.loc[:, ~X_validation_all.columns.isin(['or_mychisq', 'resistance'])] +X_validation = X_validation_f.set_index('mutationinformation') + +#%% fill na in cols with mean value +X_validation.info() +X_validation.isna().any() + +na_df = X_validation_f[X_validation_f.columns[X_validation_f.isna().any()]] +na_colnames = X_validation_f.columns[X_validation_f.isna().any()] +na_colsL = list(na_colnames) + +#============================================================================== +omit_cols_y = ['or_mychisq', 'resistance'] +my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)] +#%%############################################################################ +X_train = my_df_ml.set_index('mutationinformation') +#X_train = X_train.iloc[:,:4] +y_train = y +#X_train = X_train.dropna() +#y_train = y.dropna() + +# check dim +X_train.shape +y_train.shape + + +############################################################################### +from sklearn.pipeline import Pipeline +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import cross_validate +from sklearn.metrics import make_scorer +from sklearn.metrics import accuracy_score, precision_score, recall_score + +model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler()) + , ('logis', LogisticRegression(class_weight = 'unbalanced')) + ]) + +model_logisP.fit(X_train, y_train) +fitted_vals = model_logisP.predict(X_train) +fitted_vals + +# gives the array of predictions +model_logisP.predict(X_train) +model_logisP.predict(X_validation) +y_pred = model_logisP.predict(X_train) +y_pred2 = model_logisP.predict(X_validation) + +accuracy_score(y_train,y_pred2) +precision_score(y_train,y_pred2, pos_label = 1)# tp/(tp + fp) +recall_score(y_train,y_pred2, pos_label = 1) # tp/(tp + fn) + + +################ +acc = make_scorer(accuracy_score) +def precision(y_true,y_pred): + return precision_score(y_true,y_pred,pos_label = 1) #0 + +def recall(y_true,y_pred): + return recall_score(y_true, y_pred, pos_label = 1) #0 + +prec = make_scorer(precision) +rec = make_scorer(recall) +output = cross_validate(model_logisP + , X_train + , y + , scoring = {'acc' : acc + ,'prec' : prec + ,'rec' : rec} + , cv = 10, return_train_score = False) + +pd.DataFrame(output).mean() \ No newline at end of file diff --git a/my_datap3.py b/my_datap3.py new file mode 100644 index 0000000..cfe5abd --- /dev/null +++ b/my_datap3.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Feb 24 10:48:10 2022 + +@author: tanu +""" +############################################################################### +# questions: + # which data to use: merged_df3 or merged_df2 + # which is the target? or_mychisq or drtype col + # scaling: can it be from -1 to 1? + # how to include the mutation information? + # 'wild_type', 'mutant', 'postion' + # whether to log transform the af and or cols + # to allow mean mode values to be imputed for validation set + # whether to calculate mean, median accounting for NA or removing them? + +# strategy: + # available data = X_train + # available data but NAN = validation_test + # test data: mut generated not in mcsm + +############################################################################### +import os, sys +import re +from sklearn.datasets import load_boston +from sklearn import datasets +from sklearn import linear_model +from sklearn import preprocessing +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) +from statistics import mean, stdev + +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import cross_validate +from sklearn.metrics import make_scorer + +from sklearn.ensemble import RandomForestClassifier +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline +from sklearn.datasets import load_digits +from sklearn.model_selection import train_test_split +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import confusion_matrix +from sklearn.model_selection import StratifiedKFold + +from sklearn.preprocessing import OneHotEncoder +from sklearn.compose import make_column_transformer +from sklearn.ensemble import RandomForestClassifier + +from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score + +#%% read data +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/test_data") + +# this needs to be merged_df2 or merged_df3? +my_df = pd.read_csv("pnca_all_params.csv") + +my_df.dtypes +my_df_cols = my_df.columns + +omit_cols1 = ['pdb_file' + , 'seq_offset4pdb' + , 'mut_3upper' + , 'wild_pos' + , 'wild_chain_pos' + , 'chain' + , 'wt_3upper' + , 'consurf_colour' + , 'consurf_colour_rev' + , 'consurf_msa_data' + , 'consurf_aa_variety' + , 'snap2_accuracy_pc' + , 'beta_logistic' + , 'se_logistic' + , 'zval_logisitc' + , 'pval_chisq' + , 'log10_or_mychisq' + , 'neglog_pval_fisher' + , 'or_fisher' + , 'wild_type' + , 'mutant_type' + , 'position' + , 'ligand_id' + , 'mutation' + , 'ss' + , 'ss_class' # include it later? + , 'contacts' + ] + +omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")]) + +# [WATCH:] just to test since these have negative values! +omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")]) + +omit_cols = omit_cols1 + omit_cols2 + omit_cols3 + +# Filter df: Filter columns to focus on my selected ones +my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)] +my_df_filt_cols = my_df_filt.columns + +#Fill na of filtered df: fill NaNs with column means/medians in each column +my_df_filt2 = my_df_filt.fillna(my_df_filt.mean()) +my_df_filt3 = my_df_filt.fillna(my_df_filt.median()) +#my_df_filt_noNA = my_df_filt.fillna(0) + +summ = my_df_filt.describe() +summ2 = my_df_filt2.describe() +summ3 = my_df_filt3.describe() +#summ_noNA = my_df_filt_noNA.describe() + +######################## +# [WATCH]: Drop na +# Get Y +my_df2 = my_df_filt.dropna() +my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1) +my_df2['resistance'].value_counts() +Y = my_df2['resistance'] +Y = np.array(Y) +#Y = Y.reset_index() +#Y = Y.drop(['index'], axis = 1) +#Y.value_counts() +#Y = np.array(Y) + +# GET X +omit_cols_y = ['or_mychisq', 'resistance'] +my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)] +#my_df_ml = my_df_ml.set_index('mutationinformation') +X = my_df_ml +X = X.drop(['mutationinformation'], axis = 1) +X = np.array(X) + +#X = X.reset_index() + + +# check dim +X.shape +Y.shape +my_df2 = my_df2.reset_index() + +##################### +#https://stackoverflow.com/questions/49134338/kfolds-cross-validation-vs-train-test-split +rf = RandomForestClassifier(n_estimators=100, random_state=42) + +#https://towardsdatascience.com/stratified-k-fold-what-it-is-how-to-use-it-cf3d107d3ea2 +# k-FOLD +print('Class Ratio:', + sum(Y)/len(Y)) + +print('Class Ratio:', + sum(my_df2['resistance'])/len(my_df2['resistance']) + ) + +target = my_df2.loc[:,'resistance'] + +skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) +fold_no = 1 +for train_index, test_index in skf.split(my_df2, target): + train = my_df2.loc[train_index,:] + test = my_df2.loc[test_index,:] + print('Fold',str(fold_no), + 'Class Ratio:', + sum(test['resistance'])/len(test['resistance'])) + fold_no += 1 + +model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler()) + , ('logis', LogisticRegression(class_weight = 'unbalanced')) + ]) + +X_features = X_train.columns.to_list() + +def train_model(train, test, fold_no): + X = X_features + y = ['resistance'] + X_train = train[X] + y_train = train[y] + X_test = test[X] + y_test = test[y] + model_logisP.fit(X_train,y_train) + predictions = model_logisP.predict(X_test) + print('Fold',str(fold_no), + 'Accuracy:', + accuracy_score(y_test,predictions)) + + +fold_no = 1 +for train_index, test_index in skf.split(my_df2, target): + train = my_df2.loc[train_index,:] + test = my_df2.loc[test_index,:] + train_model(train,test,fold_no) + fold_no += 1 +#%% +skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=20) +lst_accu_stratified = [] +scaler = preprocessing.MinMaxScaler() +X_scaled = scaler.fit_transform(X) +X_scaled = X_scaled[:,[1,2,3,15,16]] + +#lr = linear_model.LogisticRegression(class_weight = 'unbalanced') +lr = linear_model.LogisticRegression() + +for train_index1, test_index1 in skf.split(X, Y): + #print(train_index) + #print(test_index) + x_train_fold1, x_test_fold1 = X_scaled[train_index1], X_scaled[test_index1] + y_train_fold1, y_test_fold1 = Y[train_index1], Y[test_index1] + lr.fit(x_train_fold1, y_train_fold1) + lst_accu_stratified.append(lr.score(x_test_fold1, y_test_fold1)) + +# print output +print('List of possible accuracy', lst_accu_stratified) +print('Max accuracy:', max(lst_accu_stratified)*100, "%") +print('Min accuracy:', min(lst_accu_stratified)*100, "%") +print('Mean accuracy:', mean(lst_accu_stratified)*100,"%") +print('St Dev:', stdev(lst_accu_stratified)*100,"%") + + +# cancer data +cancer = datasets.load_breast_cancer() +x = cancer.data +y = cancer.target +skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) +lst_accu_stratifiedC = [] +scaler = preprocessing.MinMaxScaler() +x_scaled = scaler.fit_transform(x) +x_scaled = x_scaled[:,[1,2,3, 15, 16]] + +for train_index, test_index in skf.split(x, y): + #print(train_index) + #print(test_index) + x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index] + y_train_fold, y_test_fold = y[train_index], y[test_index] + lr.fit(x_train_fold, y_train_fold) + lst_accu_stratifiedC.append(lr.score(x_test_fold, y_test_fold)) + +# print output +print('List of possible accuracy', lst_accu_stratifiedC) +print('Max accuracy:', max(lst_accu_stratifiedC)*100, "%") +print('Min accuracy:', min(lst_accu_stratifiedC)*100, "%") +print('Mean accuracy:', mean(lst_accu_stratifiedC)*100,"%") +print('St Dev:', stdev(lst_accu_stratifiedC)*100,"%") + +#%% +## +# https://towardsdatascience.com/my-random-forest-classifier-cheat-sheet-in-python-fedb84f8cf4f +y_all = my_df_filt['or_mychisq'].apply(lambda x: 0 if x <=1 else 1) +X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1) +seed = 20 # so that the result is reproducible + +X_all = my_df_filt.drop(['mutationinformation', 'or_mychisq'], axis = 1) +X_all = X_all.iloc[:,:6] + +X_train, X_test, y_train, y_test = train_test_split(X_all,y_all + , test_size=0.333 + , random_state = seed) +# Now, it is time to make NA a category. +# In Python, NaN is considered NAs. +# When encoded, those NaN will be ignored. +# Hence, it is useful to replace NaN with na, which is now a category called ‘na’. +# This will be taken into account when encoding later on. +#X_train = X_train.fillna('na') +#X_test = X_test.fillna('na') + +X_train = X_train.fillna(X_train.median()) +X_test = X_test.fillna(X_test.median()) + +X_train.dtypes + +features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) + +col_trans = make_column_transformer( + (OneHotEncoder(),features_to_encode), + remainder = "passthrough" + ) + +rf_classifier = RandomForestClassifier( + min_samples_leaf=50, + n_estimators=150, + bootstrap=True, + oob_score=True, + n_jobs=-1, + random_state=seed, + max_features='auto') + +pipe = make_pipeline(col_trans, rf_classifier) +pipe.fit(X_train, y_train) +y_pred = pipe.predict(X_test) + +accuracy_score(y_test, y_pred) +print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %") + +recall_score(y_test, y_pred) +precision_score(y_test, y_pred) +f1_score(y_test, y_pred) +roc_auc_score (y_test, y_pred) +roc_curve(y_test, y_pred) + +train_probs = pipe.predict_proba(X_train)[:,1] +probs = pipe.predict_proba(X_test)[:, 1] +train_predictions = pipe.predict(X_train) + +print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}') +print(f'Test ROC AUC Score: {roc_auc_score(y_test, probs)}') + +def evaluate_model(y_pred, probs,train_predictions, train_probs): + baseline = {} + baseline['recall']=recall_score(y_test, + [1 for _ in range(len(y_test))]) + baseline['precision'] = precision_score(y_test, + [1 for _ in range(len(y_test))]) + baseline['roc'] = 0.5 + results = {} + results['recall'] = recall_score(y_test, y_pred) + results['precision'] = precision_score(y_test, y_pred) + results['roc'] = roc_auc_score(y_test, probs) + train_results = {} + train_results['recall'] = recall_score(y_train, + train_predictions) + train_results['precision'] = precision_score(y_train, train_predictions) + train_results['roc'] = roc_auc_score(y_train, train_probs) + # for metric in ['recall', 'precision', 'roc']: + # print(f"Baseline: {round(baseline[metric], 2)}Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}") + + # Calculate false positive rates and true positive rates + base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))]) + model_fpr, model_tpr, _ = roc_curve(y_test, probs) + plt.figure(figsize = (8, 6)) + plt.rcParams['font.size'] = 16 + # Plot both curves + plt.plot(base_fpr, base_tpr, 'b', label = 'baseline') + plt.plot(model_fpr, model_tpr, 'r', label = 'model') + plt.legend(); plt.xlabel('False Positive Rate'); + plt.ylabel('True Positive Rate'); plt.title('ROC Curves'); + plt.show() + +# Recall Baseline: 1.0 Test: 0.92 Train: 0.93 +# Precision Baseline: 0.48 Test: 0.9 Train: 0.91 +# Roc Baseline: 0.5 Test: 0.97 Train: 0.97 + +evaluate_model(y_pred,probs,train_predictions,train_probs) +#%% +import itertools +def plot_confusion_matrix(cm, classes, normalize = False, + title='Confusion matrix', + cmap=plt.cm.Greens): # can change color + plt.figure(figsize = (10, 10)) + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title, size = 24) + plt.colorbar(aspect=4) + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45, size = 14) + plt.yticks(tick_marks, classes, size = 14) + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + # Label the plot + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), + fontsize = 20, + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + plt.grid(None) + plt.tight_layout() + plt.ylabel('True label', size = 18) + plt.xlabel('Predicted label', size = 18) +# Let's plot it out +cm = confusion_matrix(y_test, y_pred) +plot_confusion_matrix(cm, classes = ['0 - Susceptible', '1 - Resistant'], + title = 'R/S Confusion Matrix') + +print(rf_classifier.feature_importances_) +print(f" There are {len(rf_classifier.feature_importances_)} features in total") \ No newline at end of file