diff --git a/imports.py b/imports.py index ab3606c..454601d 100644 --- a/imports.py +++ b/imports.py @@ -27,6 +27,8 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe from sklearn.metrics import make_scorer from sklearn.metrics import classification_report +from sklearn.metrics import average_precision_score + from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold @@ -43,6 +45,16 @@ import numpy as np print(np.__version__) print(pd.__version__) from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline +#from sklearn.datasets import make_classification +from sklearn.model_selection import cross_validate +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.ensemble import AdaBoostClassifier +from imblearn.combine import SMOTEENN +from imblearn.under_sampling import EditedNearestNeighbours #%% homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/") @@ -52,8 +64,8 @@ from MultClassPipe import MultClassPipeline from MultClassPipe2 import MultClassPipeline2 from loopity_loop import MultClassPipeSKF -gene = 'rpoB' -drug = 'rifampicin' +gene = 'pncA' +drug = 'pyrazinamide' #============== # directories @@ -79,64 +91,33 @@ geneL_ppi2 = ['alr', 'embb', 'katg'] #%% get cols mycols = my_df.columns -my_df['active_aa_pos'].dtype -my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) +# change from numberic to +num_type = ['int64', 'float64'] +cat_type = ['object', 'bool'] +if my_df['active_aa_pos'].dtype in num_type: + my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) + my_df['active_aa_pos'].dtype + +# FIXME: if this is not structural, remove from source.. +# Drop NA where numerical cols have them if gene.lower() in geneL_na_ppi2: - x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] #D1148 get rid of na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] my_df = my_df.drop(index=na_index) +# FIXME: either impute or remove! +# for embb (L114M, F115L, V123L, V125I, V131M) delete for now +if gene.lower() in ['embb']: + na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) #%%============================================================================ -# GET Y -# Target1: mutation_info_labels -dm_om_map = {'DM': 1, 'OM': 0} -target1 = my_df['mutation_info_labels'].map(dm_om_map) -target1.value_counts() - -# Target2: drug -drug_labels = drug + '_labels' -drug_labels -my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'}) -my_df[drug_labels].value_counts() -my_df[drug_labels] = my_df[drug_labels].fillna('unknown') -my_df[drug_labels].value_counts() -target2 = my_df[drug_labels] - -# Target3: drtype [Binary] -drtype_labels = 'drtype_labels' -my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0 - , 'Other' : 0 - , 'Pre-MDR' : 1 - , 'MDR' : 1 - , 'Pre-XDR' : 1 - , 'XDR' : 1}) -# target3 = 'drtype' [Multinomial] -target3 = my_df[drtype_labels] - -# target4 -drtype_labels2 = 'drtype_labels2' -my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0 - , 'Other' : 0 - , 'Pre-MDR' : 1 - , 'MDR' : 1 - , 'Pre-XDR' : 2 - , 'XDR' : 2}) -target4 = my_df[drtype_labels2] - -# sanity checks -target1.value_counts() -my_df['mutation_info_labels'].value_counts() - -target2.value_counts() -my_df[drug_labels].value_counts() - -target3.value_counts() -my_df['drtype'].value_counts() -target4.value_counts() -my_df['drtype'].value_counts() +# Target1: mutation_info_labels, convert to +dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority +my_df['mutation_class'] = my_df['mutation_info_labels'].map(dm_om_map) +my_df['mutation_class'].value_counts() +my_df['mutation_info_labels']. value_counts() #%% # GET X @@ -159,10 +140,7 @@ if gene.lower() in geneL_na: if gene.lower() in geneL_na_ppi2: x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - #D1148 get rid of - na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] - my_df = my_df.drop(index=na_index) - + X_strFN = ['asa' , 'rsa' , 'kd_values' @@ -172,53 +150,48 @@ X_evolFN = ['consurf_score' , 'snap2_score' , 'snap2_accuracy_pc'] -# TODO: ADD ED values -# Problematic due to NA: filling NA with unknown or string will make it categorical -# OPTIONS -# 1. Imputing: KNN or MICE or from distribution -# 2. Fill na with median or mode -# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set' - # this means the size of the training data gets reduced! -# 4. Remove genomic features from ML COMPLETELEY! - # X_genomicFN = ['af' # , 'or_mychisq' # , 'or_logistic' # , 'or_fisher' # , 'pval_fisher'] -#%% try combinations -X_vars1 = my_df[x_stabilityN] -X_vars2 = my_df[X_strFN] -X_vars3 = my_df[X_evolFN] - -X_vars5 = my_df[x_stabilityN + X_strFN] -X_vars6 = my_df[x_stabilityN + X_evolFN] -#X_vars7 = my_df[x_stabilityN + X_genomicFN] -X_vars8 = my_df[X_strFN + X_evolFN] -#X_vars9 = my_df[X_strFN + X_genomicFN] -#X_vars10 = my_df[X_evolFN + X_genomicFN] -X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN] -#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN] - -numerical_features_names = x_stabilityN + X_strFN + X_evolFN +#%% Construct numerical and categorical column names +numerical_FN = x_stabilityN + X_strFN + X_evolFN # separate ones for foldx? -categorical_features_names = ['ss_class' - , 'wt_prop_water' - # , 'lineage_labels' # misleading if using merged_df3 - , 'mut_prop_water' - , 'wt_prop_polarity' - , 'mut_prop_polarity' - , 'wt_calcprop' - , 'mut_calcprop' - , 'active_aa_pos'] +categorical_FN = ['ss_class' + , 'wt_prop_water' + # , 'lineage_labels' # misleading if using merged_df3 + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + , 'active_aa_pos'] -numerical_features_df = my_df[numerical_features_names] -numerical_features_df.shape +#%% extracting dfs based on numerical, categorical column names +#---------------------------------- +# WITHOUT the target var included +#---------------------------------- +num_df = my_df[numerical_FN] +num_df.shape -categorical_features_df = my_df[categorical_features_names] -categorical_features_df.shape +cat_df = my_df[categorical_FN] +cat_df.shape -all_features_df = my_df[numerical_features_names + categorical_features_names] -all_features_df.shape \ No newline at end of file +all_df = my_df[numerical_FN + categorical_FN] +all_df.shape + +#------------------------------ +# WITH the target var included: + #'wtgt': with target +#------------------------------ +num_df_wtgt = my_df[numerical_FN + ['mutation_class']] +num_df_wtgt.shape + +cat_df_wtgt = my_df[categorical_FN + ['mutation_class']] +cat_df_wtgt.shape + +all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']] +all_df_wtgt.shape diff --git a/loopity_loop_CALL b/loopity_loop_CALL deleted file mode 100644 index 4916d2b..0000000 --- a/loopity_loop_CALL +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Mar 11 11:15:50 2022 - -@author: tanu -""" -#%% -del(t3_res) -t3_res = MultClassPipeSKF(input_df = numerical_features_df - , y_targetF = target1 - , var_type = 'numerical' - , skf_splits = 10) -pp.pprint(t3_res) -#print(t3_res) - -#%% Manually: mean for each model, each metric -model_name = 'Logistic Regression' -model_name = 'Naive Bayes' -model_name = 'K-Nearest Neighbors' -model_name = 'SVM' - -#%% -model_metric = 'F1_score' - -log_reg_f1 = [] -for key in t3_res[model_name]: - log_reg_f1.append(t3_res[model_name][key][model_metric]) - log_reg_f1M = mean(log_reg_f1) - print('key:', key, model_metric, ':', log_reg_f1) -print(log_reg_f1M) - -log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric]) -log_reg_f1df - -#%% -model_metric = 'MCC' -log_reg_mcc = [] -for key in t3_res[model_name]: - log_reg_mcc.append(t3_res[model_name][key][model_metric]) - log_reg_mccM = mean(log_reg_mcc) - print('key:', key, model_metric, ':', log_reg_mcc) -print(log_reg_mccM) - -log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric]) -log_reg_mccdf -#%% -model_metric = 'Precision' -log_reg_pres = [] -for key in t3_res[model_name]: - log_reg_pres.append(t3_res[model_name][key][model_metric]) - log_reg_presM = mean(log_reg_pres) - print('key:', key, model_metric, ':', log_reg_pres) -print(log_reg_presM) - -log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric]) -log_reg_presdf -#%% -model_metric = 'Recall' -log_reg_recall = [] -for key in t3_res[model_name]: - log_reg_recall.append(t3_res[model_name][key][model_metric]) - log_reg_recallM = mean(log_reg_recall) - print('key:', key, model_metric, ':', log_reg_recall) -print(log_reg_recallM) - -log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric]) -log_reg_recalldf -#%% -model_metric = 'Accuracy' -log_reg_accu = [] -for key in t3_res[model_name]: - log_reg_accu.append(t3_res[model_name][key][model_metric]) - log_reg_accuM = mean(log_reg_accu) - print('key:', key, model_metric, ':', log_reg_accu) -print(log_reg_accuM) - -log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric]) -log_reg_accudf -#%% -model_metric = 'ROC_AUC' -log_reg_roc_auc = [] -for key in t3_res[model_name]: - log_reg_roc_auc.append(t3_res[model_name][key][model_metric]) - log_reg_roc_aucM = mean(log_reg_roc_auc) - print('key:', key, model_metric, ':', log_reg_roc_auc) -print(log_reg_roc_aucM) - -log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric]) -log_reg_roc_aucdf \ No newline at end of file diff --git a/loopity_loop_CALL.py b/loopity_loop_CALL.py new file mode 100644 index 0000000..72d33b0 --- /dev/null +++ b/loopity_loop_CALL.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 11 11:15:50 2022 + +@author: tanu +""" +#%% +del(t3_res) +t3_res = MultClassPipeSKF(input_df = numerical_features_df + , y_targetF = target1 + , var_type = 'numerical' + , skf_splits = 10) +pp.pprint(t3_res) +#print(t3_res) + +#%% Manually: mean for each model, each metric +model_name = 'Logistic Regression' +model_name = 'Naive Bayes' +model_name = 'K-Nearest Neighbors' +model_name = 'SVM' + + + + + +#%% +model_metric = 'F1_score' + +log_reg_f1 = [] +for key in t3_res[model_name]: + log_reg_f1.append(t3_res[model_name][key][model_metric]) + log_reg_f1M = mean(log_reg_f1) + print('key:', key, model_metric, ':', log_reg_f1) +print(log_reg_f1M) + +log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric]) +log_reg_f1df + +#%% +model_metric = 'MCC' +log_reg_mcc = [] +for key in t3_res[model_name]: + log_reg_mcc.append(t3_res[model_name][key][model_metric]) + log_reg_mccM = mean(log_reg_mcc) + print('key:', key, model_metric, ':', log_reg_mcc) +print(log_reg_mccM) + +log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric]) +log_reg_mccdf + +#%% + + +#%% Classification Metrics we need to mean() +classification_metrics = { + 'F1_score': [] + ,'MCC': [] + ,'Precision': [] + ,'Recall': [] + ,'Accuracy': [] + } +# "mean() of the current metric across all folds for this model" + +# the output containing all the metrics across all folds for this model +out={} +# Just the mean() for each of the above metrics-per-model +out_means={} + +# Build up out{} from t3_res, which came from loopity_loop +for model in t3_res: + # NOTE: can't copy objects in Python!!! + out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []} + out_means[model]={} # just to make life easier + print(model) + for fold in t3_res[model]: + for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}: + metric_value = t3_res[model][fold][metric] + out[model][metric].append(metric_value) +# now that we've built out{}, let's mean() each metric +for model in out: + for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}: + metric_mean = mean(out[model][metric]) + # just some debug output + # print('model:', model + # , 'metric: ', metric + # , metric_mean + # ) + out[model].update({(metric+'_mean'): metric_mean }) + out_means[model].update({(metric+'_mean'): metric_mean }) + +out_scores = pd.DataFrame(out_means) diff --git a/my_datap10.py b/my_datap10.py index 43148f1..7637502 100644 --- a/my_datap10.py +++ b/my_datap10.py @@ -165,8 +165,8 @@ nb = BernoulliNB() rf = RandomForestClassifier(**rs) clfs = [('Logistic Regression', log_reg) - ,('Naive Bayes', nb) - , ('Random Forest' , rf) + #,('Naive Bayes', nb) + #, ('Random Forest' , rf) ] #seed_skf = 42 diff --git a/my_datap11.py b/my_datap11.py index e007cd1..5c6a003 100644 --- a/my_datap11.py +++ b/my_datap11.py @@ -163,7 +163,7 @@ for train_index, test_index in skf.split(input_df, y_targetF): pres = precision_score(y_test_fold, y_pred_fold) presL.append(pres) presM = mean(presL) - + # Recall recall = recall_score(y_test_fold, y_pred_fold) recallL.append(recall)