loopity_loop_CALL

2022-03-14 18:36:23 +00:00 · 2022-03-14 18:36:23 +00:00 · 160053d361
commit 160053d361
parent 7aead2d4f4
5 changed files with 163 additions and 188 deletions
--- a/imports.py
+++ b/imports.py
@ -27,6 +27,8 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe
 from sklearn.metrics import make_scorer
 from sklearn.metrics import classification_report

+from sklearn.metrics import average_precision_score
+
 from sklearn.model_selection import cross_validate
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import StratifiedKFold
@ -43,6 +45,16 @@ import numpy as np
 print(np.__version__)
 print(pd.__version__)
 from statistics import mean, stdev, median, mode
+
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.over_sampling import SMOTE
+from imblearn.pipeline import Pipeline
+#from sklearn.datasets import make_classification
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import RepeatedStratifiedKFold
+from sklearn.ensemble import AdaBoostClassifier
+from imblearn.combine import SMOTEENN
+from imblearn.under_sampling import EditedNearestNeighbours
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
@ -52,8 +64,8 @@ from MultClassPipe import MultClassPipeline
 from MultClassPipe2 import MultClassPipeline2
 from loopity_loop import MultClassPipeSKF

-gene = 'rpoB'
-drug = 'rifampicin'
+gene = 'pncA'
+drug = 'pyrazinamide'

 #==============
 # directories
@ -79,64 +91,33 @@ geneL_ppi2      = ['alr', 'embb', 'katg']
 #%% get cols
 mycols = my_df.columns

-my_df['active_aa_pos'].dtype
-my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
+# change from numberic to 
+num_type = ['int64', 'float64']
+cat_type = ['object', 'bool']

+if my_df['active_aa_pos'].dtype in num_type:
+    my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
+    my_df['active_aa_pos'].dtype
+
+# FIXME: if this is not structural, remove from source..
+# Drop NA where numerical cols have them
 if gene.lower() in geneL_na_ppi2:
-    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
    #D1148 get rid of
    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
    my_df = my_df.drop(index=na_index)

+# FIXME: either impute or remove!
+# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
+if gene.lower() in ['embb']:
+    na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
+    my_df = my_df.drop(index=na_index)
 #%%============================================================================
-# GET Y

-# Target1: mutation_info_labels
-dm_om_map = {'DM': 1, 'OM': 0}
-target1 = my_df['mutation_info_labels'].map(dm_om_map)
-target1.value_counts()
-
-# Target2: drug
-drug_labels = drug + '_labels'
-drug_labels
-my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
-my_df[drug_labels].value_counts()
-my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
-my_df[drug_labels].value_counts()
-target2 = my_df[drug_labels]
-
-# Target3: drtype [Binary]
-drtype_labels = 'drtype_labels'
-my_df[drtype_labels] = my_df['drtype'].map({'Sensitive'      : 0
-                                                 , 'Other'   : 0
-                                                 , 'Pre-MDR' : 1
-                                                 , 'MDR'     : 1
-                                                 , 'Pre-XDR' : 1
-                                                 , 'XDR'     : 1})
-# target3 = 'drtype' [Multinomial]
-target3 = my_df[drtype_labels]
-
-# target4
-drtype_labels2 = 'drtype_labels2'
-my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive'     : 0
-                                                 , 'Other'   : 0
-                                                 , 'Pre-MDR' : 1
-                                                 , 'MDR'     : 1
-                                                 , 'Pre-XDR' : 2
-                                                 , 'XDR'     : 2})
-target4 = my_df[drtype_labels2]
-
-# sanity checks
-target1.value_counts()
-my_df['mutation_info_labels'].value_counts()
-
-target2.value_counts()
-my_df[drug_labels].value_counts()
-
-target3.value_counts()
-my_df['drtype'].value_counts()
-target4.value_counts()
-my_df['drtype'].value_counts()
+# Target1: mutation_info_labels, convert to 
+dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
+my_df['mutation_class'] = my_df['mutation_info_labels'].map(dm_om_map)
+my_df['mutation_class'].value_counts()
+my_df['mutation_info_labels']. value_counts()

 #%%
 # GET X
@ -159,10 +140,7 @@ if gene.lower() in geneL_na:

 if gene.lower() in geneL_na_ppi2:
    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-    #D1148 get rid of
-    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
-    my_df = my_df.drop(index=na_index)
-    
+
 X_strFN =  ['asa'
           , 'rsa'
           , 'kd_values'
@ -172,53 +150,48 @@ X_evolFN =  ['consurf_score'
           , 'snap2_score'
           , 'snap2_accuracy_pc']

-# TODO: ADD ED values
-# Problematic due to NA: filling NA with unknown or string will make it categorical
-# OPTIONS
-# 1. Imputing: KNN or MICE or from distribution
-# 2. Fill na with median or mode
-# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set'
-    # this means the size of the training data gets reduced!
-# 4. Remove genomic features from ML COMPLETELEY!
-
 # X_genomicFN =  ['af'
 #            , 'or_mychisq'
 #            , 'or_logistic'
 #            , 'or_fisher'
 #            , 'pval_fisher']

-#%% try combinations
-X_vars1 = my_df[x_stabilityN] 
-X_vars2 = my_df[X_strFN] 
-X_vars3 = my_df[X_evolFN] 
-
-X_vars5  = my_df[x_stabilityN + X_strFN]
-X_vars6  = my_df[x_stabilityN + X_evolFN]
-#X_vars7  = my_df[x_stabilityN + X_genomicFN]
-X_vars8  = my_df[X_strFN + X_evolFN]
-#X_vars9  = my_df[X_strFN + X_genomicFN]
-#X_vars10 = my_df[X_evolFN + X_genomicFN]
-X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN]
-#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN]
-
-numerical_features_names = x_stabilityN + X_strFN + X_evolFN
+#%% Construct numerical and categorical column names
+numerical_FN = x_stabilityN + X_strFN + X_evolFN

 # separate ones for foldx?
-categorical_features_names = ['ss_class'
-                           , 'wt_prop_water'
-                          # , 'lineage_labels' # misleading if using merged_df3
-                           , 'mut_prop_water'
-                           , 'wt_prop_polarity'
-                           , 'mut_prop_polarity'
-                           , 'wt_calcprop'
-                           , 'mut_calcprop'
-                           , 'active_aa_pos']
+categorical_FN = ['ss_class'
+             , 'wt_prop_water'
+            # , 'lineage_labels' # misleading if using merged_df3
+             , 'mut_prop_water'
+             , 'wt_prop_polarity'
+             , 'mut_prop_polarity'
+             , 'wt_calcprop'
+             , 'mut_calcprop'
+             , 'active_aa_pos']

-numerical_features_df = my_df[numerical_features_names]
-numerical_features_df.shape
+#%% extracting dfs based on numerical, categorical column names
+#----------------------------------
+# WITHOUT the target var included
+#----------------------------------
+num_df = my_df[numerical_FN]
+num_df.shape

-categorical_features_df = my_df[categorical_features_names]
-categorical_features_df.shape
+cat_df = my_df[categorical_FN]
+cat_df.shape

-all_features_df = my_df[numerical_features_names + categorical_features_names]
-all_features_df.shape
+all_df = my_df[numerical_FN + categorical_FN]
+all_df.shape
+
+#------------------------------
+# WITH the target var included:
+    #'wtgt': with target
+#------------------------------
+num_df_wtgt = my_df[numerical_FN + ['mutation_class']]
+num_df_wtgt.shape
+
+cat_df_wtgt = my_df[categorical_FN + ['mutation_class']]
+cat_df_wtgt.shape
+
+all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
+all_df_wtgt.shape
--- a/90
+++ b/90
@ -1,90 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Mar 11 11:15:50 2022
-
-@author: tanu
-"""
-#%%
-del(t3_res)
-t3_res = MultClassPipeSKF(input_df = numerical_features_df
-                          , y_targetF = target1
-                          , var_type = 'numerical'
-                          , skf_splits = 10)
-pp.pprint(t3_res)
-#print(t3_res)
-
-#%% Manually: mean for each model, each metric
-model_name = 'Logistic Regression'
-model_name = 'Naive Bayes'
-model_name = 'K-Nearest Neighbors'
-model_name = 'SVM'
-
-#%%
-model_metric = 'F1_score'
-
-log_reg_f1 = []
-for key in t3_res[model_name]:
-    log_reg_f1.append(t3_res[model_name][key][model_metric])
-    log_reg_f1M = mean(log_reg_f1)
-    print('key:', key, model_metric, ':', log_reg_f1)
-print(log_reg_f1M)
-
-log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
-log_reg_f1df
-
-#%%
-model_metric = 'MCC'
-log_reg_mcc = []
-for key in t3_res[model_name]:
-    log_reg_mcc.append(t3_res[model_name][key][model_metric])
-    log_reg_mccM = mean(log_reg_mcc)
-    print('key:', key, model_metric, ':', log_reg_mcc)
-print(log_reg_mccM)
-
-log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
-log_reg_mccdf
-#%%
-model_metric = 'Precision'
-log_reg_pres = []
-for key in t3_res[model_name]:
-    log_reg_pres.append(t3_res[model_name][key][model_metric])
-    log_reg_presM = mean(log_reg_pres)
-    print('key:', key, model_metric, ':', log_reg_pres)
-print(log_reg_presM)
-
-log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric])
-log_reg_presdf
-#%%
-model_metric = 'Recall'
-log_reg_recall = []
-for key in t3_res[model_name]:
-    log_reg_recall.append(t3_res[model_name][key][model_metric])
-    log_reg_recallM = mean(log_reg_recall)
-    print('key:', key, model_metric, ':', log_reg_recall)
-print(log_reg_recallM)
-
-log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric])
-log_reg_recalldf
-#%%
-model_metric = 'Accuracy'
-log_reg_accu = []
-for key in t3_res[model_name]:
-    log_reg_accu.append(t3_res[model_name][key][model_metric])
-    log_reg_accuM = mean(log_reg_accu)
-    print('key:', key, model_metric, ':', log_reg_accu)
-print(log_reg_accuM)
-
-log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric])
-log_reg_accudf
-#%% 
-model_metric = 'ROC_AUC'
-log_reg_roc_auc = []
-for key in t3_res[model_name]:
-    log_reg_roc_auc.append(t3_res[model_name][key][model_metric])
-    log_reg_roc_aucM = mean(log_reg_roc_auc)
-    print('key:', key, model_metric, ':', log_reg_roc_auc)
-print(log_reg_roc_aucM)
-
-log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric])
-log_reg_roc_aucdf
--- a/loopity_loop_CALL.py
+++ b/loopity_loop_CALL.py
@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 11 11:15:50 2022
+
+@author: tanu
+"""
+#%%
+del(t3_res)
+t3_res = MultClassPipeSKF(input_df = numerical_features_df
+                          , y_targetF = target1
+                          , var_type = 'numerical'
+                          , skf_splits = 10)
+pp.pprint(t3_res)
+#print(t3_res)
+
+#%% Manually: mean for each model, each metric
+model_name = 'Logistic Regression'
+model_name = 'Naive Bayes'
+model_name = 'K-Nearest Neighbors'
+model_name = 'SVM'
+
+
+
+
+
+#%%
+model_metric = 'F1_score'
+
+log_reg_f1 = []
+for key in t3_res[model_name]:
+    log_reg_f1.append(t3_res[model_name][key][model_metric])
+    log_reg_f1M = mean(log_reg_f1)
+    print('key:', key, model_metric, ':', log_reg_f1)
+print(log_reg_f1M)
+
+log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
+log_reg_f1df
+
+#%%
+model_metric = 'MCC'
+log_reg_mcc = []
+for key in t3_res[model_name]:
+    log_reg_mcc.append(t3_res[model_name][key][model_metric])
+    log_reg_mccM = mean(log_reg_mcc)
+    print('key:', key, model_metric, ':', log_reg_mcc)
+print(log_reg_mccM)
+
+log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
+log_reg_mccdf
+
+#%%
+
+
+#%% Classification Metrics we need to mean()
+classification_metrics = {
+    'F1_score': []
+    ,'MCC': []
+    ,'Precision': []
+    ,'Recall': []
+    ,'Accuracy': []
+    }
+# "mean() of the current metric across all folds for this model"
+
+# the output containing all the metrics across all folds for this model
+out={}
+# Just the mean() for each of the above metrics-per-model
+out_means={}
+
+# Build up out{} from t3_res, which came from loopity_loop
+for model in t3_res:
+    # NOTE: can't copy objects in Python!!!
+    out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}
+    out_means[model]={} # just to make life easier
+    print(model)
+    for fold in t3_res[model]:
+        for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}:
+            metric_value = t3_res[model][fold][metric]
+            out[model][metric].append(metric_value)
+# now that we've built out{}, let's mean() each metric            
+for model in out:
+    for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}:
+        metric_mean = mean(out[model][metric])
+        # just some debug output
+        # print('model:', model
+        #       , 'metric: ', metric
+        #       , metric_mean
+        #       )
+        out[model].update({(metric+'_mean'): metric_mean })
+        out_means[model].update({(metric+'_mean'): metric_mean })
+
+out_scores = pd.DataFrame(out_means)
--- a/my_datap10.py
+++ b/my_datap10.py
@ -165,8 +165,8 @@ nb = BernoulliNB()
 rf = RandomForestClassifier(**rs)

 clfs = [('Logistic Regression', log_reg)
-        ,('Naive Bayes', nb)
-        , ('Random Forest'      , rf) 
+        #,('Naive Bayes', nb)
+        #, ('Random Forest'      , rf) 
        ]

 #seed_skf = 42
--- a/my_datap11.py
+++ b/my_datap11.py
@ -163,7 +163,7 @@ for train_index, test_index in skf.split(input_df, y_targetF):
    pres = precision_score(y_test_fold, y_pred_fold)
    presL.append(pres)
    presM = mean(presL)
-    
+
    # Recall
    recall = recall_score(y_test_fold, y_pred_fold)
    recallL.append(recall)