loopity_loop_CALL

This commit is contained in:
Tanushree Tunstall 2022-03-14 18:36:23 +00:00
parent 7aead2d4f4
commit 160053d361
5 changed files with 163 additions and 188 deletions

View file

@ -27,6 +27,8 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
@ -43,6 +45,16 @@ import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
#from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
@ -52,8 +64,8 @@ from MultClassPipe import MultClassPipeline
from MultClassPipe2 import MultClassPipeline2
from loopity_loop import MultClassPipeSKF
gene = 'rpoB'
drug = 'rifampicin'
gene = 'pncA'
drug = 'pyrazinamide'
#==============
# directories
@ -79,64 +91,33 @@ geneL_ppi2 = ['alr', 'embb', 'katg']
#%% get cols
mycols = my_df.columns
my_df['active_aa_pos'].dtype
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
# change from numberic to
num_type = ['int64', 'float64']
cat_type = ['object', 'bool']
if my_df['active_aa_pos'].dtype in num_type:
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
my_df['active_aa_pos'].dtype
# FIXME: if this is not structural, remove from source..
# Drop NA where numerical cols have them
if gene.lower() in geneL_na_ppi2:
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
#D1148 get rid of
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
# FIXME: either impute or remove!
# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
if gene.lower() in ['embb']:
na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
#%%============================================================================
# GET Y
# Target1: mutation_info_labels
dm_om_map = {'DM': 1, 'OM': 0}
target1 = my_df['mutation_info_labels'].map(dm_om_map)
target1.value_counts()
# Target2: drug
drug_labels = drug + '_labels'
drug_labels
my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
my_df[drug_labels].value_counts()
my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
my_df[drug_labels].value_counts()
target2 = my_df[drug_labels]
# Target3: drtype [Binary]
drtype_labels = 'drtype_labels'
my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0
, 'Other' : 0
, 'Pre-MDR' : 1
, 'MDR' : 1
, 'Pre-XDR' : 1
, 'XDR' : 1})
# target3 = 'drtype' [Multinomial]
target3 = my_df[drtype_labels]
# target4
drtype_labels2 = 'drtype_labels2'
my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0
, 'Other' : 0
, 'Pre-MDR' : 1
, 'MDR' : 1
, 'Pre-XDR' : 2
, 'XDR' : 2})
target4 = my_df[drtype_labels2]
# sanity checks
target1.value_counts()
my_df['mutation_info_labels'].value_counts()
target2.value_counts()
my_df[drug_labels].value_counts()
target3.value_counts()
my_df['drtype'].value_counts()
target4.value_counts()
my_df['drtype'].value_counts()
# Target1: mutation_info_labels, convert to
dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
my_df['mutation_class'] = my_df['mutation_info_labels'].map(dm_om_map)
my_df['mutation_class'].value_counts()
my_df['mutation_info_labels']. value_counts()
#%%
# GET X
@ -159,9 +140,6 @@ if gene.lower() in geneL_na:
if gene.lower() in geneL_na_ppi2:
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
#D1148 get rid of
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
X_strFN = ['asa'
, 'rsa'
@ -172,39 +150,17 @@ X_evolFN = ['consurf_score'
, 'snap2_score'
, 'snap2_accuracy_pc']
# TODO: ADD ED values
# Problematic due to NA: filling NA with unknown or string will make it categorical
# OPTIONS
# 1. Imputing: KNN or MICE or from distribution
# 2. Fill na with median or mode
# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set'
# this means the size of the training data gets reduced!
# 4. Remove genomic features from ML COMPLETELEY!
# X_genomicFN = ['af'
# , 'or_mychisq'
# , 'or_logistic'
# , 'or_fisher'
# , 'pval_fisher']
#%% try combinations
X_vars1 = my_df[x_stabilityN]
X_vars2 = my_df[X_strFN]
X_vars3 = my_df[X_evolFN]
X_vars5 = my_df[x_stabilityN + X_strFN]
X_vars6 = my_df[x_stabilityN + X_evolFN]
#X_vars7 = my_df[x_stabilityN + X_genomicFN]
X_vars8 = my_df[X_strFN + X_evolFN]
#X_vars9 = my_df[X_strFN + X_genomicFN]
#X_vars10 = my_df[X_evolFN + X_genomicFN]
X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN]
#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN]
numerical_features_names = x_stabilityN + X_strFN + X_evolFN
#%% Construct numerical and categorical column names
numerical_FN = x_stabilityN + X_strFN + X_evolFN
# separate ones for foldx?
categorical_features_names = ['ss_class'
categorical_FN = ['ss_class'
, 'wt_prop_water'
# , 'lineage_labels' # misleading if using merged_df3
, 'mut_prop_water'
@ -214,11 +170,28 @@ categorical_features_names = ['ss_class'
, 'mut_calcprop'
, 'active_aa_pos']
numerical_features_df = my_df[numerical_features_names]
numerical_features_df.shape
#%% extracting dfs based on numerical, categorical column names
#----------------------------------
# WITHOUT the target var included
#----------------------------------
num_df = my_df[numerical_FN]
num_df.shape
categorical_features_df = my_df[categorical_features_names]
categorical_features_df.shape
cat_df = my_df[categorical_FN]
cat_df.shape
all_features_df = my_df[numerical_features_names + categorical_features_names]
all_features_df.shape
all_df = my_df[numerical_FN + categorical_FN]
all_df.shape
#------------------------------
# WITH the target var included:
#'wtgt': with target
#------------------------------
num_df_wtgt = my_df[numerical_FN + ['mutation_class']]
num_df_wtgt.shape
cat_df_wtgt = my_df[categorical_FN + ['mutation_class']]
cat_df_wtgt.shape
all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
all_df_wtgt.shape

View file

@ -1,90 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 11 11:15:50 2022
@author: tanu
"""
#%%
del(t3_res)
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
pp.pprint(t3_res)
#print(t3_res)
#%% Manually: mean for each model, each metric
model_name = 'Logistic Regression'
model_name = 'Naive Bayes'
model_name = 'K-Nearest Neighbors'
model_name = 'SVM'
#%%
model_metric = 'F1_score'
log_reg_f1 = []
for key in t3_res[model_name]:
log_reg_f1.append(t3_res[model_name][key][model_metric])
log_reg_f1M = mean(log_reg_f1)
print('key:', key, model_metric, ':', log_reg_f1)
print(log_reg_f1M)
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
log_reg_f1df
#%%
model_metric = 'MCC'
log_reg_mcc = []
for key in t3_res[model_name]:
log_reg_mcc.append(t3_res[model_name][key][model_metric])
log_reg_mccM = mean(log_reg_mcc)
print('key:', key, model_metric, ':', log_reg_mcc)
print(log_reg_mccM)
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
log_reg_mccdf
#%%
model_metric = 'Precision'
log_reg_pres = []
for key in t3_res[model_name]:
log_reg_pres.append(t3_res[model_name][key][model_metric])
log_reg_presM = mean(log_reg_pres)
print('key:', key, model_metric, ':', log_reg_pres)
print(log_reg_presM)
log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric])
log_reg_presdf
#%%
model_metric = 'Recall'
log_reg_recall = []
for key in t3_res[model_name]:
log_reg_recall.append(t3_res[model_name][key][model_metric])
log_reg_recallM = mean(log_reg_recall)
print('key:', key, model_metric, ':', log_reg_recall)
print(log_reg_recallM)
log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric])
log_reg_recalldf
#%%
model_metric = 'Accuracy'
log_reg_accu = []
for key in t3_res[model_name]:
log_reg_accu.append(t3_res[model_name][key][model_metric])
log_reg_accuM = mean(log_reg_accu)
print('key:', key, model_metric, ':', log_reg_accu)
print(log_reg_accuM)
log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric])
log_reg_accudf
#%%
model_metric = 'ROC_AUC'
log_reg_roc_auc = []
for key in t3_res[model_name]:
log_reg_roc_auc.append(t3_res[model_name][key][model_metric])
log_reg_roc_aucM = mean(log_reg_roc_auc)
print('key:', key, model_metric, ':', log_reg_roc_auc)
print(log_reg_roc_aucM)
log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric])
log_reg_roc_aucdf

92
loopity_loop_CALL.py Normal file
View file

@ -0,0 +1,92 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 11 11:15:50 2022
@author: tanu
"""
#%%
del(t3_res)
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
pp.pprint(t3_res)
#print(t3_res)
#%% Manually: mean for each model, each metric
model_name = 'Logistic Regression'
model_name = 'Naive Bayes'
model_name = 'K-Nearest Neighbors'
model_name = 'SVM'
#%%
model_metric = 'F1_score'
log_reg_f1 = []
for key in t3_res[model_name]:
log_reg_f1.append(t3_res[model_name][key][model_metric])
log_reg_f1M = mean(log_reg_f1)
print('key:', key, model_metric, ':', log_reg_f1)
print(log_reg_f1M)
log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric])
log_reg_f1df
#%%
model_metric = 'MCC'
log_reg_mcc = []
for key in t3_res[model_name]:
log_reg_mcc.append(t3_res[model_name][key][model_metric])
log_reg_mccM = mean(log_reg_mcc)
print('key:', key, model_metric, ':', log_reg_mcc)
print(log_reg_mccM)
log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric])
log_reg_mccdf
#%%
#%% Classification Metrics we need to mean()
classification_metrics = {
'F1_score': []
,'MCC': []
,'Precision': []
,'Recall': []
,'Accuracy': []
}
# "mean() of the current metric across all folds for this model"
# the output containing all the metrics across all folds for this model
out={}
# Just the mean() for each of the above metrics-per-model
out_means={}
# Build up out{} from t3_res, which came from loopity_loop
for model in t3_res:
# NOTE: can't copy objects in Python!!!
out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}
out_means[model]={} # just to make life easier
print(model)
for fold in t3_res[model]:
for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}:
metric_value = t3_res[model][fold][metric]
out[model][metric].append(metric_value)
# now that we've built out{}, let's mean() each metric
for model in out:
for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}:
metric_mean = mean(out[model][metric])
# just some debug output
# print('model:', model
# , 'metric: ', metric
# , metric_mean
# )
out[model].update({(metric+'_mean'): metric_mean })
out_means[model].update({(metric+'_mean'): metric_mean })
out_scores = pd.DataFrame(out_means)

View file

@ -165,8 +165,8 @@ nb = BernoulliNB()
rf = RandomForestClassifier(**rs)
clfs = [('Logistic Regression', log_reg)
,('Naive Bayes', nb)
, ('Random Forest' , rf)
#,('Naive Bayes', nb)
#, ('Random Forest' , rf)
]
#seed_skf = 42