diff --git a/imports_v1.py b/imports_v1.py new file mode 100644 index 0000000..aea85bc --- /dev/null +++ b/imports_v1.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 6 13:41:54 2022 + +@author: tanu +""" +import os, sys +import pandas as pd +import numpy as np +#from copy import deepcopy +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + + +from sklearn.metrics import average_precision_score + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV + +import itertools +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) +from statistics import mean, stdev, median, mode + +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/") + +# my function +from MultClassPipe import MultClassPipeline +from MultClassPipe2 import MultClassPipeline2 +from loopity_loop import MultClassPipeSKF + +gene = 'pncA' +drug = 'pyrazinamide' + +#============== +# directories +#============== +datadir = homedir + '/git/Data/' +indir = datadir + drug + '/input/' +outdir = datadir + drug + '/output/' + +#======= +# input +#======= +infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' +#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + +my_df = pd.read_csv(infile_ml1) +my_df.dtypes +my_df_cols = my_df.columns + +geneL_basic = ['pnca'] +geneL_na = ['gid'] +geneL_na_ppi2 = ['rpob'] +geneL_ppi2 = ['alr', 'embb', 'katg'] +#%% get cols +mycols = my_df.columns + +my_df['active_aa_pos'].dtype +my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) + +if gene.lower() in geneL_na_ppi2: + #D1148 get rid of + na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) + +#%%============================================================================ +# GET Y + +# Target1: mutation_info_labels +dm_om_map = {'DM': 1, 'OM': 0} +target1 = my_df['mutation_info_labels'].map(dm_om_map) +target1.value_counts() + +# Target2: drug +drug_labels = drug + '_labels' +drug_labels +my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'}) +my_df[drug_labels].value_counts() +my_df[drug_labels] = my_df[drug_labels].fillna('unknown') +my_df[drug_labels].value_counts() +target2 = my_df[drug_labels] + +# Target3: drtype [Binary] +drtype_labels = 'drtype_labels' +my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0 + , 'Other' : 0 + , 'Pre-MDR' : 1 + , 'MDR' : 1 + , 'Pre-XDR' : 1 + , 'XDR' : 1}) +# target3 = 'drtype' [Multinomial] +target3 = my_df[drtype_labels] + +# target4 +drtype_labels2 = 'drtype_labels2' +my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0 + , 'Other' : 0 + , 'Pre-MDR' : 1 + , 'MDR' : 1 + , 'Pre-XDR' : 2 + , 'XDR' : 2}) +target4 = my_df[drtype_labels2] + +# sanity checks +target1.value_counts() +my_df['mutation_info_labels'].value_counts() + +target2.value_counts() +my_df[drug_labels].value_counts() + +target3.value_counts() +my_df['drtype'].value_counts() +target4.value_counts() +my_df['drtype'].value_counts() + +#%% +# GET X +common_cols_stabiltyN = ['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2'] + +# Build stability columns ~ gene +if gene.lower() in geneL_basic: + x_stabilityN = common_cols_stabiltyN + +if gene.lower() in geneL_ppi2: + x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' + , 'interface_dist'] +if gene.lower() in geneL_na: + x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + +if gene.lower() in geneL_na_ppi2: + x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #D1148 get rid of + #na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + #my_df = my_df.drop(index=na_index) + +X_strFN = ['asa' + , 'rsa' + , 'kd_values' + , 'rd_values'] + +X_evolFN = ['consurf_score' + , 'snap2_score' + , 'snap2_accuracy_pc'] + +# TODO: ADD ED values +# Problematic due to NA: filling NA with unknown or string will make it categorical +# OPTIONS +# 1. Imputing: KNN or MICE or from distribution +# 2. Fill na with median or mode +# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set' + # this means the size of the training data gets reduced! +# 4. Remove genomic features from ML COMPLETELEY! + +# X_genomicFN = ['af' +# , 'or_mychisq' +# , 'or_logistic' +# , 'or_fisher' +# , 'pval_fisher'] + +#%% try combinations +X_vars1 = my_df[x_stabilityN] +X_vars2 = my_df[X_strFN] +X_vars3 = my_df[X_evolFN] + +X_vars5 = my_df[x_stabilityN + X_strFN] +X_vars6 = my_df[x_stabilityN + X_evolFN] +#X_vars7 = my_df[x_stabilityN + X_genomicFN] +X_vars8 = my_df[X_strFN + X_evolFN] +#X_vars9 = my_df[X_strFN + X_genomicFN] +#X_vars10 = my_df[X_evolFN + X_genomicFN] +X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN] +#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN] + +numerical_features_names = x_stabilityN + X_strFN + X_evolFN + +# separate ones for foldx? +categorical_features_names = ['ss_class' + , 'wt_prop_water' + # , 'lineage_labels' # misleading if using merged_df3 + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + , 'active_aa_pos'] + +numerical_features_df = my_df[numerical_features_names] +numerical_features_df.shape + +categorical_features_df = my_df[categorical_features_names] +categorical_features_df.shape + +all_features_df = my_df[numerical_features_names + categorical_features_names] +all_features_df.shape diff --git a/loopity_loop_CALL.py b/loopity_loop_CALL.py index 72d33b0..5f8833a 100644 --- a/loopity_loop_CALL.py +++ b/loopity_loop_CALL.py @@ -19,11 +19,6 @@ model_name = 'Logistic Regression' model_name = 'Naive Bayes' model_name = 'K-Nearest Neighbors' model_name = 'SVM' - - - - - #%% model_metric = 'F1_score' @@ -48,10 +43,9 @@ print(log_reg_mccM) log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric]) log_reg_mccdf - #%% - - +################################################################ +# extract items from wwithin a nested dict #%% Classification Metrics we need to mean() classification_metrics = { 'F1_score': [] diff --git a/loopity_loop_CALL_single b/loopity_loop_CALL_single new file mode 100644 index 0000000..4916d2b --- /dev/null +++ b/loopity_loop_CALL_single @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 11 11:15:50 2022 + +@author: tanu +""" +#%% +del(t3_res) +t3_res = MultClassPipeSKF(input_df = numerical_features_df + , y_targetF = target1 + , var_type = 'numerical' + , skf_splits = 10) +pp.pprint(t3_res) +#print(t3_res) + +#%% Manually: mean for each model, each metric +model_name = 'Logistic Regression' +model_name = 'Naive Bayes' +model_name = 'K-Nearest Neighbors' +model_name = 'SVM' + +#%% +model_metric = 'F1_score' + +log_reg_f1 = [] +for key in t3_res[model_name]: + log_reg_f1.append(t3_res[model_name][key][model_metric]) + log_reg_f1M = mean(log_reg_f1) + print('key:', key, model_metric, ':', log_reg_f1) +print(log_reg_f1M) + +log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric]) +log_reg_f1df + +#%% +model_metric = 'MCC' +log_reg_mcc = [] +for key in t3_res[model_name]: + log_reg_mcc.append(t3_res[model_name][key][model_metric]) + log_reg_mccM = mean(log_reg_mcc) + print('key:', key, model_metric, ':', log_reg_mcc) +print(log_reg_mccM) + +log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric]) +log_reg_mccdf +#%% +model_metric = 'Precision' +log_reg_pres = [] +for key in t3_res[model_name]: + log_reg_pres.append(t3_res[model_name][key][model_metric]) + log_reg_presM = mean(log_reg_pres) + print('key:', key, model_metric, ':', log_reg_pres) +print(log_reg_presM) + +log_reg_presdf = pd.DataFrame({model_name: [log_reg_presM]}, index = [model_metric]) +log_reg_presdf +#%% +model_metric = 'Recall' +log_reg_recall = [] +for key in t3_res[model_name]: + log_reg_recall.append(t3_res[model_name][key][model_metric]) + log_reg_recallM = mean(log_reg_recall) + print('key:', key, model_metric, ':', log_reg_recall) +print(log_reg_recallM) + +log_reg_recalldf = pd.DataFrame({model_name: [log_reg_recallM]}, index = [model_metric]) +log_reg_recalldf +#%% +model_metric = 'Accuracy' +log_reg_accu = [] +for key in t3_res[model_name]: + log_reg_accu.append(t3_res[model_name][key][model_metric]) + log_reg_accuM = mean(log_reg_accu) + print('key:', key, model_metric, ':', log_reg_accu) +print(log_reg_accuM) + +log_reg_accudf = pd.DataFrame({model_name: [log_reg_accuM]}, index = [model_metric]) +log_reg_accudf +#%% +model_metric = 'ROC_AUC' +log_reg_roc_auc = [] +for key in t3_res[model_name]: + log_reg_roc_auc.append(t3_res[model_name][key][model_metric]) + log_reg_roc_aucM = mean(log_reg_roc_auc) + print('key:', key, model_metric, ':', log_reg_roc_auc) +print(log_reg_roc_aucM) + +log_reg_roc_aucdf = pd.DataFrame({model_name: [log_reg_roc_aucM]}, index = [model_metric]) +log_reg_roc_aucdf \ No newline at end of file