From 3ed7840f60bd06a30c213ad5b9c27090372de617 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 16 May 2022 08:06:56 +0100 Subject: [PATCH] added UQ import and ml call scripts --- UQ_imports_pnca.py | 257 +++++++++++++++++++++++++++++++++++++++++++++ UQ_pnca_ml_CALL.py | 56 ++++++++++ 2 files changed, 313 insertions(+) create mode 100644 UQ_imports_pnca.py create mode 100644 UQ_pnca_ml_CALL.py diff --git a/UQ_imports_pnca.py b/UQ_imports_pnca.py new file mode 100644 index 0000000..908dc15 --- /dev/null +++ b/UQ_imports_pnca.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 6 13:41:54 2022 + +@author: tanu +""" +import os, sys +import pandas as pd +import numpy as np +import pprint as pp +from copy import deepcopy +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.neural_network import MLPClassifier +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from sklearn.metrics import jaccard_score + +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + +from sklearn.metrics import average_precision_score + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV +import itertools +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline +#from sklearn.datasets import make_classification +from sklearn.model_selection import cross_validate, cross_val_score +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.ensemble import AdaBoostClassifier +from imblearn.combine import SMOTEENN +from imblearn.under_sampling import EditedNearestNeighbours + +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator + +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) + }) + +rs = {'random_state': 42} +njobs = {'n_jobs': 10} +skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) + +rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats=3 + #, shuffle = False, random_state= None) + #, shuffle = True + ,**rs) +#my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef}) +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} + +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/") + +# my function +#from MultClassPipe import MultClassPipeline +from MultClassPipe2 import MultClassPipeline2 +from loopity_loop import MultClassPipeSKFLoop +from MultClassPipe3 import MultClassPipeSKFCV + +gene = 'pncA' +drug = 'pyrazinamide' + +#============== +# directories +#============== +datadir = homedir + '/git/Data/' +indir = datadir + drug + '/input/' +outdir = datadir + drug + '/output/' + +#======= +# input +#======= +infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' +#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + +my_df = pd.read_csv(infile_ml1, index_col = 0) +my_df.dtypes +my_df_cols = my_df.columns + +geneL_basic = ['pnca'] + +# -- CHECK script -- imports.py +#%% get cols +mycols = my_df.columns +mycols + +# change from numberic to +num_type = ['int64', 'float64'] +cat_type = ['object', 'bool'] + +# TODO: +# Treat active site aa pos as category and not numerical: This needs to be part of merged_df3! +#if my_df['active_aa_pos'].dtype in num_type: +# my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) +# my_df['active_aa_pos'].dtype + +# -- CHECK script -- imports.py +#%%============================================================================ +#%% IMPUTE values for OR + +#%% Combine mmCSM_lig Data + +#%% Combine PROVEAN data + +#%% Combine ED logo data + +#%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 + +# get logic from upstream! +my_df_ml = my_df.copy() + +my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() +my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() +my_df_ml.groupby(['mutationinformation'])['ligand_distance'].apply(lambda x: (x>10)).value_counts() + +my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 +(my_df_ml['ligand_affinity_change'] == 0).sum() + +#%%============================================================================ +# Separate blind test set +my_df_ml[drug].isna().sum() + +blind_test_df = my_df_ml[my_df_ml[drug].isna()] + +training_df = my_df_ml[my_df_ml[drug].notna()] + +# Target1: dst +training_df[drug].value_counts() +training_df['dst_mode'].value_counts() + +#%% Build X +common_cols_stabiltyN = ['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2'] + +foldX_cols = ['contacts' +#, 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' +#, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' +#, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' +#, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' +#, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' +#, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' +] + +X_strFN = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + +X_evolFN = ['consurf_score' + , 'snap2_score'] + +# quick inspection which lineage to use: +#foo = my_df_ml[['lineage', 'lineage_count_all', 'lineage_count_unique']] + +X_genomicFN = ['maf' +# , 'or_mychisq' +# , 'or_logistic' +# , 'or_fisher' +# , 'pval_fisher' + #, 'lineage' + , 'lineage_count_all' + , 'lineage_count_unique' +] + +#%% Construct numerical and categorical column names + +# numerical feature names +numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN + +#categorical feature names +categorical_FN = ['ss_class' + , 'wt_prop_water' + # , 'lineage_labels' # misleading if using merged_df3 + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + #, 'active_aa_pos' + ] + +#%% extracting dfs based on numerical, categorical column names +#---------------------------------- +# WITHOUT the target var included +#---------------------------------- +num_df = training_df[numerical_FN] +num_df.shape + +cat_df = training_df[categorical_FN] +cat_df.shape + +all_df = training_df[numerical_FN + categorical_FN] +all_df.shape + +#------------------------------ +# WITH the target var included: + #'wtgt': with target +#------------------------------ +# drug and dst_mode should be the same thing +num_df_wtgt = training_df[numerical_FN + ['dst_mode']] +num_df_wtgt.shape + +cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] +cat_df_wtgt.shape + +all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] +all_df_wtgt.shape +#%%================================================================ +#%% Apply ML + diff --git a/UQ_pnca_ml_CALL.py b/UQ_pnca_ml_CALL.py new file mode 100644 index 0000000..9032012 --- /dev/null +++ b/UQ_pnca_ml_CALL.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon May 16 05:59:12 2022 + +@author: tanu +""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +#%% Data +X = all_df_wtgt[numerical_FN+categorical_FN] +X = all_df_wtgt[numerical_FN] + +y = all_df_wtgt['dst_mode'] +#%% variables + +#%% MultClassPipeSKFCV: function call() +mm_skf_scoresD = MultClassPipeSKFCV(input_df = X + , target = y + , var_type = 'numerical' + , skf_cv = skf_cv) + + +mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) +mm_skf_scores_df_all +mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) +mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results + +#%% CHECK with BLIND test +#%% +import plotly.express as px + +corr = X.corr(method = 'spearman') +corr.head() + +#p = corr.style.background_gradient(cmap='coolwarm') +p = corr.style.background_gradient(cmap='coolwarm').set_precision(2) +p + +fig = px.imshow(corr) +fig.show() + + +#%%TODO: +# Add correlation plot +# Remove low variance features +# Add feature selection +# Then run your models on BLIND test WITHOUT CV + + +