From 3bf63c522c1905186b4003244e242d355bc942f3 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sun, 6 Mar 2022 14:49:51 +0000 Subject: [PATCH] trying one_hot encoder for categ vars, which was sucessful but not rfecv --- imports.py | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++++ my_data8.py | 118 +++++++++++++++++++++++++++++ 2 files changed, 327 insertions(+) create mode 100644 imports.py create mode 100644 my_data8.py diff --git a/imports.py b/imports.py new file mode 100644 index 0000000..e1d03a2 --- /dev/null +++ b/imports.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 6 13:41:54 2022 + +@author: tanu +""" +import os, sys +import pandas as pd +import numpy as np +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer +from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split + +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV +import itertools +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) +from statistics import mean, stdev +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/") + +# my function +from MultClassPipe import MultClassPipeline + +gene = 'pncA' +drug = 'pyrazinamide' + +#============== +# directories +#============== +datadir = homedir + '/git/Data/' +indir = datadir + drug + '/input/' +outdir = datadir + drug + '/output/' + +#======= +# input +#======= +infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' +#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + +my_df = pd.read_csv(infile_ml1) +my_df.dtypes +my_df_cols = my_df.columns + +geneL_basic = ['pnca'] +geneL_na = ['gid'] +geneL_na_ppi2 = ['rpob'] +geneL_ppi2 = ['alr', 'embb', 'katg'] +#%% get cols +mycols = my_df.columns + +#%%============================================================================ +# GET Y + +# Target1: mutation_info_labels +dm_om_map = {'DM': 1, 'OM': 0} +target1 = my_df['mutation_info_labels'].map(dm_om_map) + +# Target2: drug +drug_labels = drug + '_labels' +drug_labels +my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'}) +my_df[drug_labels].value_counts() +my_df[drug_labels] = my_df[drug_labels].fillna('unknown') +my_df[drug_labels].value_counts() +target2 = my_df[drug_labels] + +# Target3: drtype [Binary] +drtype_labels = 'drtype_labels' +my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0 + , 'Other' : 0 + , 'Pre-MDR' : 1 + , 'MDR' : 1 + , 'Pre-XDR' : 1 + , 'XDR' : 1}) +# target3 = 'drtype' [Multinomial] +target3 = my_df[drtype_labels] + +# target4 +drtype_labels2 = 'drtype_labels2' +my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0 + , 'Other' : 0 + , 'Pre-MDR' : 1 + , 'MDR' : 1 + , 'Pre-XDR' : 2 + , 'XDR' : 2}) +target4 = my_df[drtype_labels2] + +# sanity checks +target1.value_counts() +my_df['mutation_info_labels'].value_counts() + +target2.value_counts() +my_df[drug_labels].value_counts() + +target3.value_counts() +my_df['drtype'].value_counts() +target4.value_counts() +my_df['drtype'].value_counts() + +#%% +# GET X +common_cols_stabiltyN = ['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2'] + +# Build stability columns ~ gene +if gene.lower() in geneL_basic: + x_stabilityN = common_cols_stabiltyN + +if gene.lower() in geneL_ppi2: + x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' + , 'interface_dist'] +if gene.lower() in geneL_na: + x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + +if gene.lower() in geneL_na_ppi2: + x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #D1148 get rid of + na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) + +X_strFN = ['asa' + , 'rsa' + , 'kd_values' + , 'rd_values'] + +X_evolFN = ['consurf_score' + , 'snap2_score' + , 'snap2_accuracy_pc'] + +# TODO: ADD ED values +# Problematic due to NA: filling NA with unknown or string will make it categorical +# OPTIONS +# 1. Imputing: KNN or MICE or from distribution +# 2. Fill na with median or mode +# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set' + # this means the size of the training data gets reduced! +# 4. Remove genomic features from ML COMPLETELEY! + +# X_genomicFN = ['af' +# , 'or_mychisq' +# , 'or_logistic' +# , 'or_fisher' +# , 'pval_fisher'] + +#%% try combinations +X_vars1 = my_df[x_stabilityN] +X_vars2 = my_df[X_strFN] +X_vars3 = my_df[X_evolFN] + +X_vars5 = my_df[x_stabilityN + X_strFN] +X_vars6 = my_df[x_stabilityN + X_evolFN] +#X_vars7 = my_df[x_stabilityN + X_genomicFN] +X_vars8 = my_df[X_strFN + X_evolFN] +#X_vars9 = my_df[X_strFN + X_genomicFN] +#X_vars10 = my_df[X_evolFN + X_genomicFN] +X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN] +#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN] + +numerical_features_names = x_stabilityN + X_strFN + X_evolFN + +# separate ones for foldx? +categorical_features_names = ['ss_class' + , 'wt_prop_water' + # , 'lineage_labels' # misleading if using merged_df3 + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + , 'active_aa_pos'] + +numerical_features_df = my_df[numerical_features_names] +numerical_features_df.shape + +categorical_features_df = my_df[categorical_features_names] +categorical_features_df.shape + +all_features_df = my_df[numerical_features_names + categorical_features_names] +all_features_df.shape diff --git a/my_data8.py b/my_data8.py new file mode 100644 index 0000000..966f2bb --- /dev/null +++ b/my_data8.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Mar 5 12:57:32 2022 + +@author: tanu +""" +#%% +# data, etc for now comes from my_data6.py and/or my_data5.py + +#%% try combinations +#import sys, os +#os.system("imports.py") + + +#%% +seed = 42 +features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) + +col_trans = make_column_transformer( + (OneHotEncoder(),features_to_encode), + remainder = "passthrough" + ) + +rf_classifier = RandomForestClassifier( + min_samples_leaf=50, + n_estimators=150, + bootstrap=True, + oob_score=True, + n_jobs=-1, + random_state=seed, + max_features='auto') + +pipe = make_pipeline(col_trans, rf_classifier) +pipe.fit(X_train, y_train) +y_pred = pipe.predict(X_test) +#%% + +all_features_df.shape + + +X_train, X_test, y_train, y_test = train_test_split(all_features_df, + target1, + test_size = 0.33, + random_state = 42) +preprocessor = ColumnTransformer( + transformers=[ + ('num', MinMaxScaler() , numerical_features_df) + ,('cat', OneHotEncoder(), categorical_features_df)]) + +seed = 42 +rf_classifier = RandomForestClassifier( + min_samples_leaf=50, + n_estimators=150, + bootstrap=True, + oob_score=True, + n_jobs=-1, + random_state=seed, + max_features='auto') + +preprocessor.fit(all_features_df) +preprocessor.transform(all_features_df) + +model = Pipeline(steps = [ + ('preprocess', preprocessor) + ,('regression',linear_model.LogisticRegression()) + ]) + +model.fit(X_train, y_train) +y_pred = model.predict(X_test) +y_pred + + +def precision(y_true,y_pred): + return precision_score(y_true,y_pred,pos_label = 1) +def recall(y_true,y_pred): + return recall_score(y_true, y_pred, pos_label = 1) +def f1(y_true,y_pred): + return f1_score(y_true, y_pred, pos_label = 1) + +acc = make_scorer(accuracy_score) +prec = make_scorer(precision) +rec = make_scorer(recall) +f1 = make_scorer(f1) + +output = cross_validate(model, X_train, y_train + , scoring = {'acc' : acc + ,'prec': prec + ,'rec' : rec + ,'f1' : f1} + , cv = 10 + , return_train_score = False) +pd.DataFrame(output).mean() +#%% with feature selection +preprocessor.fit(numerical_features_df) +preprocessor.transform(numerical_features_df) + +model = Pipeline(steps = [ + ('preprocess', preprocessor) + ,('regression',linear_model.LogisticRegression()) + ]) + + + +selector_logistic = RFECV(estimator = model + , cv = 10 + , step = 1) + +X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df + , target1 + , test_size = 0.33 + , random_state = 42) + +selector_logistic_xtrain = selector_logistic.fit_transform(X_trainN, y_trainN) +print(sel_rfe_logistic.get_support()) +X_trainN.columns + +print(sel_rfe_logistic.ranking_) \ No newline at end of file