From 3bf63c522c1905186b4003244e242d355bc942f3 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Sun, 6 Mar 2022 14:49:51 +0000
Subject: [PATCH] trying one_hot encoder for categ vars, which was sucessful
 but not rfecv

---
 imports.py  | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 my_data8.py | 118 +++++++++++++++++++++++++++++
 2 files changed, 327 insertions(+)
 create mode 100644 imports.py
 create mode 100644 my_data8.py

diff --git a/imports.py b/imports.py
new file mode 100644
index 0000000..e1d03a2
--- /dev/null
+++ b/imports.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Mar  6 13:41:54 2022
+
+@author: tanu
+"""
+import os, sys
+import pandas as pd
+import numpy as np
+from sklearn import linear_model
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
+
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_transformer
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+from sklearn.metrics import make_scorer
+from sklearn.metrics import classification_report
+
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import train_test_split
+
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
+
+from sklearn.feature_selection import RFE
+from sklearn.feature_selection import RFECV
+import itertools
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+from statistics import mean, stdev
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/")
+
+# my function
+from MultClassPipe import MultClassPipeline 
+
+gene = 'pncA'
+drug = 'pyrazinamide'
+
+#==============
+# directories
+#==============
+datadir = homedir + '/git/Data/'
+indir   = datadir + drug + '/input/'
+outdir  = datadir + drug + '/output/'
+
+#=======
+# input
+#=======
+infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
+#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
+
+my_df = pd.read_csv(infile_ml1)
+my_df.dtypes
+my_df_cols = my_df.columns
+
+geneL_basic     = ['pnca']
+geneL_na        = ['gid']
+geneL_na_ppi2   = ['rpob']
+geneL_ppi2      = ['alr', 'embb', 'katg']
+#%% get cols
+mycols = my_df.columns
+
+#%%============================================================================
+# GET Y
+
+# Target1: mutation_info_labels
+dm_om_map = {'DM': 1, 'OM': 0}
+target1 = my_df['mutation_info_labels'].map(dm_om_map)
+
+# Target2: drug
+drug_labels = drug + '_labels'
+drug_labels
+my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
+my_df[drug_labels].value_counts()
+my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
+my_df[drug_labels].value_counts()
+target2 = my_df[drug_labels]
+
+# Target3: drtype [Binary]
+drtype_labels = 'drtype_labels'
+my_df[drtype_labels] = my_df['drtype'].map({'Sensitive'      : 0
+                                                 , 'Other'   : 0
+                                                 , 'Pre-MDR' : 1
+                                                 , 'MDR'     : 1
+                                                 , 'Pre-XDR' : 1
+                                                 , 'XDR'     : 1})
+# target3 = 'drtype' [Multinomial]
+target3 = my_df[drtype_labels]
+
+# target4
+drtype_labels2 = 'drtype_labels2'
+my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive'     : 0
+                                                 , 'Other'   : 0
+                                                 , 'Pre-MDR' : 1
+                                                 , 'MDR'     : 1
+                                                 , 'Pre-XDR' : 2
+                                                 , 'XDR'     : 2})
+target4 = my_df[drtype_labels2]
+
+# sanity checks
+target1.value_counts()
+my_df['mutation_info_labels'].value_counts()
+
+target2.value_counts()
+my_df[drug_labels].value_counts()
+
+target3.value_counts()
+my_df['drtype'].value_counts()
+target4.value_counts()
+my_df['drtype'].value_counts()
+
+#%%
+# GET X
+common_cols_stabiltyN = ['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2']
+
+# Build stability columns ~ gene
+if gene.lower() in geneL_basic:
+    x_stabilityN = common_cols_stabiltyN
+    
+if gene.lower() in geneL_ppi2:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity'
+                                               , 'interface_dist'] 
+if gene.lower() in geneL_na:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
+
+if gene.lower() in geneL_na_ppi2:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+    #D1148 get rid of
+    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
+    my_df = my_df.drop(index=na_index)
+    
+X_strFN =  ['asa'
+           , 'rsa'
+           , 'kd_values'
+           , 'rd_values']    
+
+X_evolFN =  ['consurf_score'
+           , 'snap2_score'
+           , 'snap2_accuracy_pc']
+
+# TODO: ADD ED values
+# Problematic due to NA: filling NA with unknown or string will make it categorical
+# OPTIONS
+# 1. Imputing: KNN or MICE or from distribution
+# 2. Fill na with median or mode
+# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set'
+    # this means the size of the training data gets reduced!
+# 4. Remove genomic features from ML COMPLETELEY!
+
+# X_genomicFN =  ['af'
+#            , 'or_mychisq'
+#            , 'or_logistic'
+#            , 'or_fisher'
+#            , 'pval_fisher']
+
+#%% try combinations
+X_vars1 = my_df[x_stabilityN] 
+X_vars2 = my_df[X_strFN] 
+X_vars3 = my_df[X_evolFN] 
+
+X_vars5  = my_df[x_stabilityN + X_strFN]
+X_vars6  = my_df[x_stabilityN + X_evolFN]
+#X_vars7  = my_df[x_stabilityN + X_genomicFN]
+X_vars8  = my_df[X_strFN + X_evolFN]
+#X_vars9  = my_df[X_strFN + X_genomicFN]
+#X_vars10 = my_df[X_evolFN + X_genomicFN]
+X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN]
+#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN]
+
+numerical_features_names = x_stabilityN + X_strFN + X_evolFN
+
+# separate ones for foldx?
+categorical_features_names = ['ss_class'
+                           , 'wt_prop_water'
+                          # , 'lineage_labels' # misleading if using merged_df3
+                           , 'mut_prop_water'
+                           , 'wt_prop_polarity'
+                           , 'mut_prop_polarity'
+                           , 'wt_calcprop'
+                           , 'mut_calcprop'
+                           , 'active_aa_pos']
+
+numerical_features_df = my_df[numerical_features_names]
+numerical_features_df.shape
+
+categorical_features_df = my_df[categorical_features_names]
+categorical_features_df.shape
+
+all_features_df = my_df[numerical_features_names + categorical_features_names]
+all_features_df.shape
diff --git a/my_data8.py b/my_data8.py
new file mode 100644
index 0000000..966f2bb
--- /dev/null
+++ b/my_data8.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar  5 12:57:32 2022
+
+@author: tanu
+"""
+#%%
+# data, etc for now  comes from my_data6.py and/or my_data5.py
+
+#%% try combinations
+#import sys, os
+#os.system("imports.py")
+
+
+#%%
+seed = 42
+features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 
+
+col_trans = make_column_transformer(
+                        (OneHotEncoder(),features_to_encode),
+                        remainder = "passthrough"
+                        )
+
+rf_classifier = RandomForestClassifier(
+                      min_samples_leaf=50,
+                      n_estimators=150,
+                      bootstrap=True,
+                      oob_score=True,
+                      n_jobs=-1,
+                      random_state=seed,
+                      max_features='auto')
+
+pipe = make_pipeline(col_trans, rf_classifier)
+pipe.fit(X_train, y_train)
+y_pred = pipe.predict(X_test)
+#%%
+
+all_features_df.shape
+
+
+X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
+                                                    target1, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', MinMaxScaler()  , numerical_features_df)
+        ,('cat', OneHotEncoder(), categorical_features_df)])
+
+seed = 42
+rf_classifier = RandomForestClassifier(
+                      min_samples_leaf=50,
+                      n_estimators=150,
+                      bootstrap=True,
+                      oob_score=True,
+                      n_jobs=-1,
+                      random_state=seed,
+                      max_features='auto')
+
+preprocessor.fit(all_features_df)
+preprocessor.transform(all_features_df)
+
+model = Pipeline(steps = [
+    ('preprocess', preprocessor)
+    ,('regression',linear_model.LogisticRegression())
+    ])
+
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+y_pred
+
+
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1)
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1)
+def f1(y_true,y_pred):
+    return f1_score(y_true, y_pred, pos_label = 1)
+
+acc  = make_scorer(accuracy_score)
+prec = make_scorer(precision)
+rec  = make_scorer(recall)
+f1   = make_scorer(f1)
+
+output = cross_validate(model, X_train, y_train
+                        , scoring = {'acc' : acc
+                                  ,'prec': prec
+                                  ,'rec' : rec
+                                  ,'f1'  : f1}
+                        , cv = 10
+                        , return_train_score = False)
+pd.DataFrame(output).mean()
+#%% with feature selection
+preprocessor.fit(numerical_features_df)
+preprocessor.transform(numerical_features_df)
+
+model = Pipeline(steps = [
+    ('preprocess', preprocessor)
+    ,('regression',linear_model.LogisticRegression())
+    ])
+
+
+
+selector_logistic = RFECV(estimator = model
+                       , cv = 10
+                       , step = 1)
+
+X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df 
+                                                    , target1 
+                                                    , test_size = 0.33
+                                                    , random_state = 42)
+
+selector_logistic_xtrain = selector_logistic.fit_transform(X_trainN, y_trainN)
+print(sel_rfe_logistic.get_support())
+X_trainN.columns
+
+print(sel_rfe_logistic.ranking_)
\ No newline at end of file