diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py index 9ba28e3..9041cca 100644 --- a/UQ_pnca_ML.py +++ b/UQ_pnca_ML.py @@ -14,6 +14,7 @@ import pprint as pp from copy import deepcopy from sklearn import linear_model from sklearn import datasets +from collections import Counter from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.naive_bayes import BernoulliNB @@ -62,8 +63,8 @@ from sklearn.model_selection import StratifiedKFold from sklearn.pipeline import Pipeline from sklearn.pipeline import make_pipeline -#from sklearn.feature_selection import RFE -#from sklearn.feature_selection import RFECV +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV import itertools #import seaborn as sns import matplotlib.pyplot as plt @@ -87,11 +88,12 @@ from imblearn.over_sampling import SMOTENC from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import RepeatedEditedNearestNeighbours - from sklearn.model_selection import GridSearchCV from sklearn.base import BaseEstimator import json +from sklearn.impute import KNNImputer as KNN +# My functions and globals scoring_fn = ({'accuracy' : make_scorer(accuracy_score) , 'fscore' : make_scorer(f1_score) , 'mcc' : make_scorer(matthews_corrcoef) @@ -124,8 +126,8 @@ from loopity_loop import MultClassPipeSKFLoop #from MultClassPipe3 import MultClassPipeSKFCV from UQ_MultClassPipe4 import MultClassPipeSKFCV -gene = 'pncA' -drug = 'pyrazinamide' +#gene = 'pncA' +#drug = 'pyrazinamide' #gene = 'katG' #drug = 'isoniazid' @@ -167,7 +169,32 @@ cat_type = ['object', 'bool'] # -- CHECK script -- imports.py #%%============================================================================ -#%% IMPUTE values for OR +#%% IMPUTE values for OR [check script for exploration: UQ_or_imputer] +#or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher'] +sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq'] +or_cols = ['or_mychisq', 'log10_or_mychisq'] + +print("count of NULL values before imputation\n") +my_df[or_cols].isnull().sum() + +my_dfI = pd.DataFrame(index = my_df['mutationinformation'] ) + + +my_dfI = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(my_df[or_cols]) + , index = my_df['mutationinformation'] + , columns = or_cols ) +my_dfI.columns = ['or_rawI', 'logorI'] +my_dfI.columns +my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column +my_dfI.head() + +# merge with original based on index +my_df['index_bm'] = my_df.index +mydf_imputed = pd.merge(my_df + , my_dfI + , on = 'mutationinformation') +mydf_imputed = mydf_imputed.set_index(['index_bm']) + #%% Combine mmCSM_lig Data