made var names more meaniningful
This commit is contained in:
parent
e2b997badf
commit
6160d943f5
2 changed files with 85 additions and 8 deletions
46
my_data6.py
46
my_data6.py
|
@ -28,8 +28,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
|
||||||
# my function
|
# my function
|
||||||
from MultClassPipe import MultClassPipeline
|
from MultClassPipe import MultClassPipeline
|
||||||
|
|
||||||
#gene = 'pncA'
|
gene = 'pncA'
|
||||||
#drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
|
|
||||||
#==============
|
#==============
|
||||||
# directories
|
# directories
|
||||||
|
@ -48,10 +48,10 @@ my_df = pd.read_csv(infile_ml1)
|
||||||
my_df.dtypes
|
my_df.dtypes
|
||||||
my_df_cols = my_df.columns
|
my_df_cols = my_df.columns
|
||||||
|
|
||||||
geneL_basic = ['pnca']
|
geneL_basic = ['pnca']
|
||||||
geneL_na = ['gid']
|
geneL_na = ['gid']
|
||||||
geneL_na_ppi2 = ['rpob']
|
geneL_na_ppi2 = ['rpob']
|
||||||
geneL_ppi2 = ['alr', 'embb', 'katg']
|
geneL_ppi2 = ['alr', 'embb', 'katg']
|
||||||
#%% get cols
|
#%% get cols
|
||||||
mycols = my_df.columns
|
mycols = my_df.columns
|
||||||
|
|
||||||
|
@ -82,6 +82,17 @@ my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0
|
||||||
# target3 = my_df['drtype']
|
# target3 = my_df['drtype']
|
||||||
target3 = my_df[drtype_labels]
|
target3 = my_df[drtype_labels]
|
||||||
|
|
||||||
|
# target4
|
||||||
|
drtype_labels2 = 'drtype_labels2'
|
||||||
|
my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0
|
||||||
|
, 'Other' : 0
|
||||||
|
, 'Pre-MDR' : 1
|
||||||
|
, 'MDR' : 1
|
||||||
|
, 'Pre-XDR' : 2
|
||||||
|
, 'XDR' : 2})
|
||||||
|
|
||||||
|
target4 = my_df[drtype_labels2]
|
||||||
|
|
||||||
# sanity checks
|
# sanity checks
|
||||||
target1.value_counts()
|
target1.value_counts()
|
||||||
my_df['mutation_info_labels'].value_counts()
|
my_df['mutation_info_labels'].value_counts()
|
||||||
|
@ -91,6 +102,8 @@ my_df[drug_labels].value_counts()
|
||||||
|
|
||||||
target3.value_counts()
|
target3.value_counts()
|
||||||
my_df['drtype'].value_counts()
|
my_df['drtype'].value_counts()
|
||||||
|
target4.value_counts()
|
||||||
|
my_df['drtype'].value_counts()
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# GET X
|
# GET X
|
||||||
|
@ -147,9 +160,30 @@ X_vars6 = my_df[x_stability_cols + X_evolF]
|
||||||
X_vars8 = my_df[X_strF + X_evolF]
|
X_vars8 = my_df[X_strF + X_evolF]
|
||||||
#X_vars9 = my_df[X_strF + X_genomicF]
|
#X_vars9 = my_df[X_strF + X_genomicF]
|
||||||
#X_vars10 = my_df[X_evolF + X_genomicF]
|
#X_vars10 = my_df[X_evolF + X_genomicF]
|
||||||
X_vars11 = my_df[x_stability_cols + X_strF + X_evolF ]
|
X_vars11 = my_df[x_stability_cols + X_strF + X_evolF]
|
||||||
#X_vars12 = my_df[x_stability_cols + X_strF + X_evolF + X_genomicF]
|
#X_vars12 = my_df[x_stability_cols + X_strF + X_evolF + X_genomicF]
|
||||||
|
|
||||||
|
numerical_features_names = x_stability_cols + X_strF + X_evolF
|
||||||
|
|
||||||
|
# separate ones for foldx?
|
||||||
|
categorical_features_names = ['ss_class'
|
||||||
|
, 'wt_prop_water'
|
||||||
|
# , 'lineage_labels' # misleading if using merged_df3
|
||||||
|
, 'mut_prop_water'
|
||||||
|
, 'wt_prop_polarity'
|
||||||
|
, 'mut_prop_polarity'
|
||||||
|
, 'wt_calcprop'
|
||||||
|
, 'mut_calcprop'
|
||||||
|
, 'active_aa_pos']
|
||||||
|
|
||||||
|
numerical_features_df = my_df[numerical_features_names]
|
||||||
|
numerical_features_df.shape
|
||||||
|
|
||||||
|
categorical_features_df = my_df[categorical_features_names]
|
||||||
|
categorical_features_df.shape
|
||||||
|
|
||||||
|
all_features_df = my_df[numerical_features_names + categorical_features_names]
|
||||||
|
all_features_df.shape
|
||||||
#%%
|
#%%
|
||||||
X_vars1.shape[1]
|
X_vars1.shape[1]
|
||||||
X_vars5.shape[1]
|
X_vars5.shape[1]
|
||||||
|
|
47
my_data7.py
47
my_data7.py
|
@ -29,6 +29,9 @@ from sklearn.metrics import classification_report
|
||||||
from sklearn.feature_selection import RFE
|
from sklearn.feature_selection import RFE
|
||||||
from sklearn.feature_selection import RFECV
|
from sklearn.feature_selection import RFECV
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# trying feature selection
|
||||||
|
#############################
|
||||||
#%%
|
#%%
|
||||||
model= Pipeline(steps = [
|
model= Pipeline(steps = [
|
||||||
('pre', MinMaxScaler()),
|
('pre', MinMaxScaler()),
|
||||||
|
@ -103,6 +106,11 @@ https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.Sequ
|
||||||
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
|
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
|
||||||
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
|
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
|
||||||
https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html
|
https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html
|
||||||
|
|
||||||
|
#GridSearchCV
|
||||||
|
#ParameterGrid
|
||||||
|
#RandomizedSearchCV
|
||||||
|
#https://medium.com/analytics-vidhya/hyper-parameter-tuning-gridsearchcv-vs-randomizedsearchcv-499862e3ca5
|
||||||
#%% RFE: Feature selection in classification
|
#%% RFE: Feature selection in classification
|
||||||
# others in example
|
# others in example
|
||||||
# https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b
|
# https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b
|
||||||
|
@ -152,8 +160,8 @@ X_train, X_test, y_train, y_test = train_test_split(X_vars11,
|
||||||
test_size = 0.33,
|
test_size = 0.33,
|
||||||
random_state = 42)
|
random_state = 42)
|
||||||
|
|
||||||
model_logistic = LogisticRegression()
|
model_logistic2 = LogisticRegression()
|
||||||
sel_rfe_logistic = RFECV(estimator = model_logistic
|
sel_rfe_logistic = RFECV(estimator = model_logistic2
|
||||||
, cv = 10
|
, cv = 10
|
||||||
, step = 1)
|
, step = 1)
|
||||||
|
|
||||||
|
@ -162,3 +170,38 @@ print(sel_rfe_logistic.get_support())
|
||||||
X_train.columns
|
X_train.columns
|
||||||
|
|
||||||
print(sel_rfe_logistic.ranking_)
|
print(sel_rfe_logistic.ranking_)
|
||||||
|
#%%
|
||||||
|
# TODO: imputation
|
||||||
|
# Find out the best way to impute values!
|
||||||
|
#from sklearn.impute import SimpleImputer
|
||||||
|
# https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc
|
||||||
|
#KNN and MICE
|
||||||
|
my_df2 = pd.read_csv(infile_ml1)
|
||||||
|
|
||||||
|
genomicF = ['af'
|
||||||
|
, 'beta_logistic'
|
||||||
|
, 'or_logistic'
|
||||||
|
, 'pval_logistic'
|
||||||
|
, 'se_logistic'
|
||||||
|
, 'zval_logistic'
|
||||||
|
, 'ci_low_logistic'
|
||||||
|
, 'ci_hi_logistic'
|
||||||
|
, 'or_mychisq'
|
||||||
|
, 'log10_or_mychisq'
|
||||||
|
, 'or_fisher'
|
||||||
|
, 'pval_fisher'
|
||||||
|
, 'neglog_pval_fisher'
|
||||||
|
, 'ci_low_fisher'
|
||||||
|
, 'ci_hi_fisher'
|
||||||
|
, 'est_chisq'
|
||||||
|
, 'pval_chisq']
|
||||||
|
|
||||||
|
# X_genomicF = ['af'
|
||||||
|
# , 'or_mychisq'
|
||||||
|
# , 'or_logistic'
|
||||||
|
# , 'or_fisher'
|
||||||
|
# , 'pval_fisher']
|
||||||
|
|
||||||
|
my_df2[genomicF].isna().sum()
|
||||||
|
|
||||||
|
my_df2[genomicF] = my_df2[genomicF].fillna(value='unknown')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue