diff --git a/my_data6.py b/my_data6.py index afc887c..1ebaa95 100644 --- a/my_data6.py +++ b/my_data6.py @@ -28,8 +28,8 @@ os.chdir(homedir + "/git/ML_AI_training/") # my function from MultClassPipe import MultClassPipeline -#gene = 'pncA' -#drug = 'pyrazinamide' +gene = 'pncA' +drug = 'pyrazinamide' #============== # directories @@ -48,10 +48,10 @@ my_df = pd.read_csv(infile_ml1) my_df.dtypes my_df_cols = my_df.columns -geneL_basic = ['pnca'] -geneL_na = ['gid'] +geneL_basic = ['pnca'] +geneL_na = ['gid'] geneL_na_ppi2 = ['rpob'] -geneL_ppi2 = ['alr', 'embb', 'katg'] +geneL_ppi2 = ['alr', 'embb', 'katg'] #%% get cols mycols = my_df.columns @@ -82,6 +82,17 @@ my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0 # target3 = my_df['drtype'] target3 = my_df[drtype_labels] +# target4 +drtype_labels2 = 'drtype_labels2' +my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0 + , 'Other' : 0 + , 'Pre-MDR' : 1 + , 'MDR' : 1 + , 'Pre-XDR' : 2 + , 'XDR' : 2}) + +target4 = my_df[drtype_labels2] + # sanity checks target1.value_counts() my_df['mutation_info_labels'].value_counts() @@ -91,6 +102,8 @@ my_df[drug_labels].value_counts() target3.value_counts() my_df['drtype'].value_counts() +target4.value_counts() +my_df['drtype'].value_counts() #%% # GET X @@ -147,9 +160,30 @@ X_vars6 = my_df[x_stability_cols + X_evolF] X_vars8 = my_df[X_strF + X_evolF] #X_vars9 = my_df[X_strF + X_genomicF] #X_vars10 = my_df[X_evolF + X_genomicF] -X_vars11 = my_df[x_stability_cols + X_strF + X_evolF ] +X_vars11 = my_df[x_stability_cols + X_strF + X_evolF] #X_vars12 = my_df[x_stability_cols + X_strF + X_evolF + X_genomicF] +numerical_features_names = x_stability_cols + X_strF + X_evolF + +# separate ones for foldx? +categorical_features_names = ['ss_class' + , 'wt_prop_water' + # , 'lineage_labels' # misleading if using merged_df3 + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + , 'active_aa_pos'] + +numerical_features_df = my_df[numerical_features_names] +numerical_features_df.shape + +categorical_features_df = my_df[categorical_features_names] +categorical_features_df.shape + +all_features_df = my_df[numerical_features_names + categorical_features_names] +all_features_df.shape #%% X_vars1.shape[1] X_vars5.shape[1] diff --git a/my_data7.py b/my_data7.py index 220148d..8746174 100644 --- a/my_data7.py +++ b/my_data7.py @@ -29,6 +29,9 @@ from sklearn.metrics import classification_report from sklearn.feature_selection import RFE from sklearn.feature_selection import RFECV +############################# +# trying feature selection +############################# #%% model= Pipeline(steps = [ ('pre', MinMaxScaler()), @@ -103,6 +106,11 @@ https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.Sequ https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html + +#GridSearchCV +#ParameterGrid +#RandomizedSearchCV +#https://medium.com/analytics-vidhya/hyper-parameter-tuning-gridsearchcv-vs-randomizedsearchcv-499862e3ca5 #%% RFE: Feature selection in classification # others in example # https://towardsdatascience.com/feature-selection-techniques-for-classification-and-python-tips-for-their-application-10c0ddd7918b @@ -152,8 +160,8 @@ X_train, X_test, y_train, y_test = train_test_split(X_vars11, test_size = 0.33, random_state = 42) -model_logistic = LogisticRegression() -sel_rfe_logistic = RFECV(estimator = model_logistic +model_logistic2 = LogisticRegression() +sel_rfe_logistic = RFECV(estimator = model_logistic2 , cv = 10 , step = 1) @@ -162,3 +170,38 @@ print(sel_rfe_logistic.get_support()) X_train.columns print(sel_rfe_logistic.ranking_) +#%% +# TODO: imputation +# Find out the best way to impute values! +#from sklearn.impute import SimpleImputer +# https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc +#KNN and MICE +my_df2 = pd.read_csv(infile_ml1) + +genomicF = ['af' + , 'beta_logistic' + , 'or_logistic' + , 'pval_logistic' + , 'se_logistic' + , 'zval_logistic' + , 'ci_low_logistic' + , 'ci_hi_logistic' + , 'or_mychisq' + , 'log10_or_mychisq' + , 'or_fisher' + , 'pval_fisher' + , 'neglog_pval_fisher' + , 'ci_low_fisher' + , 'ci_hi_fisher' + , 'est_chisq' + , 'pval_chisq'] + +# X_genomicF = ['af' +# , 'or_mychisq' +# , 'or_logistic' +# , 'or_fisher' +# , 'pval_fisher'] + +my_df2[genomicF].isna().sum() + +my_df2[genomicF] = my_df2[genomicF].fillna(value='unknown')