diff --git a/my_data5.py b/my_data5.py index 500e6ba..4a54136 100644 --- a/my_data5.py +++ b/my_data5.py @@ -15,6 +15,7 @@ from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split import os from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score @@ -23,15 +24,30 @@ import pandas as pd homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/test_data") -# this needs to be merged_df2 or merged_df3? #gene 'pncA' -drug = 'pyrazinamide' +#drug = 'pyrazinamide' -my_df = pd.read_csv("pnca_merged_df3.csv") +#============== +# directories +#============== +datadir = homedir + '/git/Data/' +indir = datadir + drug + '/input/' +outdir = datadir + drug + '/output/' +#======= +# input +#======= +# this needs to be merged_df2 or merged_df3? +infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' +#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + +my_df = pd.read_csv(infile_ml1) my_df.dtypes my_df_cols = my_df.columns +gene_baiscL = ['pnca'] +geneL_naL = ['gid', 'rpob'] +geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob'] #%%============================================================================ # GET Y # Y = my_df.loc[:,drug] #has NA @@ -116,7 +132,7 @@ def modelPipeline(X_train, X_test, y_train, y_test): for clf_name, clf in clfs: pipeline = Pipeline(steps=[ - ('scaler', StandardScaler()), + ('scaler', MinMaxScaler()), ('classifier', clf) ] )