reran to generate merged_df3 with correct dst for dst muts. modified combining_dfs_plotting.R

2022-07-08 21:33:57 +01:00 · 2022-07-08 21:33:57 +01:00 · 8079dd7b6c
commit 8079dd7b6c
parent 289c8913d0
6 changed files with 148 additions and 211 deletions
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -168,7 +168,7 @@ def MultModelsCl(input_df, target
    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
    @type: int or StratifiedKfold()
    
-    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
    @type: list

    returns
--- a/scripts/ml/ml_functions/MultClfs_SIMPLE.py
+++ b/scripts/ml/ml_functions/MultClfs_SIMPLE.py
@ -168,7 +168,7 @@ def MultModelsCl(input_df, target
    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
    @type: int or StratifiedKfold()
    
-    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
    @type: list

    returns
@ -239,8 +239,8 @@ def MultModelsCl(input_df, target
               #  , ('Gaussian NB'               , GaussianNB() )
               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-               #  , ('LDA'                       , LinearDiscriminantAnalysis() )
-               # , ('Logistic Regression'       , LogisticRegression(**rs) )
+                 , ('LDA'                       , LinearDiscriminantAnalysis() )
+                 , ('Logistic Regression'       , LogisticRegression(**rs) )
               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
                #, ('Multinomial'               , MultinomialNB() )
@ -259,7 +259,7 @@ def MultModelsCl(input_df, target
             #     , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
             #     , ('SVC'                       , SVC(**rs) ) 
             #     , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-             #     , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
+                  , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
             # 
             ]
                
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -14,23 +14,8 @@ sys.path
 # import
 from GetMLData import *
 from SplitTTS import *
-#from MultClfs_fi import *
-from MultClfs import *
-
-#%%
-# X,y = load_boston(return_X_y=True) 
-# features = load_boston()['feature_names']
-
-# X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
-
-# rf = RandomForestRegressor(random_state=0) 
-# rf.fit(X_train,y_train)
-
-
-# f_i = list(zip(features,rf.feature_importances_)) 
-# f_i.sort(key = lambda x : x[1])
-# plt.barh([x[0] for x in f_i],[x[1] for x in f_i]) 
-# plt.show()
+#from MultClfs import *
+from MultClfs_SIMPLE import *

 #%%

@ -54,9 +39,8 @@ df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
 df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
 df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
 #df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
+
 all(df.columns.isin(['gene_name'])) # should be False
-
-
 spl_type = '70_30'
 #spl_type = '80_20'
 #spl_type = 'sl'
@ -73,6 +57,16 @@ df2 = split_tts(df

 all(df2['X'].columns.isin(['gene_name'])) # should be False

+df['dst'].value_counts()
+df['dst'].isna().sum()
+df['dst_mode'].value_counts()
+
+len(df)
+
+Counter(df2['y'])
+Counter(df2['y_bts'])
+
+
 fooD = MultModelsCl(input_df = df2['X']
                , target = df2['y']
                , sel_cv = skf_cv