From d0c329a1d9d8e1a82ee985dc4f4b9b9e7fed7067 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 17 Mar 2022 18:17:58 +0000
Subject: [PATCH] modified loopity and multclass3 to have skf_cv as a
 parameters for cv

---
 MultClassPipe3.py                         | 125 ++++++++++++----------
 MultClassPipe3_CALL.py                    |  36 +++----
 __pycache__/MultClassPipe3.cpython-37.pyc | Bin 4083 -> 4697 bytes
 __pycache__/loopity_loop.cpython-37.pyc   | Bin 4052 -> 4323 bytes
 base_estimator.py                         |   8 ++
 imports.py                                |  41 ++++++-
 loopity_loop.py                           |  57 +++++-----
 loopity_loop_CALL.py                      |  21 ++--
 8 files changed, 161 insertions(+), 127 deletions(-)

diff --git a/MultClassPipe3.py b/MultClassPipe3.py
index 4dfdc5b..aa161ab 100644
--- a/MultClassPipe3.py
+++ b/MultClassPipe3.py
@@ -61,23 +61,39 @@ from imblearn.combine import SMOTEENN
 from imblearn.under_sampling import EditedNearestNeighbours
 
 #%%
-rs = {'random_state': 42}
-# Done: add preprocessing step with one hot encoder
-# Done: get accuracy and other scores through K-fold stratified cv
+# rs = {'random_state': 42}
+# njobs = {'n_jobs': 10}
 
-scoring_fn =  ({ 'fscore'     : make_scorer(f1_score)
-                 , 'mcc'        : make_scorer(matthews_corrcoef)
-                 , 'precision' : make_scorer(precision_score)
-                 , 'recall'    : make_scorer(recall_score)
-                 , 'accuracy'      : make_scorer(accuracy_score)
-                 ,  'roc_auc'   : make_scorer(roc_auc_score)
-                 #,  'jaccard'   : make_scorer(jaccard_score)
+scoring_fn =  ({ 'fscore'       : make_scorer(f1_score)
+                  , 'mcc'        : make_scorer(matthews_corrcoef)
+                  , 'precision'  : make_scorer(precision_score)
+                  , 'recall'     : make_scorer(recall_score)
+                  , 'accuracy'   : make_scorer(accuracy_score)
+                  ,  'roc_auc'   : make_scorer(roc_auc_score)
+                  #,  'jaccard'   : make_scorer(jaccard_score)
             })    
 
 
 # Multiple Classification - Model Pipeline
-def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']):
+def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']):
 
+    '''
+    @ param input_df: input features 
+    @ type: df with input features WITHOUT the target variable
+    
+    @param target: target (or output) feature
+    @type: df or np.array or Series
+    
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho    t encoder)
+    @type: list
+
+    returns
+    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
+       
+    '''
     # determine categorical and numerical features
     numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
     numerical_ix
@@ -98,66 +114,61 @@ def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = [
     col_transform = ColumnTransformer(transformers = t
                                        , remainder='passthrough')
     
-    #%%
+    #%% Specify multiple Classification models
     log_reg = LogisticRegression(**rs)
-    nb = BernoulliNB()
-    knn = KNeighborsClassifier()
-    svm = SVC(**rs)
-    mlp = MLPClassifier(max_iter=500, **rs)
-    dt = DecisionTreeClassifier(**rs)
-    et = ExtraTreesClassifier(**rs)
-    rf = RandomForestClassifier(**rs)
-    rf2 = RandomForestClassifier(
-                          min_samples_leaf=50,
-                          n_estimators=150,
-                          bootstrap=True,
-                          oob_score=True,
-                          n_jobs=-1,
-                          random_state=42,
-                          max_features='auto')
-    
-    xgb = XGBClassifier(**rs, verbosity=0)
+    nb      = BernoulliNB()
+    knn     = KNeighborsClassifier()
+    svm     = SVC(**rs)
+    mlp     = MLPClassifier(max_iter = 500, **rs)
+    dt      = DecisionTreeClassifier(**rs)
+    et      = ExtraTreesClassifier(**rs)
+    rf      = RandomForestClassifier(**rs)
+    rf2     = RandomForestClassifier(
+                          min_samples_leaf = 50
+                          , n_estimators     = 150
+                          , bootstrap        = True
+                          , oob_score        = True
+                          , **njobs
+                          , **rs
+                          , max_features     = 'auto')
+    xgb = XGBClassifier(**rs
+                        , verbosity = 0, use_label_encoder =False)
 
-    models = [
-            ('Logistic Regression', log_reg), 
-            ('Naive Bayes', nb),
-            ('K-Nearest Neighbors', knn), 
-            ('SVM', svm), 
-            ('MLP', mlp), 
-            ('Decision Tree', dt), 
-            ('Extra Trees', et), 
-            ('Random Forest', rf), 
-            ('Random Forest2', rf2), 
-            #('XGBoost', xgb)
-            ]
-            
-    skf_cv_scores = {}
+    models = [('Logistic Regression', log_reg)
+            , ('Naive Bayes'        , nb)
+            , ('K-Nearest Neighbors', knn) 
+            , ('SVM'                , svm) 
+            , ('MLP'                , mlp) 
+            , ('Decision Tree'      , dt) 
+            , ('Extra Trees'        , et) 
+            , ('Random Forest'      , rf) 
+            , ('Naive Bayes'        , nb)
+            , ('Random Forest2'     , rf2) 
+            , ('XGBoost'            , xgb)]
+        
+    mm_skf_scoresD = {}
      
     for model_name, model_fn in models:
         print('\nModel_name:', model_name
         , '\nModel func:'    , model_fn
         , '\nList of models:', models)
     
-    #    model_pipeline = Pipeline([
-    #        ('pre'     , MinMaxScaler())
-    #        , ('model'  , model_fn)])
-            
         model_pipeline = Pipeline([
             ('prep'     , col_transform)
-            , ('model' , model_fn)])
+            , ('model'  , model_fn)])
             
         print('Running model pipeline:', model_pipeline)
-        skf_cv = cross_validate(model_pipeline
-                              , X_train
-                              , y_train
-                              , cv = 10
+        skf_cv_mod = cross_validate(model_pipeline
+                              , input_df
+                              , target
+                              , cv = skf_cv
                               , scoring = scoring_fn
                               , return_train_score = True)
-        skf_cv_scores[model_name] = {}
-        for key, value in skf_cv.items():
+        mm_skf_scoresD[model_name] = {}
+        for key, value in skf_cv_mod.items():
             print('\nkey:', key, '\nvalue:', value)
             print('\nmean value:', mean(value))
-            skf_cv_scores[model_name][key] = round(mean(value),2)
-            #pp.pprint(skf_cv_scores)
-    return(skf_cv_scores)
+            mm_skf_scoresD[model_name][key] = round(mean(value),2)
+            #pp.pprint(mm_skf_scoresD)
+    return(mm_skf_scoresD)
 
diff --git a/MultClassPipe3_CALL.py b/MultClassPipe3_CALL.py
index 6699707..c1d3808 100644
--- a/MultClassPipe3_CALL.py
+++ b/MultClassPipe3_CALL.py
@@ -5,29 +5,19 @@ Created on Tue Mar 15 11:09:50 2022
 
 @author: tanu
 """
-# stratified shuffle split
-X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
-                                                    , num_df_wtgt['mutation_class']
-                                                    , test_size = 0.33
-                                                    , **rs
-                                                    , shuffle = True
-                                                    , stratify = num_df_wtgt['mutation_class'])
+#%% Data
+X = all_df_wtgt[numerical_FN+categorical_FN]
+y = all_df_wtgt['mutation_class']
+#%% variables
 
-y_train.to_frame().value_counts().plot(kind = 'bar')
-y_test.to_frame().value_counts().plot(kind = 'bar')
-
-MultClassPipelineCV(X_train, X_test, y_train, y_test
-         , input_df = num_df_wtgt[numerical_FN]
-         , var_type = 'numerical')
+#%% MultClassPipeSKFCV: function call()
+mm_skf_scoresD = MultClassPipeSKFCV(input_df = X
+                                        , target = y
+                                        , var_type = 'mixed'
+                                        , skf_cv = skf_cv)
 
 
-skf_cv_scores = MultClassPipelineCV(X_train, X_test, y_train, y_test
-         , input_df = num_df_wtgt[numerical_FN]
-         , var_type = 'numerical')
-
-pp.pprint(skf_cv_scores)
-# construct a df
-skf_cv_scores_df = pd.DataFrame(skf_cv_scores)
-skf_cv_scores_df
-skf_cv_scores_df_test = skf_cv_scores_df.filter(like='test_', axis=0)
-skf_cv_scores_df_train = skf_cv_scores_df.filter(like='train_', axis=0)
+mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
+mm_skf_scores_df_all
+mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
+mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
diff --git a/__pycache__/MultClassPipe3.cpython-37.pyc b/__pycache__/MultClassPipe3.cpython-37.pyc
index f6693e841a0e02cfff93c1269f9c2151f8670707..c225213e9151606d369d26526629a2801f0f5fc0 100644
GIT binary patch
delta 1666
zcmZWp-EZ4e6t{02CvlUsO}{s`UAA^@#s&{;LsSJ#ZP|vZ^091dsX*a2_d0Rw*ug&O
zQt6sJ1YVGehWs!d`mjA90dJ^?SN;WF@xL+g&as_+Fl_nw+@H_+o!>e4%1_VzJzrie
zl?(;1pL%caj?Mj0o>lp8i;E<@PZrLcKh}3X<rnxoUub4bm0#qG->5rxIH@VsrIR1Y
zLMHqxd#|s>`FNeer-;uQKE6T}MJjs=S2<}Co;gzY)IgJ(nxY!dK1Z`Mi)e=H`@~!2
zIUoZ?YG$UX20Eg7M7MbXGKglMqXweA6z#|GOKBv~L6bTk;l(2o<REJ*pntChdB_cv
zpa6Na1{8+q2pFcqOT28Upg$rQSA>z_w-OYGUKvV5Z#4G!C?DhFcvp@!=$Ap~6MWKA
zA8D9A2BUE(pW@Se=9nau<9t@;O~U770qrSipgk=|(7q;%XwS$J+Ox8Z_M99=n?_PO
zhW_hv+$7DBpkgV(MDJ%=Sy#Sz{5c6GgDDu}=lgP!UyxILURDOWs=zp3kU31dD5o%u
z4Afv6!xrWAfZ*4PoR*4IC4mXlzjUMrGnOJJ<m8IdyDeuPPd-tQeg)}ggE`Er%2|F%
z&JA$maP<aEp|2)P)2Yw<mj@&!m{a8Jit<qX>W{71Zf_wi9+WF=508RD@w!1f%wuh8
zIh`)lxw)Kr)D#T5p77}~3JyDBnR1gpu%Pv7%+`l@Z-0F6E`^q$z`UmD13h4##dd9x
z<fkNQr6d5B(?Fn0u1DQ2VD;*=qG|N=iWuN@-ejJ~4k`LI;aS4}AFaQCP;VS8Qy)AA
zeao~2-=L;za}*14p1Zpz8bHCN%(mSJ)Ngf7(-xFDoO)>7Hs&(G4qTjxpH{e$su)t#
zV(yeWW?rhkqyqb`r@>CQ-A9cLW?!ZaBy76LA}Q3idV<FlxB%g`El1D?Ee5pV+I1v%
zeAD&XiN;RHKBP;Vmb1xvHRNJmOz>9Rh$F7F+<5bV3a8<6;iW=`Vr|RsgONz{1a6{}
zwz6(v_YKzpW;vGAr0uQ^RtE>ru(4b03bk6UL;Z#u?>h>if;C#SjRovfykHJ>%@lqy
z3%W5JbHncPm>Il@v__%<btdnyn!hx475_s~1}EfQOFXRUT@PFN4@gxD|IEMhZn4dJ
z^=F4K(pVQ<PsPxiRT32rgtzPZ792+7U0>8~wkz!Va0DSO$U>CczP0MQK7_}GyWu~D
zTk1^so>9w1S!d7P^}~n8_0y-K{IkOz>FHsNbbsH((-S^5;Lc9D-LA*$bP(&2kxZcO
zu(sIAr@rY#<<#p~9bw}(?ieXW9pNamFAk&Z0kgZ}7sZ3{_mOL7E1Nj)wRqH@U`lP{
z=Gy&_;-B)@Xq3q?U%YfxQ%Q2`Xf@SPb+t@%w6ER<(MTClP0f?=%i=lx04b`qlW&Vx
oi27N0QvP)*Mm+eh>EbfVub{xLE#A-J%ID#CqdO)2MS-IJFVJJ!u>b%7

delta 1048
zcmY+CO>fjj7{_PEUfb)nz1i1nHlcx#x7{|h7eWtFRS{5ofdYXLg`ky_$BvxrOKQ(X
z8)Zxm+f$`fRIbDYA-MJBRH1&1o=^|?6Ob$Y0Pr}wyv=C-`p<9X|IEmqm(%}GT9-{z
zC)jrW`)cz+`B$q#n}5&EF%72Ryhqn>M%UTQnd!&M`Yo7&S(w|FTnh70`-!fv09zro
z`tyfuMk>UHZ?~b-Y6RTo&Pg3hM*pPl94Se0Y!f(=gO+gwuP7lx-XTDNZ8MOL=?)DQ
zq0j-9A@!cBLPeSaWtaJvLFKGZgyKruG}Mr4NY^0EC8X+msw3?U=uv_{im_1Rvd|y{
z#xV=iJmnIu{6a&Wt9=q?xQ1%*^pKidcL|tad6X-U81A!pX1JE+#?Xy$bLev1ge;6e
z4tpL7P=uq>u^Q&Nm27}AjC%C6OaUrFcNvU{3~E&vsN=#!oe&o48IeW(OpKsTiX7@$
zk$2g4IxKo5EcJdD#Vh3g(+w7u!%?1vbD|7WVieAcVjsUEAAxC+#?;J_7{wUt)3Acu
zW<{mXa5*O`f(Qn4JdZb>KUTsqkBE{eFOlB&V(e+*8Nu>PSiTyLW3DEuP#5EU8M9gV
zoEOp4(@Ld*hcEgp$&V3HT_O+Zk5BF<`t4l}%i<$hxE}qc*Gr4|^JA&qZTXi@-ImS$
zucH_G)%?jDZ2p%CmF+I?)YPb&xl~u$jh)VB;9p1<jF#PNxQ@-cz7xdheaGMI1Rg)|
z@imd?=ZuK5#_iNe^2(wu<2zc8{xVjgl)1Pt5vywr?%Q5FR?s*BkJW>B08Iikuf5mh
z4RE7XlV6L?VApMI?Kgt0PI8*w>VVU1wC$D?YXk4LV{72{ygjGswVhZQgkoveIfzsH
zcC+jJLHxsL!n!zFxYlj*<)$43H-_7nSAW2(20auaMIEbtfzgk<#`2OvKkd3KGYgqa
zH6{Ajnp8pz*KR+5Z7nc*FFKq1b72FCpJ4y+QxQ0gwI#fBx9NPN;w*5Z*SYnqq{$yn
GF_gE`W)j^1

diff --git a/__pycache__/loopity_loop.cpython-37.pyc b/__pycache__/loopity_loop.cpython-37.pyc
index 54395659a2afb9c3a9d2481d7a70dd6ea54d304d..f1efd32f1fdfba334e5bcbbe9f04bfab4418c46e 100644
GIT binary patch
delta 1529
zcmZuxNlzR{6z=M2W`h9(voNg1&IDr!M#iz?#KHt80^~@8gM;FXYNo1Z8hW}X)io?~
zwK!l!xx`A;he#1pK#G)e#1F{Hr(AQ0<`~C^9Q+^hdY0HIlvJ<Z`}!^4t5>Q&MqZA_
zR${TJ0*i>7`QN^erO}taPCr0{2HLqr)3l!s6azY<gLLQ_+F7P*R>=%sJVO&|BxvP!
z?Y#3()cZLwmMUm|5?Ir)J7C|3?JRnK1s3K4G`Oeo6&e!ii6S*UP(*G}251=2$21}W
zfVQ<z??deye5#2~5v08|ZlE8u=g7S#x^xAQ1WnS^sp`GbwvOqT?AFy{yW300-MFE+
zi9;-VSCuDyU#M=<P07UBHJLON1DzwJe22cWwm}}AN%gE-DiWQmt6)EQ4&HrkS}MCe
zvM;UFzv;B1DsI0!AW;+bzYTQI9fB+y@)$}%_)4Rg=%o`fNhf9eL_>;*(<vF!RXQzG
zKvAVL-y(Nd4xc4tzZ^VQkpg;B4xA%*M5N@9963>4Ec%+-s7N=pG11@D#>GHWn-GJ(
zb_^ktG$V$fBuXjnluXEET7jOYT0H})cixa{{FhU8pC12A-bU!G#Ep}>gkG1USD<rp
z%%JsSz2tmDj$h$CFDI@*7vy9Eg~9x}t;jeXUsl{WSU-Z5fP3~XSy#Q8(CxJYLzJ*#
zRjLB(jEE{{4&MG`>*I%ywy-EMEC??$f%geFNZw>o4682gRR|~L=6~<+(5+)^bDU+|
zV9x)%oW1>AVgC+x1Si7K4Mx|oZkrS;3qQ=x@3Mlx!p6ij?E~zTs=976Oe~5y2)hiS
zL;xKdSBTRna8Ddn*qv62?D3o2e(IUbTP%QnuHkJ}%RsR}%<H%S35#~!s23_5hm86a
zC=-mA4U6G}5)rsyn>omBIl9fEVc=M)m`8YK)37$lVHR=`4hh!iyx)n%=1R7}%qrNF
z@dk+|Q`2xnw9bk%IBBcV#;O6G7i>!q!!oR*w-8RxI|bVx1Qf)K6iT=Zxy(Arx|#j2
zRlM%^>OBfCC&uAXs0E-uK2g`8JwEB3hd)v?n)f=q{9Z?y9Oha>u4$wyY>vQnEu+>V
z`;6yp#}G%g?yAFbCdo51*KERj7`celLXRJ;*tR3QRP;rp)@Jz+6mR!lMn9XJu7&GY
zC`a{LuyJE*L1#~gnb*Rtv-ep0iTAqw1$ydTblk-8&8jKxn=s5zjS9=IuWi_NW!8V#
zPC+PfPmL``f=ERgimGw%ZfrP(f+!l&kfwH`c%UC8Q8LhjI+3>@%Y+ekAu4m*`z3Z4
z?Rc+ZOS3!O&YsB#w{BR>D3<azcewwO?zFdXXTQwbe8gMr{2HQu@7x5L?)rRj9Z<gE
g3qLJ?dRa&?YvK1{Dpiwx5CY3t@_y=C3ZQW4U!1+Mz5oCK

delta 1277
zcmZux&2Jk;6rY)0e<pE0ZO4w2tn+P>hES>+38+w8QlTaiQV}hP%GzXholVxeW@a44
zXtq8CA%sK)nnN!=Bp1Y?5<-=@^bG$%;j&k_^iYWdh!cXhw%dTzk>>61{pP*joA=i1
zU&}ww4qYA^N+Vc@zijM0{d%Z?Z+*Xf4d-Z~uF;}R=oBq|ia)$a$qiJQe)t7mAnK7e
zQ+g65?dnk`tQFV;u&==8m(=gt1}3VkXY+(64+y(OQ$jmNGG=Ra9K<D2+xq7-bo_oy
z#OVahI{2P;7Y8FEV<TX4H1A;br~Y22o({&vu#KcfN9o*sBI5T51WwTTAnTwYcVx)%
ztLXORM<mDxlQMTQBJ&P%@F~XVEBx^%H^CmA$cd9ti6lNHSHZu17orP6QKJ1ZSty|H
zvsgd`1yeyu;yxQaJ3V?jm;srss1&4M?iCZ`bV26nqRbxa7>NnGBoi>MWjSe+I-x6H
z;$T+Jo@C{eoIWKO0nW+NDGm&gmow5hCP7(D_R*Xu^wGR1_R)fv>h$5FDD}}2t%zwT
ziSsB}mN}U(An1AdS<i5(m(DKLJD1Ddn6YRk|7GZ^EcYh$KXy&dJ!d;0-%F?G<ot8`
zj9r%tFJNDmi!Wd|<Wi4?7xUXqB(rqBih?Y7-+-5cpVuqqYZ#%l0qZTJX);q)6RVw-
zx+S==>wAY56BaduZ+OjhlQFYx*giA3V5V?v2ac}WzDw5()1$_o<u<wDP|Jg}w&9r#
zi$|$uv+1@$&22S212R1h8VwNey-K%ouY6R!VYF+0D<+yPv8ldITztKE-nGmUE#~nq
zZnRwCG+oQsbxqFUCc9lZqv2D_HTbU2EUxxb)odBQj!^Sj4K~+B5Xa`#52?2kPyT^b
ziK)L+)z!FH+xK^PIM^_cYPMxUA&ZB)*%E%3f*I`i+!1XClR~PIeweM0FzMBJ({%(7
z6MU~_+peW513xBM^hvTY_1D1rYb#;W@t~6$wZl}qCQJrn-3*h5CaZ~d(+bo4!0xLk
zHh8RS;Hy6df5K8d8hW)fvJG8ci5h&@X<9d~Z&pB?pMXKbDw8>%A~=q9oF-%Hoy^P>
zN$YVSlA4ZnErTbtBF^EQHik1q-N{sv8oL9Tm8$wK^Ck|})69j{aFidomdU(z&vNQ}
zJ3iwq`hN)rp7H#!!D11kwuV20tjELKV9O&PoZAAntH@-8&P))6C*Ocq(Q>Uz3Gn!N
K^<d-zj{ghw>SCt=

diff --git a/base_estimator.py b/base_estimator.py
index de9ddbb..275bb50 100644
--- a/base_estimator.py
+++ b/base_estimator.py
@@ -138,6 +138,14 @@ parameters = [
         #'tfidf__stop_words': [None],
         'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
     },
+    
+    {
+        'clf__estimator': [LogisticRegression()],
+        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
+        'max_iter': list(range(100,800,100)),
+        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
+    },
 ]
 
 pipeline = Pipeline([
diff --git a/imports.py b/imports.py
index 928f59e..62ba294 100644
--- a/imports.py
+++ b/imports.py
@@ -17,8 +17,12 @@ from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.neural_network import MLPClassifier
 from xgboost import XGBClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 
 from sklearn.compose import ColumnTransformer
@@ -52,11 +56,29 @@ from imblearn.over_sampling import RandomOverSampler
 from imblearn.over_sampling import SMOTE
 from imblearn.pipeline import Pipeline
 #from sklearn.datasets import make_classification
-from sklearn.model_selection import cross_validate
+from sklearn.model_selection import cross_validate, cross_val_score
 from sklearn.model_selection import RepeatedStratifiedKFold
 from sklearn.ensemble import AdaBoostClassifier
 from imblearn.combine import SMOTEENN
 from imblearn.under_sampling import EditedNearestNeighbours
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.base import BaseEstimator
+
+scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
+                 , 'fscore'     : make_scorer(f1_score)
+                 , 'mcc'        : make_scorer(matthews_corrcoef)
+                 ,  'precision' : make_scorer(precision_score)
+                 ,  'recall'    : make_scorer(recall_score)
+                 ,  'roc_auc'   : make_scorer(roc_auc_score)
+            }) 
+  
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+
 #%%
 homedir = os.path.expanduser("~")
 os.chdir(homedir + "/git/ML_AI_training/")
@@ -64,8 +86,8 @@ os.chdir(homedir + "/git/ML_AI_training/")
 # my function
 from MultClassPipe import MultClassPipeline
 from MultClassPipe2 import MultClassPipeline2
-from loopity_loop import MultClassPipeSKF
-from MultClassPipe3 import MultClassPipelineCV
+from loopity_loop import MultClassPipeSKFLoop
+from MultClassPipe3 import MultClassPipeSKFCV
 
 
 gene = 'pncA'
@@ -199,3 +221,16 @@ cat_df_wtgt.shape
 
 all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
 all_df_wtgt.shape
+
+#%%
+#%% Get train-test split and scoring functions
+X = num_df_wtgt[numerical_FN]
+y = num_df_wtgt['mutation_class']
+
+X_train, X_test, y_train, y_test = train_test_split(X
+                                            ,y
+                                            , test_size    = 0.33
+                                            , random_state = 2
+                                            , shuffle      = True
+                                            , stratify     = y)
+ 
\ No newline at end of file
diff --git a/loopity_loop.py b/loopity_loop.py
index b4f00e7..a0afc35 100644
--- a/loopity_loop.py
+++ b/loopity_loop.py
@@ -33,23 +33,30 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe
 from statistics import mean, stdev, median, mode
 #%%
 rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+   
 # Done: add preprocessing step with one hot encoder
-# TODO: supply stratified K-fold cv train and test data
+# TODO: supply stratified K-fold cv train and test dataskf
 # TODO: get accuracy and other scores through K-fold cv
 
 # Multiple Classification - Model Pipeline
-def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10):
+def MultClassPipeSKFLoop(input_df, target, skf_cv, var_type = ['numerical','categorical','mixed']):
 
     '''
     @ param input_df: input features 
-    @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation)
+    @ type: df with input features WITHOUT the target variable
     
-    @param y_outputF: target (or output) feature
-    @type: df or np.array
+    @param target: target (or output) feature
+    @type: df or np.array or Series
     
+    @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
+    @type: int or StratifiedKfold()
+    
+    @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
+    @type: list
 
     returns
-    multiple classification model scores
+    Dict containing multiple classification scores for each model and each Stratified Kfold
        
     '''
     # Determine categorical and numerical features
@@ -86,17 +93,17 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
                           , n_estimators     = 150
                           , bootstrap        = True
                           , oob_score        = True
-                          , n_jobs           = -1
+                          , **njobs
                           , **rs
                           , max_features     = 'auto')
     
-    xgb = XGBClassifier(**rs, verbosity = 0)
+    xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder = False)
     classification_metrics = {
         'F1_score': []
         ,'MCC': []
         ,'Precision': []
         ,'Recall': []
-        ,'Accuracy': []
+        , 'Accuracy': []
         ,'ROC_AUC': []
         }
     models = [
@@ -109,33 +116,29 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
              , ('Extra Trees'        , et) 
              , ('Random Forest'      , rf) 
              , ('Naive Bayes'        , nb)
-
-            , ('Random Forest2'     , rf2) 
-            #, ('XGBoost'            , xgb)
+             , ('Random Forest2'     , rf2) 
+             , ('XGBoost'            , xgb)
             ]
 
-    skf = StratifiedKFold(n_splits = skf_splits
-                          , shuffle = True
-                          , **rs)
+    # skf = StratifiedKFold(n_splits = 10
+    #                       #, shuffle = False, random_state= None)
+    #                       , shuffle = True,**rs)
 
-#    skf_dict = {}
     fold_no = 1
     fold_dict={}
 
-
     for model_name, model in models:
         fold_dict.update({ model_name: {}})
 
     #scores_df = pd.DataFrame()
-    for train_index, test_index in skf.split(input_df, y_targetF):
+    for train_index, test_index in skf_cv.split(input_df, target):
         x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
-        y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
+        y_train_fold, y_test_fold = target.iloc[train_index], target.iloc[test_index]
         #print("Fold: ", fold_no, len(train_index), len(test_index))
 
         for model_name, model in models:
             print("\nStart of model", model_name, "\nLoop no.", fold_no)
-            #skf_dict.update({model_name: classification_metrics })
-            model_pipeline = Pipeline(steps=[('prep'         , col_transform)
+            model_pipeline = Pipeline(steps=[('prep'          , col_transform)
                                               , ('classifier' , model)])
             model_pipeline.fit(x_train_fold, y_train_fold)
             y_pred_fold  = model_pipeline.predict(x_test_fold)
@@ -168,14 +171,4 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
             fold_dict[model_name][fold].update({'ROC_AUC'   : roc_auc})
             
         fold_no +=1
-        #pp.pprint(skf_dict)
-
-    return(fold_dict)
-
-#%% CAll function 
-# t3_res = MultClassPipeSKF(input_df = numerical_features_df
-#                           , y_targetF = target1
-#                           , var_type = 'numerical'
-#                           , skf_splits = 10)
-# pp.pprint(t3_res)
-# #print(t3_res)
+    return(fold_dict)
\ No newline at end of file
diff --git a/loopity_loop_CALL.py b/loopity_loop_CALL.py
index 00e33b1..e70763e 100644
--- a/loopity_loop_CALL.py
+++ b/loopity_loop_CALL.py
@@ -5,22 +5,19 @@ Created on Fri Mar 11 11:15:50 2022
 
 @author: tanu
 """
-#%%
-del(t3_res)
-# t3_res = MultClassPipeSKF(input_df = numerical_features_df
-#                           , y_targetF = target1
-#                           , var_type = 'numerical'
-#                           , skf_splits = 10)
-# pp.pprint(t3_res)
-# #print(t3_res)
+#%% variables
+rs = {'random_state': 42}
 
-t3_res = MultClassPipeSKF(input_df = num_df_wtgt[numerical_FN]
-                          , y_targetF = num_df_wtgt['mutation_class']
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                          , shuffle = True,**rs)
+#%% MultClassPipeSKFLoop: function call()
+t3_res = MultClassPipeSKFLoop(input_df = num_df_wtgt[numerical_FN]
+                          , target = num_df_wtgt['mutation_class']
                           , var_type = 'numerical'
-                          , skf_splits = 10)
+                          , skf_cv = skf_cv)
 pp.pprint(t3_res)
 #print(t3_res)
-
 ################################################################
 # extract items from wwithin a nested dict
 #%% Classification Metrics we need to mean()