From e28a296d98d27f12272b9f5b2689332859e75fa9 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 16 Mar 2022 10:11:13 +0000 Subject: [PATCH] saving work --- MultClassPipe2.py | 6 - MultClassPipe3.py | 250 ++++++++++------------ __pycache__/MultClassPipe2.cpython-37.pyc | Bin 2884 -> 2884 bytes __pycache__/MultClassPipe3.cpython-37.pyc | Bin 3790 -> 4083 bytes __pycache__/loopity_loop.cpython-37.pyc | Bin 3913 -> 4052 bytes imports.py | 3 + loopity_loop.py | 42 ++-- loopity_loop_CALL.py | 64 ++---- 8 files changed, 153 insertions(+), 212 deletions(-) diff --git a/MultClassPipe2.py b/MultClassPipe2.py index 9fe4619..20261c2 100644 --- a/MultClassPipe2.py +++ b/MultClassPipe2.py @@ -77,12 +77,6 @@ def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): for clf_name, clf in clfs: #%% - # pipeline = Pipeline(steps=[ - # ('scaler', MinMaxScaler()), - # #('scaler', StandardScaler()), - # ('classifier', clf) - # ] - # ) # define the data preparation for the columns t = [('cat', OneHotEncoder(), categorical_ix) , ('num', MinMaxScaler(), numerical_ix)] diff --git a/MultClassPipe3.py b/MultClassPipe3.py index b5570ae..4dfdc5b 100644 --- a/MultClassPipe3.py +++ b/MultClassPipe3.py @@ -6,51 +6,79 @@ Created on Fri Mar 4 15:25:33 2022 @author: tanu """ #%% + import os, sys import pandas as pd import numpy as np -from sklearn.linear_model import LogisticRegression +import pprint as pp +#from copy import deepcopy +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neural_network import MLPClassifier -from sklearn.pipeline import Pipeline from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import StandardScaler -from sklearn.preprocessing import MinMaxScaler, OneHotEncoder +from sklearn.compose import make_column_transformer + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + +from sklearn.metrics import average_precision_score from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold -from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score -from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV +import itertools +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline +#from sklearn.datasets import make_classification +from sklearn.model_selection import cross_validate +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.ensemble import AdaBoostClassifier +from imblearn.combine import SMOTEENN +from imblearn.under_sampling import EditedNearestNeighbours + #%% rs = {'random_state': 42} # Done: add preprocessing step with one hot encoder -# TODO: supply stratified K-fold cv train and test data -# TODO: get accuracy and other scores through K-fold cv +# Done: get accuracy and other scores through K-fold stratified cv + +scoring_fn = ({ 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'accuracy' : make_scorer(accuracy_score) + , 'roc_auc' : make_scorer(roc_auc_score) + #, 'jaccard' : make_scorer(jaccard_score) + }) + # Multiple Classification - Model Pipeline -def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10): +def MultClassPipelineCV(X_train, X_test, y_train, y_test, input_df, var_type = ['numerical', 'categorical','mixed']): - ''' - @ param input_df: input features - @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation) - - @param y_outputF: target (or output) feature - @type: df or np.array - - - returns - multiple classification model scores - - ''' - # Determine categorical and numerical features + # determine categorical and numerical features numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns @@ -69,129 +97,67 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' col_transform = ColumnTransformer(transformers = t , remainder='passthrough') - -#%% Define classification models to run + + #%% log_reg = LogisticRegression(**rs) - nb = BernoulliNB() - knn = KNeighborsClassifier() - svm = SVC(**rs) - mlp = MLPClassifier(max_iter = 500, **rs) - dt = DecisionTreeClassifier(**rs) - et = ExtraTreesClassifier(**rs) - rf = RandomForestClassifier(**rs) - rf2 = RandomForestClassifier( - min_samples_leaf = 50, - n_estimators = 150, - bootstrap = True, - oob_score = True, - n_jobs = -1, - random_state = 42, - max_features = 'auto') + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter=500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + rf2 = RandomForestClassifier( + min_samples_leaf=50, + n_estimators=150, + bootstrap=True, + oob_score=True, + n_jobs=-1, + random_state=42, + max_features='auto') - xgb = XGBClassifier(**rs, verbosity = 0) + xgb = XGBClassifier(**rs, verbosity=0) - clfs = [ - ('Logistic Regression' , log_reg) - #, ('Naive Bayes' , nb) - , ('K-Nearest Neighbors', knn) - , ('SVM' , svm) - , ('MLP' , mlp) - , ('Decision Tree' , dt) - , ('Extra Trees' , et) - , ('Random Forest' , rf) - , ('Naive Bayes' , nb) - - #, ('Random Forest2' , rf2) - #, ('XGBoost' , xgb) + models = [ + ('Logistic Regression', log_reg), + ('Naive Bayes', nb), + ('K-Nearest Neighbors', knn), + ('SVM', svm), + ('MLP', mlp), + ('Decision Tree', dt), + ('Extra Trees', et), + ('Random Forest', rf), + ('Random Forest2', rf2), + #('XGBoost', xgb) ] - - skf = StratifiedKFold(n_splits = skf_splits - , shuffle = True - #, random_state = seed_skf - , **rs) - - X_array = np.array(input_df) - Y = y_targetF - - # Initialise score metrics list to store skf results - # fscoreL = [] - # mccL = [] - # presL = [] - # recallL = [] - # accuL = [] - # roc_aucL = [] - skf_dict = {} + + skf_cv_scores = {} + + for model_name, model_fn in models: + print('\nModel_name:', model_name + , '\nModel func:' , model_fn + , '\nList of models:', models) - #scores_df = pd.DataFrame() - for train_index, test_index in skf.split(input_df, y_targetF): - x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] - y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] - #fscoreL = {} + # model_pipeline = Pipeline([ + # ('pre' , MinMaxScaler()) + # , ('model' , model_fn)]) + + model_pipeline = Pipeline([ + ('prep' , col_transform) + , ('model' , model_fn)]) + + print('Running model pipeline:', model_pipeline) + skf_cv = cross_validate(model_pipeline + , X_train + , y_train + , cv = 10 + , scoring = scoring_fn + , return_train_score = True) + skf_cv_scores[model_name] = {} + for key, value in skf_cv.items(): + print('\nkey:', key, '\nvalue:', value) + print('\nmean value:', mean(value)) + skf_cv_scores[model_name][key] = round(mean(value),2) + #pp.pprint(skf_cv_scores) + return(skf_cv_scores) - # for train_index, test_index in skf.split(X_array, Y): - # print('\nSKF train index:', train_index - # , '\nSKF test index:', test_index) - # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] - # y_train_fold, y_test_fold = Y[train_index], Y[test_index] - - clf_scores_df = pd.DataFrame() - - for clf_name, clf in clfs: - print('\nRunning the following classification models' - , clf_name) - - model_pipeline = Pipeline(steps=[('prep' , col_transform) - , ('classifier' , clf)]) - - # model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler()) - # , ('classifier' , clf)]) - - - model_pipeline.fit(x_train_fold, y_train_fold) - y_pred_fold = model_pipeline.predict(x_test_fold) - - #---------------- - # Model metrics - #---------------- - # F1-Score - fscore = f1_score(y_test_fold, y_pred_fold) - fscoreL[clf_name].append(fscore) - print('fscoreL Len: ', len(fscoreL)) - #fscoreM = mean(fscoreL[clf]) - - # Matthews correlation coefficient - mcc = matthews_corrcoef(y_test_fold, y_pred_fold) - mccL[clf_name].append(mcc) - mccM = mean(mccL) - - # # Precision - # pres = precision_score(y_test_fold, y_pred_fold) - # presL.append(pres) - # presM = mean(presL) - - # # Recall - # recall = recall_score(y_test_fold, y_pred_fold) - # recallL.append(recall) - # recallM = mean(recallL) - - # # Accuracy - # accu = accuracy_score(y_test_fold, y_pred_fold) - # accuL.append(accu) - # accuM = mean(accuL) - - # # ROC_AUC - # roc_auc = roc_auc_score(y_test_fold, y_pred_fold) - # roc_aucL.append(roc_auc) - # roc_aucM = mean(roc_aucL) - - clf_scores_df = clf_scores_df.append({'Model' : clf_name - ,'F1_score' : fscoreM - , 'MCC' : mccM - , 'Precision': presM - , 'Recall' : recallM - , 'Accuracy' : accuM - , 'ROC_curve': roc_aucM} - , ignore_index = True) - return(clf_scores_df) - #scores_df = scores_df.append(clf_scores_df) -# return clf_scores_df \ No newline at end of file diff --git a/__pycache__/MultClassPipe2.cpython-37.pyc b/__pycache__/MultClassPipe2.cpython-37.pyc index 1cb8e8b5a76b4bb4ab20671f5f475d253bb5bcc7..581275298e03563a94d7aef5bad576be8b668d18 100644 GIT binary patch delta 65 zcmX>ic0`QNiIic0`QNiICSIJd=ja=i`$#s5%+|c|Q z_$|`nH_1(Yi`?S3$!*QAgMWwI(Rdp8F1f4m4Dd2p)_B&t$M2K-8qWc*kQI&Rfgg|u z8aKR${1JJi@dEH;@>t_V?-Twh`BdX2;Lpfs8e85fS=Iac6Y@m2Kjoj3pN(YLbM};F zKYVdyng&ohK^S+&TjT3woj)Vb_y*bF&&hMGbJqKkzaTF(J_r02`AXx4_w$Ker`K^F zw!b7V^_UAdhfT7nansx4+hkkgi{2~#n!E7=4=?zV6Jfa$s2l=UZdB$sg6l+(AMuvvO}#Mqjhueu(&Rm6EWHh{N{7vHn$wnY(8w>f3&*t zX!XH^=E}X5m16O!(~sT#ZDDzsDoZbw&ZWy`kH>@jI z7`lNkYnyt^2?u(mH75L^?|JU_8oIMDx0&00zZ-~f-E*MUahU)=we$5ln)zp}?W(!n z3dT+iu;lz3$EN{*9^hlqiA2tB97MuV!+$(sWoz^G$uLY^dH3boDMm_Gcd$b02)fgD zJSJp$%k{UMgHf;Y%4aWvXv1$~!OFt=df@dr40HUj69}%E4esnQyR9WQ?3jqsvepj# zPG7CX=1wHs16g(2?Y?l@hj!S;GUQZGj8;2hl+kxQFX^??vLb@E?eyCtaY1!4!#)6=CRNhzNp)v(JRnW%mDA0#UUs zf-toA9nYn38gOb@pX=KZ_TLVBo~tI9+5t0Che%&O4?GH1{tADqY0ey9X2OWFeVOHqy6C2rk6~0jIr3MqamV33oRL!9*?RR>&Cz(ndfJ`; z--PKesIwLXr~L#9d9Aszt-QIhL0v?`Har^38M$O%gtCk?gqzu6ghe0Ug!8wuGTTWG zEXCVxS(xmV%qHh0^W%e+xzUNqg5QUSxHwK(!mhJ!kaRNKJz%t*RbhqSS-h%Rtm6&; zX&OeH{-A3@*P^M9<_B|_jk9LLP1DSTn~QVcW@vWL5^FRUSw}{k?WDTqFb{4X+;_AP zrNGTixJ7UePPnJEtYJP%$9Yl>NOqEJnE_V_1svpyjAA;#Q(!$DkEj zbRP;TWI!j;8Efejo?t?Q-w-Bq|oTS2>RXk$jRkX87Pzm!8igI*-PfRb6~^ah5Ik~*zpQ< zkN*Jj_V_>eiDflc8hJlx)RToE*d3*z ztZ#n^c0+MjO;uK(>mWG$h%Q+`{O$&!8yyOK+!)M^D@XIBPz*}jj=RsAYtA7H2Qx44 zj3To+sTQh2Y{?X|`k*o{5>1u%2PItrG!6%qBxf{}Ix(pJsJk*K7q?XXvVDiM)p4)c z>HF=~L9Mung4PT=O-hP zv|v~~lBG_WI(rN<#XWYoI>;4~Q~UbsD)}{&-ZAcjMr&SHLgwM%Z5kc+uy?tnw1lDx zv6xb%rbr4zS|St~)l=C-sH>$YDUw%2StSaJltE;s2fHHCRE%?qQ5>0ScaVSK8axzN z^guY1Pg`eY?p-YLFwIlQUxEUym;Va|3XRC^LD)p(p8OaJ( zbW$m0&I`JzM_p<8yE3)s`!W^obD82^Pg*pR7Q-t#G9@}IGIh}1MU_w|7-FaPI(BFiuc_OXkGh?~0 zLf76^p{xB-)+dtjT~E8HERFd%V(qW8Jmy4*WbL@JHfAwDWPBG@Uwg6gXE{HX`3x5+ zgfWZ03&fswdzpiGsG^@Z{M>k`ieXe`bw%~NqJn8of=Nz|N3{dq3qqDeS6zK;BH|3U zw{|F|mFkQlvx>|qg8z4hXehFv$f6=kikwkIRd?AKe_Q@=l)C8B+pn^*95$XlfM#vmou~jhnp;TAIo)<)(y9>Wn2&bO3 zsh$^!Un=dAZ6i#B|9fa{OG-6vT=TvcDWrpR?Qc-~b915dddKq;vfYIkp_5iK>M zJ{d7t&P^q}rG&MK5Zk_sz!A3*{ORhWZxlqWYHXGQNed5}CHCOWk5v zjcRsH@ujdHHSC7s%V9HW*)7Fa!kMUTw-s-Ov(cPA7tP!As$C5iqD6aA@wKoME!j(o zuZPRgioK%vMz|WS*=ve#hU?LWy`lIP_)U8=+OoG)dnUA^ZF^hsZSXtxj^by*@7lYH zp9`-=*X`@k4f{rP)4mzqvTrH*Jmh=!UUb{O9erSb5Z$rwDEUJ8VRYBN8|~Zs(LMWK zbl<)Y`6B5Ax_v;F$TC^!709Y@kTtUYoo+uM#)-DKarsg0kWrUYVkNP4!~(19GRykN zy8rRRgO49RcwimeJ2q}eSftj} zI>82s%gBl6hL9Md(GB9Rdy%(A^UIiik?_xBFCqB=k0oj9oV#H_;6_Z;#@d3|;aIbi z_QQZfHgf_Ymm4RKkCKq=6-C=iVt*j_?L;nT!G&nKo;P5wH*``DTNX2YHulU|&_LS_ z!@LC(O_q3$JMhNcRoR4vb1F>#{#ZB*w(`yoCA~Sd6F}i&Dx7Js0JOk^s$O zge{2%+yS3EDaUT!-bS27H6Fk@0j3sp57J(eH%d`(LCIU0vtCi22>h<&t>Zn!o7#q^ zW!iIhxLvZ7z;>7z2FCZx|ul@}l_ zO{FzRFWyP7j50=LUd+m*P8zR_QH7U$jhFwZk4#>9sg0`K1g-JvSX$>bUn5P@3Uprn zvjM#ZuaEnhyfNllygBA(c$3VKHkrjUN9M@_S$tI)wRubSlMY!5bY*3hEN3PXZY8UM zu4Z+ad|3l@J!^t)WG&FmYzA~ItNKRI9L@0=U&{)xZ2e%2S2NGsuV*vujXqk)^wDCV zjXK|J7yrrTU(aP)roS;h(|+^MUmK&P(K5U}pDnFO|ITO2E1IE=Rz|CtK9u{ghiqq?FAZJeOJpajU~RkE7S?8vYp?W?m07Qsvz2V^ z4K%>7WUE-$Heb!wv+bA0Xos(5n=9Ju^=#*j4jUbEop129m%6gJIo8_cTVt(VrDgH0 zm-^^hw()#D+vMBXHNL}l`89r>-1wmaU))3#H@??@``2%@Y~w`xMkBWXUVHA3Q44Ao zexF!EFaJ|0D4$sEk5kihZgazTRqBC3n_R`nFK$cSaJWZ%a}W~ z00ve%%~$KOkA%HrAuX&k8un8wAT&lU53SgZU>ZZ+em@+-n#2RLCk;jk*N4#;(`H6Rf5|!XDk`?&LmP!cTQOLjD7njkgwnV7e8i6 zTSXL@WUnIH5wJ6LBWxvgLhAaW5j%j=062yMBC1#tm(l5qYLc9eQLY?2&y&-XZAnuN zIo%}kqKXKgCTYNj43le@^OKTgy{lQ5^N&A#LS0#AtZA*1^{Fcgs417tNr|##(YdUv z`lI;tvY8htE3Z%xebX-w1j!3XdbBcJXBIhG?cV#5r(?Jc!sre!^4Mf+N|CBn?Rb z^O`VN%H$=D$x^^dl2j!rOJYh=kwg{;HU}b%xGzd7<4}pDk3=yD6Hgd@BC4Oc+&w}@ zQ&H+O98@N&j41ffs=y;8@HktMqRQ9_qKqV@F%gB3#%xt~6(i3(W@}fpy3!g=yP}d; zQnY2w?Fw^TNxOU3g*gEcQ5`yYh8_v?++~jRf~ci0{7Fn3)5_?``hBH1%C^o$6CQJ> zX(1|M(sLN?VehA+@FI>yAw7>oAqxA#AY2#}FY`r#`3ItK(K{7IFZ5Grz3@flnUi}< z|CKQ1BBT>U9oCUCDrBOjE*&}FxR4X0OTw3^qlMF*HyYU2l`gcVT}{lF=Tr(96{c)T zm=gtXqNUKp>5p$!qCRwFFmkKqJQCQA=PBQJ|E96;;QfE^@1G?R-IrI#{a(QLyT{I_ zKhKwdgTLPebg7$}ysbUB+aHSdd!m!aM>k<6{R4KxSkE31)Nu%IUXCyl8LD9_%6)l>OxZ7>AWAqS{UPguw-?eEz%&-$m6uYSDCN&Q zf95A$u>=t3R04=fC^0+jnOvh)j@k8z9A`)+oM=oW95AIaPqZf@8mBZm#hq5Ap-f28 zne@dJ_s-CXDgQQMFI3*z(}9y{=P6jhsZ0_8uuNnzGqK<$Q9ntk!UKD8BE{k8BeUcm zgSf|<81W7|?4^lTeyDP&LCIomsW&T$#0h&kM?`T=_EI)4yA~u_l%ylck|fKLNL0x4 zzb5an>fcGKP=Kw=F8s07*rp`MlE@gUQ+kE`4l-f>1nU}v^rzBa>0OYb(bi2ojp@^# Rat*z0D7mq$<1x(g{{W3ea!LRI diff --git a/__pycache__/loopity_loop.cpython-37.pyc b/__pycache__/loopity_loop.cpython-37.pyc index effcb8e2a71fc3a22cb525e22f0b56b6584a7a2b..54395659a2afb9c3a9d2481d7a70dd6ea54d304d 100644 GIT binary patch delta 976 zcmZuv&rcIU6rPz~TDonQQrd1?`V(j?ih^Dw#t@`vG^tThLo^ann3mKCjoX3L`j#eg zwlSFl7Y^m1#-ov|R})RVX=0jqAbK_a3%=44(XcP`&G){YdGDLq{TTZa*Js>rjo`EK zbzoWsK}SqLA7U6Bh)M7i*@|Wjh`u5L1uG8iFk#zf4Ql!Lu;niiD^N|q zz${q^K46w%1tHM%fKehPx=Sf}K_A!V5vxt`HT!`8-DR_wFEns<$f}2dJoG{+L=NU~ zh%jOeS`naKwLN##s1?I9xpphuKGSI?2KWdVJO%zOl@j6OX|SX13|E6m*~7aY!Ab zPC&Y&;)nS;Pki9#!{>?R$Mn@aI=f9q&%WEYeV}Z#NAC4brCYAme08x{$cuHeP-&_8 zby03PHVWoaxl$5OOo~6=s7)G26kL9T%rt%JtmMntQgeT&k4HYl%B% zp-^*Pt_m}Mz$-QPTz-w0R})vtSXtEcf6{ZZ?zkoQ9piR$y4@lsz1mA=8gfs2FP}R1 zymqs^yqI5KZmGrcBmRb%ASYedXj49OjZdnSb;CwgsxdD+%o*)fZ1lfin%Y)+75Uv2 zO;U|&3J&JNeL9W+J&Xv`6?0P>?it#W_uVKVj_elMIu=jLV` L;9syL?YsI1qlxu5 delta 886 zcmZuvO=uHA6rP#gHpwR0v`M;~|D-=@Oj{_ZP!RE;Jt&G$dMN&s&~}ot{xrC|E&X{z z4h2!LQka969-4y(k5UBDyP#eLC0FsL2d|3Y+a$FWx{vwhecyZY-u&#R{QI1KF&x$i ze)`JzB1ym0KPbTf(n~m;Urc^gzC?M1_wjzTD39?tAK3B9&uUKPiM<~ATTL~VRS|O| zH3ACeWcQfc`;0xMB5n~ez;nXD*<3F`Y?{m`ZZnY(Nig^OAYl=UHYp{$^uhHx#PvrI z-|q(kw8^HC&#z&2N~D3@i9#w#+H*Qd7!etf1=>{`p3{+YB9CRxdaZEzSf`mA<9SH% z0vOvWC2pJ#g0IH-5F~4CnepKrDhg28AAk(xnv4?k4ahdB7<5g@!{9a(LvEt$4ZBIt z<0c>3QN)Owf_&p2ExPG$+9)r&8Ei;RB1&LFB1!PXrNa~BmPU`Z)QLl>ofGwL`wQ!pT6Mu)wX04muvWDf>yG7a*c7AX9yt@3kPidr zCDUg5%d{2VtlISox7IAjs;|gdZIYF<^0D?@o(aAQ_?^2o3@*9yd+=32t<~@G7sTF@ z2cau;SN;i2HW)o_eimhi+^_ah&$OV2ex_11MW=BkMGa~wQK~aP)oF-oJsQ<8q9cZ| zf{p_-+Llpt#TMSyr&_*Bh1ZuVl@_ZzEydYzTD|gZxJ+-#Z{cf*r}cTnUHu&5N4<#n ZQ!gPVBR5i4u|&I$z2$jF0P;!X@?Y)-))@c* diff --git a/imports.py b/imports.py index a4c029e..928f59e 100644 --- a/imports.py +++ b/imports.py @@ -8,6 +8,7 @@ Created on Sun Mar 6 13:41:54 2022 import os, sys import pandas as pd import numpy as np +import pprint as pp #from copy import deepcopy from sklearn import linear_model from sklearn.linear_model import LogisticRegression, LinearRegression @@ -64,6 +65,8 @@ os.chdir(homedir + "/git/ML_AI_training/") from MultClassPipe import MultClassPipeline from MultClassPipe2 import MultClassPipeline2 from loopity_loop import MultClassPipeSKF +from MultClassPipe3 import MultClassPipelineCV + gene = 'pncA' drug = 'pyrazinamide' diff --git a/loopity_loop.py b/loopity_loop.py index 17fd851..b4f00e7 100644 --- a/loopity_loop.py +++ b/loopity_loop.py @@ -82,13 +82,13 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' et = ExtraTreesClassifier(**rs) rf = RandomForestClassifier(**rs) rf2 = RandomForestClassifier( - min_samples_leaf = 50, - n_estimators = 150, - bootstrap = True, - oob_score = True, - n_jobs = -1, - random_state = 42, - max_features = 'auto') + min_samples_leaf = 50 + , n_estimators = 150 + , bootstrap = True + , oob_score = True + , n_jobs = -1 + , **rs + , max_features = 'auto') xgb = XGBClassifier(**rs, verbosity = 0) classification_metrics = { @@ -97,20 +97,20 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' ,'Precision': [] ,'Recall': [] ,'Accuracy': [] - #,'ROC_AUC': [] + ,'ROC_AUC': [] } models = [ ('Logistic Regression' , log_reg) , ('Naive Bayes' , nb) , ('K-Nearest Neighbors', knn) , ('SVM' , svm) - # , ('MLP' , mlp) - # , ('Decision Tree' , dt) - # , ('Extra Trees' , et) - # , ('Random Forest' , rf) - # , ('Naive Bayes' , nb) + , ('MLP' , mlp) + , ('Decision Tree' , dt) + , ('Extra Trees' , et) + , ('Random Forest' , rf) + , ('Naive Bayes' , nb) - #, ('Random Forest2' , rf2) + , ('Random Forest2' , rf2) #, ('XGBoost' , xgb) ] @@ -118,7 +118,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' , shuffle = True , **rs) - skf_dict = {} +# skf_dict = {} fold_no = 1 fold_dict={} @@ -145,12 +145,12 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' #---------------- fscore = f1_score(y_test_fold, y_pred_fold) mcc = matthews_corrcoef(y_test_fold, y_pred_fold) - #pres = precision_score(y_test_fold, y_pred_fold) - #recall = recall_score(y_test_fold, y_pred_fold) - pres = precision_score(y_test_fold, y_pred_fold, zero_division=0) - recall = recall_score(y_test_fold, y_pred_fold, zero_division=0) + pres = precision_score(y_test_fold, y_pred_fold) + recall = recall_score(y_test_fold, y_pred_fold) + #pres = precision_score(y_test_fold, y_pred_fold, zero_division=0) + #recall = recall_score(y_test_fold, y_pred_fold, zero_division=0) accu = accuracy_score(y_test_fold, y_pred_fold) - #roc_auc = roc_auc_score(y_test_fold, y_pred_fold) + roc_auc = roc_auc_score(y_test_fold, y_pred_fold) fold=("fold_"+str(fold_no)) @@ -165,7 +165,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' fold_dict[model_name][fold].update({'Precision' : pres}) fold_dict[model_name][fold].update({'Recall' : recall}) fold_dict[model_name][fold].update({'Accuracy' : accu}) - #fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc}) + fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc}) fold_no +=1 #pp.pprint(skf_dict) diff --git a/loopity_loop_CALL.py b/loopity_loop_CALL.py index 5f8833a..00e33b1 100644 --- a/loopity_loop_CALL.py +++ b/loopity_loop_CALL.py @@ -7,55 +7,32 @@ Created on Fri Mar 11 11:15:50 2022 """ #%% del(t3_res) -t3_res = MultClassPipeSKF(input_df = numerical_features_df - , y_targetF = target1 +# t3_res = MultClassPipeSKF(input_df = numerical_features_df +# , y_targetF = target1 +# , var_type = 'numerical' +# , skf_splits = 10) +# pp.pprint(t3_res) +# #print(t3_res) + +t3_res = MultClassPipeSKF(input_df = num_df_wtgt[numerical_FN] + , y_targetF = num_df_wtgt['mutation_class'] , var_type = 'numerical' , skf_splits = 10) pp.pprint(t3_res) #print(t3_res) -#%% Manually: mean for each model, each metric -model_name = 'Logistic Regression' -model_name = 'Naive Bayes' -model_name = 'K-Nearest Neighbors' -model_name = 'SVM' -#%% -model_metric = 'F1_score' - -log_reg_f1 = [] -for key in t3_res[model_name]: - log_reg_f1.append(t3_res[model_name][key][model_metric]) - log_reg_f1M = mean(log_reg_f1) - print('key:', key, model_metric, ':', log_reg_f1) -print(log_reg_f1M) - -log_reg_f1df = pd.DataFrame({model_name: [log_reg_f1M]}, index = [model_metric]) -log_reg_f1df - -#%% -model_metric = 'MCC' -log_reg_mcc = [] -for key in t3_res[model_name]: - log_reg_mcc.append(t3_res[model_name][key][model_metric]) - log_reg_mccM = mean(log_reg_mcc) - print('key:', key, model_metric, ':', log_reg_mcc) -print(log_reg_mccM) - -log_reg_mccdf = pd.DataFrame({model_name: [log_reg_mccM]}, index = [model_metric]) -log_reg_mccdf -#%% ################################################################ # extract items from wwithin a nested dict #%% Classification Metrics we need to mean() -classification_metrics = { - 'F1_score': [] - ,'MCC': [] - ,'Precision': [] - ,'Recall': [] - ,'Accuracy': [] - } +# classification_metrics = { +# 'F1_score': [] +# ,'MCC': [] +# ,'Precision': [] +# ,'Recall': [] +# ,'Accuracy': [] +# ,'ROC_AUC':[] +# } # "mean() of the current metric across all folds for this model" - # the output containing all the metrics across all folds for this model out={} # Just the mean() for each of the above metrics-per-model @@ -64,16 +41,16 @@ out_means={} # Build up out{} from t3_res, which came from loopity_loop for model in t3_res: # NOTE: can't copy objects in Python!!! - out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []} + out[model]={'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]} out_means[model]={} # just to make life easier print(model) for fold in t3_res[model]: - for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}: + for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}: metric_value = t3_res[model][fold][metric] out[model][metric].append(metric_value) # now that we've built out{}, let's mean() each metric for model in out: - for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': []}: + for metric in {'F1_score': [], 'MCC': [], 'Precision': [], 'Recall': [], 'Accuracy': [], 'ROC_AUC':[]}: metric_mean = mean(out[model][metric]) # just some debug output # print('model:', model @@ -84,3 +61,4 @@ for model in out: out_means[model].update({(metric+'_mean'): metric_mean }) out_scores = pd.DataFrame(out_means) +out_scores2 = round(out_scores, 2)