saving work from thinkpad

2022-07-08 13:53:17 +01:00 · 2022-07-08 13:53:17 +01:00 · 23799275a0
commit 23799275a0
parent 5577f5b195
3 changed files with 49 additions and 31 deletions
--- a/scripts/count_vars_ML.R
+++ b/scripts/count_vars_ML.R
@ -4,7 +4,7 @@
 #source("~/git/LSHTM_analysis/config/embb.R")
 #source("~/git/LSHTM_analysis/config/gid.R")
 #source("~/git/LSHTM_analysis/config/katg.R")
-#source("~/git/LSHTM_analysis/config/pnca.R")
+source("~/git/LSHTM_analysis/config/pnca.R")
 source("~/git/LSHTM_analysis/config/rpob.R")

 #############################
@ -55,7 +55,7 @@ if (check12) {
  cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ')
 }else{
  stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
-``}
+}

 #==========================
 # CHECK: active site labels
@ -189,6 +189,24 @@ if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2
    #quit()
 }

+#%%###################################################################
+# check merged_df3
+check_mdf3 = merged_df3[, cols_sel]
+  
+check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode)
+ft_mdf3 = as.data.frame.matrix(check_mdf3T)
+
+#==================
+# CHECK: dst mode
+#===================
+dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3
+
+sel = c("mutationinformation", "dst", "dst_mode")
+
+a = merged_df3[, sel]
+str(a)
+
+
 # write file
 # outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
 # outfile_merged_df3
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@ -146,8 +146,8 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 def MultModelsCl(input_df, target
                       #, skf_cv
                       , sel_cv
-                       , blind_test_df
-                       , blind_test_target
+                       #, blind_test_df
+                       #, blind_test_target
                       , tts_split_type 

                       , resampling_type = 'none' # default
@ -231,35 +231,36 @@ def MultModelsCl(input_df, target
    # Specify multiple Classification Models  
    #======================================================
    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
-               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-                , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-                , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-                , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-                , ('Gaussian NB'               , GaussianNB() )
-                , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-                , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-                , ('LDA'                       , LinearDiscriminantAnalysis() )
-                , ('Logistic Regression'       , LogisticRegression(**rs) )
-                , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-                , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               # , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+               # , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               #  , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               #  , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               #  , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               #  , ('Gaussian NB'               , GaussianNB() )
+               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+                 , ('LDA'                       , LinearDiscriminantAnalysis() )
+               # , ('Logistic Regression'       , LogisticRegression(**rs) )
+               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
                #, ('Multinomial'               , MultinomialNB() )
-                , ('Naive Bayes'               , BernoulliNB() )
-                , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-                , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-                , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-                # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                # , ('Naive Bayes'               , BernoulliNB() )
+                # , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+                # , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+                # , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+                # # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
                #                                                         , n_estimators     = 1000
                #                                                         , bootstrap        = True
                #                                                         , oob_score        = True
                #                                                         , **njobs
                #                                                         , **rs
                #                                                         , max_features     = 'auto') ) 
-                 , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-                 , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-                 , ('SVC'                       , SVC(**rs) ) 
-                 , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                 , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
+             #     , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+             #     , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+             #     , ('SVC'                       , SVC(**rs) ) 
+             #     , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+             #     , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
+             # 
             ]
                
    mm_skf_scoresD = {}
@ -308,7 +309,7 @@ def MultModelsCl(input_df, target
        # ADD more info: meta data related to input df
        mm_skf_scoresD[model_name]['resampling']        = resampling_type
        mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
-        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
+        #mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
        mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
        mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
        
@ -357,7 +358,7 @@ def MultModelsCl(input_df, target
           # Build bts numbers dict
           btD = {'n_blindY_neg'    : Counter(blind_test_target)[0]
                  , 'n_blindY_pos'  : Counter(blind_test_target)[1]
-                  , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
+                  #, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
                  , 'n_test_size'   : len(blind_test_df) }
           
           # Update cmD+tnD dicts with btD
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -58,8 +58,8 @@ all(df.columns.isin(['gene_name'])) # should be False


 spl_type = '70_30'
-spl_type = '80_20'
-spl_type = 'sl'
+#spl_type = '80_20'
+#spl_type = 'sl'

 df2 = split_tts(df
          , data_type = 'actual'
@ -84,7 +84,6 @@ fooD = MultModelsCl(input_df = df2['X']
                , var_type = ['mixed']
                , scale_numeric = ['min_max']
                , return_formatted_output = False
-
                )

 for k, v in fooD.items():