From 23799275a0d1585f469e461a6cec9ca6aad573a6 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 8 Jul 2022 13:53:17 +0100
Subject: [PATCH] saving work from thinkpad

---
 scripts/count_vars_ML.R                       | 22 +++++++-
 scripts/ml/ml_functions/MultClfs.py           | 53 ++++++++++---------
 .../ml/ml_functions/test_func_singlegene.py   |  5 +-
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R
index 228d462..ec13f53 100644
--- a/scripts/count_vars_ML.R
+++ b/scripts/count_vars_ML.R
@@ -4,7 +4,7 @@
 #source("~/git/LSHTM_analysis/config/embb.R")
 #source("~/git/LSHTM_analysis/config/gid.R")
 #source("~/git/LSHTM_analysis/config/katg.R")
-#source("~/git/LSHTM_analysis/config/pnca.R")
+source("~/git/LSHTM_analysis/config/pnca.R")
 source("~/git/LSHTM_analysis/config/rpob.R")
 
 #############################
@@ -55,7 +55,7 @@ if (check12) {
   cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ')
 }else{
   stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
-``}
+}
 
 #==========================
 # CHECK: active site labels
@@ -189,6 +189,24 @@ if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2
     #quit()
 }
 
+#%%###################################################################
+# check merged_df3
+check_mdf3 = merged_df3[, cols_sel]
+  
+check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode)
+ft_mdf3 = as.data.frame.matrix(check_mdf3T)
+
+#==================
+# CHECK: dst mode
+#===================
+dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3
+
+sel = c("mutationinformation", "dst", "dst_mode")
+
+a = merged_df3[, sel]
+str(a)
+
+
 # write file
 # outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
 # outfile_merged_df3
diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py
index c703a6a..46be467 100755
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@@ -146,8 +146,8 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 def MultModelsCl(input_df, target
                        #, skf_cv
                        , sel_cv
-                       , blind_test_df
-                       , blind_test_target
+                       #, blind_test_df
+                       #, blind_test_target
                        , tts_split_type 
 
                        , resampling_type = 'none' # default
@@ -231,35 +231,36 @@ def MultModelsCl(input_df, target
     # Specify multiple Classification Models  
     #======================================================
     models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
-               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-                , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-                , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-                , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-                , ('Gaussian NB'               , GaussianNB() )
-                , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
-                , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-                , ('LDA'                       , LinearDiscriminantAnalysis() )
-                , ('Logistic Regression'       , LogisticRegression(**rs) )
-                , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-                , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               # , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+               # , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               #  , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               #  , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               #  , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               #  , ('Gaussian NB'               , GaussianNB() )
+               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+                 , ('LDA'                       , LinearDiscriminantAnalysis() )
+               # , ('Logistic Regression'       , LogisticRegression(**rs) )
+               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
                 #, ('Multinomial'               , MultinomialNB() )
-                , ('Naive Bayes'               , BernoulliNB() )
-                , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-                , ('QDA'                       , QuadraticDiscriminantAnalysis() )
-                , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
-                # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+                # , ('Naive Bayes'               , BernoulliNB() )
+                # , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+                # , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+                # , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+                # # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
                 #                                                         , n_estimators     = 1000
                 #                                                         , bootstrap        = True
                 #                                                         , oob_score        = True
                 #                                                         , **njobs
                 #                                                         , **rs
                 #                                                         , max_features     = 'auto') ) 
-                 , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-                 , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-                 , ('SVC'                       , SVC(**rs) ) 
-                 , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                 , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
+             #     , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+             #     , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+             #     , ('SVC'                       , SVC(**rs) ) 
+             #     , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+             #     , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
+             # 
              ]
                 
     mm_skf_scoresD = {}
@@ -308,7 +309,7 @@ def MultModelsCl(input_df, target
         # ADD more info: meta data related to input df
         mm_skf_scoresD[model_name]['resampling']        = resampling_type
         mm_skf_scoresD[model_name]['n_training_size']   = len(input_df)
-        mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
+        #mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
         mm_skf_scoresD[model_name]['n_features']        = len(input_df.columns)
         mm_skf_scoresD[model_name]['tts_split']         = tts_split_type
         
@@ -357,7 +358,7 @@ def MultModelsCl(input_df, target
            # Build bts numbers dict
            btD = {'n_blindY_neg'    : Counter(blind_test_target)[0]
                   , 'n_blindY_pos'  : Counter(blind_test_target)[1]
-                  , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
+                  #, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
                   , 'n_test_size'   : len(blind_test_df) }
            
            # Update cmD+tnD dicts with btD
diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py
index f340ef8..ea80074 100644
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@@ -58,8 +58,8 @@ all(df.columns.isin(['gene_name'])) # should be False
 
 
 spl_type = '70_30'
-spl_type = '80_20'
-spl_type = 'sl'
+#spl_type = '80_20'
+#spl_type = 'sl'
 
 df2 = split_tts(df
           , data_type = 'actual'
@@ -84,7 +84,6 @@ fooD = MultModelsCl(input_df = df2['X']
                 , var_type = ['mixed']
                 , scale_numeric = ['min_max']
                 , return_formatted_output = False
-
                 )
 
 for k, v in fooD.items():