various edits

2022-07-28 13:19:30 +01:00 · 2022-07-28 13:19:30 +01:00 · 8d8a61675f
commit 8d8a61675f
parent 90b9477520
3 changed files with 601 additions and 1 deletions
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -15,7 +15,8 @@ sys.path
 from GetMLData import *
 from SplitTTS import *
 from MultClfs import *
-#from MultClfs_SIMPLE import *
+from MultClfs_noBTS import *
+

 #%%
 rs = {'random_state': 42}
@ -69,6 +70,8 @@ len(df)
 Counter(df2['y'])
 Counter(df2['y_bts'])

+#%% Run Multiple models
+
 fooD = MultModelsCl(input_df = df2['X']
                , target = df2['y']
                , sel_cv = skf_cv
@ -140,3 +143,106 @@ from sklearn.utils import all_estimators
 all_clfs = all_estimators(type_filter="classifier")
 df = pd.DataFrame (all_clfs, columns = ['classifier_name', 'classifier_fn'])
 df.to_csv("Model_names_ALL.csv")
+#%% TEST different CV Thresholds for split_type = NONE
+
+Counter(df2['y'])
+Counter(df2['y_bts'])
+
+spl_type = 'none'
+data_type = "complete"
+
+df2 = split_tts(df
+          , data_type = data_type
+          , split_type = spl_type
+          , oversampling = True
+          , dst_colname = 'dst'
+          , target_colname = 'dst_mode'
+          , include_gene_name = True
+          , random_state = 42 # default
+      )
+
+fooD = MultModelsCl_noBTS(input_df = df2['X']
+                , target = df2['y']
+                , skf_cv_threshold = 10 # IMP to change
+
+                , tts_split_type  = spl_type
+                , resampling_type = 'XXXX' # default
+                
+                , add_cm = True  # adds confusion matrix based on cross_val_predict
+                , add_yn = True  # adds target var class numbers
+                
+                , var_type = ['mixed']
+                , scale_numeric = ['min_max']
+                , random_state = 42
+                , n_jobs = os.cpu_count() 
+                , return_formatted_output = False
+
+                )
+
+for k, v in fooD.items():
+    print('\nModel:', k
+          , '\nTRAIN MCC:', fooD[k]['test_mcc']
+          )
+
+# formatted df
+foo_df3 = MultModelsCl_noBTS(input_df = df2['X']
+                , target = df2['y']
+                , skf_cv_threshold = 5 # IMP to change
+
+                , tts_split_type  = spl_type
+                , resampling_type = 'XXXX' # default
+                
+                , add_cm = True  # adds confusion matrix based on cross_val_predict
+                , add_yn = True  # adds target var class numbers
+                
+                , var_type = ['mixed']
+                , scale_numeric = ['min_max']
+                , random_state = 42
+                , n_jobs = os.cpu_count() 
+                , return_formatted_output = True
+
+                )
+
+dfs_combine_wf = [foo_df, foo_df2, foo_df3]
+
+common_cols_wf = list(set.intersection(*(set(df.columns) for df in dfs_combine_wf)))
+
+print('\nCombinig', len(dfs_combine_wf), 'using pd.concat by row ~ rowbind'
+      , '\nChecking Dims of df to combine:'
+      , '\nDim of CV:', scoresDF_CV.shape
+      , '\nDim of BT:', scoresDF_BT.shape)
+#print(scoresDF_CV)
+        #print(scoresDF_BT)
+    
+dfs_nrows_wf = []
+for df in dfs_combine_wf:
+    dfs_nrows_wf = dfs_nrows_wf + [len(df)]
+dfs_nrows_wf = max(dfs_nrows_wf)
+    
+dfs_ncols_wf = []
+for df in dfs_combine_wf:
+    dfs_ncols_wf = dfs_ncols_wf + [len(df.columns)]
+dfs_ncols_wf = max(dfs_ncols_wf)
+print(dfs_ncols_wf)
+
+expected_nrows_wf = len(dfs_combine_wf) * dfs_nrows_wf
+expected_ncols_wf = dfs_ncols_wf
+
+if len(common_cols_wf) == dfs_ncols_wf :
+    combined_baseline_wf = pd.concat([df[common_cols_wf] for df in dfs_combine_wf], ignore_index=False)
+    print('\nConcatenating dfs with different resampling methods [WF]:'
+          , '\nSplit type:', spl_type
+          , '\nNo. of dfs combining:', len(dfs_combine_wf))
+    
+    if len(combined_baseline_wf) == expected_nrows_wf  and len(combined_baseline_wf.columns) == expected_ncols_wf:
+        print('\nPASS:', len(dfs_combine_wf), 'dfs successfully combined'
+              , '\nnrows in combined_df_wf:', len(combined_baseline_wf)
+              , '\nncols in combined_df_wf:', len(combined_baseline_wf.columns))
+    else:
+        print('\nFAIL: concatenating failed'
+              , '\nExpected nrows:', expected_nrows_wf
+              , '\nGot:', len(combined_baseline_wf)
+              , '\nExpected ncols:', expected_ncols_wf
+              , '\nGot:', len(combined_baseline_wf.columns))
+        sys.exit('\nFIRST IF FAILS')
+