optimised run_7030.py to generate ouput from dict now that the processfunction and parameter dicts have been added

2022-06-24 15:40:18 +01:00 · 2022-06-24 15:40:18 +01:00 · b37a950fec
commit b37a950fec
parent 7dc7e25016
12 changed files with 180 additions and 128408 deletions
--- a/scripts/ml/run_7030_LOOP.py
+++ b/scripts/ml/run_7030_LOOP.py
@ -9,6 +9,8 @@ Created on Mon Jun 20 13:05:23 2022
 import re
 import argparse
 import os, sys
+import collections
+
 # gene  = 'pncA'
 # drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8
@ -25,6 +27,7 @@ import os, sys
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
+
 ###############################################################################
 #==================
 # Import data
@ -54,79 +57,70 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
+#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'

 #%% Running models ############################################################
 print('\n#####################################################################\n'
-      , '\nRunning ML analysis: feature groups '
+      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
      , '\nGene name:', gene
-      , '\nDrug name:', drug)
+      , '\nDrug name:', drug
+      , '\n#####################################################################\n')

-fooD = {'baseline_paramD': {
-                   'input_df': X
-                   , 'target': y
-                   , 'var_type': 'mixed'
-                   , 'resampling_type': 'none'}
-        ,
-        'smnc_paramD': {'input_df': X_smnc
-                   , 'target': y_smnc
-                   , 'var_type': 'mixed'
-                   , 'resampling_type': 'smnc'}
-}
+paramD = {
+        'baseline_paramD': { 'input_df'        : X
+                            , 'target'         : y
+                            , 'var_type'       : 'mixed'
+                            , 'resampling_type': 'none'}
+        
+        , 'smnc_paramD': { 'input_df'          : X_smnc
+                          , 'target'           : y_smnc
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'smnc'}
+    
+        , 'ros_paramD': { 'input_df'           : X_ros
+                        , 'target'             : y_ros
+                        , 'var_type'           : 'mixed'
+                        , 'resampling_type'    : 'ros'}

-barD = {}
-for k, v in fooD.items():
-    #print(k)
-    print(fooD[k])
-    scores_7030D = MultModelsCl(**fooD[k]
+        , 'rus_paramD' : { 'input_df'          : X_rus
+                          , 'target'           : y_rus
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'rus'}
+
+        , 'rouC_paramD' : { 'input_df'         : X_rouC
+                            , 'target'          : y_rouC
+                            , 'var_type'        : 'mixed'
+                            , 'resampling_type' : 'rouC'}
+        }
+
+# Initial run to get the dict containing CV, BT and metadata DFs 
+mmD = {}
+for k, v in paramD.items():
+#    print(fooD[k])
+    scores_7030D = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_7030
                        , skf_cv = skf_cv
                        , blind_test_df = X_bts
                        , blind_test_target = y_bts
                        , add_cm = True 
-                        , add_yn = True)
-    barD[k] = scores_7030D
+                        , add_yn = True
+                        , return_formatted_output = True)
+    mmD[k] = scores_7030D
    
-
-ros_paramD = {input_df = X_ros
-                   , target = y_ros
-                   , var_type = 'mixed'
-                   , resampling_type = 'smnc'}
-
-
-rus_paramD = {input_df = X_rus
-                   , target = y_rus
-                   , var_type = 'mixed'
-                   , resampling_type = 'rus'}
-
-
-rouC_paramD = {input_df = X_rouC
-                   , target = y_rouC
-                   , var_type = 'mixed'
-                   , resampling_type = 'rouC'}
-
-
-
-
-#====
-scores_7030D = MultModelsCl(**rouC_paramD
-                    , tts_split_type = tts_split_7030
-                    , skf_cv = skf_cv
-                    , blind_test_df = X_bts
-                    , blind_test_target = y_bts
-                    , add_cm = True 
-                    , add_yn = True)
-
-###############################################################################
-###############################################################################
-#%% COMBINING all dfs: WF and LF
-# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
-
-
+for k, v in mmD.items():
+    out_wf_7030 = pd.concat(mmD, ignore_index = True)
+    
+print('\n######################################################################'
+      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
+      , '\nGene:', gene.lower()
+      , '\nDrug:', drug
+      , '\noutput file:', outFile_wf
+      , '\nDim of output:', out_wf_7030.shape
+      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
-#combined_baseline_wf.to_csv(outFile_wf, index = False)
-#print('\nFile successfully written:', outFile_wf)
+out_wf_7030.to_csv(outFile_wf, index = False)
+print('\nFile successfully written:', outFile_wf)
 ###############################################################################