added FS to MultClfs.py and modified data for different splits for consistency

2022-06-24 20:35:53 +01:00 · 2022-06-24 20:35:53 +01:00 · e2bc384155
commit e2bc384155
parent edb7aebd6a
12 changed files with 1585 additions and 994 deletions
--- a/scripts/ml/run_7030.py
+++ b/scripts/ml/run_7030.py
@ -55,9 +55,8 @@ OutFile_suffix  = '7030'
 outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

-outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
-#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv'
-
+#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
+outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
 #%% Running models ############################################################
 print('\n#####################################################################\n'
      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
@ -92,10 +91,24 @@ paramD = {
                            , 'resampling_type' : 'rouC'}
        }

-# Initial run to get the dict containing CV, BT and metadata DFs 
-mmD = {}
+##==============================================================================
+## Dict with no CV BT formatted df
+## mmD = {}
+## for k, v in paramD.items():
+## #    print(mmD[k])
+##     scores_7030D = MultModelsCl(**paramD[k]
+##                         , tts_split_type = tts_split_7030
+##                         , skf_cv = skf_cv
+##                         , blind_test_df = X_bts
+##                         , blind_test_target = y_bts
+##                         , add_cm = True 
+##                         , add_yn = True
+##                         , return_formatted_output = False)
+##     mmD[k] = scores_7030D
+##==============================================================================
+## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
+mmDD = {}
 for k, v in paramD.items():
-#    print(mmD[k])
    scores_7030D = MultModelsCl(**paramD[k]
                        , tts_split_type = tts_split_7030
                        , skf_cv = skf_cv
@ -104,23 +117,25 @@ for k, v in paramD.items():
                        , add_cm = True 
                        , add_yn = True
                        , return_formatted_output = True)
-    mmD[k] = scores_7030D
+    mmDD[k] = scores_7030D

 # Extracting the dfs from within the dict and concatenating to output as one df
-for k, v in mmD.items():
-    out_wf_7030 = pd.concat(mmD, ignore_index = True)
+for k, v in mmDD.items():
+    out_wf_7030 = pd.concat(mmDD, ignore_index = True)
+
+out_wf_7030f = out_wf_7030.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
    
 print('\n######################################################################'
      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
      , '\nGene:', gene.lower()
      , '\nDrug:', drug
      , '\noutput file:', outFile_wf
-      , '\nDim of output:', out_wf_7030.shape
+      , '\nDim of output:', out_wf_7030f.shape
      , '\n######################################################################')
 ###############################################################################
 #====================
 # Write output file
 #====================
-#out_wf_7030.to_csv(outFile_wf, index = False)
+out_wf_7030f.to_csv(outFile_wf, index = False)
 print('\nFile successfully written:', outFile_wf)
 ###############################################################################