added all run scripts for diffferent splits

2022-06-24 20:39:50 +01:00 · 2022-06-24 20:39:50 +01:00 · 5d38cde912
commit 5d38cde912
parent e2bc384155
6 changed files with 948 additions and 0 deletions
--- a/scripts/ml/run_cd_8020.py
+++ b/scripts/ml/run_cd_8020.py
@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 20 13:05:23 2022
+
+@author: tanu
+"""
+#%%Imports ####################################################################
+import re
+import argparse
+import os, sys
+
+# gene  = 'pncA'
+# drug  = 'pyrazinamide'
+#total_mtblineage_uc = 8
+###############################################################################
+#%% command line args: case sensitive
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+args = arg_parser.parse_args()
+
+drug    = args.drug
+gene    = args.gene
+
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
+
+###############################################################################
+#==================
+# Import data
+#==================
+from ml_data_cd_8020 import *
+setvars(gene,drug)
+from ml_data_cd_8020 import *
+
+# from YC run_all_ML: run locally
+#from UQ_yc_RunAllClfs import run_all_ML
+
+#====================
+# Import ML functions 
+#====================
+from MultClfs import *
+
+#==================
+# other vars
+#==================
+tts_split_cd_8020    = 'cd_80_20'
+OutFile_suffix  = '_cd_8020'
+
+#==================
+# Specify outdir 
+#==================
+outdir_ml = outdir + 'ml/tts_cd_8020/'
+print('\nOutput directory:', outdir_ml)
+
+#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
+outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
+#%% Running models ############################################################
+print('\n#####################################################################\n'
+      , '\nStarting--> Running ML analysis: Baseline modes (No FS)'
+      , '\nGene name:', gene
+      , '\nDrug name:', drug
+      , '\n#####################################################################\n')
+
+paramD = {
+        'baseline_paramD': { 'input_df'        : X
+                            , 'target'         : y
+                            , 'var_type'       : 'mixed'
+                            , 'resampling_type': 'none'}
+        
+        , 'smnc_paramD': { 'input_df'          : X_smnc
+                          , 'target'           : y_smnc
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'smnc'}
+    
+        , 'ros_paramD': { 'input_df'           : X_ros
+                        , 'target'             : y_ros
+                        , 'var_type'           : 'mixed'
+                        , 'resampling_type'    : 'ros'}
+
+        , 'rus_paramD' : { 'input_df'          : X_rus
+                          , 'target'           : y_rus
+                          , 'var_type'         : 'mixed'
+                          , 'resampling_type'  : 'rus'}
+
+        , 'rouC_paramD' : { 'input_df'         : X_rouC
+                            , 'target'          : y_rouC
+                            , 'var_type'        : 'mixed'
+                            , 'resampling_type' : 'rouC'}
+        }
+
+##==============================================================================
+## Dict with no CV BT formatted df
+## mmD = {}
+## for k, v in paramD.items():
+## #    print(mmD[k])
+##     scores_cd_8020D = MultModelsCl(**paramD[k]
+##                         , tts_split_type = tts_split_cd_8020
+##                         , skf_cv = skf_cv
+##                         , blind_test_df = X_bts
+##                         , blind_test_target = y_bts
+##                         , add_cm = True 
+##                         , add_yn = True
+##                         , return_formatted_output = False)
+##     mmD[k] = scores_cd_8020D
+##==============================================================================
+## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs 
+mmDD = {}
+for k, v in paramD.items():
+    scores_cd_8020D = MultModelsCl(**paramD[k]
+                        , tts_split_type = tts_split_cd_8020
+                        , skf_cv = skf_cv
+                        , blind_test_df = X_bts
+                        , blind_test_target = y_bts
+                        , add_cm = True 
+                        , add_yn = True
+                        , return_formatted_output = True)
+    mmDD[k] = scores_cd_8020D
+
+# Extracting the dfs from within the dict and concatenating to output as one df
+for k, v in mmDD.items():
+    out_wf_cd_8020 = pd.concat(mmDD, ignore_index = True)
+
+out_wf_cd_8020f = out_wf_cd_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
+    
+print('\n######################################################################'
+      , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
+      , '\nGene:', gene.lower()
+      , '\nDrug:', drug
+      , '\noutput file:', outFile_wf
+      , '\nDim of output:', out_wf_cd_8020f.shape
+      , '\n######################################################################')
+###############################################################################
+#====================
+# Write output file
+#====================
+out_wf_cd_8020f.to_csv(outFile_wf, index = False)
+print('\nFile successfully written:', outFile_wf)
+###############################################################################