added all run scripts for diffferent splits
This commit is contained in:
parent
e2bc384155
commit
5d38cde912
6 changed files with 948 additions and 0 deletions
141
scripts/ml/run_cd_8020.py
Executable file
141
scripts/ml/run_cd_8020.py
Executable file
|
@ -0,0 +1,141 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Jun 20 13:05:23 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%%Imports ####################################################################
|
||||
import re
|
||||
import argparse
|
||||
import os, sys
|
||||
|
||||
# gene = 'pncA'
|
||||
# drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
###############################################################################
|
||||
#%% command line args: case sensitive
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '')
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
###############################################################################
|
||||
homedir = os.path.expanduser("~")
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Import data
|
||||
#==================
|
||||
from ml_data_cd_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
#====================
|
||||
# Import ML functions
|
||||
#====================
|
||||
from MultClfs import *
|
||||
|
||||
#==================
|
||||
# other vars
|
||||
#==================
|
||||
tts_split_cd_8020 = 'cd_80_20'
|
||||
OutFile_suffix = '_cd_8020'
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
outdir_ml = outdir + 'ml/tts_cd_8020/'
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
|
||||
outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv'
|
||||
#%% Running models ############################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nStarting--> Running ML analysis: Baseline modes (No FS)'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug
|
||||
, '\n#####################################################################\n')
|
||||
|
||||
paramD = {
|
||||
'baseline_paramD': { 'input_df' : X
|
||||
, 'target' : y
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type': 'none'}
|
||||
|
||||
, 'smnc_paramD': { 'input_df' : X_smnc
|
||||
, 'target' : y_smnc
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'smnc'}
|
||||
|
||||
, 'ros_paramD': { 'input_df' : X_ros
|
||||
, 'target' : y_ros
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'ros'}
|
||||
|
||||
, 'rus_paramD' : { 'input_df' : X_rus
|
||||
, 'target' : y_rus
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'rus'}
|
||||
|
||||
, 'rouC_paramD' : { 'input_df' : X_rouC
|
||||
, 'target' : y_rouC
|
||||
, 'var_type' : 'mixed'
|
||||
, 'resampling_type' : 'rouC'}
|
||||
}
|
||||
|
||||
##==============================================================================
|
||||
## Dict with no CV BT formatted df
|
||||
## mmD = {}
|
||||
## for k, v in paramD.items():
|
||||
## # print(mmD[k])
|
||||
## scores_cd_8020D = MultModelsCl(**paramD[k]
|
||||
## , tts_split_type = tts_split_cd_8020
|
||||
## , skf_cv = skf_cv
|
||||
## , blind_test_df = X_bts
|
||||
## , blind_test_target = y_bts
|
||||
## , add_cm = True
|
||||
## , add_yn = True
|
||||
## , return_formatted_output = False)
|
||||
## mmD[k] = scores_cd_8020D
|
||||
##==============================================================================
|
||||
## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs
|
||||
mmDD = {}
|
||||
for k, v in paramD.items():
|
||||
scores_cd_8020D = MultModelsCl(**paramD[k]
|
||||
, tts_split_type = tts_split_cd_8020
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_df = X_bts
|
||||
, blind_test_target = y_bts
|
||||
, add_cm = True
|
||||
, add_yn = True
|
||||
, return_formatted_output = True)
|
||||
mmDD[k] = scores_cd_8020D
|
||||
|
||||
# Extracting the dfs from within the dict and concatenating to output as one df
|
||||
for k, v in mmDD.items():
|
||||
out_wf_cd_8020 = pd.concat(mmDD, ignore_index = True)
|
||||
|
||||
out_wf_cd_8020f = out_wf_cd_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
|
||||
|
||||
print('\n######################################################################'
|
||||
, '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)'
|
||||
, '\nGene:', gene.lower()
|
||||
, '\nDrug:', drug
|
||||
, '\noutput file:', outFile_wf
|
||||
, '\nDim of output:', out_wf_cd_8020f.shape
|
||||
, '\n######################################################################')
|
||||
###############################################################################
|
||||
#====================
|
||||
# Write output file
|
||||
#====================
|
||||
out_wf_cd_8020f.to_csv(outFile_wf, index = False)
|
||||
print('\nFile successfully written:', outFile_wf)
|
||||
###############################################################################
|
Loading…
Add table
Add a link
Reference in a new issue