diff --git a/scripts/ml/run_8020.py b/scripts/ml/run_8020.py new file mode 100755 index 0000000..75f932c --- /dev/null +++ b/scripts/ml/run_8020.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 20 13:05:23 2022 + +@author: tanu +""" +#%%Imports #################################################################### +import re +import argparse +import os, sys + +# gene = 'pncA' +# drug = 'pyrazinamide' +#total_mtblineage_uc = 8 +############################################################################### +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml') + +############################################################################### +#================== +# Import data +#================== +from ml_data_8020 import * +setvars(gene,drug) +from ml_data_8020 import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#==================== +# Import ML functions +#==================== +from MultClfs import * + +#================== +# other vars +#================== +tts_split_8020 = '80_20' +OutFile_suffix = '8020' + +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_8020/' +print('\nOutput directory:', outdir_ml) + +#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' +outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv' +#%% Running models ############################################################ +print('\n#####################################################################\n' + , '\nStarting--> Running ML analysis: Baseline modes (No FS)' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\n#####################################################################\n') + +paramD = { + 'baseline_paramD': { 'input_df' : X + , 'target' : y + , 'var_type' : 'mixed' + , 'resampling_type': 'none'} + + , 'smnc_paramD': { 'input_df' : X_smnc + , 'target' : y_smnc + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + + , 'ros_paramD': { 'input_df' : X_ros + , 'target' : y_ros + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + + , 'rus_paramD' : { 'input_df' : X_rus + , 'target' : y_rus + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + + , 'rouC_paramD' : { 'input_df' : X_rouC + , 'target' : y_rouC + , 'var_type' : 'mixed' + , 'resampling_type' : 'rouC'} + } + +##============================================================================== +## Dict with no CV BT formatted df +## mmD = {} +## for k, v in paramD.items(): +## # print(mmD[k]) +## scores_8020D = MultModelsCl(**paramD[k] +## , tts_split_type = tts_split_8020 +## , skf_cv = skf_cv +## , blind_test_df = X_bts +## , blind_test_target = y_bts +## , add_cm = True +## , add_yn = True +## , return_formatted_output = False) +## mmD[k] = scores_8020D +##============================================================================== +## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs +mmDD = {} +for k, v in paramD.items(): + scores_8020D = MultModelsCl(**paramD[k] + , tts_split_type = tts_split_8020 + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True + , return_formatted_output = True) + mmDD[k] = scores_8020D + +# Extracting the dfs from within the dict and concatenating to output as one df +for k, v in mmDD.items(): + out_wf_8020 = pd.concat(mmDD, ignore_index = True) + +out_wf_8020f = out_wf_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + +print('\n######################################################################' + , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)' + , '\nGene:', gene.lower() + , '\nDrug:', drug + , '\noutput file:', outFile_wf + , '\nDim of output:', out_wf_8020f.shape + , '\n######################################################################') +############################################################################### +#==================== +# Write output file +#==================== +out_wf_8020f.to_csv(outFile_wf, index = False) +print('\nFile successfully written:', outFile_wf) +############################################################################### diff --git a/scripts/ml/run_FS_7030.py b/scripts/ml/run_FS_7030.py new file mode 100644 index 0000000..3483ff0 --- /dev/null +++ b/scripts/ml/run_FS_7030.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue May 24 08:11:05 2022 + +@author: tanu +""" +#%% +import os, sys +import pandas as pd +import numpy as np +import pprint as pp +from copy import deepcopy +from sklearn import linear_model +from sklearn import datasets +from collections import Counter + +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier + +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.gaussian_process import GaussianProcessClassifier, kernels +from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel + +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis +from sklearn.neural_network import MLPClassifier + +from sklearn.svm import SVC +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report + +# added +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict + +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold + +from sklearn.pipeline import Pipeline, make_pipeline + +from sklearn.feature_selection import RFE, RFECV + +import itertools +import seaborn as sns +import matplotlib.pyplot as plt + +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import SMOTE +from sklearn.datasets import make_classification +from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC +from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours + +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator +from sklearn.impute import KNNImputer as KNN +import json +import argparse +import re +############################################################################### +#gene = 'pncA' +#drug = 'pyrazinamide' +#total_mtblineage_uc = 8 + +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### +#================== +# other vars +#================== +tts_split = '70_30' +OutFile_suffix = '7030_FS' +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml') + +############################################################################### +#================== +# Import data +#================== +from ml_data_7030 import * +setvars(gene,drug) +from ml_data_7030 import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#========================================== +# Import ML functions: +# fsgs_rfecv(): RFECV for Feature selection +#========================================== +from MultClfs import * + +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_7030/fs/' +print('\nOutput directory:', outdir_ml) +#OutFileFS = outdir_ml + gene.lower() + '_FS' + OutFile_suffix + '.json' +OutFileFS = outdir_ml + gene.lower() + '_FS_noOR' + OutFile_suffix + '.json' + +############################################################################ + +############################################################################### +#==================== +# single model CALL +#==================== +# aFS = fsgs(input_df = X +# , target = y +# , param_gridLd = [{'fs__min_features_to_select': [1]}] +# , blind_test_df = X_bts +# , blind_test_target = y_bts +# , estimator = LogisticRegression(**rs) +# , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below +# , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') +# , cv_method = skf_cv +# , var_type = 'mixed' +# ) +############# +# Loop +############ +#models_fs = [('Decision Tree' , DecisionTreeClassifier(**rs)) ] + +models_fs = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + , ('LDA' , LinearDiscriminantAnalysis() ) + , ('Logistic Regression' , LogisticRegression(**rs) ) + , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) + , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + , n_estimators = 1000 + , bootstrap = True + , oob_score = True + , **njobs + , **rs + , max_features = 'auto') ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + ## , ('XGBoost' , XGBClassifier(**rs, **njobs, verbosity = 3 , use_label_encoder = False) ) + ] + +print('\n#####################################################################' + , '\nRunning Feature Selection using classfication models_fs (n):', len(models_fs) + , '\nGene:' , gene.lower() + , '\nDrug:' , drug + , '\nSplit:' , tts_split + ,'\n####################################################################') + +for m in models_fs: + print(m) +print('\n====================================================================\n') + +out_fsD = {} +index = 1 +for model_name, model_fn in models_fs: + print('\nRunning classifier with FS:', index + , '\nModel_name:' , model_name + , '\nModel func:' , model_fn) + #, '\nList of models_fs:', models_fs) + index = index+1 + + out_fsD[model_name] = fsgs_rfecv(input_df = X + , target = y + , param_gridLd = [{'fs__min_features_to_select': [1]}] + , blind_test_df = X_bts + , blind_test_target = y_bts + , estimator = model_fn + , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below + , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') + , cv_method = skf_cv + , var_type = 'mixed' + ) +out_fsD +#%% Checking results dict +tot_Ditems = sum(len(v) for v in out_fsD.values()) + +checkL = [] +for k, v in out_fsD.items(): + l = [len(out_fsD[k])] + checkL = checkL + l + n_sD = len(checkL) # no. of subDicts + l_sD = list(set(checkL)) # length of each subDict + +print('\nTotal no.of subdicts:', n_sD) +if len(l_sD) == 1 and tot_Ditems == n_sD*l_sD[0]: + print('\nPASS: successful run for all Classifiers' + , '\nLength of each subdict:', l_sD) + +print('\nSuccessfully ran Feature selection on', len(models_fs), 'classifiers' + , '\nGene:', gene.lower() + , '\nDrug:', drug + , '\nSplit type:', tts_split + , '\nTotal fs models results:', len(out_fsD) + , '\nTotal items in output:', sum(len(v) for v in out_fsD.values()) ) + + +############################################################################## +#%% json output +#======================================== +# Write final output file +# https://stackoverflow.com/questions/19201290/how-to-save-a-dictionary-to-a-file +#======================================== +# Output final dict as a json +print('\nWriting Final output file (json):', OutFileFS) +with open(OutFileFS, 'w') as f: + f.write(json.dumps(out_fsD +# , cls = NpEncoder +)) + +# read json +with open(OutFileFS, 'r') as f:data = json.load(f) +############################################################################# + diff --git a/scripts/ml/run_cd_7030.py b/scripts/ml/run_cd_7030.py new file mode 100755 index 0000000..2ee6ce1 --- /dev/null +++ b/scripts/ml/run_cd_7030.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 20 13:05:23 2022 + +@author: tanu +""" +#%%Imports #################################################################### +import re +import argparse +import os, sys + +# gene = 'pncA' +# drug = 'pyrazinamide' +#total_mtblineage_uc = 8 +############################################################################### +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml') + +############################################################################### +#================== +# Import data +#================== +from ml_data_cd_7030 import * +setvars(gene,drug) +from ml_data_cd_7030 import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#==================== +# Import ML functions +#==================== +from MultClfs import * + +#================== +# other vars +#================== +tts_split_cd_7030 = 'cd_7030' +OutFile_suffix = '_cd_7030' + +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_cd_7030/' +print('\nOutput directory:', outdir_ml) + +#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' +outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv' + +#%% Running models ############################################################ +print('\n#####################################################################\n' + , '\nStarting--> Running ML analysis: Baseline modes (No FS)' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\n#####################################################################\n') + +paramD = { + 'baseline_paramD': { 'input_df' : X + , 'target' : y + , 'var_type' : 'mixed' + , 'resampling_type': 'none'} + + , 'smnc_paramD': { 'input_df' : X_smnc + , 'target' : y_smnc + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + + , 'ros_paramD': { 'input_df' : X_ros + , 'target' : y_ros + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + + , 'rus_paramD' : { 'input_df' : X_rus + , 'target' : y_rus + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + + , 'rouC_paramD' : { 'input_df' : X_rouC + , 'target' : y_rouC + , 'var_type' : 'mixed' + , 'resampling_type' : 'rouC'} + } + +##============================================================================== +## Dict with no CV BT formatted df +## mmD = {} +## for k, v in paramD.items(): +## # print(mmD[k]) +## scores_cd_7030D = MultModelsCl(**paramD[k] +## , tts_split_type = tts_split_cd_7030 +## , skf_cv = skf_cv +## , blind_test_df = X_bts +## , blind_test_target = y_bts +## , add_cm = True +## , add_yn = True +## , return_formatted_output = False) +## mmD[k] = scores_cd_7030D +##============================================================================== +## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs +mmDD = {} +for k, v in paramD.items(): + scores_cd_7030D = MultModelsCl(**paramD[k] + , tts_split_type = tts_split_cd_7030 + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True + , return_formatted_output = True) + mmDD[k] = scores_cd_7030D + +# Extracting the dfs from within the dict and concatenating to output as one df +for k, v in mmDD.items(): + out_wf_cd_7030 = pd.concat(mmDD, ignore_index = True) + +out_wf_cd_7030f = out_wf_cd_7030.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + +print('\n######################################################################' + , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)' + , '\nGene:', gene.lower() + , '\nDrug:', drug + , '\noutput file:', outFile_wf + , '\nDim of output:', out_wf_cd_7030f.shape + , '\n######################################################################') +############################################################################### +#==================== +# Write output file +#==================== +out_wf_cd_7030f.to_csv(outFile_wf, index = False) +print('\nFile successfully written:', outFile_wf) +############################################################################### diff --git a/scripts/ml/run_cd_8020.py b/scripts/ml/run_cd_8020.py new file mode 100755 index 0000000..25ef324 --- /dev/null +++ b/scripts/ml/run_cd_8020.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 20 13:05:23 2022 + +@author: tanu +""" +#%%Imports #################################################################### +import re +import argparse +import os, sys + +# gene = 'pncA' +# drug = 'pyrazinamide' +#total_mtblineage_uc = 8 +############################################################################### +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml') + +############################################################################### +#================== +# Import data +#================== +from ml_data_cd_8020 import * +setvars(gene,drug) +from ml_data_cd_8020 import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#==================== +# Import ML functions +#==================== +from MultClfs import * + +#================== +# other vars +#================== +tts_split_cd_8020 = 'cd_80_20' +OutFile_suffix = '_cd_8020' + +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_cd_8020/' +print('\nOutput directory:', outdir_ml) + +#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' +outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv' +#%% Running models ############################################################ +print('\n#####################################################################\n' + , '\nStarting--> Running ML analysis: Baseline modes (No FS)' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\n#####################################################################\n') + +paramD = { + 'baseline_paramD': { 'input_df' : X + , 'target' : y + , 'var_type' : 'mixed' + , 'resampling_type': 'none'} + + , 'smnc_paramD': { 'input_df' : X_smnc + , 'target' : y_smnc + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + + , 'ros_paramD': { 'input_df' : X_ros + , 'target' : y_ros + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + + , 'rus_paramD' : { 'input_df' : X_rus + , 'target' : y_rus + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + + , 'rouC_paramD' : { 'input_df' : X_rouC + , 'target' : y_rouC + , 'var_type' : 'mixed' + , 'resampling_type' : 'rouC'} + } + +##============================================================================== +## Dict with no CV BT formatted df +## mmD = {} +## for k, v in paramD.items(): +## # print(mmD[k]) +## scores_cd_8020D = MultModelsCl(**paramD[k] +## , tts_split_type = tts_split_cd_8020 +## , skf_cv = skf_cv +## , blind_test_df = X_bts +## , blind_test_target = y_bts +## , add_cm = True +## , add_yn = True +## , return_formatted_output = False) +## mmD[k] = scores_cd_8020D +##============================================================================== +## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs +mmDD = {} +for k, v in paramD.items(): + scores_cd_8020D = MultModelsCl(**paramD[k] + , tts_split_type = tts_split_cd_8020 + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True + , return_formatted_output = True) + mmDD[k] = scores_cd_8020D + +# Extracting the dfs from within the dict and concatenating to output as one df +for k, v in mmDD.items(): + out_wf_cd_8020 = pd.concat(mmDD, ignore_index = True) + +out_wf_cd_8020f = out_wf_cd_8020.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + +print('\n######################################################################' + , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)' + , '\nGene:', gene.lower() + , '\nDrug:', drug + , '\noutput file:', outFile_wf + , '\nDim of output:', out_wf_cd_8020f.shape + , '\n######################################################################') +############################################################################### +#==================== +# Write output file +#==================== +out_wf_cd_8020f.to_csv(outFile_wf, index = False) +print('\nFile successfully written:', outFile_wf) +############################################################################### diff --git a/scripts/ml/run_cd_sl.py b/scripts/ml/run_cd_sl.py new file mode 100755 index 0000000..fbd943a --- /dev/null +++ b/scripts/ml/run_cd_sl.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 20 13:05:23 2022 + +@author: tanu +""" +#%%Imports #################################################################### +import re +import argparse +import os, sys + +# gene = 'pncA' +# drug = 'pyrazinamide' +#total_mtblineage_uc = 8 +############################################################################### +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml') + +############################################################################### +#================== +# Import data +#================== +from ml_data_cd_sl import * +setvars(gene,drug) +from ml_data_cd_sl import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#==================== +# Import ML functions +#==================== +from MultClfs import * + +#================== +# other vars +#================== +tts_split_cd_sl = 'cd_sl' +OutFile_suffix = '_cd_sl' + +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_cd_sl/' +print('\nOutput directory:', outdir_ml) + +#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' +outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv' +#%% Running models ############################################################ +print('\n#####################################################################\n' + , '\nStarting--> Running ML analysis: Baseline modes (No FS)' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\n#####################################################################\n') + +paramD = { + 'baseline_paramD': { 'input_df' : X + , 'target' : y + , 'var_type' : 'mixed' + , 'resampling_type': 'none'} + + , 'smnc_paramD': { 'input_df' : X_smnc + , 'target' : y_smnc + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + + , 'ros_paramD': { 'input_df' : X_ros + , 'target' : y_ros + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + + , 'rus_paramD' : { 'input_df' : X_rus + , 'target' : y_rus + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + + , 'rouC_paramD' : { 'input_df' : X_rouC + , 'target' : y_rouC + , 'var_type' : 'mixed' + , 'resampling_type' : 'rouC'} + } + +##============================================================================== +## Dict with no CV BT formatted df +## mmD = {} +## for k, v in paramD.items(): +## # print(mmD[k]) +## scores_cd_slD = MultModelsCl(**paramD[k] +## , tts_split_type = tts_split_cd_sl +## , skf_cv = skf_cv +## , blind_test_df = X_bts +## , blind_test_target = y_bts +## , add_cm = True +## , add_yn = True +## , return_formatted_output = False) +## mmD[k] = scores_cd_slD +##============================================================================== +## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs +mmDD = {} +for k, v in paramD.items(): + scores_cd_slD = MultModelsCl(**paramD[k] + , tts_split_type = tts_split_cd_sl + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True + , return_formatted_output = True) + mmDD[k] = scores_cd_slD + +# Extracting the dfs from within the dict and concatenating to output as one df +for k, v in mmDD.items(): + out_wf_cd_sl = pd.concat(mmDD, ignore_index = True) + +out_wf_cd_slf = out_wf_cd_sl.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + +print('\n######################################################################' + , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)' + , '\nGene:', gene.lower() + , '\nDrug:', drug + , '\noutput file:', outFile_wf + , '\nDim of output:', out_wf_cd_slf.shape + , '\n######################################################################') +############################################################################### +#==================== +# Write output file +#==================== +out_wf_cd_slf.to_csv(outFile_wf, index = False) +print('\nFile successfully written:', outFile_wf) +############################################################################### diff --git a/scripts/ml/run_sl.py b/scripts/ml/run_sl.py new file mode 100755 index 0000000..6e325f4 --- /dev/null +++ b/scripts/ml/run_sl.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 20 13:05:23 2022 + +@author: tanu +""" +#%%Imports #################################################################### +import re +import argparse +import os, sys + +# gene = 'pncA' +# drug = 'pyrazinamide' +#total_mtblineage_uc = 8 +############################################################################### +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml') + +############################################################################### +#================== +# Import data +#================== +from ml_data_sl import * +setvars(gene,drug) +from ml_data_sl import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#==================== +# Import ML functions +#==================== +from MultClfs import * + +#================== +# other vars +#================== +tts_split_sl = 'sl' +OutFile_suffix = 'sl' + +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_sl/' +print('\nOutput directory:', outdir_ml) + +#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' +outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv' +#%% Running models ############################################################ +print('\n#####################################################################\n' + , '\nStarting--> Running ML analysis: Baseline modes (No FS)' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\n#####################################################################\n') + +paramD = { + 'baseline_paramD': { 'input_df' : X + , 'target' : y + , 'var_type' : 'mixed' + , 'resampling_type': 'none'} + + , 'smnc_paramD': { 'input_df' : X_smnc + , 'target' : y_smnc + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + + , 'ros_paramD': { 'input_df' : X_ros + , 'target' : y_ros + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + + , 'rus_paramD' : { 'input_df' : X_rus + , 'target' : y_rus + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + + , 'rouC_paramD' : { 'input_df' : X_rouC + , 'target' : y_rouC + , 'var_type' : 'mixed' + , 'resampling_type' : 'rouC'} + } + +##============================================================================== +## Dict with no CV BT formatted df +## mmD = {} +## for k, v in paramD.items(): +## # print(mmD[k]) +## scores_slD = MultModelsCl(**paramD[k] +## , tts_split_type = tts_split_sl +## , skf_cv = skf_cv +## , blind_test_df = X_bts +## , blind_test_target = y_bts +## , add_cm = True +## , add_yn = True +## , return_formatted_output = False) +## mmD[k] = scores_slD +##============================================================================== +## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs +mmDD = {} +for k, v in paramD.items(): + scores_slD = MultModelsCl(**paramD[k] + , tts_split_type = tts_split_sl + , skf_cv = skf_cv + , blind_test_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True + , return_formatted_output = True) + mmDD[k] = scores_slD + +# Extracting the dfs from within the dict and concatenating to output as one df +for k, v in mmDD.items(): + out_wf_sl = pd.concat(mmDD, ignore_index = True) + +out_wf_slf = out_wf_sl.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + +print('\n######################################################################' + , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)' + , '\nGene:', gene.lower() + , '\nDrug:', drug + , '\noutput file:', outFile_wf + , '\nDim of output:', out_wf_slf.shape + , '\n######################################################################') +############################################################################### +#==================== +# Write output file +#==================== +out_wf_slf.to_csv(outFile_wf, index = False) +print('\nFile successfully written:', outFile_wf) +###############################################################################