diff --git a/scripts/ml/alr_config.py b/scripts/ml/alr_config.py new file mode 100755 index 0000000..fa3d91e --- /dev/null +++ b/scripts/ml/alr_config.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'alr' +drug = 'cycloserine' +#total_mtblineage_uc = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +#--------------------------- +# Version 1: no AAindex +#from UQ_ML_data import * +#setvars(gene,drug) +#from UQ_ML_data import * +#--------------------------- + +from ml_data import * +setvars(gene,drug) +from ml_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('AAindex features (n):' + , len(X_aaindexFN) + , '\nThese are:\n' + , X_aaindexFN + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + sys.exit('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') +################################################################################ +#================== +# Specify outdir +#================== + +outdir_ml = outdir + 'ml/v2/' + +################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv') diff --git a/scripts/ml/embb_config.py b/scripts/ml/embb_config.py new file mode 100755 index 0000000..57193c8 --- /dev/null +++ b/scripts/ml/embb_config.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'embB' +drug = 'ethambutol' +#total_mtblineage_uc = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +#--------------------------- +# Version 1: no AAindex +#from UQ_ML_data import * +#setvars(gene,drug) +#from UQ_ML_data import * +#--------------------------- + +from ml_data import * +setvars(gene,drug) +from ml_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('AAindex features (n):' + , len(X_aaindexFN) + , '\nThese are:\n' + , X_aaindexFN + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + sys.exit('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') +################################################################################ +#================== +# Specify outdir +#================== + +outdir_ml = outdir + 'ml/v2/' + +################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv') diff --git a/scripts/ml/gid_config.py b/scripts/ml/gid_config.py new file mode 100755 index 0000000..dfdc157 --- /dev/null +++ b/scripts/ml/gid_config.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'gid' +drug = 'streptomycin' +#total_mtblineage_uc = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +#--------------------------- +# Version 1: no AAindex +#from UQ_ML_data import * +#setvars(gene,drug) +#from UQ_ML_data import * +#--------------------------- + +from ml_data import * +setvars(gene,drug) +from ml_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('AAindex features (n):' + , len(X_aaindexFN) + , '\nThese are:\n' + , X_aaindexFN + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + sys.exit('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') +################################################################################ +#================== +# Specify outdir +#================== + +outdir_ml = outdir + 'ml/v2/' + +################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv') diff --git a/scripts/ml/katg_config.py b/scripts/ml/katg_config.py new file mode 100755 index 0000000..cfe9a84 --- /dev/null +++ b/scripts/ml/katg_config.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'katG' +drug = 'isoniazid' +#total_mtblineage_uc = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +#--------------------------- +# Version 1: no AAindex +#from UQ_ML_data import * +#setvars(gene,drug) +#from UQ_ML_data import * +#--------------------------- + +from ml_data import * +setvars(gene,drug) +from ml_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('AAindex features (n):' + , len(X_aaindexFN) + , '\nThese are:\n' + , X_aaindexFN + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + sys.exit('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') +################################################################################ +#================== +# Specify outdir +#================== + +outdir_ml = outdir + 'ml/v2/' + +################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv') diff --git a/scripts/ml/pnca_config.py b/scripts/ml/pnca_config.py index d200adb..fa2d158 100755 --- a/scripts/ml/pnca_config.py +++ b/scripts/ml/pnca_config.py @@ -22,9 +22,9 @@ os.chdir( homedir + '/git/ML_AI_training/') #from UQ_ML_data import * #--------------------------- -from UQ_ML_data2 import * +from ml_data import * setvars(gene,drug) -from UQ_ML_data2 import * +from ml_data import * # from YC run_all_ML: run locally #from UQ_yc_RunAllClfs import run_all_ML @@ -54,7 +54,7 @@ print('AAindex features (n):' , len(X_aaindexFN) , '\nThese are:\n' , X_aaindexFN - , '\n================================================================\n') + , '\n================================================================\n') print('Evolutionary features (n):' , len(X_evolFN) @@ -81,6 +81,12 @@ else: sys.exit('\nFail: Count of feature mismatch') print('\n#####################################################################\n') +################################################################################ +#================== +# Specify outdir +#================== + +outdir_ml = outdir + 'ml/v2/' ################################################################################ #================== @@ -103,8 +109,8 @@ baseline_BT = baseline_all.filter(like='bts_', axis=1) baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # Write csv -baseline_CT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_CT_allF.csv') -baseline_BT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_BT_allF.csv') +baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') #%% SMOTE NC: Oversampling [Numerical + categorical] @@ -124,8 +130,8 @@ smnc_BT = smnc_all.filter(like='bts_', axis=1) smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # Write csv -smnc_CT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_CT_allF.csv') -smnc_BT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_BT_allF.csv') +smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') #%% ROS: Numerical + categorical mm_skf_scoresD3 = MultModelsCl(input_df = X_ros @@ -144,8 +150,8 @@ ros_BT = ros_all.filter(like='bts_', axis=1) ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # Write csv -ros_CT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_CT_allF.csv') -ros_BT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_BT_allF.csv') +ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') #%% RUS: Numerical + categorical mm_skf_scoresD4 = MultModelsCl(input_df = X_rus @@ -164,8 +170,8 @@ rus_BT = rus_all.filter(like='bts_' , axis=1) rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # Write csv -rus_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_CT_allF.csv') -rus_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_BT_allF.csv') +rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') #%% ROS + RUS Combined: Numerical + categorical mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC @@ -184,5 +190,5 @@ rouC_BT = rouC_all.filter(like='bts_', axis=1) rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # Write csv -rouC_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_CT_allF.csv') -rouC_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_BT_allF.csv') +rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv') diff --git a/scripts/ml/rpob_config.py b/scripts/ml/rpob_config.py new file mode 100755 index 0000000..d5b77d0 --- /dev/null +++ b/scripts/ml/rpob_config.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'rpoB' +drug = 'rifampicin' +#total_mtblineage_uc = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +#--------------------------- +# Version 1: no AAindex +#from UQ_ML_data import * +#setvars(gene,drug) +#from UQ_ML_data import * +#--------------------------- + +from ml_data import * +setvars(gene,drug) +from ml_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('AAindex features (n):' + , len(X_aaindexFN) + , '\nThese are:\n' + , X_aaindexFN + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + sys.exit('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') +################################################################################ +#================== +# Specify outdir +#================== + +outdir_ml = outdir + 'ml/v2/' + +################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')