changed blind_test_input_df to blind_test_df in MultModelsCl

This commit is contained in:
Tanushree Tunstall 2022-06-22 16:42:04 +01:00
parent bc12dbd7c2
commit 0350784d52
114 changed files with 107251 additions and 863011 deletions

View file

@ -101,7 +101,7 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%%
# Multiple Classification - Model Pipeline
def MultModelsCl(input_df, target, skf_cv
, blind_test_input_df
, blind_test_df
, blind_test_target
, add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers
@ -155,32 +155,32 @@ def MultModelsCl(input_df, target, skf_cv
, ('Logistic RegressionCV' , LogisticRegressionCV(**rs) )
, ('Gaussian NB' , GaussianNB() )
, ('Naive Bayes' , BernoulliNB() )
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
# , ('SVC' , SVC(**rs) )
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
# , ('Decision Tree' , DecisionTreeClassifier(**rs) )
# , ('Extra Trees' , ExtraTreesClassifier(**rs) )
# , ('Extra Tree' , ExtraTreeClassifier(**rs) )
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
# , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
# , n_estimators = 1000
# , bootstrap = True
# , oob_score = True
# , **njobs
# , **rs
# , max_features = 'auto') )
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
# , ('LDA' , LinearDiscriminantAnalysis() )
# , ('Multinomial' , MultinomialNB() )
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
# , ('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
# , ('QDA' , QuadraticDiscriminantAnalysis() )
# , ('Ridge Classifier' , RidgeClassifier(**rs) )
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) )
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('SVC' , SVC(**rs) )
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, n_estimators = 1000
, bootstrap = True
, oob_score = True
, **njobs
, **rs
, max_features = 'auto') )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
, ('LDA' , LinearDiscriminantAnalysis() )
, ('Multinomial' , MultinomialNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
, ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) )
]
mm_skf_scoresD = {}
@ -293,9 +293,9 @@ def MultModelsCl(input_df, target, skf_cv
# Blind test: BTS results
#=========================
# Build the final results with all scores for the model
#bts_predict = gscv_fs.predict(blind_test_input_df)
#bts_predict = gscv_fs.predict(blind_test_df)
model_pipeline.fit(input_df, target)
bts_predict = model_pipeline.predict(blind_test_input_df)
bts_predict = model_pipeline.predict(blind_test_df)
bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
print('\nMCC on Blind test:' , bts_mcc_score)

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_7030 import *
setvars(gene,drug)
from ml_data_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_8020 import *
setvars(gene,drug)
from ml_data_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_7030 import *
setvars(gene,drug)
from ml_data_cd_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_8020 import *
setvars(gene,drug)
from ml_data_cd_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_sl import *
setvars(gene,drug)
from ml_data_cd_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,210 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
#---------------------------
# Version 1: no AAindex
#from UQ_ML_data import *
#setvars(gene,drug)
#from UQ_ML_data import *
#---------------------------
from ml_data import *
setvars(gene,drug)
from ml_data import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_orig import *
setvars(gene,drug)
from ml_data_orig import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: ORIGINAL'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_orig/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_rt import *
setvars(gene,drug)
from ml_data_rt import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: REVERSE training\n'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_rt/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'alr'
drug = 'cycloserine'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_sl import *
setvars(gene,drug)
from ml_data_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: scaling law split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_7030 import *
setvars(gene,drug)
from ml_data_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_8020 import *
setvars(gene,drug)
from ml_data_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_7030 import *
setvars(gene,drug)
from ml_data_cd_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_8020 import *
setvars(gene,drug)
from ml_data_cd_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_sl import *
setvars(gene,drug)
from ml_data_cd_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,210 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
#---------------------------
# Version 1: no AAindex
#from UQ_ML_data import *
#setvars(gene,drug)
#from UQ_ML_data import *
#---------------------------
from ml_data import *
setvars(gene,drug)
from ml_data import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_orig import *
setvars(gene,drug)
from ml_data_orig import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: ORIGINAL'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_orig/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_rt import *
setvars(gene,drug)
from ml_data_rt import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: REVERSE training\n'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_rt/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'embB'
drug = 'ethambutol'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_sl import *
setvars(gene,drug)
from ml_data_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: scaling law split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_7030 import *
setvars(gene,drug)
from ml_data_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_8020 import *
setvars(gene,drug)
from ml_data_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_7030 import *
setvars(gene,drug)
from ml_data_cd_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_8020 import *
setvars(gene,drug)
from ml_data_cd_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_sl import *
setvars(gene,drug)
from ml_data_cd_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,210 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
#---------------------------
# Version 1: no AAindex
#from UQ_ML_data import *
#setvars(gene,drug)
#from UQ_ML_data import *
#---------------------------
from ml_data import *
setvars(gene,drug)
from ml_data import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_orig import *
setvars(gene,drug)
from ml_data_orig import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: ORIGINAL'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_orig/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_rt import *
setvars(gene,drug)
from ml_data_rt import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: REVERSE training\n'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_rt/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'gid'
drug = 'streptomycin'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_sl import *
setvars(gene,drug)
from ml_data_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: scaling law split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_7030 import *
setvars(gene,drug)
from ml_data_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_8020 import *
setvars(gene,drug)
from ml_data_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_7030 import *
setvars(gene,drug)
from ml_data_cd_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_8020 import *
setvars(gene,drug)
from ml_data_cd_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_sl import *
setvars(gene,drug)
from ml_data_cd_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,210 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
#---------------------------
# Version 1: no AAindex
#from UQ_ML_data import *
#setvars(gene,drug)
#from UQ_ML_data import *
#---------------------------
from ml_data import *
setvars(gene,drug)
from ml_data import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_orig import *
setvars(gene,drug)
from ml_data_orig import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: ORIGINAL'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_orig/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_rt import *
setvars(gene,drug)
from ml_data_rt import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: REVERSE training\n'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_rt/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'katG'
drug = 'isoniazid'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_sl import *
setvars(gene,drug)
from ml_data_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: scaling law split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,4 +1,4 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:548: SettingWithCopyWarning:
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:464: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
@ -55,16 +55,13 @@ PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 168
No. of categorical features: 7
PASS: x_features has no target variable
No. of columns for x_features: 175
No. of columns for x_features: 174
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_7030.py", line 19, in <module>
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./run_7030.py", line 35, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 658, in setvars
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 636, in setvars
X, X_bts, y, y_bts = train_test_split(x_features, y_target
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
train, test = next(cv.split(X=arrays[0], y=stratify))

View file

@ -1,75 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_8020.py:549: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 168
No. of categorical features: 7
PASS: x_features has no target variable
No. of columns for x_features: 175
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_8020.py", line 19, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_8020.py", line 656, in setvars
X, X_bts, y, y_bts = train_test_split(x_features, y_target
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
train, test = next(cv.split(X=arrays[0], y=stratify))
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
for train, test in self._iter_indices(X, y, groups):
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
raise ValueError(
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

View file

@ -1,113 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_7030.py:548: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 168
No. of categorical features: 7
PASS: x_features has no target variable
No. of columns for x_features: 175
-------------------------------------------------------------
Successfully split data with stratification [COMPLETE data]: 70/30
Original data size: (271, 175)
Train data size: (181, 175)
Test data size: (90, 175)
y_train numbers: Counter({0: 180, 1: 1})
y_train ratio: 180.0
y_test_numbers: Counter({0: 89, 1: 1})
y_test ratio: 89.0
-------------------------------------------------------------
index: 0
ind: 1
Mask count check: True
index: 1
ind: 2
Mask count check: True
Original Data
Counter({0: 180, 1: 1}) Data dim: (181, 175)
Simple Random OverSampling
Counter({0: 180, 1: 180})
(360, 175)
Simple Random UnderSampling
Counter({0: 1, 1: 1})
(2, 175)
Simple Combined Over and UnderSampling
Counter({0: 180, 1: 180})
(360, 175)
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_cd_7030.py", line 19, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_7030.py", line 745, in setvars
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
output = self._fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
raise ValueError(
ValueError: Expected n_neighbors <= n_samples, but n_samples = 1, n_neighbors = 6

View file

@ -1,69 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_8020.py:548: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 168
No. of categorical features: 7
PASS: x_features has no target variable
No. of columns for x_features: 175
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_cd_8020.py", line 19, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_8020.py", line 666, in setvars
yc2_ratio = yc2[0]/yc2[1]
ZeroDivisionError: division by zero

View file

@ -1,69 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_sl.py:548: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 168
No. of categorical features: 7
PASS: x_features has no target variable
No. of columns for x_features: 175
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_cd_sl.py", line 19, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_sl.py", line 669, in setvars
yc2_ratio = yc2[0]/yc2[1]
ZeroDivisionError: division by zero

View file

@ -1,105 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data.py:550: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
No. of numerical features: 45
No. of categorical features: 7
index: 0
ind: 1
Mask count check: True
index: 1
ind: 2
Mask count check: True
Original Data
Counter({0: 7, 1: 1}) Data dim: (8, 52)
-------------------------------------------------------------
Successfully split data: UQ [no aa_index but active site included] training
actual values: training set
imputed values: blind test set
Train data size: (8, 52)
Test data size: (263, 52)
y_train numbers: Counter({0: 7, 1: 1})
y_train ratio: 7.0
y_test_numbers: Counter({0: 262, 1: 1})
y_test ratio: 262.0
-------------------------------------------------------------
Simple Random OverSampling
Counter({0: 7, 1: 7})
(14, 52)
Simple Random UnderSampling
Counter({0: 1, 1: 1})
(2, 52)
Simple Combined Over and UnderSampling
Counter({0: 7, 1: 7})
(14, 52)
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_config.py", line 26, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data.py", line 701, in setvars
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
output = self._fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
raise ValueError(
ValueError: Expected n_neighbors <= n_samples, but n_samples = 1, n_neighbors = 6

View file

@ -1,107 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_orig.py:550: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 168
No. of categorical features: 7
index: 0
ind: 1
Mask count check: True
index: 1
ind: 2
Mask count check: True
Original Data
Counter({0: 7, 1: 1}) Data dim: (8, 175)
-------------------------------------------------------------
Successfully split data: ORIGINAL training
actual values: training set
imputed values: blind test set
Train data size: (8, 175)
Test data size: (263, 175)
y_train numbers: Counter({0: 7, 1: 1})
y_train ratio: 7.0
y_test_numbers: Counter({0: 262, 1: 1})
y_test ratio: 262.0
-------------------------------------------------------------
Simple Random OverSampling
Counter({0: 7, 1: 7})
(14, 175)
Simple Random UnderSampling
Counter({0: 1, 1: 1})
(2, 175)
Simple Combined Over and UnderSampling
Counter({0: 7, 1: 7})
(14, 175)
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_orig.py", line 19, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_orig.py", line 701, in setvars
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
output = self._fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
raise ValueError(
ValueError: Expected n_neighbors <= n_samples, but n_samples = 1, n_neighbors = 6

View file

@ -1,107 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py:550: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 168
No. of categorical features: 7
index: 0
ind: 1
Mask count check: True
index: 1
ind: 2
Mask count check: True
Original Data
Counter({0: 262, 1: 1}) Data dim: (263, 175)
-------------------------------------------------------------
Successfully split data: REVERSE training
imputed values: training set
actual values: blind test set
Train data size: (263, 175)
Test data size: (8, 175)
y_train numbers: Counter({0: 262, 1: 1})
y_train ratio: 262.0
y_test_numbers: Counter({0: 7, 1: 1})
y_test ratio: 7.0
-------------------------------------------------------------
Simple Random OverSampling
Counter({0: 262, 1: 262})
(524, 175)
Simple Random UnderSampling
Counter({0: 1, 1: 1})
(2, 175)
Simple Combined Over and UnderSampling
Counter({0: 262, 1: 262})
(524, 175)
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_rt.py", line 19, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py", line 701, in setvars
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
output = self._fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
raise ValueError(
ValueError: Expected n_neighbors <= n_samples, but n_samples = 1, n_neighbors = 6

View file

@ -1,75 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_sl.py:549: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 271
PASS: my_features_df and aa_df successfully combined
nrows: 271
ncols: 269
count of NULL values before imputation
or_mychisq 256
log10_or_mychisq 256
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 168
No. of categorical features: 7
PASS: x_features has no target variable
No. of columns for x_features: 175
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_sl.py", line 19, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_sl.py", line 660, in setvars
X, X_bts, y, y_bts = train_test_split(x_features, y_target
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
train, test = next(cv.split(X=arrays[0], y=stratify))
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
for train, test in self._iter_indices(X, y, groups):
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
raise ValueError(
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,107 +0,0 @@
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py:550: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
1.22.4
1.4.1
aaindex_df contains non-numerical data
Total no. of non-numerial columns: 2
Selecting numerical data only
PASS: successfully selected numerical columns only for aaindex_df
Now checking for NA in the remaining aaindex_cols
Counting aaindex_df cols with NA
ncols with NA: 4 columns
Dropping these...
Original ncols: 127
Revised df ncols: 123
Checking NA in revised df...
PASS: cols with NA successfully dropped from aaindex_df
Proceeding with combining aa_df with other features_df
PASS: ncols match
Expected ncols: 123
Got: 123
Total no. of columns in clean aa_df: 123
Proceeding to merge, expected nrows in merged_df: 531
PASS: my_features_df and aa_df successfully combined
nrows: 531
ncols: 286
count of NULL values before imputation
or_mychisq 263
log10_or_mychisq 263
dtype: int64
count of NULL values AFTER imputation
mutationinformation 0
or_rawI 0
logorI 0
dtype: int64
PASS: OR values imputed, data ready for ML
Total no. of features for aaindex: 123
No. of numerical features: 167
No. of categorical features: 7
index: 0
ind: 1
Mask count check: True
index: 1
ind: 2
Mask count check: True
Original Data
Counter({0: 409, 1: 3}) Data dim: (412, 174)
-------------------------------------------------------------
Successfully split data: REVERSE training
imputed values: training set
actual values: blind test set
Train data size: (412, 174)
Test data size: (119, 174)
y_train numbers: Counter({0: 409, 1: 3})
y_train ratio: 136.33333333333334
y_test_numbers: Counter({0: 76, 1: 43})
y_test ratio: 1.7674418604651163
-------------------------------------------------------------
Simple Random OverSampling
Counter({0: 409, 1: 409})
(818, 174)
Simple Random UnderSampling
Counter({0: 3, 1: 3})
(6, 174)
Simple Combined Over and UnderSampling
Counter({0: 409, 1: 409})
(818, 174)
Traceback (most recent call last):
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./gid_rt.py", line 19, in <module>
setvars(gene,drug)
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py", line 701, in setvars
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
output = self._fit_resample(X, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
raise ValueError(
ValueError: Expected n_neighbors <= n_samples, but n_samples = 3, n_neighbors = 6

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -572,9 +572,10 @@ def setvars(gene,drug):
, 'lineage_count_unique'
]
X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
#, 'gene_name' # will be required for the combined stuff
]
# X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
# #, 'gene_name' # will be required for the combined stuff
# ]
X_gn_Fcat = []
X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
###############################################################################

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_7030 import *
setvars(gene,drug)
from ml_data_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_8020 import *
setvars(gene,drug)
from ml_data_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_7030 import *
setvars(gene,drug)
from ml_data_cd_7030 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_8020 import *
setvars(gene,drug)
from ml_data_cd_8020 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_8020/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,203 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_cd_sl import *
setvars(gene,drug)
from ml_data_cd_sl import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
################################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_cd_sl/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,210 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
#---------------------------
# Version 1: no AAindex
#from UQ_ML_data import *
#setvars(gene,drug)
#from UQ_ML_data import *
#---------------------------
from ml_data import *
setvars(gene,drug)
from ml_data import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
from ml_data_orig import *
setvars(gene,drug)
from ml_data_orig import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: ORIGINAL'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/tts_orig/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
, '\nThese are:'
, '\nCommon stablity features:', X_stabilityN
, '\nFoldX columns:', X_foldX_cols
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
, '\nThese are:\n'
, X_evolFN
, '\n================================================================\n')
print('Genomic features (n):'
, len(X_genomicFN)
, '\nThese are:\n'
, X_genomic_mafor, '\n'
, X_genomic_linegae
, '\n================================================================\n')
print('Categorical features (n):'
, len(categorical_FN)
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
###############################################################################
#==================
# Baseline models
#==================
mm_skf_scoresD = MultModelsCl(input_df = X
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
baseline_all = pd.DataFrame(mm_skf_scoresD)
baseline_all = baseline_all.T
#baseline_train = baseline_all.filter(like='train_', axis=1)
baseline_CT = baseline_all.filter(like='test_', axis=1)
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
baseline_BT = baseline_all.filter(like='bts_', axis=1)
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
#%% SMOTE NC: Oversampling [Numerical + categorical]
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
, target = y_smnc
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
smnc_all = pd.DataFrame(mm_skf_scoresD7)
smnc_all = smnc_all.T
smnc_CT = smnc_all.filter(like='test_', axis=1)
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
smnc_BT = smnc_all.filter(like='bts_', axis=1)
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
#%% ROS: Numerical + categorical
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
, target = y_ros
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
ros_all = pd.DataFrame(mm_skf_scoresD3)
ros_all = ros_all.T
ros_CT = ros_all.filter(like='test_', axis=1)
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
ros_BT = ros_all.filter(like='bts_', axis=1)
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
#%% RUS: Numerical + categorical
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
, target = y_rus
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rus_all = pd.DataFrame(mm_skf_scoresD4)
rus_all = rus_all.T
rus_CT = rus_all.filter(like='test_', axis=1)
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rus_BT = rus_all.filter(like='bts_' , axis=1)
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
#%% ROS + RUS Combined: Numerical + categorical
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
, target = y_rouC
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts
, blind_test_target = y_bts)
rouC_all = pd.DataFrame(mm_skf_scoresD8)
rouC_all = rouC_all.T
rouC_CT = rouC_all.filter(like='test_', axis=1)
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
rouC_BT = rouC_all.filter(like='bts_', axis=1)
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# Write csv
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')

Some files were not shown because too many files have changed in this diff Show more