changed blind_test_input_df to blind_test_df in MultModelsCl
This commit is contained in:
parent
bc12dbd7c2
commit
0350784d52
114 changed files with 107251 additions and 863011 deletions
|
@ -101,7 +101,7 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
|
|||
#%%
|
||||
# Multiple Classification - Model Pipeline
|
||||
def MultModelsCl(input_df, target, skf_cv
|
||||
, blind_test_input_df
|
||||
, blind_test_df
|
||||
, blind_test_target
|
||||
, add_cm = True # adds confusion matrix based on cross_val_predict
|
||||
, add_yn = True # adds target var class numbers
|
||||
|
@ -155,32 +155,32 @@ def MultModelsCl(input_df, target, skf_cv
|
|||
, ('Logistic RegressionCV' , LogisticRegressionCV(**rs) )
|
||||
, ('Gaussian NB' , GaussianNB() )
|
||||
, ('Naive Bayes' , BernoulliNB() )
|
||||
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||
# , ('SVC' , SVC(**rs) )
|
||||
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||
# , ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||
# , ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||
# , ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||
# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
|
||||
# , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||
# , n_estimators = 1000
|
||||
# , bootstrap = True
|
||||
# , oob_score = True
|
||||
# , **njobs
|
||||
# , **rs
|
||||
# , max_features = 'auto') )
|
||||
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
|
||||
# , ('LDA' , LinearDiscriminantAnalysis() )
|
||||
# , ('Multinomial' , MultinomialNB() )
|
||||
# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||
# , ('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
||||
# , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
|
||||
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||
# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||
# , ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||
# , ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) )
|
||||
, ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||
, ('SVC' , SVC(**rs) )
|
||||
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
|
||||
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
|
||||
, n_estimators = 1000
|
||||
, bootstrap = True
|
||||
, oob_score = True
|
||||
, **njobs
|
||||
, **rs
|
||||
, max_features = 'auto') )
|
||||
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
|
||||
, ('LDA' , LinearDiscriminantAnalysis() )
|
||||
, ('Multinomial' , MultinomialNB() )
|
||||
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||
, ('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
||||
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
|
||||
, ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
|
||||
, ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||
, ('Ridge Classifier' , RidgeClassifier(**rs) )
|
||||
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) )
|
||||
]
|
||||
|
||||
mm_skf_scoresD = {}
|
||||
|
@ -293,9 +293,9 @@ def MultModelsCl(input_df, target, skf_cv
|
|||
# Blind test: BTS results
|
||||
#=========================
|
||||
# Build the final results with all scores for the model
|
||||
#bts_predict = gscv_fs.predict(blind_test_input_df)
|
||||
#bts_predict = gscv_fs.predict(blind_test_df)
|
||||
model_pipeline.fit(input_df, target)
|
||||
bts_predict = model_pipeline.predict(blind_test_input_df)
|
||||
bts_predict = model_pipeline.predict(blind_test_df)
|
||||
|
||||
bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2)
|
||||
print('\nMCC on Blind test:' , bts_mcc_score)
|
||||
|
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,210 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
#---------------------------
|
||||
# Version 1: no AAindex
|
||||
#from UQ_ML_data import *
|
||||
#setvars(gene,drug)
|
||||
#from UQ_ML_data import *
|
||||
#---------------------------
|
||||
|
||||
from ml_data import *
|
||||
setvars(gene,drug)
|
||||
from ml_data import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/uq_v1/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
# print('AAindex features (n):'
|
||||
# , len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
# , '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_orig import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_orig import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: ORIGINAL'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_orig/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_rt import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_rt import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: REVERSE training\n'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_rt/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'alr'
|
||||
drug = 'cycloserine'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: scaling law split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,210 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
#---------------------------
|
||||
# Version 1: no AAindex
|
||||
#from UQ_ML_data import *
|
||||
#setvars(gene,drug)
|
||||
#from UQ_ML_data import *
|
||||
#---------------------------
|
||||
|
||||
from ml_data import *
|
||||
setvars(gene,drug)
|
||||
from ml_data import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/uq_v1/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
# print('AAindex features (n):'
|
||||
# , len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
# , '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_orig import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_orig import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: ORIGINAL'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_orig/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_rt import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_rt import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: REVERSE training\n'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_rt/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'embB'
|
||||
drug = 'ethambutol'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: scaling law split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,210 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
#---------------------------
|
||||
# Version 1: no AAindex
|
||||
#from UQ_ML_data import *
|
||||
#setvars(gene,drug)
|
||||
#from UQ_ML_data import *
|
||||
#---------------------------
|
||||
|
||||
from ml_data import *
|
||||
setvars(gene,drug)
|
||||
from ml_data import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/uq_v1/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
# print('AAindex features (n):'
|
||||
# , len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
# , '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_orig import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_orig import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: ORIGINAL'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_orig/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_rt import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_rt import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: REVERSE training\n'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_rt/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: scaling law split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,210 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
#---------------------------
|
||||
# Version 1: no AAindex
|
||||
#from UQ_ML_data import *
|
||||
#setvars(gene,drug)
|
||||
#from UQ_ML_data import *
|
||||
#---------------------------
|
||||
|
||||
from ml_data import *
|
||||
setvars(gene,drug)
|
||||
from ml_data import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/uq_v1/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
# print('AAindex features (n):'
|
||||
# , len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
# , '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_orig import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_orig import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: ORIGINAL'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_orig/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_rt import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_rt import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: REVERSE training\n'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_rt/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'katG'
|
||||
drug = 'isoniazid'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: scaling law split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,4 +1,4 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:548: SettingWithCopyWarning:
|
||||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py:464: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
|
@ -55,16 +55,13 @@ PASS: OR values imputed, data ready for ML
|
|||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 168
|
||||
No. of categorical features: 7
|
||||
|
||||
PASS: x_features has no target variable
|
||||
|
||||
No. of columns for x_features: 175
|
||||
No. of columns for x_features: 174
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_7030.py", line 19, in <module>
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./run_7030.py", line 35, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 658, in setvars
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_7030.py", line 636, in setvars
|
||||
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
|
||||
train, test = next(cv.split(X=arrays[0], y=stratify))
|
||||
|
|
|
@ -1,75 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_8020.py:549: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 168
|
||||
No. of categorical features: 7
|
||||
|
||||
PASS: x_features has no target variable
|
||||
|
||||
No. of columns for x_features: 175
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_8020.py", line 19, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_8020.py", line 656, in setvars
|
||||
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
|
||||
train, test = next(cv.split(X=arrays[0], y=stratify))
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
|
||||
for train, test in self._iter_indices(X, y, groups):
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
|
||||
raise ValueError(
|
||||
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
|
|
@ -1,113 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_7030.py:548: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 168
|
||||
No. of categorical features: 7
|
||||
|
||||
PASS: x_features has no target variable
|
||||
|
||||
No. of columns for x_features: 175
|
||||
|
||||
-------------------------------------------------------------
|
||||
Successfully split data with stratification [COMPLETE data]: 70/30
|
||||
Original data size: (271, 175)
|
||||
Train data size: (181, 175)
|
||||
Test data size: (90, 175)
|
||||
y_train numbers: Counter({0: 180, 1: 1})
|
||||
y_train ratio: 180.0
|
||||
|
||||
y_test_numbers: Counter({0: 89, 1: 1})
|
||||
y_test ratio: 89.0
|
||||
-------------------------------------------------------------
|
||||
|
||||
index: 0
|
||||
ind: 1
|
||||
|
||||
Mask count check: True
|
||||
|
||||
index: 1
|
||||
ind: 2
|
||||
|
||||
Mask count check: True
|
||||
Original Data
|
||||
Counter({0: 180, 1: 1}) Data dim: (181, 175)
|
||||
|
||||
Simple Random OverSampling
|
||||
Counter({0: 180, 1: 180})
|
||||
(360, 175)
|
||||
|
||||
Simple Random UnderSampling
|
||||
Counter({0: 1, 1: 1})
|
||||
(2, 175)
|
||||
|
||||
Simple Combined Over and UnderSampling
|
||||
Counter({0: 180, 1: 180})
|
||||
(360, 175)
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_cd_7030.py", line 19, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_7030.py", line 745, in setvars
|
||||
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
|
||||
output = self._fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
|
||||
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
|
||||
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
|
||||
raise ValueError(
|
||||
ValueError: Expected n_neighbors <= n_samples, but n_samples = 1, n_neighbors = 6
|
|
@ -1,69 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_8020.py:548: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 168
|
||||
No. of categorical features: 7
|
||||
|
||||
PASS: x_features has no target variable
|
||||
|
||||
No. of columns for x_features: 175
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_cd_8020.py", line 19, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_8020.py", line 666, in setvars
|
||||
yc2_ratio = yc2[0]/yc2[1]
|
||||
ZeroDivisionError: division by zero
|
|
@ -1,69 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_sl.py:548: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 168
|
||||
No. of categorical features: 7
|
||||
|
||||
PASS: x_features has no target variable
|
||||
|
||||
No. of columns for x_features: 175
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_cd_sl.py", line 19, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_cd_sl.py", line 669, in setvars
|
||||
yc2_ratio = yc2[0]/yc2[1]
|
||||
ZeroDivisionError: division by zero
|
|
@ -1,105 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data.py:550: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
No. of numerical features: 45
|
||||
No. of categorical features: 7
|
||||
|
||||
index: 0
|
||||
ind: 1
|
||||
|
||||
Mask count check: True
|
||||
|
||||
index: 1
|
||||
ind: 2
|
||||
|
||||
Mask count check: True
|
||||
Original Data
|
||||
Counter({0: 7, 1: 1}) Data dim: (8, 52)
|
||||
|
||||
-------------------------------------------------------------
|
||||
Successfully split data: UQ [no aa_index but active site included] training
|
||||
actual values: training set
|
||||
imputed values: blind test set
|
||||
Train data size: (8, 52)
|
||||
Test data size: (263, 52)
|
||||
y_train numbers: Counter({0: 7, 1: 1})
|
||||
y_train ratio: 7.0
|
||||
|
||||
y_test_numbers: Counter({0: 262, 1: 1})
|
||||
y_test ratio: 262.0
|
||||
-------------------------------------------------------------
|
||||
Simple Random OverSampling
|
||||
Counter({0: 7, 1: 7})
|
||||
(14, 52)
|
||||
Simple Random UnderSampling
|
||||
Counter({0: 1, 1: 1})
|
||||
(2, 52)
|
||||
Simple Combined Over and UnderSampling
|
||||
Counter({0: 7, 1: 7})
|
||||
(14, 52)
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_config.py", line 26, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data.py", line 701, in setvars
|
||||
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
|
||||
output = self._fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
|
||||
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
|
||||
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
|
||||
raise ValueError(
|
||||
ValueError: Expected n_neighbors <= n_samples, but n_samples = 1, n_neighbors = 6
|
|
@ -1,107 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_orig.py:550: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 168
|
||||
No. of categorical features: 7
|
||||
|
||||
index: 0
|
||||
ind: 1
|
||||
|
||||
Mask count check: True
|
||||
|
||||
index: 1
|
||||
ind: 2
|
||||
|
||||
Mask count check: True
|
||||
Original Data
|
||||
Counter({0: 7, 1: 1}) Data dim: (8, 175)
|
||||
|
||||
-------------------------------------------------------------
|
||||
Successfully split data: ORIGINAL training
|
||||
actual values: training set
|
||||
imputed values: blind test set
|
||||
Train data size: (8, 175)
|
||||
Test data size: (263, 175)
|
||||
y_train numbers: Counter({0: 7, 1: 1})
|
||||
y_train ratio: 7.0
|
||||
|
||||
y_test_numbers: Counter({0: 262, 1: 1})
|
||||
y_test ratio: 262.0
|
||||
-------------------------------------------------------------
|
||||
Simple Random OverSampling
|
||||
Counter({0: 7, 1: 7})
|
||||
(14, 175)
|
||||
Simple Random UnderSampling
|
||||
Counter({0: 1, 1: 1})
|
||||
(2, 175)
|
||||
Simple Combined Over and UnderSampling
|
||||
Counter({0: 7, 1: 7})
|
||||
(14, 175)
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_orig.py", line 19, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_orig.py", line 701, in setvars
|
||||
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
|
||||
output = self._fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
|
||||
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
|
||||
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
|
||||
raise ValueError(
|
||||
ValueError: Expected n_neighbors <= n_samples, but n_samples = 1, n_neighbors = 6
|
|
@ -1,107 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py:550: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 168
|
||||
No. of categorical features: 7
|
||||
|
||||
index: 0
|
||||
ind: 1
|
||||
|
||||
Mask count check: True
|
||||
|
||||
index: 1
|
||||
ind: 2
|
||||
|
||||
Mask count check: True
|
||||
Original Data
|
||||
Counter({0: 262, 1: 1}) Data dim: (263, 175)
|
||||
|
||||
-------------------------------------------------------------
|
||||
Successfully split data: REVERSE training
|
||||
imputed values: training set
|
||||
actual values: blind test set
|
||||
Train data size: (263, 175)
|
||||
Test data size: (8, 175)
|
||||
y_train numbers: Counter({0: 262, 1: 1})
|
||||
y_train ratio: 262.0
|
||||
|
||||
y_test_numbers: Counter({0: 7, 1: 1})
|
||||
y_test ratio: 7.0
|
||||
-------------------------------------------------------------
|
||||
Simple Random OverSampling
|
||||
Counter({0: 262, 1: 262})
|
||||
(524, 175)
|
||||
Simple Random UnderSampling
|
||||
Counter({0: 1, 1: 1})
|
||||
(2, 175)
|
||||
Simple Combined Over and UnderSampling
|
||||
Counter({0: 262, 1: 262})
|
||||
(524, 175)
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_rt.py", line 19, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py", line 701, in setvars
|
||||
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
|
||||
output = self._fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
|
||||
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
|
||||
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
|
||||
raise ValueError(
|
||||
ValueError: Expected n_neighbors <= n_samples, but n_samples = 1, n_neighbors = 6
|
|
@ -1,75 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_sl.py:549: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 271
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 271
|
||||
ncols: 269
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 256
|
||||
log10_or_mychisq 256
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 168
|
||||
No. of categorical features: 7
|
||||
|
||||
PASS: x_features has no target variable
|
||||
|
||||
No. of columns for x_features: 175
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_sl.py", line 19, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_sl.py", line 660, in setvars
|
||||
X, X_bts, y, y_bts = train_test_split(x_features, y_target
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
|
||||
train, test = next(cv.split(X=arrays[0], y=stratify))
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
|
||||
for train, test in self._iter_indices(X, y, groups):
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
|
||||
raise ValueError(
|
||||
ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,107 +0,0 @@
|
|||
/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py:550: SettingWithCopyWarning:
|
||||
A value is trying to be set on a copy of a slice from a DataFrame
|
||||
|
||||
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
|
||||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
1.22.4
|
||||
1.4.1
|
||||
|
||||
aaindex_df contains non-numerical data
|
||||
|
||||
Total no. of non-numerial columns: 2
|
||||
|
||||
Selecting numerical data only
|
||||
|
||||
PASS: successfully selected numerical columns only for aaindex_df
|
||||
|
||||
Now checking for NA in the remaining aaindex_cols
|
||||
|
||||
Counting aaindex_df cols with NA
|
||||
ncols with NA: 4 columns
|
||||
Dropping these...
|
||||
Original ncols: 127
|
||||
|
||||
Revised df ncols: 123
|
||||
|
||||
Checking NA in revised df...
|
||||
|
||||
PASS: cols with NA successfully dropped from aaindex_df
|
||||
Proceeding with combining aa_df with other features_df
|
||||
|
||||
PASS: ncols match
|
||||
Expected ncols: 123
|
||||
Got: 123
|
||||
|
||||
Total no. of columns in clean aa_df: 123
|
||||
|
||||
Proceeding to merge, expected nrows in merged_df: 531
|
||||
|
||||
PASS: my_features_df and aa_df successfully combined
|
||||
nrows: 531
|
||||
ncols: 286
|
||||
count of NULL values before imputation
|
||||
|
||||
or_mychisq 263
|
||||
log10_or_mychisq 263
|
||||
dtype: int64
|
||||
count of NULL values AFTER imputation
|
||||
|
||||
mutationinformation 0
|
||||
or_rawI 0
|
||||
logorI 0
|
||||
dtype: int64
|
||||
|
||||
PASS: OR values imputed, data ready for ML
|
||||
|
||||
Total no. of features for aaindex: 123
|
||||
|
||||
No. of numerical features: 167
|
||||
No. of categorical features: 7
|
||||
|
||||
index: 0
|
||||
ind: 1
|
||||
|
||||
Mask count check: True
|
||||
|
||||
index: 1
|
||||
ind: 2
|
||||
|
||||
Mask count check: True
|
||||
Original Data
|
||||
Counter({0: 409, 1: 3}) Data dim: (412, 174)
|
||||
|
||||
-------------------------------------------------------------
|
||||
Successfully split data: REVERSE training
|
||||
imputed values: training set
|
||||
actual values: blind test set
|
||||
Train data size: (412, 174)
|
||||
Test data size: (119, 174)
|
||||
y_train numbers: Counter({0: 409, 1: 3})
|
||||
y_train ratio: 136.33333333333334
|
||||
|
||||
y_test_numbers: Counter({0: 76, 1: 43})
|
||||
y_test ratio: 1.7674418604651163
|
||||
-------------------------------------------------------------
|
||||
Simple Random OverSampling
|
||||
Counter({0: 409, 1: 409})
|
||||
(818, 174)
|
||||
Simple Random UnderSampling
|
||||
Counter({0: 3, 1: 3})
|
||||
(6, 174)
|
||||
Simple Combined Over and UnderSampling
|
||||
Counter({0: 409, 1: 409})
|
||||
(818, 174)
|
||||
Traceback (most recent call last):
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/./gid_rt.py", line 19, in <module>
|
||||
setvars(gene,drug)
|
||||
File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py", line 701, in setvars
|
||||
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
|
||||
output = self._fit_resample(X, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
|
||||
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
|
||||
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
|
||||
File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
|
||||
raise ValueError(
|
||||
ValueError: Expected n_neighbors <= n_samples, but n_samples = 3, n_neighbors = 6
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -572,9 +572,10 @@ def setvars(gene,drug):
|
|||
, 'lineage_count_unique'
|
||||
]
|
||||
|
||||
X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
|
||||
#, 'gene_name' # will be required for the combined stuff
|
||||
]
|
||||
# X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
|
||||
# #, 'gene_name' # will be required for the combined stuff
|
||||
# ]
|
||||
X_gn_Fcat = []
|
||||
|
||||
X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
|
||||
###############################################################################
|
||||
|
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_7030 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_7030 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_7030/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_8020 import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_8020 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 80/20 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_8020/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,203 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_cd_sl import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_cd_sl import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
################################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis [COMPLETE DATA]: 70/30 split'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_cd_sl/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
################################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,210 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
#---------------------------
|
||||
# Version 1: no AAindex
|
||||
#from UQ_ML_data import *
|
||||
#setvars(gene,drug)
|
||||
#from UQ_ML_data import *
|
||||
#---------------------------
|
||||
|
||||
from ml_data import *
|
||||
setvars(gene,drug)
|
||||
from ml_data import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/uq_v1/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
# print('AAindex features (n):'
|
||||
# , len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
# , '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
|
@ -1,202 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat May 28 05:25:30 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
#total_mtblineage_uc = 8
|
||||
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
|
||||
|
||||
from ml_data_orig import *
|
||||
setvars(gene,drug)
|
||||
from ml_data_orig import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
||||
# TT run all ML clfs: baseline mode
|
||||
from MultModelsCl import MultModelsCl
|
||||
|
||||
############################################################################
|
||||
print('\n#####################################################################\n'
|
||||
, '\nRunning ML analysis: ORIGINAL'
|
||||
, '\nGene name:', gene
|
||||
, '\nDrug name:', drug)
|
||||
|
||||
#==================
|
||||
# Specify outdir
|
||||
#==================
|
||||
|
||||
outdir_ml = outdir + 'ml/tts_orig/'
|
||||
|
||||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('\nSanity checks:'
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
||||
print('Strucutral features (n):'
|
||||
, len(X_ssFN)
|
||||
, '\nThese are:'
|
||||
, '\nCommon stablity features:', X_stabilityN
|
||||
, '\nFoldX columns:', X_foldX_cols
|
||||
, '\nOther struc columns:', X_str
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
, len(X_evolFN)
|
||||
, '\nThese are:\n'
|
||||
, X_evolFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Genomic features (n):'
|
||||
, len(X_genomicFN)
|
||||
, '\nThese are:\n'
|
||||
, X_genomic_mafor, '\n'
|
||||
, X_genomic_linegae
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Categorical features (n):'
|
||||
, len(categorical_FN)
|
||||
, '\nThese are:\n'
|
||||
, categorical_FN
|
||||
, '\n================================================================\n')
|
||||
|
||||
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
|
||||
print('\nPass: No. of features match')
|
||||
else:
|
||||
sys.exit('\nFail: Count of feature mismatch')
|
||||
|
||||
print('\n#####################################################################\n')
|
||||
|
||||
###############################################################################
|
||||
#==================
|
||||
# Baseline models
|
||||
#==================
|
||||
mm_skf_scoresD = MultModelsCl(input_df = X
|
||||
, target = y
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
|
||||
baseline_all = pd.DataFrame(mm_skf_scoresD)
|
||||
baseline_all = baseline_all.T
|
||||
#baseline_train = baseline_all.filter(like='train_', axis=1)
|
||||
baseline_CT = baseline_all.filter(like='test_', axis=1)
|
||||
baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
baseline_BT = baseline_all.filter(like='bts_', axis=1)
|
||||
baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
|
||||
baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
|
||||
|
||||
|
||||
#%% SMOTE NC: Oversampling [Numerical + categorical]
|
||||
mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
|
||||
, target = y_smnc
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
smnc_all = pd.DataFrame(mm_skf_scoresD7)
|
||||
smnc_all = smnc_all.T
|
||||
|
||||
smnc_CT = smnc_all.filter(like='test_', axis=1)
|
||||
smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
smnc_BT = smnc_all.filter(like='bts_', axis=1)
|
||||
smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
|
||||
smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
|
||||
|
||||
#%% ROS: Numerical + categorical
|
||||
mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
|
||||
, target = y_ros
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
ros_all = pd.DataFrame(mm_skf_scoresD3)
|
||||
ros_all = ros_all.T
|
||||
|
||||
ros_CT = ros_all.filter(like='test_', axis=1)
|
||||
ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
ros_BT = ros_all.filter(like='bts_', axis=1)
|
||||
ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
|
||||
ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
|
||||
|
||||
#%% RUS: Numerical + categorical
|
||||
mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
|
||||
, target = y_rus
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rus_all = pd.DataFrame(mm_skf_scoresD4)
|
||||
rus_all = rus_all.T
|
||||
|
||||
rus_CT = rus_all.filter(like='test_', axis=1)
|
||||
rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rus_BT = rus_all.filter(like='bts_' , axis=1)
|
||||
rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
|
||||
rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
|
||||
|
||||
#%% ROS + RUS Combined: Numerical + categorical
|
||||
mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
|
||||
, target = y_rouC
|
||||
, var_type = 'mixed'
|
||||
, skf_cv = skf_cv
|
||||
, blind_test_input_df = X_bts
|
||||
, blind_test_target = y_bts)
|
||||
rouC_all = pd.DataFrame(mm_skf_scoresD8)
|
||||
rouC_all = rouC_all.T
|
||||
|
||||
rouC_CT = rouC_all.filter(like='test_', axis=1)
|
||||
rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
|
||||
|
||||
rouC_BT = rouC_all.filter(like='bts_', axis=1)
|
||||
rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
|
||||
|
||||
# Write csv
|
||||
rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
|
||||
rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue