saving work for yesterday where uq runs were repeated

This commit is contained in:
Tanushree Tunstall 2022-06-20 14:57:11 +01:00
parent efeaf52cde
commit 4ab99dcbd2
11 changed files with 10993 additions and 10660 deletions

View file

@ -147,7 +147,7 @@ def MultModelsCl(input_df, target, skf_cv
mlp = MLPClassifier(max_iter = 500, **rs)
dt = DecisionTreeClassifier(**rs)
ets = ExtraTreesClassifier(**rs)
et = ExtraTreeClassifier(**rs)
rf = RandomForestClassifier(**rs, n_estimators = 1000 )
rf2 = RandomForestClassifier(
min_samples_leaf = 5
@ -169,7 +169,6 @@ def MultModelsCl(input_df, target, skf_cv
abc = AdaBoostClassifier(**rs)
bc = BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True)
et = ExtraTreeClassifier(**rs)
gpc = GaussianProcessClassifier(**rs)
gbc = GradientBoostingClassifier(**rs)
qda = QuadraticDiscriminantAnalysis()
@ -181,14 +180,13 @@ def MultModelsCl(input_df, target, skf_cv
, ('Gaussian NB' , gnb)
, ('Naive Bayes' , nb)
, ('K-Nearest Neighbors' , knn)
, ('SVM' , svc)
, ('SVC' , svc)
, ('MLP' , mlp)
, ('Decision Tree' , dt)
, ('Extra Trees' , ets)
, ('Extra Tree' , et)
, ('Random Forest' , rf)
, ('Random Forest2' , rf2)
, ('Naive Bayes' , nb)
, ('XGBoost' , xgb)
, ('LDA' , lda)
, ('Multinomial' , mnb)

View file

@ -32,15 +32,36 @@ from ml_data import *
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
#%%###########################################################################
print('\n#####################################################################\n')
print('TESTING cmd:'
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug
, '\nTotal input features:', X.shape
, '\n', Counter(y))
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
@ -50,11 +71,11 @@ print('Strucutral features (n):'
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
@ -75,20 +96,15 @@ print('Categorical features (n):'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/v2/'
################################################################################
###############################################################################
#==================
# Baseline models
#==================

View file

@ -32,15 +32,36 @@ from ml_data import *
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
#%%###########################################################################
print('\n#####################################################################\n')
print('TESTING cmd:'
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug
, '\nTotal input features:', X.shape
, '\n', Counter(y))
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
@ -50,11 +71,11 @@ print('Strucutral features (n):'
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
@ -75,20 +96,15 @@ print('Categorical features (n):'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/v2/'
################################################################################
###############################################################################
#==================
# Baseline models
#==================

View file

@ -32,15 +32,36 @@ from ml_data import *
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
#%%###########################################################################
print('\n#####################################################################\n')
print('TESTING cmd:'
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug
, '\nTotal input features:', X.shape
, '\n', Counter(y))
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
@ -50,11 +71,11 @@ print('Strucutral features (n):'
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
@ -75,20 +96,15 @@ print('Categorical features (n):'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/v2/'
################################################################################
###############################################################################
#==================
# Baseline models
#==================

View file

@ -32,15 +32,36 @@ from ml_data import *
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
#%%###########################################################################
print('\n#####################################################################\n')
print('TESTING cmd:'
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug
, '\nTotal input features:', X.shape
, '\n', Counter(y))
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
@ -50,11 +71,11 @@ print('Strucutral features (n):'
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
@ -75,20 +96,15 @@ print('Categorical features (n):'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/v2/'
################################################################################
###############################################################################
#==================
# Baseline models
#==================

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -552,18 +552,18 @@ def setvars(gene,drug):
#=================================================
# Training and BLIND test set: imputed vs actual
# BUT in REVERSE i.e
# BUT in REVERSE i.e.
# dst with actual values : blind test
# dst with imputed values : training set
#==================================================
my_df_ml[drug].isna().sum() #'na' ones are now training set
blind_test_df = my_df_ml[my_df_ml[drug].notna()]
blind_test_df.shape
training_df = my_df_ml[my_df_ml[drug].isna()]
training_df.shape
blind_test_df = my_df_ml[my_df_ml[drug].notna()]
blind_test_df.shape
# Target 1: dst_mode
training_df[drug].value_counts()
training_df['dst_mode'].value_counts()

View file

@ -32,15 +32,36 @@ from ml_data import *
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
#%%###########################################################################
print('\n#####################################################################\n')
print('TESTING cmd:'
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug
, '\nTotal input features:', X.shape
, '\n', Counter(y))
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
@ -50,11 +71,11 @@ print('Strucutral features (n):'
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
@ -75,20 +96,15 @@ print('Categorical features (n):'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/v2/'
################################################################################
###############################################################################
#==================
# Baseline models
#==================

View file

@ -32,15 +32,36 @@ from ml_data import *
# TT run all ML clfs: baseline mode
from MultModelsCl import MultModelsCl
#%%###########################################################################
print('\n#####################################################################\n')
print('TESTING cmd:'
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug
, '\nTotal input features:', X.shape
, '\n', Counter(y))
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
print('Strucutral features (n):'
, len(X_ssFN)
@ -50,11 +71,11 @@ print('Strucutral features (n):'
, '\nOther struc columns:', X_str
, '\n================================================================\n')
print('AAindex features (n):'
, len(X_aaindexFN)
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
# print('AAindex features (n):'
# , len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
# , '\n================================================================\n')
print('Evolutionary features (n):'
, len(X_evolFN)
@ -75,20 +96,15 @@ print('Categorical features (n):'
, categorical_FN
, '\n================================================================\n')
if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ):
print('\nPass: No. of features match')
else:
sys.exit('\nFail: Count of feature mismatch')
print('\n#####################################################################\n')
################################################################################
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/v2/'
################################################################################
###############################################################################
#==================
# Baseline models
#==================

View file

@ -1,49 +1,51 @@
==================================
# BTS data: UQ
# Features NOT including AA index
# Date: 30/05/2022, but reran with active_site indication column that was added lateron
# TODO: put in folder called v1, and UQ in folder called UQ_v0 (will need to rename!)
# UQ run: same as ORIGINAL except
# Features NOT including AA index, but includes active site annotations
# Date: 30/05/2022, but reran with from my desktop on 19/05/2022 to capture log and include active_site indication column
# captures error: 2>$1
=================================
./pnca_config.py
./embb_config.py
./gid_config.py
./katg_config.py
./rpob_config.py
./pnca_config.py 2>&1 | tee log_pnca_config.txt
./embb_config.py 2>&1 | tee log_embb_config.txt
./gid_config.py 2>&1 | tee log_gid_config.txt
./katg_config.py 2>&1 | tee log_katg_config.txt
./rpob_config.py 2>&1 | tee log_rpob_config.txt
./alr_config.py 2>&1 | tee log_alr_config.txt
# ERROR, as expected, too few values!
## ./alr_config.py = NA
# ^^^^^^^^ FIXME!
########################################################################
==================================
# BTS data: imputed values
# All features including AA index
# Date: 16/05/2022
=================================
# Split: ORIGINAL
# actual values: training set
# imputed values: blind set
# All features including AA index
# Date: 18/05/2022 # reran with minor formatting and log capture
# captures error: 2>$1
=================================
./pnca_orig.py 2>&1 | tee log_pnca_orig.txt
./embb_orig.py 2>&1 | tee log_embb_orig.txt
./gid_orig.py 2>&1 | tee log_gid_orig.txt
./katg_orig.py 2>&1 | tee log_katg_orig.txt
./rpob_orig.py 2>&1 | tee log_rpob_orig.txt
./alr_orig.py 2>&1 | tee log_alr_orig.txt # ERROR, as expected, too few values!
./pnca_config.py
./embb_config.py
./gid_config.py
./katg_config.py
./rpob_config.py
##./alr_config.py = NA
########################################################################
=================================
# Split: 70/30
# All features including AA index
# Date: 17/05/2022 and 18/05/2022
# Date: 17/05/2022 and 18/05/2022, reran with minor foramtting and log capture
# captures error: 2>$1
=================================
./pnca_7030.py
./embb_7030.py
# ./gid_7030.py: problems, CT values are non existing except for rouC
./pnca_7030.py 2>&1 | tee log_pnca_7030.txt
./embb_7030.py 2>&1 | tee log_embb_7030.txt
./gid_7030.py 2>&1 | tee log_gid_7030.txt
./katg_7030.py
./rpob_7030.py
./katg_7030.py 2>&1 | tee log_katg_7030.txt
./rpob_7030.py 2>&1 | tee log_rpob_7030.txt
./alr_7030.py 2>&1 | tee log_alr_7030.txt # ERROR, as expected, too few values!
##./alr_7030.py = NA
problems, CT values are non existing except for rouC
########################################################################
=================================
@ -57,7 +59,7 @@
./gid_8020.py 2>&1 | tee log_gid_8020.txt
./katg_8020.py 2>&1 | tee log_katg_8020.txt
./rpob_8020.py 2>&1 | tee log_rpob_8020.txt
./alr_8020.py 2>&1 | tee log_alr_8020.txt
./alr_8020.py 2>&1 | tee log_alr_8020.txt # ERROR, as expected, too few values!
########################################################################
@ -72,11 +74,11 @@
./gid_sl.py 2>&1 | tee log_gid_sl.txt
./katg_sl.py 2>&1 | tee log_katg_sl.txt
./rpob_sl.py 2>&1 | tee log_rpob_sl.txt
./alr_sl.py 2>&1 | tee log_alr_sl.txt
./alr_sl.py 2>&1 | tee log_alr_sl.txt # ERROR, as expected, too few values!
########################################################################
=================================
# Split: Reverse training
# Split: REVERSE training
# imputed values: training set
# actual values: blind set
# All features including AA index
@ -88,7 +90,7 @@
./gid_rt.py 2>&1 | tee log_gid_rt.txt
./katg_rt.py 2>&1 | tee log_katg_rt.txt
./rpob_rt.py 2>&1 | tee log_rpob_rt.txt
./alr_rt.py 2>&1 | tee log_alr_rt.txt
./alr_rt.py 2>&1 | tee log_alr_rt.txt # ERROR, as expected, too few values!
########################################################################
# COMPLETE Data: actual + na i.e imputed
@ -105,7 +107,7 @@
./gid_cd_7030.py 2>&1 | tee log_gid_cd_7030.txt
./katg_cd_7030.py 2>&1 | tee log_katg_cd_7030.txt
./rpob_cd_7030.py 2>&1 | tee log_rpob_cd_7030.txt
./alr_cd_7030.py 2>&1 | tee log_alr_cd_7030.txt
./alr_cd_7030.py 2>&1 | tee log_alr_cd_7030.txt # ERROR, as expected, too few values!
########################################################################
=================================
@ -119,5 +121,18 @@
./gid_cd_8020.py 2>&1 | tee log_gid_cd_8020.txt
./katg_cd_8020.py 2>&1 | tee log_katg_cd_8020.txt
./rpob_cd_8020.py 2>&1 | tee log_rpob_cd_8020.txt
./alr_cd_8020.py 2>&1 | tee log_alr_cd_8020.txt
./alr_cd_8020.py 2>&1 | tee log_alr_cd_8020.txt # ERROR, as expected, too few values!
=================================
# Split: scaling law [COMPLETE DATA]
# All features including AA index
# Date: 18/05/2022
# captures error: 2>$1
=================================
./pnca_cd_sl.py 2>&1 | tee log_pnca_cd_sl.txt
./embb_cd_sl.py 2>&1 | tee log_embb_cd_sl.txt
./gid_cd_sl.py 2>&1 | tee log_gid_cd_sl.txt
./katg_cd_sl.py 2>&1 | tee log_katg_cd_sl.txt
./rpob_cd_sl.py 2>&1 | tee log_rpob_cd_sl.txt
./alr_cd_sl.py 2>&1 | tee log_alr_cd_sl.txt # ERROR, as expected, too few values!