fixed masking condition for ML training data for genes and wrote revised mask files out
This commit is contained in:
parent
0adf69f75a
commit
f4cab1fdfb
3 changed files with 46 additions and 26 deletions
|
@ -101,14 +101,15 @@ def getmldata(gene, drug
|
||||||
datadir = homedir + '/git/Data/'
|
datadir = homedir + '/git/Data/'
|
||||||
indir = datadir + drug + '/input/'
|
indir = datadir + drug + '/input/'
|
||||||
outdir = datadir + drug + '/output/'
|
outdir = datadir + drug + '/output/'
|
||||||
outdir_ml = outdir + 'ml/'
|
#outdir_ml = outdir + 'ml/'
|
||||||
|
outdir_ml = homedir + '/git/LSHTM_ML/output/'
|
||||||
|
|
||||||
#==========================
|
#==========================
|
||||||
# outfile for ML training:
|
# outfile for ML training:
|
||||||
#==========================
|
#==========================
|
||||||
outFile_ml = outdir_ml + gene.lower() + '_training_data.csv'
|
outFile_ml = outdir_ml + gene.lower() + '_training_data.csv'
|
||||||
|
|
||||||
outFile_mask_ml = outdir_ml + gene.lower() + '_mask_check.csv'
|
outFile_mask_ml = outdir_ml + 'genes/mask_check/' + gene.lower() + '_mask_check.csv'
|
||||||
|
|
||||||
#=======
|
#=======
|
||||||
# input
|
# input
|
||||||
|
@ -436,41 +437,58 @@ def getmldata(gene, drug
|
||||||
#X_stabilityN = common_cols_stabiltyN
|
#X_stabilityN = common_cols_stabiltyN
|
||||||
gene_affinity_colnames = []# not needed as its the common ones
|
gene_affinity_colnames = []# not needed as its the common ones
|
||||||
cols_to_mask = ['ligand_affinity_change']
|
cols_to_mask = ['ligand_affinity_change']
|
||||||
|
cols_to_mask_ppi2 = []
|
||||||
|
|
||||||
if gene.lower() in geneL_ppi2:
|
if gene.lower() in geneL_ppi2:
|
||||||
gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist']
|
gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist']
|
||||||
#X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
|
#X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
|
||||||
cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
|
#cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
|
||||||
|
cols_to_mask = ['ligand_affinity_change']
|
||||||
|
cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
|
||||||
|
|
||||||
|
|
||||||
if gene.lower() in geneL_na:
|
if gene.lower() in geneL_na:
|
||||||
gene_affinity_colnames = ['mcsm_na_affinity']
|
gene_affinity_colnames = ['mcsm_na_affinity']
|
||||||
#X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
|
#X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
|
||||||
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
|
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
|
||||||
|
cols_to_mask_ppi2 = []
|
||||||
|
|
||||||
|
|
||||||
if gene.lower() in geneL_na_ppi2:
|
if gene.lower() in geneL_na_ppi2:
|
||||||
gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
||||||
#X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
|
#X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
|
||||||
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
|
#cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
|
||||||
|
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
|
||||||
|
cols_to_mask_ppi2 = ['mcsm_ppi2_affinity']
|
||||||
|
|
||||||
#=======================
|
#=======================
|
||||||
# Masking columns:
|
# Masking columns:
|
||||||
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
|
# lig_dist >10 ==> mCSM-lig AND mCSM-NA col values == 0
|
||||||
|
# interface_dist >10 ==> mCSM-ppi2 col values == 0
|
||||||
#=======================
|
#=======================
|
||||||
my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
|
my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
|
||||||
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
|
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
|
||||||
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
|
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
|
||||||
|
|
||||||
# mask the mcsm affinity related columns where ligand distance > 10
|
# mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10
|
||||||
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
|
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
|
||||||
(my_df_ml['ligand_affinity_change'] == 0).sum()
|
|
||||||
|
|
||||||
|
# mask the mcsm_ppi2_affinity column where interface_dist > 10
|
||||||
|
if len(cols_to_mask_ppi2) > 0:
|
||||||
|
my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0
|
||||||
|
add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2
|
||||||
|
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask]
|
||||||
|
else:
|
||||||
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ]
|
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ]
|
||||||
|
|
||||||
#===================================================
|
# sanity check: check script SANITY_CHECK_mask.py
|
||||||
# write file for check
|
|
||||||
|
if write_maskfile:
|
||||||
|
# write mask file for sanity check
|
||||||
#mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
#mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||||
#mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
|
|
||||||
#===================================================
|
mask_check.to_csv(outdir_ml + gene.lower() + '_mask_check.csv')
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
#%% Feature groups (FG): Build X for Input ML
|
#%% Feature groups (FG): Build X for Input ML
|
||||||
############################################################################
|
############################################################################
|
||||||
|
|
|
@ -77,6 +77,7 @@ import re
|
||||||
import itertools
|
import itertools
|
||||||
from sklearn.model_selection import LeaveOneGroupOut
|
from sklearn.model_selection import LeaveOneGroupOut
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
|
from sklearn.naive_bayes import ComplementNB
|
||||||
|
|
||||||
#%% GLOBALS
|
#%% GLOBALS
|
||||||
#rs = {'random_state': 42}
|
#rs = {'random_state': 42}
|
||||||
|
@ -260,6 +261,8 @@ def MultModelsCl(input_df, target
|
||||||
#======================================================
|
#======================================================
|
||||||
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
|
||||||
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
|
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
|
||||||
|
#, ('Bernoulli NB' , BernoulliNB() ) # pks Naive Bayes, CAUTION
|
||||||
|
, ('Complement NB' , ComplementNB() )
|
||||||
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
, ('Decision Tree' , DecisionTreeClassifier(**rs) )
|
||||||
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
, ('Extra Tree' , ExtraTreeClassifier(**rs) )
|
||||||
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
, ('Extra Trees' , ExtraTreesClassifier(**rs) )
|
||||||
|
@ -271,8 +274,8 @@ def MultModelsCl(input_df, target
|
||||||
, ('Logistic Regression' , LogisticRegression(**rs) )
|
, ('Logistic Regression' , LogisticRegression(**rs) )
|
||||||
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||||
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
, ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||||
, ('Multinomial' , MultinomialNB() )
|
, ('Multinomial NB' , MultinomialNB() )
|
||||||
, ('Naive Bayes' , BernoulliNB() )
|
|
||||||
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
|
||||||
, ('QDA' , QuadraticDiscriminantAnalysis() )
|
, ('QDA' , QuadraticDiscriminantAnalysis() )
|
||||||
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
|
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
|
||||||
|
|
|
@ -18,7 +18,6 @@ from SplitTTS import *
|
||||||
from MultClfs_SIMPLE import *
|
from MultClfs_SIMPLE import *
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
|
|
||||||
skf_cv = StratifiedKFold(n_splits = 10
|
skf_cv = StratifiedKFold(n_splits = 10
|
||||||
, shuffle = True,**rs)
|
, shuffle = True,**rs)
|
||||||
#sel_cv = logo
|
#sel_cv = logo
|
||||||
|
@ -29,16 +28,16 @@ skf_cv = StratifiedKFold(n_splits = 10
|
||||||
gene_model_paramD = {'data_combined_model' : False
|
gene_model_paramD = {'data_combined_model' : False
|
||||||
, 'use_or' : False
|
, 'use_or' : False
|
||||||
, 'omit_all_genomic_features': False
|
, 'omit_all_genomic_features': False
|
||||||
, 'write_maskfile' : False
|
, 'write_maskfile' : True
|
||||||
, 'write_outfile' : False }
|
, 'write_outfile' : False }
|
||||||
|
|
||||||
#df = getmldata(gene, drug, **gene_model_paramD)
|
#df = getmldata(gene, drug, **gene_model_paramD)
|
||||||
df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
|
df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
|
||||||
df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
|
#df = getmldata('embB', 'ethambutol' , **gene_model_paramD)
|
||||||
df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
|
#df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
|
||||||
df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
|
#df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
|
||||||
df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
|
#df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
|
||||||
#df = getmldata('alr' , 'cycloserine' , **combined_model_paramD)
|
#df = getmldata('alr' , 'cycloserine' , **gene_model_paramD)
|
||||||
|
|
||||||
all(df.columns.isin(['gene_name'])) # should be False
|
all(df.columns.isin(['gene_name'])) # should be False
|
||||||
spl_type = '70_30'
|
spl_type = '70_30'
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue