working on dissected model, testing diff feature groups

This commit is contained in:
Tanushree Tunstall 2022-06-20 21:51:07 +01:00
parent 135efcee41
commit e68a153883
4 changed files with 270 additions and 161 deletions

View file

@ -74,11 +74,11 @@ import json
rs = {'random_state': 42} rs = {'random_state': 42}
njobs = {'n_jobs': 10} njobs = {'n_jobs': 10}
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score) , 'fscore' : make_scorer(f1_score)
, 'precision' : make_scorer(precision_score) , 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score) , 'recall' : make_scorer(recall_score)
, 'accuracy' : make_scorer(accuracy_score)
, 'roc_auc' : make_scorer(roc_auc_score) , 'roc_auc' : make_scorer(roc_auc_score)
, 'jcc' : make_scorer(jaccard_score) , 'jcc' : make_scorer(jaccard_score)
}) })
@ -137,7 +137,9 @@ def MultModelsCl(input_df, target, skf_cv
col_transform = ColumnTransformer(transformers = t col_transform = ColumnTransformer(transformers = t
, remainder='passthrough') , remainder='passthrough')
#======================================================
# Specify multiple Classification models # Specify multiple Classification models
#======================================================
models = [('Logistic Regression' , LogisticRegression(**rs) ) models = [('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(**rs) ) , ('Logistic RegressionCV' , LogisticRegressionCV(**rs) )
, ('Gaussian NB' , GaussianNB() ) , ('Gaussian NB' , GaussianNB() )

View file

@ -78,10 +78,10 @@ rs = {'random_state': 42}
njobs = {'n_jobs': 10} njobs = {'n_jobs': 10}
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score) , 'fscore' : make_scorer(f1_score)
, 'precision' : make_scorer(precision_score) , 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score) , 'recall' : make_scorer(recall_score)
, 'accuracy' : make_scorer(accuracy_score)
, 'roc_auc' : make_scorer(roc_auc_score) , 'roc_auc' : make_scorer(roc_auc_score)
, 'jcc' : make_scorer(jaccard_score) , 'jcc' : make_scorer(jaccard_score)
}) })
@ -103,7 +103,6 @@ def MultModelsCl_dissected(input_df, target, skf_cv
, blind_test_target , blind_test_target
, add_cm = True # adds confusion matrix based on cross_val_predict , add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers , add_yn = True # adds target var class numbers
, feature_groups = ['']
, var_type = ['numerical', 'categorical','mixed']): , var_type = ['numerical', 'categorical','mixed']):
''' '''
@ -122,14 +121,18 @@ def MultModelsCl_dissected(input_df, target, skf_cv
returns returns
Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
''' '''
#======================================================
# Determine categorical and numerical features # Determine categorical and numerical features
#======================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix categorical_ix
#======================================================
# Determine preprocessing steps ~ var_type # Determine preprocessing steps ~ var_type
#======================================================
if var_type == 'numerical': if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)] t = [('num', MinMaxScaler(), numerical_ix)]
@ -143,7 +146,9 @@ def MultModelsCl_dissected(input_df, target, skf_cv
col_transform = ColumnTransformer(transformers = t col_transform = ColumnTransformer(transformers = t
, remainder='passthrough') , remainder='passthrough')
# Specify multiple Classification models #======================================================
# Specify multiple Classification Models
#======================================================
models = [('Logistic Regression' , LogisticRegression(**rs) ) models = [('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(**rs) ) , ('Logistic RegressionCV' , LogisticRegressionCV(**rs) )
, ('Gaussian NB' , GaussianNB() ) , ('Gaussian NB' , GaussianNB() )
@ -206,7 +211,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
####################################################################### #######################################################################
#====================================================== #======================================================
# Option 1: Add confusion matrix from cross_val_predict # Option: Add confusion matrix from cross_val_predict
# Understand and USE with caution # Understand and USE with caution
# cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples." # cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples."
# https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate # https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate
@ -237,7 +242,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
skf_cv_modD = skf_cv_modD skf_cv_modD = skf_cv_modD
####################################################################### #######################################################################
#============================================= #=============================================
# Option 2: Add targety numbers for data # Option: Add targety numbers for data
#============================================= #=============================================
if add_yn: if add_yn:

View file

@ -417,125 +417,37 @@ else:
#--------------------------------------- #---------------------------------------
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#%%######################################################################## #%% Data for ML ###############################################################
#========================== #==========================
# Data for ML # Data for ML
#========================== #==========================
my_df_ml = my_df.copy() my_df_ml = my_df.copy()
#%% Build X: input for ML # Build column names to mask for affinity chanhes
common_cols_stabiltyN = ['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'mmcsm_lig'
, 'contacts']
# Build stability columns ~ gene
if gene.lower() in geneL_basic: if gene.lower() in geneL_basic:
X_stabilityN = common_cols_stabiltyN #X_stabilityN = common_cols_stabiltyN
gene_affinity_colnames = []# not needed as its a common one
cols_to_mask = ['ligand_affinity_change'] cols_to_mask = ['ligand_affinity_change']
if gene.lower() in geneL_ppi2: if gene.lower() in geneL_ppi2:
# X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist']
geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
if gene.lower() in geneL_na: if gene.lower() in geneL_na:
# X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] gene_affinity_colnames = ['mcsm_na_affinity']
geneL_na_st_cols = ['mcsm_na_affinity'] #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
if gene.lower() in geneL_na_ppi2: if gene.lower() in geneL_na_ppi2:
# X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
]
X_str = ['rsa'
#, 'asa'
, 'kd_values'
, 'rd_values']
X_ssFN = X_stabilityN + X_str + X_foldX_cols
X_evolFN = ['consurf_score'
, 'snap2_score'
, 'provean_score']
X_genomic_mafor = ['maf'
, 'logorI'
# , 'or_rawI'
# , 'or_mychisq'
# , 'or_logistic'
# , 'or_fisher'
# , 'pval_fisher'
]
X_genomic_linegae = ['lineage_proportion'
, 'dist_lineage_proportion'
#, 'lineage' # could be included as a category but it has L2;L4 formatting
, 'lineage_count_all'
, 'lineage_count_unique'
]
X_genomicFN = X_genomic_mafor + X_genomic_linegae
#X_aaindexFN = list(aa_df_cols)
#print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
# numerical feature names [NO aa_index]
numerical_FN = X_ssFN + X_evolFN + X_genomicFN
# categorical feature names
categorical_FN = ['ss_class'
# , 'wt_prop_water'
# , 'mut_prop_water'
# , 'wt_prop_polarity'
# , 'mut_prop_polarity'
# , 'wt_calcprop'
# , 'mut_calcprop'
, 'aa_prop_change'
, 'electrostatics_change'
, 'polarity_change'
, 'water_change'
, 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
, 'active_site' #[didn't use it for uq_v1]
#, 'gene_name' # will be required for the combined stuff
]
#----------------------------------------------
# count numerical and categorical features
#----------------------------------------------
print('\nNo. of numerical features:', len(numerical_FN)
, '\nNo. of categorical features:', len(categorical_FN))
###########################################################################
#======================= #=======================
# Masking columns: # Masking columns:
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
#======================= #=======================
# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
# my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
# (my_df_ml['ligand_affinity_change'] == 0).sum()
my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
@ -546,16 +458,139 @@ my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]
#===================================================
# write file for check # write file for check
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
#=================================================== #===================================================
###############################################################################
#%% Feature groups (FG): Build X for Input ML
############################################################################
#===========================
# FG1: Evolutionary features
#===========================
X_evolFN = ['consurf_score'
, 'snap2_score'
, 'provean_score']
###############################################################################
#========================
# FG2: Stability features
#========================
#--------
# common
#--------
X_common_stability_Fnum = [
'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2'
, 'mmcsm_lig'
, 'contacts']
#--------
# FoldX
#--------
X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
, 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
, 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
, 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
, 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
, 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
###############################################################################
#===================
# FG3: Affinity features
#===================
common_affinity_Fnum = ['ligand_distance'
, 'ligand_affinity_change']
# if gene.lower() in geneL_basic:
# X_affinityFN = common_affinity_Fnum
# else:
# X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
###############################################################################
#============================
# FG4: Residue level features
#============================
#-----------
# AA index
#-----------
X_aaindex_Fnum = list(aa_df_cols)
print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
#-----------------
# surface area
# depth
# hydrophobicity
#-----------------
X_str_Fnum = ['rsa'
#, 'asa'
, 'kd_values'
, 'rd_values']
#---------------------------
# Other aa properties
# active site indication
#---------------------------
X_aap_Fcat = ['ss_class'
# , 'wt_prop_water'
# , 'mut_prop_water'
# , 'wt_prop_polarity'
# , 'mut_prop_polarity'
# , 'wt_calcprop'
# , 'mut_calcprop'
, 'aa_prop_change'
, 'electrostatics_change'
, 'polarity_change'
, 'water_change'
, 'active_site']
X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
###############################################################################
#========================
# FG5: Genomic features
#========================
X_gn_mafor_Fnum = ['maf'
, 'logorI'
# , 'or_rawI'
# , 'or_mychisq'
# , 'or_logistic'
# , 'or_fisher'
# , 'pval_fisher'
]
X_gn_linegae_Fnum = ['lineage_proportion'
, 'dist_lineage_proportion'
#, 'lineage' # could be included as a category but it has L2;L4 formatting
, 'lineage_count_all'
, 'lineage_count_unique'
]
X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
#, 'gene_name' # will be required for the combined stuff
]
X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
###############################################################################
# Feature groups further collaps:
X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
###############################################################################
#%% Define training and test data
#======================================================
# Training and BLIND test set [UQ]: actual vs imputed # Training and BLIND test set [UQ]: actual vs imputed
# No aa index but active_site included # No aa index but active_site included
# dst with actual values : training set # dst with actual values : training set
# dst with imputed values : blind test # dst with imputed values : blind test
#================================================== #======================================================
my_df_ml[drug].isna().sum() #'na' ones are the blind_test set my_df_ml[drug].isna().sum() #'na' ones are the blind_test set
blind_test_df = my_df_ml[my_df_ml[drug].isna()] blind_test_df = my_df_ml[my_df_ml[drug].isna()]
@ -567,6 +602,7 @@ training_df.shape
# Target 1: dst_mode # Target 1: dst_mode
training_df[drug].value_counts() training_df[drug].value_counts()
training_df['dst_mode'].value_counts() training_df['dst_mode'].value_counts()
#################################################################### ####################################################################
#============ #============
# ML data # ML data
@ -574,8 +610,8 @@ training_df['dst_mode'].value_counts()
#------ #------
# X: Training and Blind test (BTS) # X: Training and Blind test (BTS)
#------ #------
X = training_df[numerical_FN + categorical_FN] X = training_df[all_featuresN]
X_bts = blind_test_df[numerical_FN + categorical_FN] X_bts = blind_test_df[all_featuresN]
#------ #------
# y # y
@ -601,19 +637,67 @@ yc1_ratio = yc1[0]/yc1[1]
yc2 = Counter(y_bts) yc2 = Counter(y_bts)
yc2_ratio = yc2[0]/yc2[1] yc2_ratio = yc2[0]/yc2[1]
###############################################################################
#======================================================
# Determine categorical and numerical features
#======================================================
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
numerical_cols
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
categorical_cols
################################################################################
# IMPORTANT sanity checks
if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
print('\nPASS: ML data with input features, training and test generated...'
, '\n\nTotal no. of input features:' , len(X.columns)
, '\n--------No. of numerical features:' , len(numerical_cols)
, '\n--------No. of categorical features:' , len(categorical_cols)
, '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
, '\n\nTotal no. of stability features:' , len(X_stability_FN)
, '\n--------Common stabilty cols:' , len(X_common_stability_Fnum)
, '\n--------Foldx cols:' , len(X_foldX_Fnum)
, '\n\nTotal no. of affinity features:' , len(X_affinityFN)
, '\n--------Common affinity cols:' , len(common_affinity_Fnum)
, '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames)
, '\n\nTotal no. of residue level features:', len(X_resprop_FN)
, '\n--------AA index cols:' , len(X_aaindex_Fnum)
, '\n--------Residue Prop cols:' , len(X_str_Fnum)
, '\n--------AA change Prop cols:' , len(X_aap_Fcat)
, '\n\nTotal no. of genomic features:' , len(X_genomicFN)
, '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum)
, '\n--------Lineage cols:' , len(X_gn_linegae_Fnum)
, '\n--------Other cols:' , len(X_gn_Fcat)
)
else:
print('\nFAIL: numbers mismatch'
, '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
, '\nGot:', len(X.columns))
sys.exit()
###############################################################################
print('\n-------------------------------------------------------------' print('\n-------------------------------------------------------------'
, '\nSuccessfully split data: UQ [no aa_index but active site included] training' , '\nSuccessfully split data: ALL features'
, '\nactual values: training set' , '\nactual values: training set'
, '\nimputed values: blind test set' , '\nimputed values: blind test set'
, '\nTrain data size:', X.shape
, '\nTest data size:', X_bts.shape , '\n\nTotal data size:', len(X) + len(X_bts)
, '\n\nTrain data size:', X.shape
, '\ny_train numbers:', yc1 , '\ny_train numbers:', yc1
, '\ny_train ratio:',yc1_ratio
, '\n' , '\n\nTest data size:', X_bts.shape
, '\ny_test_numbers:', yc2 , '\ny_test_numbers:', yc2
, '\n\ny_train ratio:',yc1_ratio
, '\ny_test ratio:', yc2_ratio , '\ny_test ratio:', yc2_ratio
, '\n-------------------------------------------------------------' , '\n-------------------------------------------------------------'
) )
########################################################################### ###########################################################################
#%% #%%
########################################################################### ###########################################################################

View file

@ -47,60 +47,78 @@ outdir_ml = outdir + 'ml/uq_v1/dissected'
print('\nOutput directory:', outdir_ml) print('\nOutput directory:', outdir_ml)
#%%########################################################################### #%%###########################################################################
print('\nSanity checks:'
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n') print('\n================================================================\n')
print('Strucutral features (n):' , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
, len(X_ssFN)
, '\nThese are:' , '\n\nTotal no. of stability features:' , len(X_stability_FN)
, '\nCommon stablity features:', X_stabilityN , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum)
, '\nFoldX columns:', X_foldX_cols , '\n--------Foldx cols:' , len(X_foldX_Fnum)
, '\nOther struc columns:', X_str
, '\n================================================================\n') , '\n\nTotal no. of affinity features:' , len(X_affinityFN)
, '\n--------Common affinity cols:' , len(common_affinity_Fnum)
, '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames)
, '\n\nTotal no. of residue level features:', len(X_resprop_FN)
, '\n--------AA index cols:' , len(X_aaindex_Fnum)
, '\n--------Residue Prop cols:' , len(X_str_Fnum)
, '\n--------AA change Prop cols:' , len(X_aap_Fcat)
, '\n\nTotal no. of genomic features:' , len(X_genomicFN)
, '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum)
, '\n--------Lineage cols:' , len(X_gn_linegae_Fnum)
, '\n--------Other cols:' , len(X_gn_Fcat)
# print('AAindex features (n):' X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
# , len(X_aaindexFN) X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
# , '\nThese are:\n' all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
# , X_aaindexFN
# , '\n================================================================\n') ###############################################################################
print('Evolutionary features (n):' print('\n================================================================'
, len(X_evolFN)
, '\nThese are:\n' , '\nTotal Evolutionary features (n):' , len(X_evolFN)
, X_evolFN , '\n--------------Evol. feature colnames:', X_evolFN
, '\n================================================================\n')
, '\n================================================================'
, '\n\nTotal structural features (n):', len(X_structural_FN)
, '\n--------Stability ncols:' , len(X_stability_FN)
, '\n--------------Common stability colnames:' , X_common_stability_Fnum
, '\n--------------Foldx colnames:' , X_foldX_Fnum
, '\n--------Affinity ncols:' , len(X_affinityFN)
, '\n--------------Common affinity colnames:' , common_affinity_Fnum
, '\n--------------Gene specific affinity colnames:', gene_affinity_colnames
print('Genomic features (n):' , '\n--------Residue prop ncols:' , len(X_resprop_FN)
, len(X_genomicFN) , '\n--------------Residue Prop cols:' , X_str_Fnum
, '\nThese are:\n' , '\n--------------AA change Prop cols:' , X_aap_Fcat
, X_genomic_mafor, '\n' , '\n--------------AA index cols:' , X_aaindex_Fnum
, X_genomic_linegae
, '\n================================================================\n') , '\n================================================================'
, '\n\nTotal Genomic features (n):' , len(X_genomicFN)
, '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum)
, '\n--------------MAF+OR colnames:' , X_gn_mafor_Fnum
print('Categorical features (n):' , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum)
, len(categorical_FN) , '\n--------------Lineage cols:' , X_gn_linegae_Fnum
, '\nThese are:\n'
, categorical_FN
, '\n================================================================\n')
#if ( len(X.columns) == len(X_ssFN) + len(X_aaindexFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): , '\n--------Other cols:' , len(X_gn_Fcat)
if ( len(X.columns) == len(X_ssFN) + len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): , '\n--------------Other cols:' , X_gn_Fcat
, '\n================================================================')
# Sanity check
if ( len(X.columns) == len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)):
print('\nPass: No. of features match') print('\nPass: No. of features match')
else: else:
sys.exit('\nFail: Count of feature mismatch') print('\nFail: Count of feature mismatch'
, '\nExpected:', len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)
, '\nGot:', len(X.columns))
sys.exit()
print('\n#####################################################################\n') print('\n#####################################################################\n')
@ -108,7 +126,7 @@ print('\n#####################################################################\n
# #================== # #==================
# # Baseline models # # Baseline models
# #================== # #==================
# mm_skf_scoresD = MultModelsCl(input_df = X # mm_skf_scoresD = MultModelsCl_dissected(input_df = X
# , target = y # , target = y
# , var_type = 'mixed' # , var_type = 'mixed'
# , skf_cv = skf_cv # , skf_cv = skf_cv