diff --git a/scripts/ml/ml_data.py b/scripts/ml/ml_data.py index d5fbe11..e63da15 100644 --- a/scripts/ml/ml_data.py +++ b/scripts/ml/ml_data.py @@ -423,9 +423,9 @@ def setvars(gene,drug): #========================== my_df_ml = my_df.copy() - #========================== - # BLIND test set - #========================== + #=============================== + # Training and BLIND test set + #=============================== # Separate blind test set my_df_ml[drug].isna().sum() @@ -435,7 +435,7 @@ def setvars(gene,drug): training_df = my_df_ml[my_df_ml[drug].notna()] training_df.shape - # Target1: dst + # Target1: dst_mode training_df[drug].value_counts() training_df['dst_mode'].value_counts() @@ -514,15 +514,11 @@ def setvars(gene,drug): print('\nTotal no. of features for aaindex:', len(X_aaindexFN)) - #%% Construct numerical and categorical column names # numerical feature names - # numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN - - #numerical_FN = X_ssFN + X_evolFN + X_genomicFN numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN - #categorical feature names + # categorical feature names categorical_FN = ['ss_class' # , 'wt_prop_water' # , 'mut_prop_water' @@ -534,8 +530,8 @@ def setvars(gene,drug): , 'electrostatics_change' , 'polarity_change' , 'water_change' - #, 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1] - , 'active_site' + , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] + , 'active_site' #[didn't use it for uq_v1] #, 'gene_name' # will be required for the combined stuff ] #---------------------------------------------- @@ -561,7 +557,7 @@ def setvars(gene,drug): my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() - # mask the column ligand distance > 10 + # mask the mcsm affinity related columns where ligand distance > 10 my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 (my_df_ml['ligand_affinity_change'] == 0).sum() diff --git a/scripts/ml/pnca_config.py b/scripts/ml/pnca_config.py index fa2d158..ecc34f3 100755 --- a/scripts/ml/pnca_config.py +++ b/scripts/ml/pnca_config.py @@ -13,7 +13,7 @@ drug = 'pyrazinamide' #total_mtblineage_uc = 8 homedir = os.path.expanduser("~") -os.chdir( homedir + '/git/ML_AI_training/') +os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/') #--------------------------- # Version 1: no AAindex @@ -30,7 +30,7 @@ from ml_data import * #from UQ_yc_RunAllClfs import run_all_ML # TT run all ML clfs: baseline mode -from UQ_MultModelsCl import MultModelsCl +from MultModelsCl import MultModelsCl #%%###########################################################################