From e2bc384155f6483bf18d5d2fedfa65b7c4ded317 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 24 Jun 2022 20:35:53 +0100 Subject: [PATCH] added FS to MultClfs.py and modified data for different splits for consistency --- scripts/ml/FS.py | 2 +- scripts/ml/MultClfs.py | 304 +++++++++++++++++++++- scripts/ml/ml_data_7030.py | 17 +- scripts/ml/ml_data_8020.py | 392 +++++++++++++++------------- scripts/ml/ml_data_cd_7030.py | 402 +++++++++++++++-------------- scripts/ml/ml_data_cd_8020.py | 402 +++++++++++++++-------------- scripts/ml/ml_data_cd_sl.py | 410 ++++++++++++++++-------------- scripts/ml/ml_data_sl.py | 404 +++++++++++++++-------------- scripts/ml/run_7030.py | 37 ++- scripts/ml/run_FS.py | 25 +- scripts/ml/running_ml_scripts.txt | 182 +++++++++---- scripts/ml/test_MultClfs.py | 2 + 12 files changed, 1585 insertions(+), 994 deletions(-) diff --git a/scripts/ml/FS.py b/scripts/ml/FS.py index 9eba852..f35731b 100755 --- a/scripts/ml/FS.py +++ b/scripts/ml/FS.py @@ -98,7 +98,7 @@ mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} ############################################################################### -def fsgs(input_df +def fsgs_rfecv(input_df , target , param_gridLd = [{'fs__min_features_to_select' : [1]}] , blind_test_df = pd.DataFrame() diff --git a/scripts/ml/MultClfs.py b/scripts/ml/MultClfs.py index 39377fa..4e56ab3 100644 --- a/scripts/ml/MultClfs.py +++ b/scripts/ml/MultClfs.py @@ -502,10 +502,11 @@ def ProcessMultModelsCl(inputD = {}): # Combine WF+Metadata: Final output #------------------------------------- # checking indices for the dfs to combine: - c1 = list(set(combined_baseline_wf.index)) - c2 = list(metaDF.index) + c1L = list(set(combined_baseline_wf.index)) + c2L = list(metaDF.index) - if c1 == c2: + #if set(c1L) == set(c2L): + if set(c1L) == set(c2L) and all(x in c2L for x in c1L) and all(x in c1L for x in c2L): print('\nPASS: proceeding to merge metadata with CV and BT dfs') combDF = pd.merge(combined_baseline_wf, metaDF, left_index = True, right_index = True) else: @@ -531,5 +532,302 @@ def ProcessMultModelsCl(inputD = {}): return combDF ############################################################################### +#%% Feature selection function ################################################ +############################ +# fsgs_rfecv() +############################ +# Run FS using some classifier models +# +def fsgs_rfecv(input_df + , target + , param_gridLd = [{'fs__min_features_to_select' : [1]}] + , blind_test_df = pd.DataFrame() + , blind_test_target = pd.Series(dtype = 'int64') + , estimator = LogisticRegression(**rs) # placeholder + , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below + , custom_fs = RFECV(DecisionTreeClassifier(**rs) , cv = skf_cv, scoring = 'matthews_corrcoef') + , cv_method = skf_cv + , var_type = ['numerical', 'categorical' , 'mixed'] + , verbose = 3 + ): + ''' + returns + Dict containing results from FS and hyperparam tuning for a given estiamtor + + >>> ADD MORE <<< + + optimised/selected based on mcc + + ''' + ########################################################################### + #================================================ + # Determine categorical and numerical features + #================================================ + numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix + + #================================================ + # Determine preprocessing steps ~ var_type + #================================================ + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + + ########################################################################### + #================================================== + # Create var_type ~ column names + # using one hot encoder with RFECV means + # the names internally are lost. Hence + # fit col_transformeer to my input_df and get + # all the column names out and stored in a var + # to allow the 'selected features' to be subsetted + # from the numpy boolean array + #================================================= + col_transform.fit(input_df) + col_transform.get_feature_names_out() + + var_type_colnames = col_transform.get_feature_names_out() + var_type_colnames = pd.Index(var_type_colnames) + + if var_type == 'mixed': + print('\nVariable type is:', var_type + , '\nNo. of columns in input_df:', len(input_df.columns) + , '\nNo. of columns post one hot encoder:', len(var_type_colnames)) + else: + print('\nNo. of columns in input_df:', len(input_df.columns)) + + #================================== + # Build FS with supplied estimator + #================================== + if use_fs: + fs = custom_fs + else: + fs = RFECV(estimator, cv = skf_cv, scoring = 'matthews_corrcoef') + + #================================== + # Build basic param grid + #================================== + # param_gridD = [ + # {'fs__min_features_to_select' : [1] + # }] + + ############################################################################ + # Create Pipeline object + pipe = Pipeline([ + ('pre', col_transform), + ('fs', fs), + ('clf', estimator)]) + ############################################################################ + # Define GridSearchCV + gscv_fs = GridSearchCV(pipe + #, param_gridLd = param_gridD + , param_gridLd + , cv = cv_method + , scoring = scoring_fn + , refit = 'mcc' + , verbose = 3 + , return_train_score = True + , **njobs) + + gscv_fs.fit(input_df, target) + + ########################################################################### + # Get best param and scores out + gscv_fs.best_params_ + gscv_fs.best_score_ + + # Training best score corresponds to the max of the mean_test + train_bscore = round(gscv_fs.best_score_, 2); train_bscore + print('\nTraining best score (MCC):', train_bscore) + gscv_fs.cv_results_['mean_test_mcc'] + round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) + round(np.nanmax(gscv_fs.cv_results_['mean_test_mcc']),2) + + check_train_score = [round(gscv_fs.cv_results_['mean_test_mcc'].max(),2) + , round(np.nanmax(gscv_fs.cv_results_['mean_test_mcc']),2)] + + check_train_score = np.nanmax(check_train_score) + + # Training results + gscv_tr_resD = gscv_fs.cv_results_ + mod_refit_param = gscv_fs.refit + + # sanity check + if train_bscore == check_train_score: + print('\nVerified training score (MCC):', train_bscore ) + else: + sys.exit('\nTraining score could not be internatlly verified. Please check training results dict') + + #------------------------- + # Dict of CV results + #------------------------- + cv_allD = gscv_fs.cv_results_ + cvdf0 = pd.DataFrame(cv_allD) + cvdf = cvdf0.filter(regex='mean_test', axis = 1) + cvdfT = cvdf.T + cvdfT.columns = ['cv_score'] + cvdfTr = cvdfT.loc[:,'cv_score'].round(decimals = 2) # round values + cvD = cvdfTr.to_dict() + print('\n CV results dict generated for:', len(scoring_fn), 'scores' + , '\nThese are:', scoring_fn.keys()) + + #------------------------- + # Blind test: REAL check! + #------------------------- + #tp = gscv_fs.predict(X_bts) + tp = gscv_fs.predict(blind_test_df) + print('\nMCC on Blind test:' , round(matthews_corrcoef(blind_test_target, tp),2)) + print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, tp),2)) + + #================= + # info extraction + #================= + # gives input vals?? + gscv_fs._check_n_features + + # gives gscv params used + gscv_fs._get_param_names() + + # gives ?? + gscv_fs.best_estimator_ + gscv_fs.best_params_ # gives best estimator params as a dict + gscv_fs.best_estimator_._final_estimator # similar to above, doesn't contain max_iter + gscv_fs.best_estimator_.named_steps['fs'].get_support() + gscv_fs.best_estimator_.named_steps['fs'].ranking_ # array of ranks for the features + + gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.mean() + gscv_fs.best_estimator_.named_steps['fs'].grid_scores_.max() + #gscv_fs.best_estimator_.named_steps['fs'].grid_scores_ + + estimator_mask = gscv_fs.best_estimator_.named_steps['fs'].get_support() + + + ############################################################################ + #============ + # FS results + #============ + # Now get the features out + + #-------------- + # All features + #-------------- + all_features = gscv_fs.feature_names_in_ + n_all_features = gscv_fs.n_features_in_ + #all_features = gsfit.feature_names_in_ + + #-------------- + # Selected features by the classifier + # Important to have var_type_colnames here + #---------------- + #sel_features = X.columns[gscv_fs.best_estimator_.named_steps['fs'].get_support()] 3 only for numerical df + sel_features = var_type_colnames[gscv_fs.best_estimator_.named_steps['fs'].get_support()] + n_sf = gscv_fs.best_estimator_.named_steps['fs'].n_features_ + + #-------------- + # Get model name + #-------------- + model_name = gscv_fs.best_estimator_.named_steps['clf'] + b_model_params = gscv_fs.best_params_ + + print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + ) + + ########################################################################### + ############################## OUTPUT ##################################### + ########################################################################### + #========================= + # Blind test: BTS results + #========================= + # Build the final results with all scores for a feature selected model + #bts_predict = gscv_fs.predict(X_bts) + bts_predict = gscv_fs.predict(blind_test_df) + + print('\nMCC on Blind test:' , round(matthews_corrcoef(blind_test_target, bts_predict),2)) + print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) + bts_mcc_score = round(matthews_corrcoef(blind_test_target, bts_predict),2) + + # Diff b/w train and bts test scores + train_test_diff = train_bscore - bts_mcc_score + print('\nDiff b/w train and blind test score (MCC):', train_test_diff) + + lr_btsD ={} + #lr_btsD['bts_mcc'] = bts_mcc_score + lr_btsD['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) + lr_btsD['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) + lr_btsD['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2) + lr_btsD['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2) + lr_btsD['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2) + lr_btsD['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) + lr_btsD + + #=========================== + # Add FS related model info + #=========================== + model_namef = str(model_name) + # FIXME: doesn't tell you which it has chosen + fs_methodf = str(gscv_fs.best_estimator_.named_steps['fs']) + all_featuresL = list(all_features) + fs_res_arrayf = str(list( gscv_fs.best_estimator_.named_steps['fs'].get_support())) + fs_res_array_rankf = str(list( gscv_fs.best_estimator_.named_steps['fs'].ranking_)) + sel_featuresf = list(sel_features) + n_sf = int(n_sf) + + output_modelD = {'model_name': model_namef + , 'model_refit_param': mod_refit_param + , 'Best_model_params': b_model_params + , 'n_all_features': n_all_features + , 'fs_method': fs_methodf + , 'fs_res_array': fs_res_arrayf + , 'fs_res_array_rank': fs_res_array_rankf + , 'all_feature_names': all_featuresL + , 'n_sel_features': n_sf + , 'sel_features_names': sel_featuresf} + #output_modelD + + #======================================== + # Update output_modelD with bts_results + #======================================== + output_modelD.update(lr_btsD) + output_modelD + + output_modelD['train_score (MCC)'] = train_bscore + output_modelD['bts_mcc'] = bts_mcc_score + output_modelD['train_bts_diff'] = round(train_test_diff,2) + print(output_modelD) + + nlen = len(output_modelD) + + #======================================== + # Update output_modelD with cv_results + #======================================== + output_modelD.update(cvD) + + if (len(output_modelD) == nlen + len(cvD)): + print('\nFS run complete for model:', estimator + , '\nFS using:', fs + , '\nOutput dict size:', len(output_modelD)) + return(output_modelD) + else: + sys.exit('\nFAIL:numbers mismatch output dict length not as expected. Please check') \ No newline at end of file diff --git a/scripts/ml/ml_data_7030.py b/scripts/ml/ml_data_7030.py index 560d06f..879dad6 100644 --- a/scripts/ml/ml_data_7030.py +++ b/scripts/ml/ml_data_7030.py @@ -37,7 +37,7 @@ def setvars(gene,drug): import argparse import re #%% GLOBALS - tts_split = "70/30" + tts_split = "70_30" rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -727,7 +727,7 @@ def setvars(gene,drug): #------------------------------ oversample = RandomOverSampler(sampling_strategy='minority') X_ros, y_ros = oversample.fit_resample(X, y) - print('Simple Random OverSampling\n', Counter(y_ros)) + print('\nSimple Random OverSampling\n', Counter(y_ros)) print(X_ros.shape) #------------------------------ @@ -736,7 +736,7 @@ def setvars(gene,drug): #------------------------------ undersample = RandomUnderSampler(sampling_strategy='majority') X_rus, y_rus = undersample.fit_resample(X, y) - print('Simple Random UnderSampling\n', Counter(y_rus)) + print('\nSimple Random UnderSampling\n', Counter(y_rus)) print(X_rus.shape) #------------------------------ @@ -747,7 +747,7 @@ def setvars(gene,drug): X_ros, y_ros = oversample.fit_resample(X, y) undersample = RandomUnderSampler(sampling_strategy='majority') X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros) - print('Simple Combined Over and UnderSampling\n', Counter(y_rouC)) + print('\nSimple Combined Over and UnderSampling\n', Counter(y_rouC)) print(X_rouC.shape) #------------------------------ @@ -767,7 +767,7 @@ def setvars(gene,drug): categorical_colind = X.columns.get_indexer(list(categorical_ix)) categorical_colind - k_sm = 5 # 5 is deafult + k_sm = 5 # 5 is default sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs) X_smnc, y_smnc = sm_nc.fit_resample(X, y) print('\nSMOTE_NC OverSampling\n', Counter(y_smnc)) @@ -797,5 +797,10 @@ def setvars(gene,drug): # print(X_enn.shape) # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn)) - ############################################################################### + ########################################################################### # TODO: Find over and undersampling JUST for categorical data + ########################################################################### + + print('\n#################################################################' + , '\nDim of X for gene:', gene.lower(), '\n', X.shape + , '\n###############################################################') \ No newline at end of file diff --git a/scripts/ml/ml_data_8020.py b/scripts/ml/ml_data_8020.py index 4928839..1099106 100644 --- a/scripts/ml/ml_data_8020.py +++ b/scripts/ml/ml_data_8020.py @@ -34,7 +34,11 @@ def setvars(gene,drug): from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline + import argparse + import re #%% GLOBALS + tts_split = "80_20" + rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -56,12 +60,10 @@ def setvars(gene,drug): , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} - jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - + jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% FOR LATER: Combine ED logo data ########################################################################### - rs = {'random_state': 42} - njobs = {'n_jobs': 10} + homedir = os.path.expanduser("~") geneL_basic = ['pnca'] @@ -422,118 +424,31 @@ def setvars(gene,drug): #========================== my_df_ml = my_df.copy() - #%% Build X: input for ML - common_cols_stabiltyN = ['ligand_distance' - , 'ligand_affinity_change' - , 'duet_stability_change' - , 'ddg_foldx' - , 'deepddg' - , 'ddg_dynamut2' - , 'mmcsm_lig' - , 'contacts'] - - # Build stability columns ~ gene + # Build column names to mask for affinity chanhes if gene.lower() in geneL_basic: - X_stabilityN = common_cols_stabiltyN + #X_stabilityN = common_cols_stabiltyN + gene_affinity_colnames = []# not needed as its the common ones cols_to_mask = ['ligand_affinity_change'] if gene.lower() in geneL_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] - geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] if gene.lower() in geneL_na: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] - geneL_na_st_cols = ['mcsm_na_affinity'] - X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] if gene.lower() in geneL_na_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] - - X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' - , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' - , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' - , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' - , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' - , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' - ] - - X_str = ['rsa' - #, 'asa' - , 'kd_values' - , 'rd_values'] - - X_ssFN = X_stabilityN + X_str + X_foldX_cols - - X_evolFN = ['consurf_score' - , 'snap2_score' - , 'provean_score'] - - X_genomic_mafor = ['maf' - , 'logorI' - # , 'or_rawI' - # , 'or_mychisq' - # , 'or_logistic' - # , 'or_fisher' - # , 'pval_fisher' - ] - - X_genomic_linegae = ['lineage_proportion' - , 'dist_lineage_proportion' - #, 'lineage' # could be included as a category but it has L2;L4 formatting - , 'lineage_count_all' - , 'lineage_count_unique' - ] - - X_genomicFN = X_genomic_mafor + X_genomic_linegae - - X_aaindexFN = list(aa_df_cols) - - print('\nTotal no. of features for aaindex:', len(X_aaindexFN)) - - # numerical feature names - numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN - - # categorical feature names - categorical_FN = ['ss_class' - # , 'wt_prop_water' - # , 'mut_prop_water' - # , 'wt_prop_polarity' - # , 'mut_prop_polarity' - # , 'wt_calcprop' - # , 'mut_calcprop' - , 'aa_prop_change' - , 'electrostatics_change' - , 'polarity_change' - , 'water_change' - , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] - , 'active_site' #[didn't use it for uq_v1] - #, 'gene_name' # will be required for the combined stuff - ] - - #---------------------------------------------- - # count numerical and categorical features - #---------------------------------------------- - - print('\nNo. of numerical features:', len(numerical_FN) - , '\nNo. of categorical features:', len(categorical_FN)) - #======================= # Masking columns: # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 #======================= - # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() - # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() - - # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 - # (my_df_ml['ligand_affinity_change'] == 0).sum() - my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() @@ -544,92 +459,165 @@ def setvars(gene,drug): mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + #=================================================== # write file for check mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) - mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + #=================================================== + ############################################################################### + #%% Feature groups (FG): Build X for Input ML + ############################################################################ + #=========================== + # FG1: Evolutionary features + #=========================== + X_evolFN = ['consurf_score' + , 'snap2_score' + , 'provean_score'] + + ############################################################################### + #======================== + # FG2: Stability features + #======================== + #-------- + # common + #-------- + X_common_stability_Fnum = [ + 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2' + , 'contacts'] + #-------- + # FoldX + #-------- + X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' + , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' + , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' + , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' + , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' + , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'] + + X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum + + ############################################################################### + #=================== + # FG3: Affinity features + #=================== + common_affinity_Fnum = ['ligand_distance' + , 'ligand_affinity_change' + , 'mmcsm_lig'] + + # if gene.lower() in geneL_basic: + # X_affinityFN = common_affinity_Fnum + # else: + # X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + ############################################################################### + #============================ + # FG4: Residue level features + #============================ + #----------- + # AA index + #----------- + X_aaindex_Fnum = list(aa_df_cols) + print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum)) + + #----------------- + # surface area + # depth + # hydrophobicity + #----------------- + X_str_Fnum = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + + #--------------------------- + # Other aa properties + # active site indication + #--------------------------- + X_aap_Fcat = ['ss_class' + # , 'wt_prop_water' + # , 'mut_prop_water' + # , 'wt_prop_polarity' + # , 'mut_prop_polarity' + # , 'wt_calcprop' + # , 'mut_calcprop' + , 'aa_prop_change' + , 'electrostatics_change' + , 'polarity_change' + , 'water_change' + , 'active_site'] - ##################################################################### + X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat + ############################################################################### + #======================== + # FG5: Genomic features + #======================== + X_gn_mafor_Fnum = ['maf' + #, 'logorI' + # , 'or_rawI' + # , 'or_mychisq' + # , 'or_logistic' + # , 'or_fisher' + # , 'pval_fisher' + ] + + X_gn_linegae_Fnum = ['lineage_proportion' + , 'dist_lineage_proportion' + #, 'lineage' # could be included as a category but it has L2;L4 formatting + , 'lineage_count_all' + , 'lineage_count_unique' + ] + + # X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] + # #, 'gene_name' # will be required for the combined stuff + # ] + X_gn_Fcat = [] + + X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat + ############################################################################### + #======================== + # FG6 collapsed: Structural : Atability + Affinity + ResidueProp + #======================== + X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN + + ############################################################################### + #======================== + # BUILDING all features + #======================== + all_featuresN = X_evolFN + X_structural_FN + X_genomicFN + + ############################################################################### + #%% Define training and test data #================================================================ # Training and BLIND test set: 80/20 - - # Throw away previous blind_test_df, and call the 30% data as blind_test - # as these were imputed values and initial analysis shows that this - # is not very representative + # dst with actual values : training set + # dst with imputed values : THROW AWAY [unrepresentative] #================================================================ my_df_ml[drug].isna().sum() + # blind_test_df = my_df_ml[my_df_ml[drug].isna()] # blind_test_df.shape - training_df = my_df_ml[my_df_ml[drug].notna()] + training_df = my_df_ml[my_df_ml[drug].notna()] training_df.shape # Target 1: dst_mode training_df[drug].value_counts() training_df['dst_mode'].value_counts() - #################################################################### - -############################################################################### -############################################################################### - # #%% extracting dfs based on numerical, categorical column names - # #---------------------------------- - # # WITHOUT the target var included - # #---------------------------------- - # num_df = training_df[numerical_FN] - # num_df.shape - - # cat_df = training_df[categorical_FN] - # cat_df.shape - - # all_df = training_df[numerical_FN + categorical_FN] - # all_df.shape - - # #------------------------------ - # # WITH the target var included: - # #'wtgt': with target - # #------------------------------ - # # drug and dst_mode should be the same thing - # num_df_wtgt = training_df[numerical_FN + ['dst_mode']] - # num_df_wtgt.shape - - # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] - # cat_df_wtgt.shape - - # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] - # all_df_wtgt.shape - - #%%######################################################################## - # #============ - # # ML data: OLD - # #============ - # #------ - # # X: Training and Blind test (BTS) - # #------ - # X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL - # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL - # #X = all_df_wtgt[numerical_FN] # training numerical only - # #X_bts = blind_test_df[numerical_FN] # blind test data numerical - - # #------ - # # y - # #------ - # y = all_df_wtgt['dst_mode'] # training data y - # y_bts = blind_test_df['dst_mode'] # blind data test y - - # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] - -############################################################################### -############################################################################### - #====================================== + #################################################################### + #==================================== # ML data: Train test split: 80/20 # with stratification # 80% : training_data for CV # 20% : blind test - #====================================== - - # features: all_df or - x_features = training_df[numerical_FN + categorical_FN] - y_target = training_df['dst_mode'] + #===================================== + x_features = training_df[all_featuresN] + y_target = training_df['dst_mode'] # sanity check if not 'dst_mode' in x_features.columns: @@ -640,7 +628,9 @@ def setvars(gene,drug): #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d else: sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!') - + #------------------- + # train-test split + #------------------- #x_train, x_test, y_train, y_test # traditional var_names # so my downstream code doesn't need to change X, X_bts, y, y_bts = train_test_split(x_features, y_target @@ -652,16 +642,65 @@ def setvars(gene,drug): yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] - + + ############################################################################### + #====================================================== + # Determine categorical and numerical features + #====================================================== + numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns + numerical_cols + categorical_cols = X.select_dtypes(include=['object', 'bool']).columns + categorical_cols + + ################################################################################ + # IMPORTANT sanity checks + if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN): + print('\nPASS: ML data with input features, training and test generated...' + , '\n\nTotal no. of input features:' , len(X.columns) + , '\n--------No. of numerical features:' , len(numerical_cols) + , '\n--------No. of categorical features:' , len(categorical_cols) + + , '\n\nTotal no. of evolutionary features:' , len(X_evolFN) + + , '\n\nTotal no. of stability features:' , len(X_stability_FN) + , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum) + , '\n--------Foldx cols:' , len(X_foldX_Fnum) + + , '\n\nTotal no. of affinity features:' , len(X_affinityFN) + , '\n--------Common affinity cols:' , len(common_affinity_Fnum) + , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) + + , '\n\nTotal no. of residue level features:', len(X_resprop_FN) + , '\n--------AA index cols:' , len(X_aaindex_Fnum) + , '\n--------Residue Prop cols:' , len(X_str_Fnum) + , '\n--------AA change Prop cols:' , len(X_aap_Fcat) + + , '\n\nTotal no. of genomic features:' , len(X_genomicFN) + , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) + , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) + , '\n--------Other cols:' , len(X_gn_Fcat) + ) + else: + print('\nFAIL: numbers mismatch' + , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN) + , '\nGot:', len(X.columns)) + sys.exit() + ############################################################################### print('\n-------------------------------------------------------------' - , '\nSuccessfully split data with stratification: 80/20 ' - , '\nInput features data size:', x_features.shape - , '\nTrain data size:', X.shape - , '\nTest data size:', X_bts.shape + , '\nSuccessfully split data: ALL features' + , '\nactual values: training set' + , '\nSplit:', tts_split + #, '\nimputed values: blind test set' + + , '\n\nTotal data size:', len(X) + len(X_bts) + + , '\n\nTrain data size:', X.shape , '\ny_train numbers:', yc1 - , '\ny_train ratio:',yc1_ratio - , '\n' + + , '\n\nTest data size:', X_bts.shape , '\ny_test_numbers:', yc2 + + , '\n\ny_train ratio:',yc1_ratio , '\ny_test ratio:', yc2_ratio , '\n-------------------------------------------------------------' ) @@ -760,3 +799,8 @@ def setvars(gene,drug): ############################################################################### # TODO: Find over and undersampling JUST for categorical data + ########################################################################### + + print('\n#################################################################' + , '\nDim of X for gene:', gene.lower(), '\n', X.shape + , '\n###############################################################') diff --git a/scripts/ml/ml_data_cd_7030.py b/scripts/ml/ml_data_cd_7030.py index 398307f..65af7f4 100644 --- a/scripts/ml/ml_data_cd_7030.py +++ b/scripts/ml/ml_data_cd_7030.py @@ -34,7 +34,11 @@ def setvars(gene,drug): from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline + import argparse + import re #%% GLOBALS + tts_split = "70_30" + rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -56,12 +60,10 @@ def setvars(gene,drug): , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} - jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - + jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% FOR LATER: Combine ED logo data ########################################################################### - rs = {'random_state': 42} - njobs = {'n_jobs': 10} + homedir = os.path.expanduser("~") geneL_basic = ['pnca'] @@ -422,118 +424,31 @@ def setvars(gene,drug): #========================== my_df_ml = my_df.copy() - #%% Build X: input for ML - common_cols_stabiltyN = ['ligand_distance' - , 'ligand_affinity_change' - , 'duet_stability_change' - , 'ddg_foldx' - , 'deepddg' - , 'ddg_dynamut2' - , 'mmcsm_lig' - , 'contacts'] - - # Build stability columns ~ gene + # Build column names to mask for affinity chanhes if gene.lower() in geneL_basic: - X_stabilityN = common_cols_stabiltyN + #X_stabilityN = common_cols_stabiltyN + gene_affinity_colnames = []# not needed as its the common ones cols_to_mask = ['ligand_affinity_change'] if gene.lower() in geneL_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] - geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] if gene.lower() in geneL_na: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] - geneL_na_st_cols = ['mcsm_na_affinity'] - X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] if gene.lower() in geneL_na_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] - - X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' - , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' - , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' - , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' - , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' - , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' - ] - - X_str = ['rsa' - #, 'asa' - , 'kd_values' - , 'rd_values'] - - X_ssFN = X_stabilityN + X_str + X_foldX_cols - - X_evolFN = ['consurf_score' - , 'snap2_score' - , 'provean_score'] - - X_genomic_mafor = ['maf' - , 'logorI' - # , 'or_rawI' - # , 'or_mychisq' - # , 'or_logistic' - # , 'or_fisher' - # , 'pval_fisher' - ] - - X_genomic_linegae = ['lineage_proportion' - , 'dist_lineage_proportion' - #, 'lineage' # could be included as a category but it has L2;L4 formatting - , 'lineage_count_all' - , 'lineage_count_unique' - ] - - X_genomicFN = X_genomic_mafor + X_genomic_linegae - - X_aaindexFN = list(aa_df_cols) - - print('\nTotal no. of features for aaindex:', len(X_aaindexFN)) - - # numerical feature names - numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN - - # categorical feature names - categorical_FN = ['ss_class' - # , 'wt_prop_water' - # , 'mut_prop_water' - # , 'wt_prop_polarity' - # , 'mut_prop_polarity' - # , 'wt_calcprop' - # , 'mut_calcprop' - , 'aa_prop_change' - , 'electrostatics_change' - , 'polarity_change' - , 'water_change' - , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] - , 'active_site' #[didn't use it for uq_v1] - #, 'gene_name' # will be required for the combined stuff - ] - - #---------------------------------------------- - # count numerical and categorical features - #---------------------------------------------- - - print('\nNo. of numerical features:', len(numerical_FN) - , '\nNo. of categorical features:', len(categorical_FN)) - #======================= # Masking columns: # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 #======================= - # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() - # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() - - # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 - # (my_df_ml['ligand_affinity_change'] == 0).sum() - my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() @@ -544,21 +459,150 @@ def setvars(gene,drug): mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + #=================================================== # write file for check mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) - mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') - - ##################################################################### - #================================================================ - # Training and Blind test [COMPLETE data]: 70/30 + mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + #=================================================== + ############################################################################### + #%% Feature groups (FG): Build X for Input ML + ############################################################################ + #=========================== + # FG1: Evolutionary features + #=========================== + X_evolFN = ['consurf_score' + , 'snap2_score' + , 'provean_score'] - # Use complete data, call the 30% as blind test + ############################################################################### + #======================== + # FG2: Stability features + #======================== + #-------- + # common + #-------- + X_common_stability_Fnum = [ + 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2' + , 'contacts'] + #-------- + # FoldX + #-------- + X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' + , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' + , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' + , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' + , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' + , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'] + + X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum + + ############################################################################### + #=================== + # FG3: Affinity features + #=================== + common_affinity_Fnum = ['ligand_distance' + , 'ligand_affinity_change' + , 'mmcsm_lig'] + + # if gene.lower() in geneL_basic: + # X_affinityFN = common_affinity_Fnum + # else: + # X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + ############################################################################### + #============================ + # FG4: Residue level features + #============================ + #----------- + # AA index + #----------- + X_aaindex_Fnum = list(aa_df_cols) + print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum)) + + #----------------- + # surface area + # depth + # hydrophobicity + #----------------- + X_str_Fnum = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + + #--------------------------- + # Other aa properties + # active site indication + #--------------------------- + X_aap_Fcat = ['ss_class' + # , 'wt_prop_water' + # , 'mut_prop_water' + # , 'wt_prop_polarity' + # , 'mut_prop_polarity' + # , 'wt_calcprop' + # , 'mut_calcprop' + , 'aa_prop_change' + , 'electrostatics_change' + , 'polarity_change' + , 'water_change' + , 'active_site'] + + X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat + ############################################################################### + #======================== + # FG5: Genomic features + #======================== + X_gn_mafor_Fnum = ['maf' + #, 'logorI' + # , 'or_rawI' + # , 'or_mychisq' + # , 'or_logistic' + # , 'or_fisher' + # , 'pval_fisher' + ] + + X_gn_linegae_Fnum = ['lineage_proportion' + , 'dist_lineage_proportion' + #, 'lineage' # could be included as a category but it has L2;L4 formatting + , 'lineage_count_all' + , 'lineage_count_unique' + ] + + # X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] + # #, 'gene_name' # will be required for the combined stuff + # ] + X_gn_Fcat = [] + + X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat + ############################################################################### + #======================== + # FG6 collapsed: Structural : Atability + Affinity + ResidueProp + #======================== + X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN + + ############################################################################### + #======================== + # BUILDING all features + #======================== + all_featuresN = X_evolFN + X_structural_FN + X_genomicFN + + ############################################################################### + #%% Define training and test data + #================================================================ + # Training and BLIND test set: 70/30 + # dst with actual values : training set + # dst with imputed values : THROW AWAY [unrepresentative] #================================================================ my_df_ml[drug].isna().sum() + # blind_test_df = my_df_ml[my_df_ml[drug].isna()] # blind_test_df.shape - #training_df = my_df_ml[my_df_ml[drug].notna()] + #training_df = my_df_ml[my_df_ml[drug].notna()] #training_df.shape training_df = my_df_ml.copy() @@ -568,80 +612,14 @@ def setvars(gene,drug): training_df['dst_mode'].value_counts() #################################################################### - -############################################################################### -############################################################################### - # #%% extracting dfs based on numerical, categorical column names - # #---------------------------------- - # # WITHOUT the target var included - # #---------------------------------- - # num_df = training_df[numerical_FN] - # num_df.shape - - # cat_df = training_df[categorical_FN] - # cat_df.shape - - # all_df = training_df[numerical_FN + categorical_FN] - # all_df.shape - - # #------------------------------ - # # WITH the target var included: - # #'wtgt': with target - # #------------------------------ - # # drug and dst_mode should be the same thing - # num_df_wtgt = training_df[numerical_FN + ['dst_mode']] - # num_df_wtgt.shape - - # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] - # cat_df_wtgt.shape - - # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] - # all_df_wtgt.shape - - #%%######################################################################## - # #============ - # # ML data: OLD - # #============ - # #------ - # # X: Training and Blind test (BTS) - # #------ - # X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL - # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL - # #X = all_df_wtgt[numerical_FN] # training numerical only - # #X_bts = blind_test_df[numerical_FN] # blind test data numerical - - # #------ - # # y - # #------ - # y = all_df_wtgt['dst_mode'] # training data y - # y_bts = blind_test_df['dst_mode'] # blind data test y - - # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] - - # # Quick check - # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() - # for i in range(len(cols_to_mask)): - # ind = i+1 - # print('\nindex:', i, '\nind:', ind) - # print('\nMask count check:' - # , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() - # ) - - # print('Original Data\n', Counter(y) - # , 'Data dim:', X.shape) - -############################################################################### -############################################################################### #==================================== - # ML data: Train test split [COMPLETE data]: 70/30 + # ML data: Train test split: 70/30 # with stratification # 70% : training_data for CV # 30% : blind test #===================================== - - # features: all_df or - x_features = training_df[numerical_FN + categorical_FN] - y_target = training_df['dst_mode'] + x_features = training_df[all_featuresN] + y_target = training_df['dst_mode'] # sanity check if not 'dst_mode' in x_features.columns: @@ -652,7 +630,9 @@ def setvars(gene,drug): #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d else: sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!') - + #------------------- + # train-test split + #------------------- #x_train, x_test, y_train, y_test # traditional var_names # so my downstream code doesn't need to change X, X_bts, y, y_bts = train_test_split(x_features, y_target @@ -664,16 +644,65 @@ def setvars(gene,drug): yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] - + + ############################################################################### + #====================================================== + # Determine categorical and numerical features + #====================================================== + numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns + numerical_cols + categorical_cols = X.select_dtypes(include=['object', 'bool']).columns + categorical_cols + + ################################################################################ + # IMPORTANT sanity checks + if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN): + print('\nPASS: ML data with input features, training and test generated...' + , '\n\nTotal no. of input features:' , len(X.columns) + , '\n--------No. of numerical features:' , len(numerical_cols) + , '\n--------No. of categorical features:' , len(categorical_cols) + + , '\n\nTotal no. of evolutionary features:' , len(X_evolFN) + + , '\n\nTotal no. of stability features:' , len(X_stability_FN) + , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum) + , '\n--------Foldx cols:' , len(X_foldX_Fnum) + + , '\n\nTotal no. of affinity features:' , len(X_affinityFN) + , '\n--------Common affinity cols:' , len(common_affinity_Fnum) + , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) + + , '\n\nTotal no. of residue level features:', len(X_resprop_FN) + , '\n--------AA index cols:' , len(X_aaindex_Fnum) + , '\n--------Residue Prop cols:' , len(X_str_Fnum) + , '\n--------AA change Prop cols:' , len(X_aap_Fcat) + + , '\n\nTotal no. of genomic features:' , len(X_genomicFN) + , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) + , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) + , '\n--------Other cols:' , len(X_gn_Fcat) + ) + else: + print('\nFAIL: numbers mismatch' + , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN) + , '\nGot:', len(X.columns)) + sys.exit() + ############################################################################### print('\n-------------------------------------------------------------' - , '\nSuccessfully split data with stratification [COMPLETE data]: 70/30' - , '\nInput features data size:', x_features.shape - , '\nTrain data size:', X.shape - , '\nTest data size:', X_bts.shape + , '\nSuccessfully split data: ALL features' + , '\nactual values: training set' + , '\nSplit:', tts_split + #, '\nimputed values: blind test set' + + , '\n\nTotal data size:', len(X) + len(X_bts) + + , '\n\nTrain data size:', X.shape , '\ny_train numbers:', yc1 - , '\ny_train ratio:',yc1_ratio - , '\n' + + , '\n\nTest data size:', X_bts.shape , '\ny_test_numbers:', yc2 + + , '\n\ny_train ratio:',yc1_ratio , '\ny_test ratio:', yc2_ratio , '\n-------------------------------------------------------------' ) @@ -772,3 +801,8 @@ def setvars(gene,drug): ############################################################################### # TODO: Find over and undersampling JUST for categorical data + ########################################################################### + + print('\n#################################################################' + , '\nDim of X for gene:', gene.lower(), '\n', X.shape + , '\n###############################################################') \ No newline at end of file diff --git a/scripts/ml/ml_data_cd_8020.py b/scripts/ml/ml_data_cd_8020.py index 295ed18..c95d856 100644 --- a/scripts/ml/ml_data_cd_8020.py +++ b/scripts/ml/ml_data_cd_8020.py @@ -34,7 +34,11 @@ def setvars(gene,drug): from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline + import argparse + import re #%% GLOBALS + tts_split = "80_20" + rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -56,12 +60,10 @@ def setvars(gene,drug): , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} - jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - + jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% FOR LATER: Combine ED logo data ########################################################################### - rs = {'random_state': 42} - njobs = {'n_jobs': 10} + homedir = os.path.expanduser("~") geneL_basic = ['pnca'] @@ -422,118 +424,31 @@ def setvars(gene,drug): #========================== my_df_ml = my_df.copy() - #%% Build X: input for ML - common_cols_stabiltyN = ['ligand_distance' - , 'ligand_affinity_change' - , 'duet_stability_change' - , 'ddg_foldx' - , 'deepddg' - , 'ddg_dynamut2' - , 'mmcsm_lig' - , 'contacts'] - - # Build stability columns ~ gene + # Build column names to mask for affinity chanhes if gene.lower() in geneL_basic: - X_stabilityN = common_cols_stabiltyN + #X_stabilityN = common_cols_stabiltyN + gene_affinity_colnames = []# not needed as its the common ones cols_to_mask = ['ligand_affinity_change'] if gene.lower() in geneL_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] - geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] if gene.lower() in geneL_na: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] - geneL_na_st_cols = ['mcsm_na_affinity'] - X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] if gene.lower() in geneL_na_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] - - X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' - , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' - , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' - , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' - , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' - , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' - ] - - X_str = ['rsa' - #, 'asa' - , 'kd_values' - , 'rd_values'] - - X_ssFN = X_stabilityN + X_str + X_foldX_cols - - X_evolFN = ['consurf_score' - , 'snap2_score' - , 'provean_score'] - - X_genomic_mafor = ['maf' - , 'logorI' - # , 'or_rawI' - # , 'or_mychisq' - # , 'or_logistic' - # , 'or_fisher' - # , 'pval_fisher' - ] - - X_genomic_linegae = ['lineage_proportion' - , 'dist_lineage_proportion' - #, 'lineage' # could be included as a category but it has L2;L4 formatting - , 'lineage_count_all' - , 'lineage_count_unique' - ] - - X_genomicFN = X_genomic_mafor + X_genomic_linegae - - X_aaindexFN = list(aa_df_cols) - - print('\nTotal no. of features for aaindex:', len(X_aaindexFN)) - - # numerical feature names - numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN - - # categorical feature names - categorical_FN = ['ss_class' - # , 'wt_prop_water' - # , 'mut_prop_water' - # , 'wt_prop_polarity' - # , 'mut_prop_polarity' - # , 'wt_calcprop' - # , 'mut_calcprop' - , 'aa_prop_change' - , 'electrostatics_change' - , 'polarity_change' - , 'water_change' - , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] - , 'active_site' #[didn't use it for uq_v1] - #, 'gene_name' # will be required for the combined stuff - ] - - #---------------------------------------------- - # count numerical and categorical features - #---------------------------------------------- - - print('\nNo. of numerical features:', len(numerical_FN) - , '\nNo. of categorical features:', len(categorical_FN)) - #======================= # Masking columns: # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 #======================= - # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() - # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() - - # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 - # (my_df_ml['ligand_affinity_change'] == 0).sum() - my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() @@ -544,21 +459,150 @@ def setvars(gene,drug): mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + #=================================================== # write file for check mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) - mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') - - ##################################################################### - #================================================================ - # Training and Blind test [COMPLETE data]: 80/20 + mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + #=================================================== + ############################################################################### + #%% Feature groups (FG): Build X for Input ML + ############################################################################ + #=========================== + # FG1: Evolutionary features + #=========================== + X_evolFN = ['consurf_score' + , 'snap2_score' + , 'provean_score'] - # Use complete data, call the 20% as blind test + ############################################################################### + #======================== + # FG2: Stability features + #======================== + #-------- + # common + #-------- + X_common_stability_Fnum = [ + 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2' + , 'contacts'] + #-------- + # FoldX + #-------- + X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' + , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' + , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' + , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' + , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' + , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'] + + X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum + + ############################################################################### + #=================== + # FG3: Affinity features + #=================== + common_affinity_Fnum = ['ligand_distance' + , 'ligand_affinity_change' + , 'mmcsm_lig'] + + # if gene.lower() in geneL_basic: + # X_affinityFN = common_affinity_Fnum + # else: + # X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + ############################################################################### + #============================ + # FG4: Residue level features + #============================ + #----------- + # AA index + #----------- + X_aaindex_Fnum = list(aa_df_cols) + print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum)) + + #----------------- + # surface area + # depth + # hydrophobicity + #----------------- + X_str_Fnum = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + + #--------------------------- + # Other aa properties + # active site indication + #--------------------------- + X_aap_Fcat = ['ss_class' + # , 'wt_prop_water' + # , 'mut_prop_water' + # , 'wt_prop_polarity' + # , 'mut_prop_polarity' + # , 'wt_calcprop' + # , 'mut_calcprop' + , 'aa_prop_change' + , 'electrostatics_change' + , 'polarity_change' + , 'water_change' + , 'active_site'] + + X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat + ############################################################################### + #======================== + # FG5: Genomic features + #======================== + X_gn_mafor_Fnum = ['maf' + #, 'logorI' + # , 'or_rawI' + # , 'or_mychisq' + # , 'or_logistic' + # , 'or_fisher' + # , 'pval_fisher' + ] + + X_gn_linegae_Fnum = ['lineage_proportion' + , 'dist_lineage_proportion' + #, 'lineage' # could be included as a category but it has L2;L4 formatting + , 'lineage_count_all' + , 'lineage_count_unique' + ] + + # X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] + # #, 'gene_name' # will be required for the combined stuff + # ] + X_gn_Fcat = [] + + X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat + ############################################################################### + #======================== + # FG6 collapsed: Structural : Atability + Affinity + ResidueProp + #======================== + X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN + + ############################################################################### + #======================== + # BUILDING all features + #======================== + all_featuresN = X_evolFN + X_structural_FN + X_genomicFN + + ############################################################################### + #%% Define training and test data + #================================================================ + # Training and BLIND test set: 80/20 + # dst with actual values : training set + # dst with imputed values : THROW AWAY [unrepresentative] #================================================================ my_df_ml[drug].isna().sum() + # blind_test_df = my_df_ml[my_df_ml[drug].isna()] # blind_test_df.shape - #training_df = my_df_ml[my_df_ml[drug].notna()] + #training_df = my_df_ml[my_df_ml[drug].notna()] #training_df.shape training_df = my_df_ml.copy() @@ -568,80 +612,14 @@ def setvars(gene,drug): training_df['dst_mode'].value_counts() #################################################################### - -############################################################################### -############################################################################### - # #%% extracting dfs based on numerical, categorical column names - # #---------------------------------- - # # WITHOUT the target var included - # #---------------------------------- - # num_df = training_df[numerical_FN] - # num_df.shape - - # cat_df = training_df[categorical_FN] - # cat_df.shape - - # all_df = training_df[numerical_FN + categorical_FN] - # all_df.shape - - # #------------------------------ - # # WITH the target var included: - # #'wtgt': with target - # #------------------------------ - # # drug and dst_mode should be the same thing - # num_df_wtgt = training_df[numerical_FN + ['dst_mode']] - # num_df_wtgt.shape - - # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] - # cat_df_wtgt.shape - - # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] - # all_df_wtgt.shape - - #%%######################################################################## - # #============ - # # ML data: OLD - # #============ - # #------ - # # X: Training and Blind test (BTS) - # #------ - # X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL - # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL - # #X = all_df_wtgt[numerical_FN] # training numerical only - # #X_bts = blind_test_df[numerical_FN] # blind test data numerical - - # #------ - # # y - # #------ - # y = all_df_wtgt['dst_mode'] # training data y - # y_bts = blind_test_df['dst_mode'] # blind data test y - - # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] - - # # Quick check - # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() - # for i in range(len(cols_to_mask)): - # ind = i+1 - # print('\nindex:', i, '\nind:', ind) - # print('\nMask count check:' - # , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() - # ) - - # print('Original Data\n', Counter(y) - # , 'Data dim:', X.shape) - -############################################################################### -############################################################################### #==================================== - # ML data: Train test split [COMPLETE data]: 80/20 + # ML data: Train test split: 80/20 # with stratification # 80% : training_data for CV # 20% : blind test #===================================== - - # features: all_df or - x_features = training_df[numerical_FN + categorical_FN] - y_target = training_df['dst_mode'] + x_features = training_df[all_featuresN] + y_target = training_df['dst_mode'] # sanity check if not 'dst_mode' in x_features.columns: @@ -652,7 +630,9 @@ def setvars(gene,drug): #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d else: sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!') - + #------------------- + # train-test split + #------------------- #x_train, x_test, y_train, y_test # traditional var_names # so my downstream code doesn't need to change X, X_bts, y, y_bts = train_test_split(x_features, y_target @@ -664,16 +644,65 @@ def setvars(gene,drug): yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] - + + ############################################################################### + #====================================================== + # Determine categorical and numerical features + #====================================================== + numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns + numerical_cols + categorical_cols = X.select_dtypes(include=['object', 'bool']).columns + categorical_cols + + ################################################################################ + # IMPORTANT sanity checks + if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN): + print('\nPASS: ML data with input features, training and test generated...' + , '\n\nTotal no. of input features:' , len(X.columns) + , '\n--------No. of numerical features:' , len(numerical_cols) + , '\n--------No. of categorical features:' , len(categorical_cols) + + , '\n\nTotal no. of evolutionary features:' , len(X_evolFN) + + , '\n\nTotal no. of stability features:' , len(X_stability_FN) + , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum) + , '\n--------Foldx cols:' , len(X_foldX_Fnum) + + , '\n\nTotal no. of affinity features:' , len(X_affinityFN) + , '\n--------Common affinity cols:' , len(common_affinity_Fnum) + , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) + + , '\n\nTotal no. of residue level features:', len(X_resprop_FN) + , '\n--------AA index cols:' , len(X_aaindex_Fnum) + , '\n--------Residue Prop cols:' , len(X_str_Fnum) + , '\n--------AA change Prop cols:' , len(X_aap_Fcat) + + , '\n\nTotal no. of genomic features:' , len(X_genomicFN) + , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) + , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) + , '\n--------Other cols:' , len(X_gn_Fcat) + ) + else: + print('\nFAIL: numbers mismatch' + , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN) + , '\nGot:', len(X.columns)) + sys.exit() + ############################################################################### print('\n-------------------------------------------------------------' - , '\nSuccessfully split data with stratification [COMPLETE data]: 80/20' - , '\nInput features data size:', x_features.shape - , '\nTrain data size:', X.shape - , '\nTest data size:', X_bts.shape + , '\nSuccessfully split data: ALL features' + , '\nactual values: training set' + , '\nSplit:', tts_split + #, '\nimputed values: blind test set' + + , '\n\nTotal data size:', len(X) + len(X_bts) + + , '\n\nTrain data size:', X.shape , '\ny_train numbers:', yc1 - , '\ny_train ratio:',yc1_ratio - , '\n' + + , '\n\nTest data size:', X_bts.shape , '\ny_test_numbers:', yc2 + + , '\n\ny_train ratio:',yc1_ratio , '\ny_test ratio:', yc2_ratio , '\n-------------------------------------------------------------' ) @@ -772,3 +801,8 @@ def setvars(gene,drug): ############################################################################### # TODO: Find over and undersampling JUST for categorical data + ########################################################################### + + print('\n#################################################################' + , '\nDim of X for gene:', gene.lower(), '\n', X.shape + , '\n###############################################################') diff --git a/scripts/ml/ml_data_cd_sl.py b/scripts/ml/ml_data_cd_sl.py index 922eaa5..5941c30 100644 --- a/scripts/ml/ml_data_cd_sl.py +++ b/scripts/ml/ml_data_cd_sl.py @@ -34,7 +34,11 @@ def setvars(gene,drug): from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline + import argparse + import re #%% GLOBALS + tts_split = "sl" + rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -56,12 +60,10 @@ def setvars(gene,drug): , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} - jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - + jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% FOR LATER: Combine ED logo data ########################################################################### - rs = {'random_state': 42} - njobs = {'n_jobs': 10} + homedir = os.path.expanduser("~") geneL_basic = ['pnca'] @@ -422,118 +424,31 @@ def setvars(gene,drug): #========================== my_df_ml = my_df.copy() - #%% Build X: input for ML - common_cols_stabiltyN = ['ligand_distance' - , 'ligand_affinity_change' - , 'duet_stability_change' - , 'ddg_foldx' - , 'deepddg' - , 'ddg_dynamut2' - , 'mmcsm_lig' - , 'contacts'] - - # Build stability columns ~ gene + # Build column names to mask for affinity chanhes if gene.lower() in geneL_basic: - X_stabilityN = common_cols_stabiltyN + #X_stabilityN = common_cols_stabiltyN + gene_affinity_colnames = []# not needed as its the common ones cols_to_mask = ['ligand_affinity_change'] if gene.lower() in geneL_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] - geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] if gene.lower() in geneL_na: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] - geneL_na_st_cols = ['mcsm_na_affinity'] - X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] if gene.lower() in geneL_na_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] - - X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' - , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' - , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' - , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' - , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' - , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' - ] - - X_str = ['rsa' - #, 'asa' - , 'kd_values' - , 'rd_values'] - - X_ssFN = X_stabilityN + X_str + X_foldX_cols - - X_evolFN = ['consurf_score' - , 'snap2_score' - , 'provean_score'] - - X_genomic_mafor = ['maf' - , 'logorI' - # , 'or_rawI' - # , 'or_mychisq' - # , 'or_logistic' - # , 'or_fisher' - # , 'pval_fisher' - ] - - X_genomic_linegae = ['lineage_proportion' - , 'dist_lineage_proportion' - #, 'lineage' # could be included as a category but it has L2;L4 formatting - , 'lineage_count_all' - , 'lineage_count_unique' - ] - - X_genomicFN = X_genomic_mafor + X_genomic_linegae - - X_aaindexFN = list(aa_df_cols) - - print('\nTotal no. of features for aaindex:', len(X_aaindexFN)) - - # numerical feature names - numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN - - # categorical feature names - categorical_FN = ['ss_class' - # , 'wt_prop_water' - # , 'mut_prop_water' - # , 'wt_prop_polarity' - # , 'mut_prop_polarity' - # , 'wt_calcprop' - # , 'mut_calcprop' - , 'aa_prop_change' - , 'electrostatics_change' - , 'polarity_change' - , 'water_change' - , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] - , 'active_site' #[didn't use it for uq_v1] - #, 'gene_name' # will be required for the combined stuff - ] - - #---------------------------------------------- - # count numerical and categorical features - #---------------------------------------------- - - print('\nNo. of numerical features:', len(numerical_FN) - , '\nNo. of categorical features:', len(categorical_FN)) - #======================= # Masking columns: # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 #======================= - # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() - # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() - - # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 - # (my_df_ml['ligand_affinity_change'] == 0).sum() - my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() @@ -544,24 +459,154 @@ def setvars(gene,drug): mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + #=================================================== # write file for check mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) - mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') - - ##################################################################### - #================================================================ - # Training and Blind test [COMPLETE data]: scaling law split - # https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d + mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + #=================================================== + ############################################################################### + #%% Feature groups (FG): Build X for Input ML + ############################################################################ + #=========================== + # FG1: Evolutionary features + #=========================== + X_evolFN = ['consurf_score' + , 'snap2_score' + , 'provean_score'] + ############################################################################### + #======================== + # FG2: Stability features + #======================== + #-------- + # common + #-------- + X_common_stability_Fnum = [ + 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2' + , 'contacts'] + #-------- + # FoldX + #-------- + X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' + , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' + , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' + , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' + , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' + , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'] + + X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum + + ############################################################################### + #=================== + # FG3: Affinity features + #=================== + common_affinity_Fnum = ['ligand_distance' + , 'ligand_affinity_change' + , 'mmcsm_lig'] + + # if gene.lower() in geneL_basic: + # X_affinityFN = common_affinity_Fnum + # else: + # X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + ############################################################################### + #============================ + # FG4: Residue level features + #============================ + #----------- + # AA index + #----------- + X_aaindex_Fnum = list(aa_df_cols) + print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum)) + + #----------------- + # surface area + # depth + # hydrophobicity + #----------------- + X_str_Fnum = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + + #--------------------------- + # Other aa properties + # active site indication + #--------------------------- + X_aap_Fcat = ['ss_class' + # , 'wt_prop_water' + # , 'mut_prop_water' + # , 'wt_prop_polarity' + # , 'mut_prop_polarity' + # , 'wt_calcprop' + # , 'mut_calcprop' + , 'aa_prop_change' + , 'electrostatics_change' + , 'polarity_change' + , 'water_change' + , 'active_site'] + + X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat + ############################################################################### + #======================== + # FG5: Genomic features + #======================== + X_gn_mafor_Fnum = ['maf' + #, 'logorI' + # , 'or_rawI' + # , 'or_mychisq' + # , 'or_logistic' + # , 'or_fisher' + # , 'pval_fisher' + ] + + X_gn_linegae_Fnum = ['lineage_proportion' + , 'dist_lineage_proportion' + #, 'lineage' # could be included as a category but it has L2;L4 formatting + , 'lineage_count_all' + , 'lineage_count_unique' + ] + + # X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] + # #, 'gene_name' # will be required for the combined stuff + # ] + X_gn_Fcat = [] + + X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat + ############################################################################### + #======================== + # FG6 collapsed: Structural : Atability + Affinity + ResidueProp + #======================== + X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN + + ############################################################################### + #======================== + # BUILDING all features + #======================== + all_featuresN = X_evolFN + X_structural_FN + X_genomicFN + + ############################################################################### + #%% Define training and test data + #================================================================ + # Training and BLIND test set: scaling law split + #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d + # dst with actual values : training set + # dst with imputed values : THROW AWAY [unrepresentative] # test data size ~ 1/sqrt(features NOT including target variable) #================================================================ my_df_ml[drug].isna().sum() + # blind_test_df = my_df_ml[my_df_ml[drug].isna()] # blind_test_df.shape - #training_df = my_df_ml[my_df_ml[drug].notna()] + #training_df = my_df_ml[my_df_ml[drug].notna()] #training_df.shape - + training_df = my_df_ml.copy() # Target 1: dst_mode @@ -569,80 +614,14 @@ def setvars(gene,drug): training_df['dst_mode'].value_counts() #################################################################### - -############################################################################### -############################################################################### - # #%% extracting dfs based on numerical, categorical column names - # #---------------------------------- - # # WITHOUT the target var included - # #---------------------------------- - # num_df = training_df[numerical_FN] - # num_df.shape - - # cat_df = training_df[categorical_FN] - # cat_df.shape - - # all_df = training_df[numerical_FN + categorical_FN] - # all_df.shape - - # #------------------------------ - # # WITH the target var included: - # #'wtgt': with target - # #------------------------------ - # # drug and dst_mode should be the same thing - # num_df_wtgt = training_df[numerical_FN + ['dst_mode']] - # num_df_wtgt.shape - - # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] - # cat_df_wtgt.shape - - # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] - # all_df_wtgt.shape - - #%%######################################################################## - # #============ - # # ML data: OLD - # #============ - # #------ - # # X: Training and Blind test (BTS) - # #------ - # X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL - # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL - # #X = all_df_wtgt[numerical_FN] # training numerical only - # #X_bts = blind_test_df[numerical_FN] # blind test data numerical - - # #------ - # # y - # #------ - # y = all_df_wtgt['dst_mode'] # training data y - # y_bts = blind_test_df['dst_mode'] # blind data test y - - # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] - - # # Quick check - # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() - # for i in range(len(cols_to_mask)): - # ind = i+1 - # print('\nindex:', i, '\nind:', ind) - # print('\nMask count check:' - # , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() - # ) - - # print('Original Data\n', Counter(y) - # , 'Data dim:', X.shape) - -############################################################################### -############################################################################### #==================================== - # ML data: Train test split [COMPLETE data]: scaling law + # ML data: Train test split: SL # with stratification # 1-blind test : training_data for CV # 1/sqrt(columns) : blind test - #===================================== - - # features: all_df or - x_features = training_df[numerical_FN + categorical_FN] - y_target = training_df['dst_mode'] + #=========================================== + x_features = training_df[all_featuresN] + y_target = training_df['dst_mode'] # sanity check if not 'dst_mode' in x_features.columns: @@ -650,12 +629,15 @@ def setvars(gene,drug): x_ncols = len(x_features.columns) print('\nNo. of columns for x_features:', x_ncols) # NEED It for scaling law split + #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d else: sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!') - + #------------------- + # train-test split + #------------------- sl_test_size = 1/np.sqrt(x_ncols) train = 1 - sl_test_size - + #x_train, x_test, y_train, y_test # traditional var_names # so my downstream code doesn't need to change X, X_bts, y, y_bts = train_test_split(x_features, y_target @@ -667,16 +649,65 @@ def setvars(gene,drug): yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] - + + ############################################################################### + #====================================================== + # Determine categorical and numerical features + #====================================================== + numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns + numerical_cols + categorical_cols = X.select_dtypes(include=['object', 'bool']).columns + categorical_cols + + ################################################################################ + # IMPORTANT sanity checks + if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN): + print('\nPASS: ML data with input features, training and test generated...' + , '\n\nTotal no. of input features:' , len(X.columns) + , '\n--------No. of numerical features:' , len(numerical_cols) + , '\n--------No. of categorical features:' , len(categorical_cols) + + , '\n\nTotal no. of evolutionary features:' , len(X_evolFN) + + , '\n\nTotal no. of stability features:' , len(X_stability_FN) + , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum) + , '\n--------Foldx cols:' , len(X_foldX_Fnum) + + , '\n\nTotal no. of affinity features:' , len(X_affinityFN) + , '\n--------Common affinity cols:' , len(common_affinity_Fnum) + , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) + + , '\n\nTotal no. of residue level features:', len(X_resprop_FN) + , '\n--------AA index cols:' , len(X_aaindex_Fnum) + , '\n--------Residue Prop cols:' , len(X_str_Fnum) + , '\n--------AA change Prop cols:' , len(X_aap_Fcat) + + , '\n\nTotal no. of genomic features:' , len(X_genomicFN) + , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) + , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) + , '\n--------Other cols:' , len(X_gn_Fcat) + ) + else: + print('\nFAIL: numbers mismatch' + , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN) + , '\nGot:', len(X.columns)) + sys.exit() + ############################################################################### print('\n-------------------------------------------------------------' - , '\nSuccessfully split data with stratification according to scaling law [COMPLETE data]: 1/sqrt(x_ncols)' - , '\nInput features data size:', x_features.shape - , '\nTrain data size:', X.shape - , '\nTest data size:', X_bts.shape + , '\nSuccessfully split data: ALL features' + , '\nactual values: training set' + , '\nSplit:', tts_split + #, '\nimputed values: blind test set' + + , '\n\nTotal data size:', len(X) + len(X_bts) + + , '\n\nTrain data size:', X.shape , '\ny_train numbers:', yc1 - , '\ny_train ratio:',yc1_ratio - , '\n' + + , '\n\nTest data size:', X_bts.shape , '\ny_test_numbers:', yc2 + + , '\n\ny_train ratio:',yc1_ratio , '\ny_test ratio:', yc2_ratio , '\n-------------------------------------------------------------' ) @@ -775,3 +806,8 @@ def setvars(gene,drug): ############################################################################### # TODO: Find over and undersampling JUST for categorical data + ########################################################################### + + print('\n#################################################################' + , '\nDim of X for gene:', gene.lower(), '\n', X.shape + , '\n###############################################################') diff --git a/scripts/ml/ml_data_sl.py b/scripts/ml/ml_data_sl.py index e850ab5..ffa866b 100644 --- a/scripts/ml/ml_data_sl.py +++ b/scripts/ml/ml_data_sl.py @@ -34,7 +34,11 @@ def setvars(gene,drug): from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline + import argparse + import re #%% GLOBALS + tts_split = "sl" + rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -56,12 +60,10 @@ def setvars(gene,drug): , **rs) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} - jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - + jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% FOR LATER: Combine ED logo data ########################################################################### - rs = {'random_state': 42} - njobs = {'n_jobs': 10} + homedir = os.path.expanduser("~") geneL_basic = ['pnca'] @@ -422,118 +424,31 @@ def setvars(gene,drug): #========================== my_df_ml = my_df.copy() - #%% Build X: input for ML - common_cols_stabiltyN = ['ligand_distance' - , 'ligand_affinity_change' - , 'duet_stability_change' - , 'ddg_foldx' - , 'deepddg' - , 'ddg_dynamut2' - , 'mmcsm_lig' - , 'contacts'] - - # Build stability columns ~ gene + # Build column names to mask for affinity chanhes if gene.lower() in geneL_basic: - X_stabilityN = common_cols_stabiltyN + #X_stabilityN = common_cols_stabiltyN + gene_affinity_colnames = []# not needed as its the common ones cols_to_mask = ['ligand_affinity_change'] if gene.lower() in geneL_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] - geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] if gene.lower() in geneL_na: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] - geneL_na_st_cols = ['mcsm_na_affinity'] - X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] if gene.lower() in geneL_na_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] - - X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' - , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' - , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' - , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' - , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' - , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' - ] - - X_str = ['rsa' - #, 'asa' - , 'kd_values' - , 'rd_values'] - - X_ssFN = X_stabilityN + X_str + X_foldX_cols - - X_evolFN = ['consurf_score' - , 'snap2_score' - , 'provean_score'] - - X_genomic_mafor = ['maf' - , 'logorI' - # , 'or_rawI' - # , 'or_mychisq' - # , 'or_logistic' - # , 'or_fisher' - # , 'pval_fisher' - ] - - X_genomic_linegae = ['lineage_proportion' - , 'dist_lineage_proportion' - #, 'lineage' # could be included as a category but it has L2;L4 formatting - , 'lineage_count_all' - , 'lineage_count_unique' - ] - - X_genomicFN = X_genomic_mafor + X_genomic_linegae - - X_aaindexFN = list(aa_df_cols) - - print('\nTotal no. of features for aaindex:', len(X_aaindexFN)) - - # numerical feature names - numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN - - # categorical feature names - categorical_FN = ['ss_class' - # , 'wt_prop_water' - # , 'mut_prop_water' - # , 'wt_prop_polarity' - # , 'mut_prop_polarity' - # , 'wt_calcprop' - # , 'mut_calcprop' - , 'aa_prop_change' - , 'electrostatics_change' - , 'polarity_change' - , 'water_change' - , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] - , 'active_site' #[didn't use it for uq_v1] - #, 'gene_name' # will be required for the combined stuff - ] - - #---------------------------------------------- - # count numerical and categorical features - #---------------------------------------------- - - print('\nNo. of numerical features:', len(numerical_FN) - , '\nNo. of categorical features:', len(categorical_FN)) - #======================= # Masking columns: # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 #======================= - # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() - # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() - - # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 - # (my_df_ml['ligand_affinity_change'] == 0).sum() - my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() @@ -544,22 +459,152 @@ def setvars(gene,drug): mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + #=================================================== # write file for check mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) - mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + #=================================================== + ############################################################################### + #%% Feature groups (FG): Build X for Input ML + ############################################################################ + #=========================== + # FG1: Evolutionary features + #=========================== + X_evolFN = ['consurf_score' + , 'snap2_score' + , 'provean_score'] + + ############################################################################### + #======================== + # FG2: Stability features + #======================== + #-------- + # common + #-------- + X_common_stability_Fnum = [ + 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2' + , 'contacts'] + #-------- + # FoldX + #-------- + X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' + , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' + , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' + , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' + , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' + , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'] + + X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum + + ############################################################################### + #=================== + # FG3: Affinity features + #=================== + common_affinity_Fnum = ['ligand_distance' + , 'ligand_affinity_change' + , 'mmcsm_lig'] + + # if gene.lower() in geneL_basic: + # X_affinityFN = common_affinity_Fnum + # else: + # X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + ############################################################################### + #============================ + # FG4: Residue level features + #============================ + #----------- + # AA index + #----------- + X_aaindex_Fnum = list(aa_df_cols) + print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum)) + + #----------------- + # surface area + # depth + # hydrophobicity + #----------------- + X_str_Fnum = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + + #--------------------------- + # Other aa properties + # active site indication + #--------------------------- + X_aap_Fcat = ['ss_class' + # , 'wt_prop_water' + # , 'mut_prop_water' + # , 'wt_prop_polarity' + # , 'mut_prop_polarity' + # , 'wt_calcprop' + # , 'mut_calcprop' + , 'aa_prop_change' + , 'electrostatics_change' + , 'polarity_change' + , 'water_change' + , 'active_site'] - ##################################################################### + X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat + ############################################################################### + #======================== + # FG5: Genomic features + #======================== + X_gn_mafor_Fnum = ['maf' + #, 'logorI' + # , 'or_rawI' + # , 'or_mychisq' + # , 'or_logistic' + # , 'or_fisher' + # , 'pval_fisher' + ] + + X_gn_linegae_Fnum = ['lineage_proportion' + , 'dist_lineage_proportion' + #, 'lineage' # could be included as a category but it has L2;L4 formatting + , 'lineage_count_all' + , 'lineage_count_unique' + ] + + # X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] + # #, 'gene_name' # will be required for the combined stuff + # ] + X_gn_Fcat = [] + + X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat + ############################################################################### + #======================== + # FG6 collapsed: Structural : Atability + Affinity + ResidueProp + #======================== + X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN + + ############################################################################### + #======================== + # BUILDING all features + #======================== + all_featuresN = X_evolFN + X_structural_FN + X_genomicFN + + ############################################################################### + #%% Define training and test data #================================================================ # Training and BLIND test set: scaling law split - # https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d - + #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d + # dst with actual values : training set + # dst with imputed values : THROW AWAY [unrepresentative] # test data size ~ 1/sqrt(features NOT including target variable) #================================================================ my_df_ml[drug].isna().sum() + # blind_test_df = my_df_ml[my_df_ml[drug].isna()] # blind_test_df.shape - training_df = my_df_ml[my_df_ml[drug].notna()] + training_df = my_df_ml[my_df_ml[drug].notna()] training_df.shape # Target 1: dst_mode @@ -567,80 +612,14 @@ def setvars(gene,drug): training_df['dst_mode'].value_counts() #################################################################### - -############################################################################### -############################################################################### - # #%% extracting dfs based on numerical, categorical column names - # #---------------------------------- - # # WITHOUT the target var included - # #---------------------------------- - # num_df = training_df[numerical_FN] - # num_df.shape - - # cat_df = training_df[categorical_FN] - # cat_df.shape - - # all_df = training_df[numerical_FN + categorical_FN] - # all_df.shape - - # #------------------------------ - # # WITH the target var included: - # #'wtgt': with target - # #------------------------------ - # # drug and dst_mode should be the same thing - # num_df_wtgt = training_df[numerical_FN + ['dst_mode']] - # num_df_wtgt.shape - - # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] - # cat_df_wtgt.shape - - # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] - # all_df_wtgt.shape - - #%%######################################################################## - # #============ - # # ML data: OLD - # #============ - # #------ - # # X: Training and Blind test (BTS) - # #------ - # X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL - # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL - # #X = all_df_wtgt[numerical_FN] # training numerical only - # #X_bts = blind_test_df[numerical_FN] # blind test data numerical - - # #------ - # # y - # #------ - # y = all_df_wtgt['dst_mode'] # training data y - # y_bts = blind_test_df['dst_mode'] # blind data test y - - # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] - - # # Quick check - # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() - # for i in range(len(cols_to_mask)): - # ind = i+1 - # print('\nindex:', i, '\nind:', ind) - # print('\nMask count check:' - # , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() - # ) - - # print('Original Data\n', Counter(y) - # , 'Data dim:', X.shape) - -############################################################################### -############################################################################### - #=========================================== - # ML data: Train test split:scaling law + #==================================== + # ML data: Train test split: SL # with stratification # 1-blind test : training_data for CV # 1/sqrt(columns) : blind test #=========================================== - - # features: all_df or - x_features = training_df[numerical_FN + categorical_FN] - y_target = training_df['dst_mode'] + x_features = training_df[all_featuresN] + y_target = training_df['dst_mode'] # sanity check if not 'dst_mode' in x_features.columns: @@ -648,12 +627,15 @@ def setvars(gene,drug): x_ncols = len(x_features.columns) print('\nNo. of columns for x_features:', x_ncols) # NEED It for scaling law split + #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d else: sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!') - + #------------------- + # train-test split + #------------------- sl_test_size = 1/np.sqrt(x_ncols) train = 1 - sl_test_size - + #x_train, x_test, y_train, y_test # traditional var_names # so my downstream code doesn't need to change X, X_bts, y, y_bts = train_test_split(x_features, y_target @@ -665,16 +647,65 @@ def setvars(gene,drug): yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] - + + ############################################################################### + #====================================================== + # Determine categorical and numerical features + #====================================================== + numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns + numerical_cols + categorical_cols = X.select_dtypes(include=['object', 'bool']).columns + categorical_cols + + ################################################################################ + # IMPORTANT sanity checks + if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN): + print('\nPASS: ML data with input features, training and test generated...' + , '\n\nTotal no. of input features:' , len(X.columns) + , '\n--------No. of numerical features:' , len(numerical_cols) + , '\n--------No. of categorical features:' , len(categorical_cols) + + , '\n\nTotal no. of evolutionary features:' , len(X_evolFN) + + , '\n\nTotal no. of stability features:' , len(X_stability_FN) + , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum) + , '\n--------Foldx cols:' , len(X_foldX_Fnum) + + , '\n\nTotal no. of affinity features:' , len(X_affinityFN) + , '\n--------Common affinity cols:' , len(common_affinity_Fnum) + , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) + + , '\n\nTotal no. of residue level features:', len(X_resprop_FN) + , '\n--------AA index cols:' , len(X_aaindex_Fnum) + , '\n--------Residue Prop cols:' , len(X_str_Fnum) + , '\n--------AA change Prop cols:' , len(X_aap_Fcat) + + , '\n\nTotal no. of genomic features:' , len(X_genomicFN) + , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) + , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) + , '\n--------Other cols:' , len(X_gn_Fcat) + ) + else: + print('\nFAIL: numbers mismatch' + , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN) + , '\nGot:', len(X.columns)) + sys.exit() + ############################################################################### print('\n-------------------------------------------------------------' - , '\nSuccessfully split data with stratification according to scaling law: 1/sqrt(x_ncols)' - , '\nInput features data size:', x_features.shape - , '\nTrain data size:', X.shape - , '\nTest data size:', sl_test_size, ' ', X_bts.shape + , '\nSuccessfully split data: ALL features' + , '\nactual values: training set' + , '\nSplit:', tts_split + #, '\nimputed values: blind test set' + + , '\n\nTotal data size:', len(X) + len(X_bts) + + , '\n\nTrain data size:', X.shape , '\ny_train numbers:', yc1 - , '\ny_train ratio:',yc1_ratio - , '\n' + + , '\n\nTest data size:', X_bts.shape , '\ny_test_numbers:', yc2 + + , '\n\ny_train ratio:',yc1_ratio , '\ny_test ratio:', yc2_ratio , '\n-------------------------------------------------------------' ) @@ -773,3 +804,8 @@ def setvars(gene,drug): ############################################################################### # TODO: Find over and undersampling JUST for categorical data + ########################################################################### + + print('\n#################################################################' + , '\nDim of X for gene:', gene.lower(), '\n', X.shape + , '\n###############################################################') diff --git a/scripts/ml/run_7030.py b/scripts/ml/run_7030.py index 151a5aa..c737b53 100755 --- a/scripts/ml/run_7030.py +++ b/scripts/ml/run_7030.py @@ -55,9 +55,8 @@ OutFile_suffix = '7030' outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) -outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' -#outFile_lf = outdir_ml + gene.lower() + '_baselineC_ext_' + OutFile_suffix + '.csv' - +#outFile_wf = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' +outFile_wf = outdir_ml + gene.lower() + '_baselineC_noOR' + OutFile_suffix + '.csv' #%% Running models ############################################################ print('\n#####################################################################\n' , '\nStarting--> Running ML analysis: Baseline modes (No FS)' @@ -92,10 +91,24 @@ paramD = { , 'resampling_type' : 'rouC'} } -# Initial run to get the dict containing CV, BT and metadata DFs -mmD = {} +##============================================================================== +## Dict with no CV BT formatted df +## mmD = {} +## for k, v in paramD.items(): +## # print(mmD[k]) +## scores_7030D = MultModelsCl(**paramD[k] +## , tts_split_type = tts_split_7030 +## , skf_cv = skf_cv +## , blind_test_df = X_bts +## , blind_test_target = y_bts +## , add_cm = True +## , add_yn = True +## , return_formatted_output = False) +## mmD[k] = scores_7030D +##============================================================================== +## Initial run to get the dict of dicts for each sampling type containing CV, BT and metadata DFs +mmDD = {} for k, v in paramD.items(): -# print(mmD[k]) scores_7030D = MultModelsCl(**paramD[k] , tts_split_type = tts_split_7030 , skf_cv = skf_cv @@ -104,23 +117,25 @@ for k, v in paramD.items(): , add_cm = True , add_yn = True , return_formatted_output = True) - mmD[k] = scores_7030D + mmDD[k] = scores_7030D # Extracting the dfs from within the dict and concatenating to output as one df -for k, v in mmD.items(): - out_wf_7030 = pd.concat(mmD, ignore_index = True) +for k, v in mmDD.items(): + out_wf_7030 = pd.concat(mmDD, ignore_index = True) + +out_wf_7030f = out_wf_7030.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) print('\n######################################################################' , '\nEnd--> Successfully generated output DF for Multiple classifiers (baseline models)' , '\nGene:', gene.lower() , '\nDrug:', drug , '\noutput file:', outFile_wf - , '\nDim of output:', out_wf_7030.shape + , '\nDim of output:', out_wf_7030f.shape , '\n######################################################################') ############################################################################### #==================== # Write output file #==================== -#out_wf_7030.to_csv(outFile_wf, index = False) +out_wf_7030f.to_csv(outFile_wf, index = False) print('\nFile successfully written:', outFile_wf) ############################################################################### \ No newline at end of file diff --git a/scripts/ml/run_FS.py b/scripts/ml/run_FS.py index b04a4ca..229098c 100755 --- a/scripts/ml/run_FS.py +++ b/scripts/ml/run_FS.py @@ -92,7 +92,7 @@ gene = args.gene #================== # other vars #================== -tts_split = '70/30' +tts_split = '70_30' OutFile_suffix = '7030_FS' ############################################################################### #================== @@ -116,7 +116,8 @@ from FS import fsgs #================== outdir_ml = outdir + 'ml/tts_7030/fs/' print('\nOutput directory:', outdir_ml) -OutFileFS = outdir_ml + gene.lower() + '_FS_' + OutFile_suffix + '.json' +#OutFileFS = outdir_ml + gene.lower() + '_FS' + OutFile_suffix + '.json' +OutFileFS = outdir_ml + gene.lower() + '_FS_noOR' + OutFile_suffix + '.json' ############################################################################ @@ -153,17 +154,17 @@ models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) , ('Extra Tree' , ExtraTreeClassifier(**rs) ) , ('Extra Trees' , ExtraTreesClassifier(**rs) ) , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - #, ('Gaussian NB' , GaussianNB() ) - #, ('Gaussian Process' , GaussianProcessClassifier(**rs) ) - #, ('K-Nearest Neighbors' , KNeighborsClassifier() ) + ##, ('Gaussian NB' , GaussianNB() ) + ##, ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + ##, ('K-Nearest Neighbors' , KNeighborsClassifier() ) , ('LDA' , LinearDiscriminantAnalysis() ) , ('Logistic Regression' , LogisticRegression(**rs) ) , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - #, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) - #, ('Multinomial' , MultinomialNB() ) - #, ('Naive Bayes' , BernoulliNB() ) + ##, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + ##, ('Multinomial' , MultinomialNB() ) + ##, ('Naive Bayes' , BernoulliNB() ) , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - #, ('QDA' , QuadraticDiscriminantAnalysis() ) + ##, ('QDA' , QuadraticDiscriminantAnalysis() ) , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 , n_estimators = 1000 @@ -174,10 +175,10 @@ models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) , max_features = 'auto') ) , ('Ridge Classifier' , RidgeClassifier(**rs) ) , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - #, ('SVC' , SVC(**rs) ) + ##, ('SVC' , SVC(**rs) ) , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - # , ('XGBoost' , XGBClassifier(**rs, **njobs, verbosity = 3 - # , use_label_encoder = False) ) + ## , ('XGBoost' , XGBClassifier(**rs, **njobs, verbosity = 3 + ## , use_label_encoder = False) ) ] print('\n#####################################################################' diff --git a/scripts/ml/running_ml_scripts.txt b/scripts/ml/running_ml_scripts.txt index deb193e..9661aba 100644 --- a/scripts/ml/running_ml_scripts.txt +++ b/scripts/ml/running_ml_scripts.txt @@ -1,11 +1,13 @@ -================================= -# Split: 70/30 +######################################################################## + + #70/30 + +######################################################################## +=-----------------------------------= # All features including AA index -# Date: 22/06/2022 -# captures error: 2>$1 -# omitted drtype_labels -================================= -time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt +# [WITH OR] +=-----------------------------------= +time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030.txt #d time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030.txt time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030.txt time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030.txt @@ -14,71 +16,155 @@ time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030.txt # alr: # ERROR, as expected, too few values! # gid: problems -######################################################################## -================================= -# Split: 80/20 +=-----------------------------------= # All features including AA index -# Date: 17/05/2022, 18:48 -# captures error: 2>$1 -================================= +# [WITHOUT OR] **DONE +#------------------------------------= +time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030_noOR.txt +time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030_noOR.txt +time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030_noOR.txt +time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030_noOR.txt +time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030_noOR.txt +time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030_noOR.txt +######################################################################## + # 80/20 ######################################################################## -================================= -# Split: scaling law +=-----------------------------------= # All features including AA index -# Date: 17/05/2022, 18:48 -# captures error: 2>$1 -================================= +# [WITH OR] +=-----------------------------------= +time ./run_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_8020.txt +time ./run_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_8020.txt +time ./run_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_8020.txt +time ./run_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_8020.txt +time ./run_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_8020.txt +time ./run_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_8020.txt -######################################################################## -================================= -# Split: REVERSE training -# imputed values: training set -# actual values: blind set +=-----------------------------------= # All features including AA index -# Date: 18/05/2022 -# captures error: 2>$1 -================================= +# [WITHOUT OR] **DONE +real 0m1.099s +user 0m1.308s +sys 0m1.474s +=-----------------------------------= +time ./run_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_8020_noOR.txt +time ./run_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_8020_noOR.txt +time ./run_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_8020_noOR.txt +time ./run_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_8020_noOR.txt +time ./run_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_8020_noOR.txt +time ./run_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_8020_noOR.txt ######################################################################## -# COMPLETE Data: actual + na i.e imputed + + # SL + ######################################################################## -================================= -# Split: 70/30 [COMPLETE DATA] +=-----------------------------------= # All features including AA index -# Date: 18/05/2022 -# captures error: 2>$1 -================================= +=-----------------------------------= -######################################################################## -================================= -# Split: 80/20 [COMPLETE DATA] + + + + +=-----------------------------------= # All features including AA index -# Date: 18/05/2022 -# captures error: 2>$1 -================================= +# [WITHOUT OR] +=-----------------------------------= +time ./run_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_sl_noOR.txt +time ./run_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_sl_noOR.txt +time ./run_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_sl_noOR.txt +time ./run_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_sl_noOR.txt +time ./run_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_sl_noOR.txt +time ./run_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_sl_noOR.txt -================================= -# Split: scaling law [COMPLETE DATA] +######################################################################## + + +######################################################################## +######################################################################## +###################### COMPLETE DATA ############################## +######################################################################## +######################################################################## + + +######################################################################## + + #70/30 + +######################################################################## + + +=-----------------------------------= # All features including AA index -# Date: 18/05/2022 -# captures error: 2>$1 -================================= +# [WITHOUT OR] +#------------------------------------= +time ./run_cd_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_7030_noOR.txt +time ./run_cd_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_7030_noOR.txt +time ./run_cd_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_7030_noOR.txt +time ./run_cd_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_7030_noOR.txt +time ./run_cd_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_7030_noOR.txt +time ./run_cd_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_7030_noOR.txt + + ######################################################################## + + # 80/20 + +######################################################################## + + + +=-----------------------------------= +# All features including AA index +# [WITHOUT OR] +#------------------------------------= +time ./run_cd_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_8020_noOR.txt +time ./run_cd_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_8020_noOR.txt +time ./run_cd_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_8020_noOR.txt +time ./run_cd_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_8020_noOR.txt +time ./run_cd_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_8020_noOR.txt +time ./run_cd_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_8020_noOR.txt + +######################################################################## + + # SL + +######################################################################## + + + +=-----------------------------------= +# All features including AA index +# [WITHOUT OR] +#------------------------------------= +time ./run_cd_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_sl_noOR.txt +time ./run_cd_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_sl_noOR.txt +time ./run_cd_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_sl_noOR.txt +time ./run_cd_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_sl_noOR.txt +time ./run_cd_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_sl_noOR.txt +time ./run_cd_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_sl_noOR.txt + + + + +######################################################################## + +######################################################################## +######################################################################## +###################### Feature Selection ########################## ######################################################################## ######################################################################## -# running feature selection -# Split:70/30 +# 7030 time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt -real 338m26.705s -user 1946m12.173s -sys 189m40.122s +time ./run_FS_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030_noOR.txt diff --git a/scripts/ml/test_MultClfs.py b/scripts/ml/test_MultClfs.py index 99d7798..f337470 100644 --- a/scripts/ml/test_MultClfs.py +++ b/scripts/ml/test_MultClfs.py @@ -100,3 +100,5 @@ mmDF3 = MultModelsCl(input_df = X_smnc #================= # output from function call ProcessMultModelsCl(mmD) +ProcessMultModelsCl(testD) +