diff --git a/scripts/ml/MultModelsCl_dissected.py b/scripts/ml/MultModelsCl_dissected.py index 6919061..b93ada6 100644 --- a/scripts/ml/MultModelsCl_dissected.py +++ b/scripts/ml/MultModelsCl_dissected.py @@ -305,7 +305,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv mm_skf_scoresD[model_name]['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2) - mm_skf_scoresD[model_name]['bts_jaccard'] = round(jaccard_score(blind_test_target, bts_predict),2) + mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC return(mm_skf_scoresD) diff --git a/scripts/ml/Mult_dissected_CALL.py b/scripts/ml/Mult_dissected_CALL.py index 229ed92..000e302 100644 --- a/scripts/ml/Mult_dissected_CALL.py +++ b/scripts/ml/Mult_dissected_CALL.py @@ -5,21 +5,47 @@ Created on Mon Jun 20 13:05:23 2022 @author: tanu """ +import re +#all_featuresN = X_evolFN + X_structural_FN + X_genomicFN +# X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN +# X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat + + +score_type_ordermapD = { 'mcc' : 1 + , 'fscore' : 2 + , 'jcc' : 3 + , 'precision' : 4 + , 'recall' : 5 + , 'accuracy' : 6 + , 'roc_auc' : 7 + , 'TN' : 8 + , 'FP' : 9 + , 'FN' : 10 + , 'TP' : 11 + , 'trainingY_neg': 12 + , 'trainingY_pos': 13 + , 'blindY_neg' : 14 + , 'blindY_pos' : 15 + , 'fit_time' : 16 + , 'score_time' : 17 + } + + #================== # Baseline models #================== -cm_di2 = MultModelsCl_dissected(input_df = X - , target = y - , var_type = 'mixed' - , skf_cv = skf_cv - , blind_test_input_df = X_bts - , blind_test_target = y_bts - , add_cm = True - , add_yn = True) +# cm_di2 = MultModelsCl_dissected(input_df = X +# , target = y +# , var_type = 'mixed' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts +# , add_cm = True +# , add_yn = True) -baseline_all2 = pd.DataFrame(cm_di2) -baseline_all2 = baseline_all2.T -baseline_CTBT2 = baseline_all2.filter(regex = 'test_.*|bts_.*|TN|FP|FN|TP|.*_neg|.*_pos' , axis = 1) +# baseline_all2 = pd.DataFrame(cm_di2) +# baseline_all2T = baseline_all2.T +# baseline_CTBT2 = baseline_all2T.filter(regex = 'test_.*|bts_.*|TN|FP|FN|TP|.*_neg|.*_pos' , axis = 1) #================ # Stability cols @@ -52,35 +78,68 @@ scores_mm_gn = MultModelsCl_dissected(input_df = X[X_genomicFN] , add_yn = True) baseline_all_gn = pd.DataFrame(scores_mm_gn) -baseline_CTBT_gn = baseline_all_gn.filter(regex = '.*_time|test_.*|bts_.*|TN|FP|FN|TP|.*_neg|.*_pos' , axis = 0) -baseline_CTBT_gn['feature_group'] = feature_gp_name +baseline_GN = baseline_all_gn.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) +baseline_GN = baseline_GN.reset_index() +baseline_GN.rename(columns = {'index': 'original_names'}, inplace = True) +# Indicate whether BT or CT +bt_pattern = re.compile(r'bts_.*') +baseline_GN['data_source'] = baseline_GN.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) -baseline_CT = baseline_CTBT_gn.filter(regex = '.*_time|test_.*|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) +baseline_GN['score_type'] = baseline_GN['original_names'].str.replace('bts_|test_', '', regex = True) -baseline_CT = baseline_CT.reset_index() -baseline_CT.rename(columns = {'index': 'original_index'}, inplace = True) -baseline_CT['score_type'] = baseline_CT['original_index'] -baseline_CT['score_type'] = baseline_CT['score_type'].str.replace('test_*', '', regex = True) -baseline_CT['data_source'] = 'CT_score' +score_type_uniqueN = set(baseline_GN['score_type']) +cL1 = list(score_type_ordermapD.keys()) +cL2 = list(score_type_uniqueN) - -baseline_BT = baseline_CTBT_gn.filter(regex = 'bts_', axis = 0) +if set(cL1).issubset(cL2): + print('\nPASS: sorting df by score that is mapped onto the order I want') + baseline_GN['score_order'] = baseline_GN['score_type'].map(score_type_ordermapD) + baseline_GN.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) +else: + sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') + +baseline_GN['feature_group'] = feature_gp_name + +#------------- +# Blind test +#------------- +baseline_BT = baseline_all_gn.filter(regex = 'bts_', axis = 0) baseline_BT = baseline_BT.reset_index() -baseline_BT.rename(columns = {'index': 'original_index'}, inplace = True) -baseline_BT['score_type'] = baseline_BT['original_index'] +baseline_BT.rename(columns = {'index': 'original_names'}, inplace = True) +baseline_BT['score_type'] = baseline_BT['original_names'] baseline_BT['score_type'] = baseline_BT['score_type'].str.replace('bts_*', '', regex = True) baseline_BT['data_source'] = 'BT_score' -# rpow bind -if all(baseline_CT.columns == baseline_BT.columns): - print('\nPASS:colnames match, proceeding to rowbind') - comb_df = pd.concat([baseline_BT,baseline_CT], axis = 0, ignore_index = True ) - -baseline_CT -baseline_CT +#-------- +# CV +#-------- +baseline_CT = baseline_all_gn.filter(regex = '.*_time|test_.*|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) +baseline_CT = baseline_CT.reset_index() +baseline_CT.rename(columns = {'index': 'original_names'}, inplace = True) +baseline_CT['score_type'] = baseline_CT['original_names'] +baseline_CT['score_type'] = baseline_CT['score_type'].str.replace('test_*', '', regex = True) +baseline_CT['data_source'] = 'CT_score' +#---------------------- +# rpow bind: CT and BT +#---------------------- +if all(baseline_BT.columns == baseline_CT.columns): + print('\nPASS: Colnames match, proceeding to row bind for data:', feature_gp_name + , '\nDim of df1 (BT):', baseline_BT.shape + , '\nDim of df2 (CT):', baseline_CT.shape) + comb_df_gn = pd.concat([baseline_BT, baseline_CT], axis = 0, ignore_index = True) + comb_df_gn['feature_group'] = feature_gp_name + print('\nDim of combined df:', comb_df_gn.shape) +else: + print('\nFAIL: colnames mismatch, cannot combine') + +# good way but I don't like to have to rearrange the columns later +#frames_tocombine = [baseline_BT, baseline_CT] +#common_cols = list(set.intersection(*(set(df.columns) for df in frames_tocombine))) +#a = pd.concat([df[common_cols] for df in frames_tocombine], ignore_index=True) +############################################################################### #================ # Evolution #================ diff --git a/scripts/ml/ml_data_dissected.py b/scripts/ml/ml_data_dissected.py index 12ea9b1..a589449 100644 --- a/scripts/ml/ml_data_dissected.py +++ b/scripts/ml/ml_data_dissected.py @@ -578,9 +578,15 @@ X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [US X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat ############################################################################### -# Feature groups further collaps: +#======================== +# FG6 collapsed: Structural : Atability + Affinity + ResidueProp +#======================== X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN +############################################################################### +#======================== +# BUILDING all features +#======================== all_featuresN = X_evolFN + X_structural_FN + X_genomicFN ############################################################################### @@ -662,7 +668,7 @@ if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + l , '\n\nTotal no. of affinity features:' , len(X_affinityFN) , '\n--------Common affinity cols:' , len(common_affinity_Fnum) - , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) + , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) , '\n\nTotal no. of residue level features:', len(X_resprop_FN) , '\n--------AA index cols:' , len(X_aaindex_Fnum) diff --git a/scripts/ml/pnca_config_dissected.py b/scripts/ml/pnca_config_dissected.py index 24367d3..dafaff2 100644 --- a/scripts/ml/pnca_config_dissected.py +++ b/scripts/ml/pnca_config_dissected.py @@ -49,32 +49,11 @@ print('\nOutput directory:', outdir_ml) #%%########################################################################### print('\n================================================================\n') - , '\n\nTotal no. of evolutionary features:' , len(X_evolFN) - - , '\n\nTotal no. of stability features:' , len(X_stability_FN) - , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum) - , '\n--------Foldx cols:' , len(X_foldX_Fnum) - - , '\n\nTotal no. of affinity features:' , len(X_affinityFN) - , '\n--------Common affinity cols:' , len(common_affinity_Fnum) - , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) - - , '\n\nTotal no. of residue level features:', len(X_resprop_FN) - , '\n--------AA index cols:' , len(X_aaindex_Fnum) - , '\n--------Residue Prop cols:' , len(X_str_Fnum) - , '\n--------AA change Prop cols:' , len(X_aap_Fcat) - - , '\n\nTotal no. of genomic features:' , len(X_genomicFN) - , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) - , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) - , '\n--------Other cols:' , len(X_gn_Fcat) X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat all_featuresN = X_evolFN + X_structural_FN + X_genomicFN -############################################################################### - print('\n================================================================' , '\nTotal Evolutionary features (n):' , len(X_evolFN)