adding formatting to get all output from ML for feature grpups starting with genomics
This commit is contained in:
parent
cadaed2ba7
commit
7b378ca6f3
4 changed files with 98 additions and 54 deletions
|
@ -305,7 +305,7 @@ def MultModelsCl_dissected(input_df, target, skf_cv
|
||||||
mm_skf_scoresD[model_name]['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2)
|
||||||
mm_skf_scoresD[model_name]['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2)
|
||||||
mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2)
|
||||||
mm_skf_scoresD[model_name]['bts_jaccard'] = round(jaccard_score(blind_test_target, bts_predict),2)
|
mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2)
|
||||||
#mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC
|
#mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC
|
||||||
|
|
||||||
return(mm_skf_scoresD)
|
return(mm_skf_scoresD)
|
||||||
|
|
|
@ -5,21 +5,47 @@ Created on Mon Jun 20 13:05:23 2022
|
||||||
|
|
||||||
@author: tanu
|
@author: tanu
|
||||||
"""
|
"""
|
||||||
|
import re
|
||||||
|
#all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
|
||||||
|
# X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
|
||||||
|
# X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
|
||||||
|
|
||||||
|
|
||||||
|
score_type_ordermapD = { 'mcc' : 1
|
||||||
|
, 'fscore' : 2
|
||||||
|
, 'jcc' : 3
|
||||||
|
, 'precision' : 4
|
||||||
|
, 'recall' : 5
|
||||||
|
, 'accuracy' : 6
|
||||||
|
, 'roc_auc' : 7
|
||||||
|
, 'TN' : 8
|
||||||
|
, 'FP' : 9
|
||||||
|
, 'FN' : 10
|
||||||
|
, 'TP' : 11
|
||||||
|
, 'trainingY_neg': 12
|
||||||
|
, 'trainingY_pos': 13
|
||||||
|
, 'blindY_neg' : 14
|
||||||
|
, 'blindY_pos' : 15
|
||||||
|
, 'fit_time' : 16
|
||||||
|
, 'score_time' : 17
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#==================
|
#==================
|
||||||
# Baseline models
|
# Baseline models
|
||||||
#==================
|
#==================
|
||||||
cm_di2 = MultModelsCl_dissected(input_df = X
|
# cm_di2 = MultModelsCl_dissected(input_df = X
|
||||||
, target = y
|
# , target = y
|
||||||
, var_type = 'mixed'
|
# , var_type = 'mixed'
|
||||||
, skf_cv = skf_cv
|
# , skf_cv = skf_cv
|
||||||
, blind_test_input_df = X_bts
|
# , blind_test_input_df = X_bts
|
||||||
, blind_test_target = y_bts
|
# , blind_test_target = y_bts
|
||||||
, add_cm = True
|
# , add_cm = True
|
||||||
, add_yn = True)
|
# , add_yn = True)
|
||||||
|
|
||||||
baseline_all2 = pd.DataFrame(cm_di2)
|
# baseline_all2 = pd.DataFrame(cm_di2)
|
||||||
baseline_all2 = baseline_all2.T
|
# baseline_all2T = baseline_all2.T
|
||||||
baseline_CTBT2 = baseline_all2.filter(regex = 'test_.*|bts_.*|TN|FP|FN|TP|.*_neg|.*_pos' , axis = 1)
|
# baseline_CTBT2 = baseline_all2T.filter(regex = 'test_.*|bts_.*|TN|FP|FN|TP|.*_neg|.*_pos' , axis = 1)
|
||||||
|
|
||||||
#================
|
#================
|
||||||
# Stability cols
|
# Stability cols
|
||||||
|
@ -52,35 +78,68 @@ scores_mm_gn = MultModelsCl_dissected(input_df = X[X_genomicFN]
|
||||||
, add_yn = True)
|
, add_yn = True)
|
||||||
|
|
||||||
baseline_all_gn = pd.DataFrame(scores_mm_gn)
|
baseline_all_gn = pd.DataFrame(scores_mm_gn)
|
||||||
baseline_CTBT_gn = baseline_all_gn.filter(regex = '.*_time|test_.*|bts_.*|TN|FP|FN|TP|.*_neg|.*_pos' , axis = 0)
|
|
||||||
baseline_CTBT_gn['feature_group'] = feature_gp_name
|
|
||||||
|
|
||||||
|
baseline_GN = baseline_all_gn.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||||
|
baseline_GN = baseline_GN.reset_index()
|
||||||
|
baseline_GN.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||||
|
|
||||||
|
# Indicate whether BT or CT
|
||||||
|
bt_pattern = re.compile(r'bts_.*')
|
||||||
|
baseline_GN['data_source'] = baseline_GN.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
|
||||||
|
|
||||||
baseline_CT = baseline_CTBT_gn.filter(regex = '.*_time|test_.*|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
baseline_GN['score_type'] = baseline_GN['original_names'].str.replace('bts_|test_', '', regex = True)
|
||||||
|
|
||||||
baseline_CT = baseline_CT.reset_index()
|
score_type_uniqueN = set(baseline_GN['score_type'])
|
||||||
baseline_CT.rename(columns = {'index': 'original_index'}, inplace = True)
|
cL1 = list(score_type_ordermapD.keys())
|
||||||
baseline_CT['score_type'] = baseline_CT['original_index']
|
cL2 = list(score_type_uniqueN)
|
||||||
baseline_CT['score_type'] = baseline_CT['score_type'].str.replace('test_*', '', regex = True)
|
|
||||||
baseline_CT['data_source'] = 'CT_score'
|
|
||||||
|
|
||||||
|
if set(cL1).issubset(cL2):
|
||||||
|
print('\nPASS: sorting df by score that is mapped onto the order I want')
|
||||||
|
baseline_GN['score_order'] = baseline_GN['score_type'].map(score_type_ordermapD)
|
||||||
|
baseline_GN.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
|
||||||
|
else:
|
||||||
|
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
|
||||||
|
|
||||||
baseline_BT = baseline_CTBT_gn.filter(regex = 'bts_', axis = 0)
|
baseline_GN['feature_group'] = feature_gp_name
|
||||||
|
|
||||||
|
#-------------
|
||||||
|
# Blind test
|
||||||
|
#-------------
|
||||||
|
baseline_BT = baseline_all_gn.filter(regex = 'bts_', axis = 0)
|
||||||
baseline_BT = baseline_BT.reset_index()
|
baseline_BT = baseline_BT.reset_index()
|
||||||
baseline_BT.rename(columns = {'index': 'original_index'}, inplace = True)
|
baseline_BT.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||||
baseline_BT['score_type'] = baseline_BT['original_index']
|
baseline_BT['score_type'] = baseline_BT['original_names']
|
||||||
baseline_BT['score_type'] = baseline_BT['score_type'].str.replace('bts_*', '', regex = True)
|
baseline_BT['score_type'] = baseline_BT['score_type'].str.replace('bts_*', '', regex = True)
|
||||||
baseline_BT['data_source'] = 'BT_score'
|
baseline_BT['data_source'] = 'BT_score'
|
||||||
|
|
||||||
# rpow bind
|
#--------
|
||||||
if all(baseline_CT.columns == baseline_BT.columns):
|
# CV
|
||||||
print('\nPASS:colnames match, proceeding to rowbind')
|
#--------
|
||||||
comb_df = pd.concat([baseline_BT,baseline_CT], axis = 0, ignore_index = True )
|
baseline_CT = baseline_all_gn.filter(regex = '.*_time|test_.*|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
|
||||||
|
baseline_CT = baseline_CT.reset_index()
|
||||||
|
baseline_CT.rename(columns = {'index': 'original_names'}, inplace = True)
|
||||||
|
baseline_CT['score_type'] = baseline_CT['original_names']
|
||||||
|
baseline_CT['score_type'] = baseline_CT['score_type'].str.replace('test_*', '', regex = True)
|
||||||
|
baseline_CT['data_source'] = 'CT_score'
|
||||||
|
|
||||||
baseline_CT
|
#----------------------
|
||||||
baseline_CT
|
# rpow bind: CT and BT
|
||||||
|
#----------------------
|
||||||
|
if all(baseline_BT.columns == baseline_CT.columns):
|
||||||
|
print('\nPASS: Colnames match, proceeding to row bind for data:', feature_gp_name
|
||||||
|
, '\nDim of df1 (BT):', baseline_BT.shape
|
||||||
|
, '\nDim of df2 (CT):', baseline_CT.shape)
|
||||||
|
comb_df_gn = pd.concat([baseline_BT, baseline_CT], axis = 0, ignore_index = True)
|
||||||
|
comb_df_gn['feature_group'] = feature_gp_name
|
||||||
|
print('\nDim of combined df:', comb_df_gn.shape)
|
||||||
|
else:
|
||||||
|
print('\nFAIL: colnames mismatch, cannot combine')
|
||||||
|
|
||||||
|
# good way but I don't like to have to rearrange the columns later
|
||||||
|
#frames_tocombine = [baseline_BT, baseline_CT]
|
||||||
|
#common_cols = list(set.intersection(*(set(df.columns) for df in frames_tocombine)))
|
||||||
|
#a = pd.concat([df[common_cols] for df in frames_tocombine], ignore_index=True)
|
||||||
|
###############################################################################
|
||||||
#================
|
#================
|
||||||
# Evolution
|
# Evolution
|
||||||
#================
|
#================
|
||||||
|
|
|
@ -578,9 +578,15 @@ X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [US
|
||||||
|
|
||||||
X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
|
X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# Feature groups further collaps:
|
#========================
|
||||||
|
# FG6 collapsed: Structural : Atability + Affinity + ResidueProp
|
||||||
|
#========================
|
||||||
X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
|
X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
#========================
|
||||||
|
# BUILDING all features
|
||||||
|
#========================
|
||||||
all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
|
all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
|
@ -49,32 +49,11 @@ print('\nOutput directory:', outdir_ml)
|
||||||
#%%###########################################################################
|
#%%###########################################################################
|
||||||
print('\n================================================================\n')
|
print('\n================================================================\n')
|
||||||
|
|
||||||
, '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
|
|
||||||
|
|
||||||
, '\n\nTotal no. of stability features:' , len(X_stability_FN)
|
|
||||||
, '\n--------Common stabilty cols:' , len(X_common_stability_Fnum)
|
|
||||||
, '\n--------Foldx cols:' , len(X_foldX_Fnum)
|
|
||||||
|
|
||||||
, '\n\nTotal no. of affinity features:' , len(X_affinityFN)
|
|
||||||
, '\n--------Common affinity cols:' , len(common_affinity_Fnum)
|
|
||||||
, '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames)
|
|
||||||
|
|
||||||
, '\n\nTotal no. of residue level features:', len(X_resprop_FN)
|
|
||||||
, '\n--------AA index cols:' , len(X_aaindex_Fnum)
|
|
||||||
, '\n--------Residue Prop cols:' , len(X_str_Fnum)
|
|
||||||
, '\n--------AA change Prop cols:' , len(X_aap_Fcat)
|
|
||||||
|
|
||||||
, '\n\nTotal no. of genomic features:' , len(X_genomicFN)
|
|
||||||
, '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum)
|
|
||||||
, '\n--------Lineage cols:' , len(X_gn_linegae_Fnum)
|
|
||||||
, '\n--------Other cols:' , len(X_gn_Fcat)
|
|
||||||
|
|
||||||
X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
|
X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
|
||||||
X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
|
X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
|
||||||
all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
|
all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
|
|
||||||
print('\n================================================================'
|
print('\n================================================================'
|
||||||
|
|
||||||
, '\nTotal Evolutionary features (n):' , len(X_evolFN)
|
, '\nTotal Evolutionary features (n):' , len(X_evolFN)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue