#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Jun 20 13:05:23 2022 @author: tanu """ import re #all_featuresN = X_evolFN + X_structural_FN + X_genomicFN # X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN # X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat score_type_ordermapD = { 'mcc' : 1 , 'fscore' : 2 , 'jcc' : 3 , 'precision' : 4 , 'recall' : 5 , 'accuracy' : 6 , 'roc_auc' : 7 , 'TN' : 8 , 'FP' : 9 , 'FN' : 10 , 'TP' : 11 , 'trainingY_neg': 12 , 'trainingY_pos': 13 , 'blindY_neg' : 14 , 'blindY_pos' : 15 , 'fit_time' : 16 , 'score_time' : 17 } #================== # Baseline models #================== # cm_di2 = MultModelsCl_dissected(input_df = X # , target = y # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts # , add_cm = True # , add_yn = True) # baseline_all2 = pd.DataFrame(cm_di2) # baseline_all2T = baseline_all2.T # baseline_CTBT2 = baseline_all2T.filter(regex = 'test_.*|bts_.*|TN|FP|FN|TP|.*_neg|.*_pos' , axis = 1) #================ # Stability cols #================ #================ # Affinity cols #================ #================ # Residue level #================ #================ # Genomics # X_genomicFN #================ feature_gp_name = 'genomics' scores_mm_gn = MultModelsCl_dissected(input_df = X[X_genomicFN] , target = y , var_type = 'mixed' , skf_cv = skf_cv , blind_test_input_df = X_bts[X_genomicFN] , blind_test_target = y_bts , add_cm = True , add_yn = True) baseline_all_gn = pd.DataFrame(scores_mm_gn) baseline_GN = baseline_all_gn.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) baseline_GN = baseline_GN.reset_index() baseline_GN.rename(columns = {'index': 'original_names'}, inplace = True) # Indicate whether BT or CT bt_pattern = re.compile(r'bts_.*') baseline_GN['data_source'] = baseline_GN.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) baseline_GN['score_type'] = baseline_GN['original_names'].str.replace('bts_|test_', '', regex = True) score_type_uniqueN = set(baseline_GN['score_type']) cL1 = list(score_type_ordermapD.keys()) cL2 = list(score_type_uniqueN) if set(cL1).issubset(cL2): print('\nPASS: sorting df by score that is mapped onto the order I want') baseline_GN['score_order'] = baseline_GN['score_type'].map(score_type_ordermapD) baseline_GN.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) else: sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') baseline_GN['feature_group'] = feature_gp_name #------------- # Blind test #------------- baseline_BT = baseline_all_gn.filter(regex = 'bts_', axis = 0) baseline_BT = baseline_BT.reset_index() baseline_BT.rename(columns = {'index': 'original_names'}, inplace = True) baseline_BT['score_type'] = baseline_BT['original_names'] baseline_BT['score_type'] = baseline_BT['score_type'].str.replace('bts_*', '', regex = True) baseline_BT['data_source'] = 'BT_score' #-------- # CV #-------- baseline_CT = baseline_all_gn.filter(regex = '.*_time|test_.*|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) baseline_CT = baseline_CT.reset_index() baseline_CT.rename(columns = {'index': 'original_names'}, inplace = True) baseline_CT['score_type'] = baseline_CT['original_names'] baseline_CT['score_type'] = baseline_CT['score_type'].str.replace('test_*', '', regex = True) baseline_CT['data_source'] = 'CT_score' #---------------------- # rpow bind: CT and BT #---------------------- if all(baseline_BT.columns == baseline_CT.columns): print('\nPASS: Colnames match, proceeding to row bind for data:', feature_gp_name , '\nDim of df1 (BT):', baseline_BT.shape , '\nDim of df2 (CT):', baseline_CT.shape) comb_df_gn = pd.concat([baseline_BT, baseline_CT], axis = 0, ignore_index = True) comb_df_gn['feature_group'] = feature_gp_name print('\nDim of combined df:', comb_df_gn.shape) else: print('\nFAIL: colnames mismatch, cannot combine') # good way but I don't like to have to rearrange the columns later #frames_tocombine = [baseline_BT, baseline_CT] #common_cols = list(set.intersection(*(set(df.columns) for df in frames_tocombine))) #a = pd.concat([df[common_cols] for df in frames_tocombine], ignore_index=True) ############################################################################### #================ # Evolution #================