saving work

This commit is contained in:
Tanushree Tunstall 2022-06-21 18:12:31 +01:00
parent 7b378ca6f3
commit 137f19a285
5 changed files with 1289 additions and 1102 deletions

View file

@ -41,6 +41,9 @@ from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report
# added
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneOut, KFold, RepeatedKFold, cross_val_predict
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
@ -69,18 +72,20 @@ from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.impute import KNNImputer as KNN from sklearn.impute import KNNImputer as KNN
import json import json
import argparse
import re
#%% GLOBALS #%% GLOBALS
rs = {'random_state': 42} rs = {'random_state': 42}
njobs = {'n_jobs': 10} njobs = {'n_jobs': 10}
scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'fscore' : make_scorer(f1_score) , 'fscore' : make_scorer(f1_score)
, 'precision' : make_scorer(precision_score) , 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score) , 'recall' : make_scorer(recall_score)
, 'accuracy' : make_scorer(accuracy_score) , 'accuracy' : make_scorer(accuracy_score)
, 'roc_auc' : make_scorer(roc_auc_score) , 'roc_auc' : make_scorer(roc_auc_score)
, 'jcc' : make_scorer(jaccard_score) , 'jcc' : make_scorer(jaccard_score)
}) })
skf_cv = StratifiedKFold(n_splits = 10 skf_cv = StratifiedKFold(n_splits = 10
@ -98,6 +103,8 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
def MultModelsCl(input_df, target, skf_cv def MultModelsCl(input_df, target, skf_cv
, blind_test_input_df , blind_test_input_df
, blind_test_target , blind_test_target
, add_cm = True # adds confusion matrix based on cross_val_predict
, add_yn = True # adds target var class numbers
, var_type = ['numerical', 'categorical','mixed']): , var_type = ['numerical', 'categorical','mixed']):
''' '''
@ -117,13 +124,17 @@ def MultModelsCl(input_df, target, skf_cv
Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
''' '''
#======================================================
# Determine categorical and numerical features # Determine categorical and numerical features
#======================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix categorical_ix
#======================================================
# Determine preprocessing steps ~ var_type # Determine preprocessing steps ~ var_type
#======================================================
if var_type == 'numerical': if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)] t = [('num', MinMaxScaler(), numerical_ix)]
@ -138,38 +149,38 @@ def MultModelsCl(input_df, target, skf_cv
, remainder='passthrough') , remainder='passthrough')
#====================================================== #======================================================
# Specify multiple Classification models # Specify multiple Classification Models
#====================================================== #======================================================
models = [('Logistic Regression' , LogisticRegression(**rs) ) models = [('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(**rs) ) , ('Logistic RegressionCV' , LogisticRegressionCV(**rs) )
, ('Gaussian NB' , GaussianNB() ) , ('Gaussian NB' , GaussianNB() )
, ('Naive Bayes' , BernoulliNB() ) , ('Naive Bayes' , BernoulliNB() )
, ('K-Nearest Neighbors' , KNeighborsClassifier() ) # , ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('SVC' , SVC(**rs) ) # , ('SVC' , SVC(**rs) )
, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) # , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
, ('Decision Tree' , DecisionTreeClassifier(**rs) ) # , ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) ) # , ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) ) # , ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) )
, ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
, n_estimators = 1000 # , n_estimators = 1000
, bootstrap = True # , bootstrap = True
, oob_score = True # , oob_score = True
, **njobs # , **njobs
, **rs # , **rs
, max_features = 'auto') ) # , max_features = 'auto') )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) ) # , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
, ('LDA' , LinearDiscriminantAnalysis() ) # , ('LDA' , LinearDiscriminantAnalysis() )
, ('Multinomial' , MultinomialNB() ) # , ('Multinomial' , MultinomialNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) # , ('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) # , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
, ('Gaussian Process' , GaussianProcessClassifier(**rs) ) # , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
, ('QDA' , QuadraticDiscriminantAnalysis() ) # , ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Ridge Classifier' , RidgeClassifier(**rs) ) # , ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) ) # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 10) )
] ]
mm_skf_scoresD = {} mm_skf_scoresD = {}
@ -200,6 +211,72 @@ def MultModelsCl(input_df, target, skf_cv
, scoring = scoring_fn , scoring = scoring_fn
, return_train_score = True) , return_train_score = True)
#######################################################################
#======================================================
# Option: Add confusion matrix from cross_val_predict
# Understand and USE with caution
# cross_val_score, cross_val_predict, "Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from cross_validate and cross_val_score unless all tests sets have equal size and the metric decomposes over samples."
# https://stackoverflow.com/questions/65645125/producing-a-confusion-matrix-with-cross-validate
#======================================================
if add_cm:
#-----------------------------------------------------------
# Initialise dict of Confusion Matrix (cm)
#-----------------------------------------------------------
cmD = {}
# Calculate cm
y_pred = cross_val_predict(model_pipeline, input_df, target, cv = skf_cv, **njobs)
#_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
# Build dict
cmD = {'TN' : tn
, 'FP': fp
, 'FN': fn
, 'TP': tp}
#---------------------------------
# Update cv dict with cmD and tbtD
#----------------------------------
skf_cv_modD.update(cmD)
else:
skf_cv_modD = skf_cv_modD
#######################################################################
#=============================================
# Option: Add targety numbers for data
#=============================================
if add_yn:
#-----------------------------------------------------------
# Initialise dict of target numbers: training and blind (tbt)
#-----------------------------------------------------------
tbtD = {}
# training y
tyn = Counter(target)
tyn_neg = tyn[0]
tyn_pos = tyn[1]
# blind test y
btyn = Counter(blind_test_target)
btyn_neg = btyn[0]
btyn_pos = btyn[1]
# Build dict
tbtD = {'trainingY_neg' : tyn_neg
, 'trainingY_pos' : tyn_pos
, 'blindY_neg' : btyn_neg
, 'blindY_pos' : btyn_pos}
#---------------------------------
# Update cv dict with cmD and tbtD
#----------------------------------
skf_cv_modD.update(tbtD)
else:
skf_cv_modD = skf_cv_modD
#######################################################################
#============================== #==============================
# Extract mean values for CV # Extract mean values for CV
#============================== #==============================
@ -207,15 +284,15 @@ def MultModelsCl(input_df, target, skf_cv
for key, value in skf_cv_modD.items(): for key, value in skf_cv_modD.items():
print('\nkey:', key, '\nvalue:', value) print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', mean(value)) print('\nmean value:', np.mean(value))
mm_skf_scoresD[model_name][key] = round(mean(value),2) mm_skf_scoresD[model_name][key] = round(np.mean(value),2)
#return(mm_skf_scoresD) #return(mm_skf_scoresD)
#%% #%%
#========================= #=========================
# Blind test: BTS results # Blind test: BTS results
#========================= #=========================
# Build the final results with all scores for a feature selected model # Build the final results with all scores for the model
#bts_predict = gscv_fs.predict(blind_test_input_df) #bts_predict = gscv_fs.predict(blind_test_input_df)
model_pipeline.fit(input_df, target) model_pipeline.fit(input_df, target)
bts_predict = model_pipeline.predict(blind_test_input_df) bts_predict = model_pipeline.predict(blind_test_input_df)
@ -225,28 +302,16 @@ def MultModelsCl(input_df, target, skf_cv
print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
# Diff b/w train and bts test scores # Diff b/w train and bts test scores
#train_test_diff_MCC = cvtrain_mcc - bts_mcc_score # train_test_diff_MCC = cvtrain_mcc - bts_mcc_score
# print('\nDiff b/w train and blind test score (MCC):', train_test_diff) # print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
# # create a dict with all scores
# lr_btsD = { 'model_name': model_name
# , 'bts_mcc':None
# , 'bts_fscore':None
# , 'bts_precision':None
# , 'bts_recall':None
# , 'bts_accuracy':None
# , 'bts_roc_auc':None
# , 'bts_jaccard':None}
mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score
mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2)
mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)
mm_skf_scoresD[model_name]['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_recall'] = round(recall_score(blind_test_target, bts_predict),2)
mm_skf_scoresD[model_name]['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_accuracy'] = round(accuracy_score(blind_test_target, bts_predict),2)
mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2)
mm_skf_scoresD[model_name]['bts_jaccard'] = round(jaccard_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2)
#mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC
return(mm_skf_scoresD) return(mm_skf_scoresD)

View file

@ -72,6 +72,8 @@ from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.impute import KNNImputer as KNN from sklearn.impute import KNNImputer as KNN
import json import json
import argparse
import re
#%% GLOBALS #%% GLOBALS
rs = {'random_state': 42} rs = {'random_state': 42}
@ -98,7 +100,7 @@ mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%% #%%
# Multiple Classification - Model Pipeline # Multiple Classification - Model Pipeline
def MultModelsCl_dissected(input_df, target, skf_cv def MultModelsCl(input_df, target, skf_cv
, blind_test_input_df , blind_test_input_df
, blind_test_target , blind_test_target
, add_cm = True # adds confusion matrix based on cross_val_predict , add_cm = True # adds confusion matrix based on cross_val_predict
@ -299,6 +301,10 @@ def MultModelsCl_dissected(input_df, target, skf_cv
print('\nMCC on Blind test:' , bts_mcc_score) print('\nMCC on Blind test:' , bts_mcc_score)
print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2)) print('\nAccuracy on Blind test:', round(accuracy_score(blind_test_target, bts_predict),2))
# Diff b/w train and bts test scores
# train_test_diff_MCC = cvtrain_mcc - bts_mcc_score
# print('\nDiff b/w train and blind test score (MCC):', train_test_diff)
mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score mm_skf_scoresD[model_name]['bts_mcc'] = bts_mcc_score
mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_fscore'] = round(f1_score(blind_test_target, bts_predict),2)
mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_precision'] = round(precision_score(blind_test_target, bts_predict),2)

View file

@ -29,46 +29,74 @@ score_type_ordermapD = { 'mcc' : 1
, 'fit_time' : 16 , 'fit_time' : 16
, 'score_time' : 17 , 'score_time' : 17
} }
###############################################################################
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/fgs/'
print('\nOutput directory:', outdir_ml)
outFile = outdir_ml + gene.lower() + '_baseline_FG.csv'
#================== #==================
# Baseline models # other vars
#================== #==================
# cm_di2 = MultModelsCl_dissected(input_df = X tts_split_name = 'original'
# , target = y sampling_type_name = 'none'
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts
# , add_cm = True
# , add_yn = True)
# baseline_all2 = pd.DataFrame(cm_di2)
# baseline_all2T = baseline_all2.T
# baseline_CTBT2 = baseline_all2T.filter(regex = 'test_.*|bts_.*|TN|FP|FN|TP|.*_neg|.*_pos' , axis = 1)
###############################################################################
#================ #================
# Stability cols # Evolutionary
# X_evolFN
#================ #================
feature_gp_nameEV = 'evolutionary'
n_featuresEV = len(X_evolFN)
scores_mmEV = MultModelsCl_dissected(input_df = X[X_evolFN]
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts[X_evolFN]
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
#================ baseline_allEV = pd.DataFrame(scores_mmEV)
# Affinity cols
#================
baseline_EV = baseline_allEV.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_EV = baseline_EV.reset_index()
baseline_EV.rename(columns = {'index': 'original_names'}, inplace = True)
#================ # Indicate whether BT or CT
# Residue level bt_pattern = re.compile(r'bts_.*')
#================ baseline_EV['data_source'] = baseline_EV.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_EV['score_type'] = baseline_EV['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_EV['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_EV['score_order'] = baseline_EV['score_type'].map(score_type_ordermapD)
baseline_EV.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
baseline_EV['feature_group'] = feature_gp_nameEV
baseline_EV['sampling_type'] = sampling_type_name
baseline_EV['tts_split'] = tts_split_name
baseline_EV['n_features'] = n_featuresEV
###############################################################################
#================ #================
# Genomics # Genomics
# X_genomicFN # X_genomicFN
#================ #================
feature_gp_name = 'genomics' feature_gp_nameGN = 'genomics'
n_featuresGN = len(X_genomicFN)
scores_mm_gn = MultModelsCl_dissected(input_df = X[X_genomicFN] scores_mmGN = MultModelsCl_dissected(input_df = X[X_genomicFN]
, target = y , target = y
, var_type = 'mixed' , var_type = 'mixed'
, skf_cv = skf_cv , skf_cv = skf_cv
@ -77,9 +105,9 @@ scores_mm_gn = MultModelsCl_dissected(input_df = X[X_genomicFN]
, add_cm = True , add_cm = True
, add_yn = True) , add_yn = True)
baseline_all_gn = pd.DataFrame(scores_mm_gn) baseline_allGN = pd.DataFrame(scores_mmGN)
baseline_GN = baseline_all_gn.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) baseline_GN = baseline_allGN.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_GN = baseline_GN.reset_index() baseline_GN = baseline_GN.reset_index()
baseline_GN.rename(columns = {'index': 'original_names'}, inplace = True) baseline_GN.rename(columns = {'index': 'original_names'}, inplace = True)
@ -100,47 +128,340 @@ if set(cL1).issubset(cL2):
else: else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
baseline_GN['feature_group'] = feature_gp_name baseline_GN['feature_group'] = feature_gp_nameGN
baseline_GN['sampling_type'] = sampling_type_name
baseline_GN['tts_split'] = tts_split_name
baseline_GN['n_features'] = n_featuresGN
###############################################################################
#all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
# X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
# X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
#================
# Structural cols
# X_structural_FN
#================
feature_gp_nameSTR = 'structural'
n_featuresSTR = len(X_structural_FN)
#------------- scores_mmSTR = MultModelsCl_dissected(input_df = X[X_structural_FN]
# Blind test , target = y
#------------- , var_type = 'mixed'
baseline_BT = baseline_all_gn.filter(regex = 'bts_', axis = 0) , skf_cv = skf_cv
baseline_BT = baseline_BT.reset_index() , blind_test_input_df = X_bts[X_structural_FN]
baseline_BT.rename(columns = {'index': 'original_names'}, inplace = True) , blind_test_target = y_bts
baseline_BT['score_type'] = baseline_BT['original_names'] , add_cm = True
baseline_BT['score_type'] = baseline_BT['score_type'].str.replace('bts_*', '', regex = True) , add_yn = True)
baseline_BT['data_source'] = 'BT_score'
#-------- baseline_allSTR = pd.DataFrame(scores_mmSTR)
# CV
#--------
baseline_CT = baseline_all_gn.filter(regex = '.*_time|test_.*|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_CT = baseline_CT.reset_index()
baseline_CT.rename(columns = {'index': 'original_names'}, inplace = True)
baseline_CT['score_type'] = baseline_CT['original_names']
baseline_CT['score_type'] = baseline_CT['score_type'].str.replace('test_*', '', regex = True)
baseline_CT['data_source'] = 'CT_score'
#---------------------- baseline_STR = baseline_allSTR.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
# rpow bind: CT and BT baseline_STR = baseline_STR.reset_index()
#---------------------- baseline_STR.rename(columns = {'index': 'original_names'}, inplace = True)
if all(baseline_BT.columns == baseline_CT.columns):
print('\nPASS: Colnames match, proceeding to row bind for data:', feature_gp_name # Indicate whether BT or CT
, '\nDim of df1 (BT):', baseline_BT.shape bt_pattern = re.compile(r'bts_.*')
, '\nDim of df2 (CT):', baseline_CT.shape) baseline_STR['data_source'] = baseline_STR.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
comb_df_gn = pd.concat([baseline_BT, baseline_CT], axis = 0, ignore_index = True)
comb_df_gn['feature_group'] = feature_gp_name baseline_STR['score_type'] = baseline_STR['original_names'].str.replace('bts_|test_', '', regex = True)
print('\nDim of combined df:', comb_df_gn.shape)
score_type_uniqueN = set(baseline_STR['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_STR['score_order'] = baseline_STR['score_type'].map(score_type_ordermapD)
baseline_STR.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
else: else:
print('\nFAIL: colnames mismatch, cannot combine') sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
# good way but I don't like to have to rearrange the columns later baseline_STR['feature_group'] = feature_gp_nameSTR
#frames_tocombine = [baseline_BT, baseline_CT] baseline_STR['sampling_type'] = sampling_type_name
#common_cols = list(set.intersection(*(set(df.columns) for df in frames_tocombine))) baseline_STR['tts_split'] = tts_split_name
#a = pd.concat([df[common_cols] for df in frames_tocombine], ignore_index=True) baseline_STR['n_features'] = n_featuresSTR
##############################################################################
#================
# Stability cols
# X_stability_FN
#================
feature_gp_nameSTB = 'stability'
n_featuresSTB = len(X_stability_FN)
scores_mmSTB = MultModelsCl_dissected(input_df = X[X_stability_FN]
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts[X_stability_FN]
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
baseline_allSTB = pd.DataFrame(scores_mmSTB)
baseline_STB = baseline_allSTB.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_STB = baseline_STB.reset_index()
baseline_STB.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CT
bt_pattern = re.compile(r'bts_.*')
baseline_STB['data_source'] = baseline_STB.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_STB['score_type'] = baseline_STB['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_STB['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_STB['score_order'] = baseline_STB['score_type'].map(score_type_ordermapD)
baseline_STB.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
baseline_STB['feature_group'] = feature_gp_nameSTB
baseline_STB['sampling_type'] = sampling_type_name
baseline_STB['tts_split'] = tts_split_name
baseline_STB['n_features'] = n_featuresSTB
############################################################################### ###############################################################################
#================ #================
# Evolution # Affinity cols
# X_affinityFN
#================ #================
feature_gp_nameAFF = 'affinity'
n_featuresAFF = len(X_affinityFN)
scores_mmAFF = MultModelsCl_dissected(input_df = X[X_affinityFN]
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts[X_affinityFN]
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
baseline_allAFF = pd.DataFrame(scores_mmAFF)
baseline_AFF = baseline_allAFF.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_AFF = baseline_AFF.reset_index()
baseline_AFF.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CT
bt_pattern = re.compile(r'bts_.*')
baseline_AFF['data_source'] = baseline_AFF.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_AFF['score_type'] = baseline_AFF['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_AFF['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_AFF['score_order'] = baseline_AFF['score_type'].map(score_type_ordermapD)
baseline_AFF.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
baseline_AFF['feature_group'] = feature_gp_nameAFF
baseline_AFF['sampling_type'] = sampling_type_name
baseline_AFF['tts_split'] = tts_split_name
baseline_AFF['n_features'] = n_featuresAFF
###############################################################################
#================
# Residue level
# X_resprop_FN
#================
feature_gp_nameRES = 'residue_prop'
n_featuresRES = len(X_resprop_FN)
scores_mmRES = MultModelsCl_dissected(input_df = X[X_resprop_FN]
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts[X_resprop_FN]
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
baseline_allRES = pd.DataFrame(scores_mmRES)
baseline_RES = baseline_allRES.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_RES = baseline_RES.reset_index()
baseline_RES.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CT
bt_pattern = re.compile(r'bts_.*')
baseline_RES['data_source'] = baseline_RES.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_RES['score_type'] = baseline_RES['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_RES['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_RES['score_order'] = baseline_RES['score_type'].map(score_type_ordermapD)
baseline_RES.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
baseline_RES['feature_group'] = feature_gp_nameRES
baseline_RES['sampling_type'] = sampling_type_name
baseline_RES['tts_split'] = tts_split_name
baseline_RES['n_features'] = n_featuresRES
###############################################################################
#================
# Residue level-AAindex
#X_resprop_FN - X_aaindex_Fnum
#================
X_respropNOaaFN = list(set(X_resprop_FN) - set(X_aaindex_Fnum))
feature_gp_nameRNAA = 'ResPropNoAA'
n_featuresRNAA = len(X_respropNOaaFN)
scores_mmRNAA = MultModelsCl_dissected(input_df = X[X_respropNOaaFN]
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts[X_respropNOaaFN]
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
baseline_allRNAA = pd.DataFrame(scores_mmRNAA)
baseline_RNAA = baseline_allRNAA.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_RNAA = baseline_RNAA.reset_index()
baseline_RNAA.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CT
bt_pattern = re.compile(r'bts_.*')
baseline_RNAA['data_source'] = baseline_RNAA.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_RNAA['score_type'] = baseline_RNAA['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_RNAA['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_RNAA['score_order'] = baseline_RNAA['score_type'].map(score_type_ordermapD)
baseline_RNAA.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
baseline_RNAA['feature_group'] = feature_gp_nameRNAA
baseline_RNAA['sampling_type'] = sampling_type_name
baseline_RNAA['tts_split'] = tts_split_name
baseline_RNAA['n_features'] = n_featuresRNAA
###############################################################################
#================
# Structural cols-AAindex
#X_structural_FN - X_aaindex_Fnum
#================
X_strNOaaFN = list(set(X_structural_FN) - set(X_aaindex_Fnum))
feature_gp_nameSNAA = 'StrNoAA'
n_featuresSNAA = len(X_strNOaaFN)
scores_mmSNAA = MultModelsCl_dissected(input_df = X[X_strNOaaFN]
, target = y
, var_type = 'mixed'
, skf_cv = skf_cv
, blind_test_input_df = X_bts[X_strNOaaFN]
, blind_test_target = y_bts
, add_cm = True
, add_yn = True)
baseline_allSNAA = pd.DataFrame(scores_mmSNAA)
baseline_SNAA = baseline_allSNAA.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
baseline_SNAA = baseline_SNAA.reset_index()
baseline_SNAA.rename(columns = {'index': 'original_names'}, inplace = True)
# Indicate whether BT or CT
bt_pattern = re.compile(r'bts_.*')
baseline_SNAA['data_source'] = baseline_SNAA.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
baseline_SNAA['score_type'] = baseline_SNAA['original_names'].str.replace('bts_|test_', '', regex = True)
score_type_uniqueN = set(baseline_SNAA['score_type'])
cL1 = list(score_type_ordermapD.keys())
cL2 = list(score_type_uniqueN)
if set(cL1).issubset(cL2):
print('\nPASS: sorting df by score that is mapped onto the order I want')
baseline_SNAA['score_order'] = baseline_SNAA['score_type'].map(score_type_ordermapD)
baseline_SNAA.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
else:
sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
baseline_SNAA['feature_group'] = feature_gp_nameSNAA
baseline_SNAA['sampling_type'] = sampling_type_name
baseline_SNAA['tts_split'] = tts_split_name
baseline_SNAA['n_features'] = n_featuresSNAA
###############################################################################
#%% COMBINING all FG dfs
#================
# Combine all
# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
#================
dfs_combine = [baseline_EV, baseline_GN, baseline_STR, baseline_STB, baseline_AFF, baseline_RES , baseline_RNAA , baseline_SNAA]
dfs_nrows = []
for df in dfs_combine:
dfs_nrows = dfs_nrows + [len(df)]
dfs_nrows = max(dfs_nrows)
dfs_ncols = []
for df in dfs_combine:
dfs_ncols = dfs_ncols + [len(df.columns)]
dfs_ncols = max(dfs_ncols)
# dfs_ncols = []
# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
# dfs_ncols2
expected_nrows = len(dfs_combine) * dfs_nrows
expected_ncols = dfs_ncols
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
if len(common_cols) == dfs_ncols :
combined_FG_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
fgs = combined_FG_baseline[['feature_group', 'n_features']]
fgs = fgs.drop_duplicates()
print('\nConcatenating dfs with feature groups after ML analysis (sampling type):'
, '\nNo. of dfs combining:', len(dfs_combine)
, '\nSampling type:', sampling_type
, '\nThe feature groups are:'
, '\n', fgs)
if len(combined_FG_baseline) == expected_nrows and len(combined_FG_baseline.columns) == expected_ncols:
print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
, '\nnrows in combined_df:', len(combined_FG_baseline)
, '\nncols in combined_df:', len(combined_FG_baseline.columns))
else:
print('\nFAIL: concatenating failed'
, '\nExpected nrows:', expected_nrows
, '\nGot:', len(combined_FG_baseline)
, '\nExpected ncols:', expected_ncols
, '\nGot:', len(combined_FG_baseline.columns))
sys.exit()
else:
sys.exit('\nConcatenting dfs not possible,check numbers ')
# # rpow bind
# if all(ll((baseline_EV.columns == baseline_GN.columns == baseline_STR.columns)):
# print('\nPASS:colnames match, proceeding to rowbind')
# comb_df = pd.concat()], axis = 0, ignore_index = True )
###############################################################################
#====================
# Write output file
#====================
combined_FG_baseline.to_csv(outFile)
print('\nFile successfully written:', outFile)
###############################################################################

File diff suppressed because it is too large Load diff

View file

@ -1,207 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
#---------------------------
# Version 1: no AAindex
#from UQ_ML_data import *
#setvars(gene,drug)
#from UQ_ML_data import *
#---------------------------
from ml_data_dissected import *
setvars(gene,drug)
from ml_data_dissected import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl_dissected import MultModelsCl_dissected
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/dissected'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\n================================================================\n')
X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
print('\n================================================================'
, '\nTotal Evolutionary features (n):' , len(X_evolFN)
, '\n--------------Evol. feature colnames:', X_evolFN
, '\n================================================================'
, '\n\nTotal structural features (n):', len(X_structural_FN)
, '\n--------Stability ncols:' , len(X_stability_FN)
, '\n--------------Common stability colnames:' , X_common_stability_Fnum
, '\n--------------Foldx colnames:' , X_foldX_Fnum
, '\n--------Affinity ncols:' , len(X_affinityFN)
, '\n--------------Common affinity colnames:' , common_affinity_Fnum
, '\n--------------Gene specific affinity colnames:', gene_affinity_colnames
, '\n--------Residue prop ncols:' , len(X_resprop_FN)
, '\n--------------Residue Prop cols:' , X_str_Fnum
, '\n--------------AA change Prop cols:' , X_aap_Fcat
, '\n--------------AA index cols:' , X_aaindex_Fnum
, '\n================================================================'
, '\n\nTotal Genomic features (n):' , len(X_genomicFN)
, '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum)
, '\n--------------MAF+OR colnames:' , X_gn_mafor_Fnum
, '\n--------Lineage cols:' , len(X_gn_linegae_Fnum)
, '\n--------------Lineage cols:' , X_gn_linegae_Fnum
, '\n--------Other cols:' , len(X_gn_Fcat)
, '\n--------------Other cols:' , X_gn_Fcat
, '\n================================================================')
# Sanity check
if ( len(X.columns) == len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)):
print('\nPass: No. of features match')
else:
print('\nFail: Count of feature mismatch'
, '\nExpected:', len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)
, '\nGot:', len(X.columns))
sys.exit()
print('\n#####################################################################\n')
# ###############################################################################
# #==================
# # Baseline models
# #==================
# mm_skf_scoresD = MultModelsCl_dissected(input_df = X
# , target = y
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# baseline_all = pd.DataFrame(mm_skf_scoresD)
# baseline_all = baseline_all.T
# #baseline_train = baseline_all.filter(like='train_', axis=1)
# baseline_CT = baseline_all.filter(like='test_', axis=1)
# baseline_CT.sort_values(by=['test_mcc'], ascending=False, inplace=True)
# baseline_BT = baseline_all.filter(like='bts_', axis=1)
# baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
# #%% SMOTE NC: Oversampling [Numerical + categorical]
# mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
# , target = y_smnc
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# smnc_all = pd.DataFrame(mm_skf_scoresD7)
# smnc_all = smnc_all.T
# smnc_CT = smnc_all.filter(like='test_', axis=1)
# smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# smnc_BT = smnc_all.filter(like='bts_', axis=1)
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
# #%% ROS: Numerical + categorical
# mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
# , target = y_ros
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# ros_all = pd.DataFrame(mm_skf_scoresD3)
# ros_all = ros_all.T
# ros_CT = ros_all.filter(like='test_', axis=1)
# ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# ros_BT = ros_all.filter(like='bts_', axis=1)
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
# #%% RUS: Numerical + categorical
# mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
# , target = y_rus
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# rus_all = pd.DataFrame(mm_skf_scoresD4)
# rus_all = rus_all.T
# rus_CT = rus_all.filter(like='test_', axis=1)
# rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rus_BT = rus_all.filter(like='bts_' , axis=1)
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
# #%% ROS + RUS Combined: Numerical + categorical
# mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
# , target = y_rouC
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# rouC_all = pd.DataFrame(mm_skf_scoresD8)
# rouC_all = rouC_all.T
# rouC_CT = rouC_all.filter(like='test_', axis=1)
# rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rouC_BT = rouC_all.filter(like='bts_', axis=1)
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')