diff --git a/LR_FS.json b/LR_FS.json new file mode 100644 index 0000000..5d2beed --- /dev/null +++ b/LR_FS.json @@ -0,0 +1 @@ +{"model_name": "GradientBoostingClassifier(n_estimators=10, random_state=42, subsample=0.7)", "model_refit_param": "mcc", "Best_model_params": {"clf__learning_rate": 0.1, "clf__max_depth": 3, "clf__n_estimators": 10, "clf__subsample": 0.7}, "n_all_features": 13, "fs_method": "RFECV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=42),\n estimator=LogisticRegression(random_state=42),\n scoring='matthews_corrcoef')", "fs_res_array": "[False, False, False, False, True, False, False, True, False, False, False, False, True]", "fs_res_array_rank": [3, 5, 8, 2, 1, 10, 7, 1, 6, 4, 9, 11, 1], "all_feature_names": ["ligand_distance", "ligand_affinity_change", "duet_stability_change", "ddg_foldx", "deepddg", "ddg_dynamut2", "contacts", "rsa", "kd_values", "rd_values", "consurf_score", "snap2_score", "maf"], "n_sel_features": 2, "sel_features_names": ["ddg_foldx", "rd_values"], "bts_fscore": 0.7, "bts_precision": 0.56, "bts_recall": 0.93, "bts_accuracy": 0.61, "bts_roc_auc": 0.61, "bts_jaccard": 0.54, "train_score (MCC)": 0.23, "bts_mcc": 0.28, "train_bts_diff": -0.05} \ No newline at end of file diff --git a/MLfeature_types.py b/MLfeature_types.py old mode 100644 new mode 100755 index b68ea9c..6f0a2f6 --- a/MLfeature_types.py +++ b/MLfeature_types.py @@ -9,11 +9,11 @@ Created on Sun May 29 06:46:19 2022 #%% Build X: input for ML print('Strucutral features (n):' - , len(common_cols_stabiltyN) + len(foldX_cols) + len(X_strFN) + , len(X_ssFN) , '\nThese are:' - , '\nCommon stablity features:', common_cols_stabiltyN - , '\nFoldX columns:', foldX_cols - , '\nOther struc columns:', X_strFN + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str , '\n================================================================\n') print('Evolutionary features (n):' @@ -36,3 +36,7 @@ print('Categorical features (n):' , categorical_FN , '\n================================================================\n') +if ( len(X.columns) == len(X_ssFN) +len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + print('\nFail: Count of feature mismatch') \ No newline at end of file diff --git a/MultClassPipe2.py b/MultClassPipe2.py old mode 100644 new mode 100755 diff --git a/MultClassPipe3.py b/MultClassPipe3.py old mode 100644 new mode 100755 diff --git a/MultClassPipe3_CALL.py b/MultClassPipe3_CALL.py old mode 100644 new mode 100755 diff --git a/MultModelsCl_CALL.py b/MultModelsCl_CALL.py old mode 100644 new mode 100755 index 9c363bb..5e14f48 --- a/MultModelsCl_CALL.py +++ b/MultModelsCl_CALL.py @@ -6,36 +6,36 @@ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ -from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score -from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report +# from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +# from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report -from sklearn.model_selection import train_test_split, cross_validate, cross_val_score -from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold +# from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +# from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold -from sklearn.pipeline import Pipeline, make_pipeline -#%% GLOBALS -rs = {'random_state': 42} -njobs = {'n_jobs': 10} +# from sklearn.pipeline import Pipeline, make_pipeline +# #%% GLOBALS +# rs = {'random_state': 42} +# njobs = {'n_jobs': 10} -scoring_fn = ({'accuracy' : make_scorer(accuracy_score) - , 'fscore' : make_scorer(f1_score) - , 'mcc' : make_scorer(matthews_corrcoef) - , 'precision' : make_scorer(precision_score) - , 'recall' : make_scorer(recall_score) - , 'roc_auc' : make_scorer(roc_auc_score) - , 'jcc' : make_scorer(jaccard_score) - }) +# scoring_fn = ({'accuracy' : make_scorer(accuracy_score) +# , 'fscore' : make_scorer(f1_score) +# , 'mcc' : make_scorer(matthews_corrcoef) +# , 'precision' : make_scorer(precision_score) +# , 'recall' : make_scorer(recall_score) +# , 'roc_auc' : make_scorer(roc_auc_score) +# , 'jcc' : make_scorer(jaccard_score) +# }) -skf_cv = StratifiedKFold(n_splits = 10 - #, shuffle = False, random_state= None) - , shuffle = True,**rs) +# skf_cv = StratifiedKFold(n_splits = 10 +# #, shuffle = False, random_state= None) +# , shuffle = True,**rs) -rskf_cv = RepeatedStratifiedKFold(n_splits = 10 - , n_repeats = 3 - , **rs) +# rskf_cv = RepeatedStratifiedKFold(n_splits = 10 +# , n_repeats = 3 +# , **rs) -mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} -jacc_score_fn = {'jcc': make_scorer(jaccard_score)} +# mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} +# jacc_score_fn = {'jcc': make_scorer(jaccard_score)} ############################################################################### #%% MultModelsCl: function call() @@ -55,6 +55,9 @@ baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) baseline_BT = baseline_all.filter(like='bts_', axis=1) baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +# Write csv +baseline_BT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_BT_allF.csv') + #%% SMOTE NC: Oversampling [Numerical + categorical] mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc , target = y_smnc @@ -70,6 +73,10 @@ smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) smnc_BT = smnc_all.filter(like='bts_', axis=1) smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_BT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_BT_allF.csv') + #%% ROS: Numerical + categorical mm_skf_scoresD3 = MultModelsCl(input_df = X_ros , target = y_ros @@ -85,6 +92,9 @@ ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) ros_BT = ros_all.filter(like='bts_', axis=1) ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_BT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_BT_allF.csv') #%% RUS: Numerical + categorical mm_skf_scoresD4 = MultModelsCl(input_df = X_rus , target = y_rus @@ -100,6 +110,9 @@ rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) rus_BT = rus_all.filter(like='bts_' , axis=1) rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_BT_allF.csv') #%% ROS + RUS Combined: Numerical + categorical mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC , target = y_rouC @@ -116,6 +129,8 @@ rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) rouC_BT = rouC_all.filter(like='bts_', axis=1) rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +# Write csv +rouC_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_BT_allF.csv') #%% SMOTE OS: Numerical only # mm_skf_scoresD2 = MultModelsCl(input_df = X_sm # , target = y_sm @@ -130,6 +145,8 @@ rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # sm_BT = sm_all.filter(like='bts_', axis=1) #sm_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +# Write csv +#sm_BT.to_csv(outdir + 'ml/' + gene.lower() + '_sm_BT_allF.csv') #%% SMOTE ENN: Over + Undersampling combined: Numerical ONLY # mm_skf_scoresD5 = MultModelsCl(input_df = X_enn # , target = y_enn @@ -146,6 +163,9 @@ rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # enn_BT = enn_all.filter(like='bts_', axis=1) #enn_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) +# Write csv +#enn_BT.to_csv(outdir + 'ml/' + gene.lower() + '_enn_BT_allF.csv') + #%% Repeated ENN # mm_skf_scoresD6 = MultModelsCl(input_df = X_renn # , target = y_renn @@ -161,5 +181,6 @@ rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # renn_BT = renn_all.filter(like='bts_', axis=1) # renn_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) - - +############################################################################### +# end of script +############################################################################## diff --git a/UQ_FS_eg.py b/UQ_FS_eg.py old mode 100644 new mode 100755 diff --git a/UQ_FS_fn.py b/UQ_FS_fn.py old mode 100644 new mode 100755 diff --git a/UQ_FS_fn_CALL.py b/UQ_FS_fn_CALL.py old mode 100644 new mode 100755 diff --git a/UQ_FS_mixed_eg.py b/UQ_FS_mixed_eg.py old mode 100644 new mode 100755 diff --git a/UQ_Imbalance.py b/UQ_Imbalance.py old mode 100644 new mode 100755 diff --git a/UQ_LR_FS_p1.py b/UQ_LR_FS_p1.py old mode 100644 new mode 100755 diff --git a/UQ_LR_FS_p2.py b/UQ_LR_FS_p2.py old mode 100644 new mode 100755 diff --git a/UQ_LR_p1.py b/UQ_LR_p1.py old mode 100644 new mode 100755 diff --git a/UQ_ML_data.py b/UQ_ML_data.py old mode 100644 new mode 100755 index 40303f8..6edb964 --- a/UQ_ML_data.py +++ b/UQ_ML_data.py @@ -27,6 +27,37 @@ def setvars(gene,drug): from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import RepeatedEditedNearestNeighbours + from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score + from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report + + from sklearn.model_selection import train_test_split, cross_validate, cross_val_score + from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold + + from sklearn.pipeline import Pipeline, make_pipeline + #%% GLOBALS + rs = {'random_state': 42} + njobs = {'n_jobs': 10} + + scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) + , 'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) + }) + + skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) + + rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats = 3 + , **rs) + + mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} + jacc_score_fn = {'jcc': make_scorer(jaccard_score)} + #%% FOR LATER: Combine ED logo data #%% FOR LARER: active aa site annotations ########################################################################### @@ -51,32 +82,42 @@ def setvars(gene,drug): my_df.dtypes my_df_cols = my_df.columns - geneL_basic = ['pnca'] - - # -- CHECK script -- imports.py + geneL_basic = ['pnca'] + geneL_na = ['gid'] + geneL_na_ppi2 = ['rpob'] + geneL_ppi2 = ['alr', 'embb', 'katg'] #%% get cols mycols = my_df.columns - mycols - # change from numberic to - num_type = ['int64', 'float64'] - cat_type = ['object', 'bool'] + # # change from numberic to + # num_type = ['int64', 'float64'] + # cat_type = ['object', 'bool'] - #TODO: - # #Treat active site aa pos as category and not numerical: This needs to be part of merged_df3! # if my_df['active_aa_pos'].dtype in num_type: # my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) # my_df['active_aa_pos'].dtype - # -- CHECK script -- imports.py + # FIXME: if this is not structural, remove from source.. + # Drop NA where numerical cols have them + if gene.lower() in geneL_na_ppi2: + #D1148 get rid of + na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) + + # FIXME: either impute or remove! + # for embb (L114M, F115L, V123L, V125I, V131M) delete for now + if gene.lower() in ['embb']: + na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)] + my_df = my_df.drop(index=na_index)# RERUN embb with the 5 values now present + ########################################################################### #%% Add lineage calculation columns #FIXME: Check if this can be imported from config? - total_mtblineage_u = 8 + total_mtblineage_uc = 8 lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode'] #bar = my_df[lineage_colnames] my_df['lineage_proportion'] = my_df['lineage_count_unique']/my_df['lineage_count_all'] - my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_u + my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc ########################################################################### #%% AA property change #-------------------- @@ -219,15 +260,6 @@ def setvars(gene,drug): #========================== my_df_ml = my_df.copy() - #%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 - my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() - my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() - my_df_ml.groupby(['mutationinformation'])['ligand_distance'].apply(lambda x: (x>10)).value_counts() - - my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 - (my_df_ml['ligand_affinity_change'] == 0).sum() - - #%%######################################################################## #========================== # BLIND test set #========================== @@ -253,8 +285,32 @@ def setvars(gene,drug): , 'ddg_dynamut2' , 'mmcsm_lig' , 'contacts'] + + # Build stability columns ~ gene + if gene.lower() in geneL_basic: + X_stabilityN = common_cols_stabiltyN + cols_to_mask = ['ligand_affinity_change'] + + if gene.lower() in geneL_ppi2: + # X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] + geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] + X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] + + if gene.lower() in geneL_na: + # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + geneL_na_st_cols = ['mcsm_na_affinity'] + X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] + + if gene.lower() in geneL_na_ppi2: + # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] + - foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' + X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' @@ -262,11 +318,13 @@ def setvars(gene,drug): , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' ] - X_strFN = ['rsa' + X_str = ['rsa' #, 'asa' , 'kd_values' , 'rd_values'] + X_ssFN = X_stabilityN + X_str + X_foldX_cols + X_evolFN = ['consurf_score' , 'snap2_score' , 'provean_score'] @@ -287,12 +345,14 @@ def setvars(gene,drug): , 'lineage_count_unique' ] - X_genomicFN = X_genomic_mafor+X_genomic_linegae + X_genomicFN = X_genomic_mafor + X_genomic_linegae #%% Construct numerical and categorical column names # numerical feature names - numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN +# numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN + numerical_FN = X_ssFN + X_evolFN + X_genomicFN + #categorical feature names categorical_FN = ['ss_class' # , 'wt_prop_water' @@ -306,9 +366,34 @@ def setvars(gene,drug): , 'polarity_change' , 'water_change' , 'drtype_mode_labels' # beware then you can use it to predict -# , 'active_aa_pos' # TODO? + #, 'active_aa_pos' # TODO? ] + ########################################################################### + #======================= + # Masking columns: + # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 + #======================= + #%% Masking columns + # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() + # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() + # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 + # (my_df_ml['ligand_affinity_change'] == 0).sum() + + my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() + my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() + my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() + + # mask the column ligand distance > 10 + my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 + (my_df_ml['ligand_affinity_change'] == 0).sum() + + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + + # write file for check + mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) + mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + #%% extracting dfs based on numerical, categorical column names #---------------------------------- # WITHOUT the target var included @@ -335,12 +420,12 @@ def setvars(gene,drug): all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] all_df_wtgt.shape - #%%================================================================ - #%% Apply ML - - #%% Data + #%%######################################################################## + #============ + # ML data + #============ #------ - # X + # X: Training and Blind test (BTS) #------ X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL @@ -352,17 +437,18 @@ def setvars(gene,drug): #------ y = all_df_wtgt['dst_mode'] # training data y y_bts = blind_test_df['dst_mode'] # blind data test y - - #Blind test data {same format} - #X_bts = blind_test_df[numerical_FN] - #X_bts = blind_test_df[numerical_FN + categorical_FN] - #y_bts = blind_test_df['dst_mode'] - - X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] + + #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] # Quick check - (X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() - ############################################################################## + #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() + for i in range(len(cols_to_mask)): + ind = i+1 + print('\nindex:', i, '\nind:', ind) + print('\nMask count check:' + , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() + ) + print('Original Data\n', Counter(y) , 'Data dim:', X.shape) diff --git a/UQ_MultClassPipe4.py b/UQ_MultClassPipe4.py old mode 100644 new mode 100755 diff --git a/UQ_MultModelsCl.py b/UQ_MultModelsCl.py old mode 100644 new mode 100755 index 942d980..078d60a --- a/UQ_MultModelsCl.py +++ b/UQ_MultModelsCl.py @@ -74,13 +74,13 @@ import json rs = {'random_state': 42} njobs = {'n_jobs': 10} -scoring_fn = ({'accuracy' : make_scorer(accuracy_score) - , 'fscore' : make_scorer(f1_score) - , 'mcc' : make_scorer(matthews_corrcoef) - , 'precision' : make_scorer(precision_score) - , 'recall' : make_scorer(recall_score) - , 'roc_auc' : make_scorer(roc_auc_score) - , 'jcc' : make_scorer(jaccard_score) +scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) + , 'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) }) skf_cv = StratifiedKFold(n_splits = 10 diff --git a/UQ_RF.py b/UQ_RF.py old mode 100644 new mode 100755 diff --git a/UQ_TODO_categorical_classification_columns.py b/UQ_TODO_categorical_classification_columns.py old mode 100644 new mode 100755 diff --git a/UQ_imbalance.py b/UQ_imbalance.py old mode 100644 new mode 100755 diff --git a/UQ_or_impute.py b/UQ_or_impute.py old mode 100644 new mode 100755 diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py old mode 100644 new mode 100755 diff --git a/UQ_practice.py b/UQ_practice.py old mode 100644 new mode 100755 diff --git a/UQ_yc_RunAllClfs.py b/UQ_yc_RunAllClfs.py old mode 100644 new mode 100755 diff --git a/UQ_yc_RunAllClfs_CALL.py b/UQ_yc_RunAllClfs_CALL.py new file mode 100755 index 0000000..7b46437 --- /dev/null +++ b/UQ_yc_RunAllClfs_CALL.py @@ -0,0 +1,101 @@ + +from UQ_yc_RunAllClfs import run_all_ML + +#%% CALL function +#run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +# Baseline_data + +YC_resD2 = run_all_ML(input_pd=X, target_label=y, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_baseline = YC_resD2['CrossValResultsDF'] +CVResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_baseline = YC_resD2['BlindTestResultsDF'] +BTSResultsDF_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# from sklearn.utils import all_estimators +# for name, algorithm in all_estimators(type_filter="classifier"): +# clf = algorithm() +# print('Name:', name, '\nAlgo:', clf) + +# Random Oversampling +YC_resD_ros = run_all_ML(input_pd=X_ros, target_label=y_ros, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_ros = YC_resD_ros['CrossValResultsDF'] +CVResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_ros = YC_resD_ros['BlindTestResultsDF'] +BTSResultsDF_ros.sort_values(by=['matthew'], ascending=False, inplace=True) + +# Random Undersampling +YC_resD_rus = run_all_ML(input_pd=X_rus, target_label=y_rus, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_rus = YC_resD_rus['CrossValResultsDF'] +CVResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_rus = YC_resD_rus['BlindTestResultsDF'] +BTSResultsDF_rus.sort_values(by=['matthew'], ascending=False, inplace=True) + +# Random Oversampling+Undersampling +YC_resD_rouC = run_all_ML(input_pd=X_rouC, target_label=y_rouC, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_rouC = YC_resD_rouC['CrossValResultsDF'] +CVResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_rouC = YC_resD_rouC['BlindTestResultsDF'] +BTSResultsDF_rouC.sort_values(by=['matthew'], ascending=False, inplace=True) + +# SMOTE NC +YC_resD_smnc = run_all_ML(input_pd=X_smnc, target_label=y_smnc, blind_test_input_df=X_bts, blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +CVResultsDF_smnc = YC_resD_smnc['CrossValResultsDF'] +CVResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True) +BTSResultsDF_smnc = YC_resD_smnc['BlindTestResultsDF'] +BTSResultsDF_smnc.sort_values(by=['matthew'], ascending=False, inplace=True) +############################################################################## +#============================================ +# BASELINE models with dissected featues +#============================================ +# Genomics +yC_gf = run_all_ML(input_pd=X[X_genomicFN], target_label=y, blind_test_input_df=X_bts[X_genomicFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_gfCT_baseline= yC_gf['CrossValResultsDF'] +yc_gfCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_gfBT_baseline = yC_gf['BlindTestResultsDF'] +yc_gfBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# Evolutionary +yC_ev = run_all_ML(input_pd=X[X_evolFN], target_label=y, blind_test_input_df=X_bts[X_evolFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_evCT_baseline= yC_ev['CrossValResultsDF'] +yc_evCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_evBT_baseline = yC_ev['BlindTestResultsDF'] +yc_evBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# strucF:All +yC_sfall = run_all_ML(input_pd=X[X_strFN], target_label=y, blind_test_input_df=X_bts[X_strFN], blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_sfallCT_baseline= yC_sfall['CrossValResultsDF'] +yc_sfallCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_sfallBT_baseline = yC_sfall['BlindTestResultsDF'] +yc_sfallBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# strucF:Common ONLY +c = [x for x in X_ssFN if x not in X_foldX_cols] +yC_sfco= run_all_ML(input_pd=X[X_stabilityN], target_label=y + , blind_test_input_df=X_bts[x_stabilityN] + , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_sfcoCT_baseline= yC_sfco['CrossValResultsDF'] +yc_sfcoCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_sfcoBT_baseline = yC_sfco['BlindTestResultsDF'] +yc_sfcoBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# strucF:common_stability + foldX_cols i.e interaction +yC_fxss= run_all_ML(input_pd=X[common_cols_stabiltyN+foldX_cols], target_label=y + , blind_test_input_df=X_bts[common_cols_stabiltyN+foldX_cols] + , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_fxssCT_baseline= yC_fxss['CrossValResultsDF'] +yc_fxssCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_fxssBT_baseline = yC_fxss['BlindTestResultsDF'] +yc_fxssBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +# categorical +yC_cat= run_all_ML(input_pd=X[categorical_FN], target_label=y + , blind_test_input_df=X_bts[categorical_FN] + , blind_test_target=y_bts, preprocess = True, var_type = 'mixed') +yc_catCT_baseline= yC_cat['CrossValResultsDF'] +yc_catCT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) +yc_catBT_baseline = yC_cat['BlindTestResultsDF'] +yc_catBT_baseline.sort_values(by=['matthew'], ascending=False, inplace=True) + +#================================================= +# Dissected features with Over and Undersampling +#================================================= diff --git a/alr_config.py b/alr_config.py new file mode 100755 index 0000000..593cb71 --- /dev/null +++ b/alr_config.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'alr' +drug = 'cycloserine' +#total_mtblineage_u = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +from UQ_ML_data import * +setvars(gene,drug) +from UQ_ML_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) +len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + print('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') + + +################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_BT_allF.csv') diff --git a/base_estimator.py b/base_estimator.py old mode 100644 new mode 100755 diff --git a/base_estimator2.py b/base_estimator2.py old mode 100644 new mode 100755 diff --git a/base_estimator3.py b/base_estimator3.py old mode 100644 new mode 100755 diff --git a/classification_params_FS.py b/classification_params_FS.py old mode 100644 new mode 100755 diff --git a/cross_validate_vs_loopity_loop.py b/cross_validate_vs_loopity_loop.py old mode 100644 new mode 100755 diff --git a/embb_config.py b/embb_config.py new file mode 100755 index 0000000..e685568 --- /dev/null +++ b/embb_config.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'embB' +drug = 'ethambutol' +#total_mtblineage_u = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +from UQ_ML_data import * +setvars(gene,drug) +from UQ_ML_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) +len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + print('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') + + +################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_BT_allF.csv') diff --git a/gid_config.py b/gid_config.py new file mode 100755 index 0000000..11cbc00 --- /dev/null +++ b/gid_config.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'gid' +drug = 'streptomycin' +#total_mtblineage_u = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +from UQ_ML_data import * +setvars(gene,drug) +from UQ_ML_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) +len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + print('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') + +################################################################################ + +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_BT_allF.csv') diff --git a/grid_search_vs_base_estimator.py b/grid_search_vs_base_estimator.py old mode 100644 new mode 100755 diff --git a/gscv.py b/gscv.py old mode 100644 new mode 100755 diff --git a/gscv_eg.py b/gscv_eg.py old mode 100644 new mode 100755 diff --git a/imbalance_p1.py b/imbalance_p1.py old mode 100644 new mode 100755 diff --git a/imbalance_p2.py b/imbalance_p2.py old mode 100644 new mode 100755 diff --git a/imports.py b/imports.py old mode 100644 new mode 100755 index f0a0dfe..a28ddc7 --- a/imports.py +++ b/imports.py @@ -99,8 +99,8 @@ from MultClassPipe2 import MultClassPipeline2 from loopity_loop import MultClassPipeSKFLoop from MultClassPipe3 import MultClassPipeSKFCV -gene = 'pncA' -drug = 'pyrazinamide' +#gene = 'pncA' +#drug = 'pyrazinamide' #============== # directories @@ -119,10 +119,10 @@ my_df = pd.read_csv(infile_ml1) my_df.dtypes my_df_cols = my_df.columns -geneL_basic = ['pnca'] +geneL_basic = ['pncA'] geneL_na = ['gid'] -geneL_na_ppi2 = ['rpob'] -geneL_ppi2 = ['alr', 'embb', 'katg'] +geneL_na_ppi2 = ['rpoB'] +geneL_ppi2 = ['alr', 'embB', 'katG'] #%% get cols mycols = my_df.columns diff --git a/imports_unsup.py b/imports_unsup.py old mode 100644 new mode 100755 diff --git a/intra_model_gscv.py b/intra_model_gscv.py old mode 100644 new mode 100755 diff --git a/itertools.py b/itertools.py old mode 100644 new mode 100755 diff --git a/katg_config.py b/katg_config.py new file mode 100755 index 0000000..882e8eb --- /dev/null +++ b/katg_config.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'katG' +drug = 'isoniazid' +#total_mtblineage_u = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +from UQ_ML_data import * +setvars(gene,drug) +from UQ_ML_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) +len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + print('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') + +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_BT_allF.csv') diff --git a/loopity_loop.py b/loopity_loop.py old mode 100644 new mode 100755 diff --git a/loopity_loop_CALL.py b/loopity_loop_CALL.py old mode 100644 new mode 100755 diff --git a/pnca_config.py b/pnca_config.py index 794914e..71cfbb0 100755 --- a/pnca_config.py +++ b/pnca_config.py @@ -10,7 +10,7 @@ import os gene = 'pncA' drug = 'pyrazinamide' -#total_mtblineage_u = 8 +#total_mtblineage_uc = 8 homedir = os.path.expanduser("~") os.chdir( homedir + '/git/ML_AI_training/') @@ -20,7 +20,7 @@ setvars(gene,drug) from UQ_ML_data import * # from YC run_all_ML: run locally -from UQ_yc_RunAllClfs import run_all_ML +#from UQ_yc_RunAllClfs import run_all_ML # TT run all ML clfs: baseline mode from UQ_MultModelsCl import MultModelsCl @@ -28,6 +28,7 @@ from UQ_MultModelsCl import MultModelsCl #%%########################################################################### print('\n#####################################################################\n') + print('TESTING cmd:' , '\nGene name:', gene , '\nDrug name:', drug @@ -35,11 +36,11 @@ print('TESTING cmd:' , '\n', Counter(y)) print('Strucutral features (n):' - , len(common_cols_stabiltyN) + len(foldX_cols) + len(X_strFN) + , len(X_ssFN) , '\nThese are:' - , '\nCommon stablity features:', common_cols_stabiltyN - , '\nFoldX columns:', foldX_cols - , '\nOther struc columns:', X_strFN + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str , '\n================================================================\n') print('Evolutionary features (n):' @@ -60,7 +61,115 @@ print('Categorical features (n):' , '\nThese are:\n' , categorical_FN , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) +len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + print('\nFail: Count of feature mismatch') + print('\n#####################################################################\n') ################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_BT_allF.csv') diff --git a/rfecv_vis.py b/rfecv_vis.py new file mode 100644 index 0000000..9fb87ec --- /dev/null +++ b/rfecv_vis.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun May 29 12:21:34 2022 + +@author: tanu +""" +from sklearn.svm import SVC +from sklearn.datasets import make_classification + +from yellowbrick.model_selection import RFECV + +# Instantiate RFECV visualizer with a linear SVM classifier +visualizer = RFECV(SVC(kernel='linear', C=1)) + +visualizer.fit(X[numerical_FN], y) # Fit the data to the visualizer +visualizer.show() + + +numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns +numerical_ix +categorical_ix = X.select_dtypes(include=['object', 'bool']).columns +categorical_ix + +# Determine preprocessing steps ~ var_type +var_type = 'mixed' +var_type = 'numerical' + +if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + +if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + +if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + + t = [('num', MinMaxScaler(), numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix)] + +col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') +#--------------ALEX help +# col_transform +# col_transform.fit(X) +# test = col_transform.transform(X) +# print(col_transform.get_feature_names_out()) + +# foo = col_transform.fit_transform(X) +Xm = col_transform.fit_transform(X) +# (foo == test).all() +#----------------------- + +visualizer.fit(Xm, y) # Fit the data to the visualizer +visualizer.show() + + +visualizer.fit(X[numerical_FN], y) # Fit the data to the visualizer +visualizer.show() diff --git a/rfecv_with_ohe.py b/rfecv_with_ohe.py old mode 100644 new mode 100755 diff --git a/rpob_config.py b/rpob_config.py new file mode 100755 index 0000000..346a049 --- /dev/null +++ b/rpob_config.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 28 05:25:30 2022 + +@author: tanu +""" + +import os + +gene = 'rpoB' +drug = 'rifampicin' +#total_mtblineage_u = 8 + +homedir = os.path.expanduser("~") +os.chdir( homedir + '/git/ML_AI_training/') + +from UQ_ML_data import * +setvars(gene,drug) +from UQ_ML_data import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +# TT run all ML clfs: baseline mode +from UQ_MultModelsCl import MultModelsCl + +#%%########################################################################### + +print('\n#####################################################################\n') + +print('TESTING cmd:' + , '\nGene name:', gene + , '\nDrug name:', drug + , '\nTotal input features:', X.shape + , '\n', Counter(y)) + +print('Strucutral features (n):' + , len(X_ssFN) + , '\nThese are:' + , '\nCommon stablity features:', X_stabilityN + , '\nFoldX columns:', X_foldX_cols + , '\nOther struc columns:', X_str + , '\n================================================================\n') + +print('Evolutionary features (n):' + , len(X_evolFN) + , '\nThese are:\n' + , X_evolFN + , '\n================================================================\n') + +print('Genomic features (n):' + , len(X_genomicFN) + , '\nThese are:\n' + , X_genomic_mafor, '\n' + , X_genomic_linegae + , '\n================================================================\n') + +print('Categorical features (n):' + , len(categorical_FN) + , '\nThese are:\n' + , categorical_FN + , '\n================================================================\n') + +if ( len(X.columns) == len(X_ssFN) +len(X_evolFN) + len(X_genomicFN) + len(categorical_FN) ): + print('\nPass: No. of features match') +else: + print('\nFail: Count of feature mismatch') + +print('\n#####################################################################\n') + + +################################################################################ +#================== +# Baseline models +#================== +mm_skf_scoresD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) + +baseline_all = pd.DataFrame(mm_skf_scoresD) +baseline_all = baseline_all.T +#baseline_train = baseline_all.filter(like='train_', axis=1) +baseline_CT = baseline_all.filter(like='test_', axis=1) +baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +baseline_BT = baseline_all.filter(like='bts_', axis=1) +baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +baseline_CT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_CT_allF.csv') +baseline_BT.to_csv(outdir + 'ml/' + gene.lower() + '_baseline_BT_allF.csv') + + +#%% SMOTE NC: Oversampling [Numerical + categorical] +mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +smnc_all = pd.DataFrame(mm_skf_scoresD7) +smnc_all = smnc_all.T + +smnc_CT = smnc_all.filter(like='test_', axis=1) +smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +smnc_BT = smnc_all.filter(like='bts_', axis=1) +smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +smnc_CT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_CT_allF.csv') +smnc_BT.to_csv(outdir + 'ml/' + gene.lower() + '_smnc_BT_allF.csv') + +#%% ROS: Numerical + categorical +mm_skf_scoresD3 = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +ros_all = pd.DataFrame(mm_skf_scoresD3) +ros_all = ros_all.T + +ros_CT = ros_all.filter(like='test_', axis=1) +ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +ros_BT = ros_all.filter(like='bts_', axis=1) +ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +ros_CT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_CT_allF.csv') +ros_BT.to_csv(outdir + 'ml/' + gene.lower() + '_ros_BT_allF.csv') + +#%% RUS: Numerical + categorical +mm_skf_scoresD4 = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rus_all = pd.DataFrame(mm_skf_scoresD4) +rus_all = rus_all.T + +rus_CT = rus_all.filter(like='test_', axis=1) +rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rus_BT = rus_all.filter(like='bts_' , axis=1) +rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rus_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_CT_allF.csv') +rus_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rus_BT_allF.csv') + +#%% ROS + RUS Combined: Numerical + categorical +mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts) +rouC_all = pd.DataFrame(mm_skf_scoresD8) +rouC_all = rouC_all.T + +rouC_CT = rouC_all.filter(like='test_', axis=1) +rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +rouC_BT = rouC_all.filter(like='bts_', axis=1) +rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# Write csv +rouC_CT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_CT_allF.csv') +rouC_BT.to_csv(outdir + 'ml/' + gene.lower() + '_rouC_BT_allF.csv') diff --git a/temp.py b/temp.py new file mode 100755 index 0000000..324b5ca --- /dev/null +++ b/temp.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun May 29 09:22:51 2022 + +@author: tanu +""" + +geneL_basic = ['pncA'] +geneL_na = ['gid'] +geneL_na_ppi2 = ['rpoB'] +geneL_ppi2 = ['alr', 'embB', 'katG'] +#%% get cols +mycols = my_df.columns + +# # change from numberic to +# num_type = ['int64', 'float64'] +# cat_type = ['object', 'bool'] + +# if my_df['active_aa_pos'].dtype in num_type: +# my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) +# my_df['active_aa_pos'].dtype + +# FIXME: if this is not structural, remove from source.. +# Drop NA where numerical cols have them +if gene.lower() in geneL_na_ppi2: + #D1148 get rid of + na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) + +# FIXME: either impute or remove! +# for embb (L114M, F115L, V123L, V125I, V131M) delete for now +if gene.lower() in ['embb']: + na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)] + #my_df = my_df.drop(index=na_index))# RERUN embb with the 5 values now present +#%%=========================================================================== + +#%% +# GET X +common_cols_stabiltyN = ['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2' + , 'contacts'] + +# Build stability columns ~ gene +if gene.lower() in geneL_basic: + x_stabilityN = common_cols_stabiltyN + cols_to_mask = ['ligand_affinity_change'] + +if gene.lower() in geneL_ppi2: +# x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] + geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] + x_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] + +if gene.lower() in geneL_na: +# x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + geneL_na_st_cols = ['mcsm_na_affinity'] + x_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] + +if gene.lower() in geneL_na_ppi2: +# x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + x_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] + + +#%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 +my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() +my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() +my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() + +# mask the column ligand distance > 10 +my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 +(my_df_ml['ligand_affinity_change'] == 0).sum() + +mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + + +for i in range(len(cols_to_mask)): + ind = i+1 + print('\nindex:', i, '\nind:', ind) + print('\nMask count check:' + , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() + ) + +(my_df_ml[cols_to_mask[0]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() +(my_df_ml[cols_to_mask[1]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() diff --git a/umap_fs.py b/umap_fs.py old mode 100644 new mode 100755 diff --git a/unsup_v1.py b/unsup_v1.py old mode 100644 new mode 100755