From 58c25e23c00626060a37037d3013bb59500071c0 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sat, 3 Sep 2022 14:42:13 +0100 Subject: [PATCH] added ml_iterator.py --- scripts/ml/ml_iterator_FS.py | 185 +++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100755 scripts/ml/ml_iterator_FS.py diff --git a/scripts/ml/ml_iterator_FS.py b/scripts/ml/ml_iterator_FS.py new file mode 100755 index 0000000..a92a9b1 --- /dev/null +++ b/scripts/ml/ml_iterator_FS.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 29 20:29:36 2022 + +@author: tanu +""" +import sys, os +import pandas as pd +import numpy as np +import re + +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') +sys.path +############################################################################### +outdir = homedir + '/git/LSHTM_ML/output/feature_selection/ind_gene/' + +#==================== +# Import ML functions +#==================== +from MultClfs import * +from GetMLData import * +from SplitTTS import * + +skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True, random_state = 42) + +n_jobs = os.cpu_count() +njobs = {'n_jobs': n_jobs } +rs = {'random_state': 42} + + +#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"] +ml_gene_drugD = { + 'pncA' : 'pyrazinamide', + 'embB' : 'ethambutol'#, + #'katG' : 'isoniazid', + #'rpoB' : 'rifampicin', + #'gid' : 'streptomycin' + } +gene_dataD={} +split_types = [ + #'70_30', + '80_20', + 'sl', + #'rt', + #'none_bts' + ] + +split_data_types = [ + #'actual', + 'complete' + ] + + +for gene, drug in ml_gene_drugD.items(): + print ('\nGene:', gene + , '\nDrug:', drug) + gene_low = gene.lower() + gene_dataD[gene_low] = getmldata(gene, drug + , **gene_model_paramD) + + for split_type in split_types: + for data_type in split_data_types: + + out_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_FS_"+ '.csv' + + tempD=split_tts(gene_dataD[gene_low] + , data_type = data_type + , split_type = split_type + , oversampling = True + , dst_colname = 'dst' + , target_colname = 'dst_mode' + , include_gene_name = True + ) + print("Feature Selection goes here") + + # REASSIGN for simplicity + # X + X_train = tempD['X'].copy() + X_test = tempD['X_bts'].copy() + X_train.shape + X_test.shape + + # Y + y_train = tempD['y'].copy() + y_test = tempD['y_bts'].copy() + y_train.shape + y_test.shape + + numerical_ix = X_train.select_dtypes(include=['int64', 'float64']).columns + categorical_ix = X_train.select_dtypes(include=['object', 'bool']).columns + + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'mixed': + t = [('num', MinMaxScaler(), numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix)] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + + col_transform.fit(X_train) + # Get feature names out pain + var_type_colnames = col_transform.get_feature_names_out() + var_type_colnames = pd.Index(var_type_colnames) + + X_train = col_transform.fit_transform(X_train) + X_test = col_transform.fit_transform(X_test) + + fs_clf = "RandomForestClassifier" + rf_all_features = RandomForestClassifier(n_estimators=1000, max_depth=5 + , **rs, **njobs) + + # fit + rf_all_features.fit(np.array(X_train), np.array(y_train)) + print("RF, baseline MCC:", matthews_corrcoef(y_test, rf_all_features.predict(X_test))) + + # BORUTA and fit + boruta_selector = BorutaPy(rf_all_features,**rs, verbose = 3) + boruta_selector.fit(np.array(X_train), np.array(y_train)) + + # Get chosen features + print("Ranking: ", boruta_selector.ranking_) + print("No. of significant features: ", boruta_selector.n_features_) + + + X_important_train = boruta_selector.transform(np.array(X_train)) + X_important_test = boruta_selector.transform(np.array(X_test)) + + # just retesting with selected features on RF itselfs + rf_all_features.fit(X_important_train, y_train) + print("RF, Boruta MCC:", matthews_corrcoef(y_test, rf_all_features.predict(X_important_test))) + + selected_rf_features = pd.DataFrame({'Feature':list(var_type_colnames), + 'Ranking':boruta_selector.ranking_}) + + features_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_boruta_ranking_"+ '.csv' + selected_rf_features.to_csv(features_filename, index = True) + + sel_rf_features_sorted = selected_rf_features.sort_values(by='Ranking') + + + sel_features = var_type_colnames[boruta_selector.support_] + sel_features_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_boruta_selected_"+ '.csv' + pd.DataFrame(sel_features).to_csv(sel_features_filename, index = True) + + X_train_named = pd.DataFrame(X_train) + X_train_named.columns=var_type_colnames + + X_test_named = pd.DataFrame(X_test) + X_test_named.columns=var_type_colnames + + X_train_FS = X_train_named[list(sel_features)] + X_test_FS = X_test_named[list(sel_features)] + + # use the selected features for MultModelsCl + scoresD = MultModelsCl(input_df = X_train_FS, + target = y_train, + var_type = 'numerical', # A NOTE OF IT + resampling_type = 'none' + , sel_cv = skf_cv + , tts_split_type = split_type + , add_cm = True + , add_yn = True + , scale_numeric = ['min_max'] + , run_blind_test = True + , blind_test_df = X_test_FS + , blind_test_target = y_test + , return_formatted_output = True + , random_state = 42 + , n_jobs = os.cpu_count() + ) + + #out_wf = pd.concat(scoresD, ignore_index = True) + #out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + scoresD.to_csv(out_filename, index = False) +