#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Jun 29 20:29:36 2022 @author: tanu """ import sys, os import pandas as pd import numpy as np import re ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### outdir = homedir + '/git/LSHTM_ML/output/feature_selection/ind_gene/' #==================== # Import ML functions #==================== from MultClfs import * from GetMLData import * from SplitTTS import * skf_cv = StratifiedKFold(n_splits = 10 #, shuffle = False, random_state= None) , shuffle = True, random_state = 42) n_jobs = os.cpu_count() njobs = {'n_jobs': n_jobs } rs = {'random_state': 42} #ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"] ml_gene_drugD = { 'pncA' : 'pyrazinamide', 'embB' : 'ethambutol'#, #'katG' : 'isoniazid', #'rpoB' : 'rifampicin', #'gid' : 'streptomycin' } gene_dataD={} split_types = [ #'70_30', '80_20', 'sl', #'rt', #'none_bts' ] split_data_types = [ #'actual', 'complete' ] for gene, drug in ml_gene_drugD.items(): print ('\nGene:', gene , '\nDrug:', drug) gene_low = gene.lower() gene_dataD[gene_low] = getmldata(gene, drug , **gene_model_paramD) for split_type in split_types: for data_type in split_data_types: out_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_FS_"+ '.csv' tempD=split_tts(gene_dataD[gene_low] , data_type = data_type , split_type = split_type , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True ) print("Feature Selection goes here") # REASSIGN for simplicity # X X_train = tempD['X'].copy() X_test = tempD['X_bts'].copy() X_train.shape X_test.shape # Y y_train = tempD['y'].copy() y_test = tempD['y_bts'].copy() y_train.shape y_test.shape numerical_ix = X_train.select_dtypes(include=['int64', 'float64']).columns categorical_ix = X_train.select_dtypes(include=['object', 'bool']).columns if var_type == 'numerical': t = [('num', MinMaxScaler(), numerical_ix)] if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] if var_type == 'mixed': t = [('num', MinMaxScaler(), numerical_ix) , ('cat', OneHotEncoder(), categorical_ix)] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') col_transform.fit(X_train) # Get feature names out pain var_type_colnames = col_transform.get_feature_names_out() var_type_colnames = pd.Index(var_type_colnames) X_train = col_transform.fit_transform(X_train) X_test = col_transform.fit_transform(X_test) fs_clf = "RandomForestClassifier" rf_all_features = RandomForestClassifier(n_estimators=1000, max_depth=5 , **rs, **njobs) # fit rf_all_features.fit(np.array(X_train), np.array(y_train)) print("RF, baseline MCC:", matthews_corrcoef(y_test, rf_all_features.predict(X_test))) # BORUTA and fit boruta_selector = BorutaPy(rf_all_features,**rs, verbose = 3) boruta_selector.fit(np.array(X_train), np.array(y_train)) # Get chosen features print("Ranking: ", boruta_selector.ranking_) print("No. of significant features: ", boruta_selector.n_features_) X_important_train = boruta_selector.transform(np.array(X_train)) X_important_test = boruta_selector.transform(np.array(X_test)) # just retesting with selected features on RF itselfs rf_all_features.fit(X_important_train, y_train) print("RF, Boruta MCC:", matthews_corrcoef(y_test, rf_all_features.predict(X_important_test))) selected_rf_features = pd.DataFrame({'Feature':list(var_type_colnames), 'Ranking':boruta_selector.ranking_}) features_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_boruta_ranking_"+ '.csv' selected_rf_features.to_csv(features_filename, index = True) sel_rf_features_sorted = selected_rf_features.sort_values(by='Ranking') sel_features = var_type_colnames[boruta_selector.support_] sel_features_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_boruta_selected_"+ '.csv' pd.DataFrame(sel_features).to_csv(sel_features_filename, index = True) X_train_named = pd.DataFrame(X_train) X_train_named.columns=var_type_colnames X_test_named = pd.DataFrame(X_test) X_test_named.columns=var_type_colnames X_train_FS = X_train_named[list(sel_features)] X_test_FS = X_test_named[list(sel_features)] # use the selected features for MultModelsCl scoresD = MultModelsCl(input_df = X_train_FS, target = y_train, var_type = 'numerical', # A NOTE OF IT resampling_type = 'none' , sel_cv = skf_cv , tts_split_type = split_type , add_cm = True , add_yn = True , scale_numeric = ['min_max'] , run_blind_test = True , blind_test_df = X_test_FS , blind_test_target = y_test , return_formatted_output = True , random_state = 42 , n_jobs = os.cpu_count() ) #out_wf = pd.concat(scoresD, ignore_index = True) #out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) scoresD.to_csv(out_filename, index = False)