LSHTM_analysis/scripts/ml/ml_iterator_FS.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 29 20:29:36 2022

@author: tanu
"""
import sys, os
import pandas as pd
import numpy as np
import re

###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
sys.path
###############################################################################
outdir = homedir + '/git/LSHTM_ML/output/feature_selection/ind_gene/'

#====================
# Import ML functions
#====================
from MultClfs import *
from GetMLData import *
from SplitTTS import *

skf_cv = StratifiedKFold(n_splits = 10
                          #, shuffle = False, random_state= None)
                           , shuffle = True, random_state = 42)

n_jobs = os.cpu_count()
njobs = {'n_jobs': n_jobs }
rs    = {'random_state': 42}


#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"]
ml_gene_drugD = {
        'pncA' : 'pyrazinamide',
        'embB' : 'ethambutol'#,
        #'katG' : 'isoniazid',
        #'rpoB' : 'rifampicin',
        #'gid'  : 'streptomycin'
                 }
gene_dataD={}
split_types = [
        #'70_30',
        '80_20',
        'sl',
        #'rt',
        #'none_bts'
               ]

split_data_types = [
        #'actual',
        'complete'
        ]


for gene, drug in ml_gene_drugD.items():
    print ('\nGene:', gene
           , '\nDrug:', drug)
    gene_low = gene.lower()
    gene_dataD[gene_low] = getmldata(gene, drug
                                     , **gene_model_paramD)

    for split_type in split_types:
        for data_type in split_data_types:

            out_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_FS_"+ '.csv'

            tempD=split_tts(gene_dataD[gene_low]
                      , data_type = data_type
                      , split_type = split_type
                      , oversampling = True
                      , dst_colname = 'dst'
                      , target_colname = 'dst_mode'
                      , include_gene_name = True
                  )
            print("Feature Selection goes here")

            # REASSIGN for simplicity
            # X
            X_train = tempD['X'].copy()
            X_test = tempD['X_bts'].copy()
            X_train.shape
            X_test.shape

            # Y
            y_train = tempD['y'].copy()
            y_test = tempD['y_bts'].copy()
            y_train.shape
            y_test.shape

            numerical_ix = X_train.select_dtypes(include=['int64', 'float64']).columns
            categorical_ix = X_train.select_dtypes(include=['object', 'bool']).columns

            if var_type  == 'numerical':
                t = [('num', MinMaxScaler(), numerical_ix)]

            if var_type == 'categorical':
                t = [('cat', OneHotEncoder(), categorical_ix)]

            if var_type  == 'mixed':
                t = [('num', MinMaxScaler(), numerical_ix)
                     , ('cat', OneHotEncoder(), categorical_ix)]

            col_transform = ColumnTransformer(transformers = t
                                            , remainder='passthrough')

            col_transform.fit(X_train)
            # Get feature names out pain
            var_type_colnames = col_transform.get_feature_names_out()
            var_type_colnames = pd.Index(var_type_colnames)

            X_train = col_transform.fit_transform(X_train)
            X_test = col_transform.fit_transform(X_test)

            fs_clf = "RandomForestClassifier"
            rf_all_features = RandomForestClassifier(n_estimators=1000, max_depth=5
                                                      , **rs, **njobs)

            # fit
            rf_all_features.fit(np.array(X_train), np.array(y_train))
            print("RF, baseline MCC:", matthews_corrcoef(y_test, rf_all_features.predict(X_test)))

            # BORUTA and fit
            boruta_selector = BorutaPy(rf_all_features,**rs, verbose = 3)
            boruta_selector.fit(np.array(X_train), np.array(y_train))

            # Get chosen features
            print("Ranking: ", boruta_selector.ranking_)
            print("No. of significant features: ", boruta_selector.n_features_)


            X_important_train = boruta_selector.transform(np.array(X_train))
            X_important_test = boruta_selector.transform(np.array(X_test))

            # just retesting with selected features on RF itselfs
            rf_all_features.fit(X_important_train, y_train)
            print("RF, Boruta MCC:", matthews_corrcoef(y_test, rf_all_features.predict(X_important_test)))

            selected_rf_features = pd.DataFrame({'Feature':list(var_type_colnames),
                                                    'Ranking':boruta_selector.ranking_})

            features_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_boruta_ranking_"+ '.csv'
            selected_rf_features.to_csv(features_filename, index = True)

            sel_rf_features_sorted = selected_rf_features.sort_values(by='Ranking')


            sel_features = var_type_colnames[boruta_selector.support_]
            sel_features_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + "_boruta_selected_"+ '.csv'
            pd.DataFrame(sel_features).to_csv(sel_features_filename, index = True)

            X_train_named = pd.DataFrame(X_train)
            X_train_named.columns=var_type_colnames

            X_test_named = pd.DataFrame(X_test)
            X_test_named.columns=var_type_colnames

            X_train_FS = X_train_named[list(sel_features)]
            X_test_FS = X_test_named[list(sel_features)]

            # use the selected features for MultModelsCl
            scoresD = MultModelsCl(input_df = X_train_FS,
                                   target = y_train,
                                   var_type = 'numerical', # A NOTE OF IT
                                   resampling_type = 'none'
                                , sel_cv = skf_cv
                                , tts_split_type = split_type
                                , add_cm = True
                                , add_yn = True
                                , scale_numeric = ['min_max']
                                , run_blind_test = True
                                , blind_test_df =  X_test_FS
                                , blind_test_target = y_test
                                , return_formatted_output = True
                                , random_state = 42
                                , n_jobs = os.cpu_count()
                                )

            #out_wf = pd.concat(scoresD, ignore_index = True)
            #out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
            scoresD.to_csv(out_filename, index = False)