LSHTM_analysis/scripts/ml/combined_model/run_cm_logo_FS.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep  2 19:17:46 2022

@author: tanu
"""
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml/combined_model')
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml/ml_functions')
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml')

from MultClfs import *
from cm_logo_skf_FS import *

###############################################################################
#%% FS with all genes in training
###############################################################################

# 1. Select Features
boruta_features = CMLogoSkf_FS(cm_input_df = combined_df,var_type = 'mixed', file_suffix = "complete")

# 2. Find original column names of features
# if it starts with num__, get rid of num__
# if it starts with cat__, get rid of cat__ and the _<number> at the end
for i in boruta_features:
    print(i)
    boruta_features[i]['sel_features']=[re.sub('^num__|cat__(.*)_\d*$',r'\1', current_thing) for current_thing in boruta_features[i]['sel_features']]
    boruta_features[i]['sel_features'] = list(set(boruta_features[i]['sel_features']))

# write json
OutFile_6Tgenes = "/home/tanu/git/LSHTM_ML/output/feature_selection/boruta_features_6_Tgenes.json"
pd.DataFrame(boruta_features).to_json(path_or_buf=OutFile_6Tgenes)

# 3. Run all classification models using original column names from (2)
combined_df_embb=combined_df[boruta_features['embb']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_katg=combined_df[boruta_features['katg']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_pnca=combined_df[boruta_features['pnca']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_gid= combined_df[boruta_features['gid' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_rpob= combined_df[boruta_features['rpob' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]


# from /home/tanu/git/LSHTM_analysis/scripts/ml/ml_functions/MultClf.py
CombinedModelML(combined_df_embb
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["embb"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = []
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS"
        )


CombinedModelML(combined_df_katg
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["katg"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = []
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS"
        )


CombinedModelML(combined_df_pnca
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["pnca"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = []
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS"
        )

CombinedModelML(combined_df_gid
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["gid"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = []
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS"
        )

CombinedModelML(combined_df_rpob
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["rpob"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = []
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS"
        )


# write all feature rankings
for i in boruta_features:
    print (i)
    gene_fs_ranking = boruta_features[i]['fs_ranking']
    gene_fs_ranking.to_csv("/home/tanu/git/LSHTM_ML/output/feature_selection/"+ i + "_boruta_featues_6Tgenes.csv")


###############################################################################
#%% FS withour training including ALR
###############################################################################
# With training omitting alr
boruta_features_omit_alr = CMLogoSkf_FS(cm_input_df = combined_df
                                        , std_gene_omit = ['alr']
                                        , var_type = 'mixed')

# 2. Find original column names of features
# if it starts with num__, get rid of num__
# if it starts with cat__, get rid of cat__ and the _<number> at the end
for i in boruta_features_omit_alr:
    print(i)
    boruta_features_omit_alr[i]['sel_features']=[re.sub('^num__|cat__(.*)_\d*$',r'\1', current_thing) for current_thing in boruta_features[i]['sel_features']]
    boruta_features_omit_alr[i]['sel_features'] = list(set(boruta_features_omit_alr[i]['sel_features']))

# write json
OutFile_5Tgenes = "/home/tanu/git/LSHTM_ML/output/feature_selection/boruta_features_5_Tgenes.json"
pd.DataFrame(boruta_features_omit_alr).to_json(path_or_buf=OutFile_5Tgenes)

# 3. Run all classification models using original column names from (2)
cm_input_df5 =  combined_df[~combined_df['gene_name'].isin(omit_gene_alr)]

combined_df_embb_no_alr = cm_input_df5[boruta_features_omit_alr['embb']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_katg_no_alr = cm_input_df5[boruta_features_omit_alr['katg']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_pnca_no_alr = cm_input_df5[boruta_features_omit_alr['pnca']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_gid_no_alr  = cm_input_df5[boruta_features_omit_alr['gid' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_rpob_no_alr = cm_input_df5[boruta_features_omit_alr['rpob' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]


CombinedModelML(combined_df_embb_no_alr
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["embb"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = ["alr"]
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS_no_Talr"
        )


CombinedModelML(combined_df_katg_no_alr
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["katg"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = ["alr"]
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS_no_Talr"
        )


CombinedModelML(combined_df_pnca_no_alr
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["pnca"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = ["alr"]
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS_no_Talr"
        )

CombinedModelML(combined_df_gid_no_alr
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["gid"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = ["alr"]
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS_no_Talr"
        )

CombinedModelML(combined_df_rpob_no_alr
        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
        , bts_genes = ["rpob"]
        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
        , target_var = 'dst_mode'
        , gene_group = 'gene_name'
        , std_gene_omit = ["alr"]
        , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
        , file_suffix = "FS_no_Talr"
        )


# write all feature rankings
for i in boruta_features_omit_alr:
    print (i)
    gene_fs_ranking_no_alr = boruta_features_omit_alr[i]['fs_ranking']
    gene_fs_ranking_no_alr.to_csv("/home/tanu/git/LSHTM_ML/output/feature_selection/"+ i + "_boruta_featues_5Tgenes.csv")