LSHTM_analysis/scripts/ml/combined_model/run_cm_logo_FS.py

204 lines
8.4 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 2 19:17:46 2022
@author: tanu
"""
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml/combined_model')
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml/ml_functions')
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml')
from MultClfs import *
from cm_logo_skf_FS import *
###############################################################################
#%% FS with all genes in training
###############################################################################
# 1. Select Features
boruta_features = CMLogoSkf_FS(cm_input_df = combined_df,var_type = 'mixed', file_suffix = "complete")
# 2. Find original column names of features
# if it starts with num__, get rid of num__
# if it starts with cat__, get rid of cat__ and the _<number> at the end
for i in boruta_features:
print(i)
boruta_features[i]['sel_features']=[re.sub('^num__|cat__(.*)_\d*$',r'\1', current_thing) for current_thing in boruta_features[i]['sel_features']]
boruta_features[i]['sel_features'] = list(set(boruta_features[i]['sel_features']))
# write json
OutFile_6Tgenes = "/home/tanu/git/LSHTM_ML/output/feature_selection/boruta_features_6_Tgenes.json"
pd.DataFrame(boruta_features).to_json(path_or_buf=OutFile_6Tgenes)
# 3. Run all classification models using original column names from (2)
combined_df_embb=combined_df[boruta_features['embb']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_katg=combined_df[boruta_features['katg']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_pnca=combined_df[boruta_features['pnca']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_gid= combined_df[boruta_features['gid' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_rpob= combined_df[boruta_features['rpob' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
# from /home/tanu/git/LSHTM_analysis/scripts/ml/ml_functions/MultClf.py
CombinedModelML(combined_df_embb
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["embb"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = []
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS"
)
CombinedModelML(combined_df_katg
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["katg"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = []
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS"
)
CombinedModelML(combined_df_pnca
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["pnca"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = []
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS"
)
CombinedModelML(combined_df_gid
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["gid"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = []
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS"
)
CombinedModelML(combined_df_rpob
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["rpob"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = []
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS"
)
# write all feature rankings
for i in boruta_features:
print (i)
gene_fs_ranking = boruta_features[i]['fs_ranking']
gene_fs_ranking.to_csv("/home/tanu/git/LSHTM_ML/output/feature_selection/"+ i + "_boruta_featues_6Tgenes.csv")
###############################################################################
#%% FS withour training including ALR
###############################################################################
# With training omitting alr
boruta_features_omit_alr = CMLogoSkf_FS(cm_input_df = combined_df
, std_gene_omit = ['alr']
, var_type = 'mixed')
# 2. Find original column names of features
# if it starts with num__, get rid of num__
# if it starts with cat__, get rid of cat__ and the _<number> at the end
for i in boruta_features_omit_alr:
print(i)
boruta_features_omit_alr[i]['sel_features']=[re.sub('^num__|cat__(.*)_\d*$',r'\1', current_thing) for current_thing in boruta_features[i]['sel_features']]
boruta_features_omit_alr[i]['sel_features'] = list(set(boruta_features_omit_alr[i]['sel_features']))
# write json
OutFile_5Tgenes = "/home/tanu/git/LSHTM_ML/output/feature_selection/boruta_features_5_Tgenes.json"
pd.DataFrame(boruta_features_omit_alr).to_json(path_or_buf=OutFile_5Tgenes)
# 3. Run all classification models using original column names from (2)
cm_input_df5 = combined_df[~combined_df['gene_name'].isin(omit_gene_alr)]
combined_df_embb_no_alr = cm_input_df5[boruta_features_omit_alr['embb']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_katg_no_alr = cm_input_df5[boruta_features_omit_alr['katg']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_pnca_no_alr = cm_input_df5[boruta_features_omit_alr['pnca']['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_gid_no_alr = cm_input_df5[boruta_features_omit_alr['gid' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
combined_df_rpob_no_alr = cm_input_df5[boruta_features_omit_alr['rpob' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
CombinedModelML(combined_df_embb_no_alr
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["embb"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = ["alr"]
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS_no_Talr"
)
CombinedModelML(combined_df_katg_no_alr
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["katg"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = ["alr"]
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS_no_Talr"
)
CombinedModelML(combined_df_pnca_no_alr
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["pnca"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = ["alr"]
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS_no_Talr"
)
CombinedModelML(combined_df_gid_no_alr
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["gid"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = ["alr"]
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS_no_Talr"
)
CombinedModelML(combined_df_rpob_no_alr
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["rpob"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = ["alr"]
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
, file_suffix = "FS_no_Talr"
)
# write all feature rankings
for i in boruta_features_omit_alr:
print (i)
gene_fs_ranking_no_alr = boruta_features_omit_alr[i]['fs_ranking']
gene_fs_ranking_no_alr.to_csv("/home/tanu/git/LSHTM_ML/output/feature_selection/"+ i + "_boruta_featues_5Tgenes.csv")