204 lines
8.4 KiB
Python
204 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Fri Sep 2 19:17:46 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
###############################################################################
|
|
homedir = os.path.expanduser("~")
|
|
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml/combined_model')
|
|
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml/ml_functions')
|
|
sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml')
|
|
|
|
from MultClfs import *
|
|
from cm_logo_skf_FS import *
|
|
|
|
###############################################################################
|
|
#%% FS with all genes in training
|
|
###############################################################################
|
|
|
|
# 1. Select Features
|
|
boruta_features = CMLogoSkf_FS(cm_input_df = combined_df,var_type = 'mixed', file_suffix = "complete")
|
|
|
|
# 2. Find original column names of features
|
|
# if it starts with num__, get rid of num__
|
|
# if it starts with cat__, get rid of cat__ and the _<number> at the end
|
|
for i in boruta_features:
|
|
print(i)
|
|
boruta_features[i]['sel_features']=[re.sub('^num__|cat__(.*)_\d*$',r'\1', current_thing) for current_thing in boruta_features[i]['sel_features']]
|
|
boruta_features[i]['sel_features'] = list(set(boruta_features[i]['sel_features']))
|
|
|
|
# write json
|
|
OutFile_6Tgenes = "/home/tanu/git/LSHTM_ML/output/feature_selection/boruta_features_6_Tgenes.json"
|
|
pd.DataFrame(boruta_features).to_json(path_or_buf=OutFile_6Tgenes)
|
|
|
|
# 3. Run all classification models using original column names from (2)
|
|
combined_df_embb=combined_df[boruta_features['embb']['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
combined_df_katg=combined_df[boruta_features['katg']['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
combined_df_pnca=combined_df[boruta_features['pnca']['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
combined_df_gid= combined_df[boruta_features['gid' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
combined_df_rpob= combined_df[boruta_features['rpob' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
|
|
|
|
# from /home/tanu/git/LSHTM_analysis/scripts/ml/ml_functions/MultClf.py
|
|
CombinedModelML(combined_df_embb
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["embb"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = []
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS"
|
|
)
|
|
|
|
|
|
CombinedModelML(combined_df_katg
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["katg"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = []
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS"
|
|
)
|
|
|
|
|
|
CombinedModelML(combined_df_pnca
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["pnca"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = []
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS"
|
|
)
|
|
|
|
CombinedModelML(combined_df_gid
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["gid"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = []
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS"
|
|
)
|
|
|
|
CombinedModelML(combined_df_rpob
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["rpob"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = []
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS"
|
|
)
|
|
|
|
|
|
# write all feature rankings
|
|
for i in boruta_features:
|
|
print (i)
|
|
gene_fs_ranking = boruta_features[i]['fs_ranking']
|
|
gene_fs_ranking.to_csv("/home/tanu/git/LSHTM_ML/output/feature_selection/"+ i + "_boruta_featues_6Tgenes.csv")
|
|
|
|
|
|
###############################################################################
|
|
#%% FS withour training including ALR
|
|
###############################################################################
|
|
# With training omitting alr
|
|
boruta_features_omit_alr = CMLogoSkf_FS(cm_input_df = combined_df
|
|
, std_gene_omit = ['alr']
|
|
, var_type = 'mixed')
|
|
|
|
# 2. Find original column names of features
|
|
# if it starts with num__, get rid of num__
|
|
# if it starts with cat__, get rid of cat__ and the _<number> at the end
|
|
for i in boruta_features_omit_alr:
|
|
print(i)
|
|
boruta_features_omit_alr[i]['sel_features']=[re.sub('^num__|cat__(.*)_\d*$',r'\1', current_thing) for current_thing in boruta_features[i]['sel_features']]
|
|
boruta_features_omit_alr[i]['sel_features'] = list(set(boruta_features_omit_alr[i]['sel_features']))
|
|
|
|
# write json
|
|
OutFile_5Tgenes = "/home/tanu/git/LSHTM_ML/output/feature_selection/boruta_features_5_Tgenes.json"
|
|
pd.DataFrame(boruta_features_omit_alr).to_json(path_or_buf=OutFile_5Tgenes)
|
|
|
|
# 3. Run all classification models using original column names from (2)
|
|
cm_input_df5 = combined_df[~combined_df['gene_name'].isin(omit_gene_alr)]
|
|
|
|
combined_df_embb_no_alr = cm_input_df5[boruta_features_omit_alr['embb']['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
combined_df_katg_no_alr = cm_input_df5[boruta_features_omit_alr['katg']['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
combined_df_pnca_no_alr = cm_input_df5[boruta_features_omit_alr['pnca']['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
combined_df_gid_no_alr = cm_input_df5[boruta_features_omit_alr['gid' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
combined_df_rpob_no_alr = cm_input_df5[boruta_features_omit_alr['rpob' ]['sel_features']+['dst', 'dst_mode', 'gene_name']]
|
|
|
|
|
|
CombinedModelML(combined_df_embb_no_alr
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["embb"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = ["alr"]
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS_no_Talr"
|
|
)
|
|
|
|
|
|
CombinedModelML(combined_df_katg_no_alr
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["katg"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = ["alr"]
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS_no_Talr"
|
|
)
|
|
|
|
|
|
CombinedModelML(combined_df_pnca_no_alr
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["pnca"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = ["alr"]
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS_no_Talr"
|
|
)
|
|
|
|
CombinedModelML(combined_df_gid_no_alr
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["gid"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = ["alr"]
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS_no_Talr"
|
|
)
|
|
|
|
CombinedModelML(combined_df_rpob_no_alr
|
|
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
|
|
, bts_genes = ["rpob"]
|
|
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
|
|
, target_var = 'dst_mode'
|
|
, gene_group = 'gene_name'
|
|
, std_gene_omit = ["alr"]
|
|
, output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/"
|
|
, file_suffix = "FS_no_Talr"
|
|
)
|
|
|
|
|
|
# write all feature rankings
|
|
for i in boruta_features_omit_alr:
|
|
print (i)
|
|
gene_fs_ranking_no_alr = boruta_features_omit_alr[i]['fs_ranking']
|
|
gene_fs_ranking_no_alr.to_csv("/home/tanu/git/LSHTM_ML/output/feature_selection/"+ i + "_boruta_featues_5Tgenes.csv")
|
|
|
|
|