#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Sep 2 19:17:46 2022 @author: tanu """ ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml/combined_model') sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path.append(homedir + '/home/tanu/git/LSHTM_analysis/scripts/ml') from MultClfs import * from cm_logo_skf_FS import * ############################################################################### #%% FS with all genes in training ############################################################################### # 1. Select Features boruta_features = CMLogoSkf_FS(cm_input_df = combined_df,var_type = 'mixed', file_suffix = "complete") # 2. Find original column names of features # if it starts with num__, get rid of num__ # if it starts with cat__, get rid of cat__ and the _ at the end for i in boruta_features: print(i) boruta_features[i]['sel_features']=[re.sub('^num__|cat__(.*)_\d*$',r'\1', current_thing) for current_thing in boruta_features[i]['sel_features']] boruta_features[i]['sel_features'] = list(set(boruta_features[i]['sel_features'])) # write json OutFile_6Tgenes = "/home/tanu/git/LSHTM_ML/output/feature_selection/boruta_features_6_Tgenes.json" pd.DataFrame(boruta_features).to_json(path_or_buf=OutFile_6Tgenes) # 3. Run all classification models using original column names from (2) combined_df_embb=combined_df[boruta_features['embb']['sel_features']+['dst', 'dst_mode', 'gene_name']] combined_df_katg=combined_df[boruta_features['katg']['sel_features']+['dst', 'dst_mode', 'gene_name']] combined_df_pnca=combined_df[boruta_features['pnca']['sel_features']+['dst', 'dst_mode', 'gene_name']] combined_df_gid= combined_df[boruta_features['gid' ]['sel_features']+['dst', 'dst_mode', 'gene_name']] combined_df_rpob= combined_df[boruta_features['rpob' ]['sel_features']+['dst', 'dst_mode', 'gene_name']] # from /home/tanu/git/LSHTM_analysis/scripts/ml/ml_functions/MultClf.py CombinedModelML(combined_df_embb , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["embb"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = [] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS" ) CombinedModelML(combined_df_katg , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["katg"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = [] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS" ) CombinedModelML(combined_df_pnca , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["pnca"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = [] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS" ) CombinedModelML(combined_df_gid , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["gid"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = [] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS" ) CombinedModelML(combined_df_rpob , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["rpob"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = [] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS" ) # write all feature rankings for i in boruta_features: print (i) gene_fs_ranking = boruta_features[i]['fs_ranking'] gene_fs_ranking.to_csv("/home/tanu/git/LSHTM_ML/output/feature_selection/"+ i + "_boruta_featues_6Tgenes.csv") ############################################################################### #%% FS withour training including ALR ############################################################################### # With training omitting alr boruta_features_omit_alr = CMLogoSkf_FS(cm_input_df = combined_df , std_gene_omit = ['alr'] , var_type = 'mixed') # 2. Find original column names of features # if it starts with num__, get rid of num__ # if it starts with cat__, get rid of cat__ and the _ at the end for i in boruta_features_omit_alr: print(i) boruta_features_omit_alr[i]['sel_features']=[re.sub('^num__|cat__(.*)_\d*$',r'\1', current_thing) for current_thing in boruta_features[i]['sel_features']] boruta_features_omit_alr[i]['sel_features'] = list(set(boruta_features_omit_alr[i]['sel_features'])) # write json OutFile_5Tgenes = "/home/tanu/git/LSHTM_ML/output/feature_selection/boruta_features_5_Tgenes.json" pd.DataFrame(boruta_features_omit_alr).to_json(path_or_buf=OutFile_5Tgenes) # 3. Run all classification models using original column names from (2) cm_input_df5 = combined_df[~combined_df['gene_name'].isin(omit_gene_alr)] combined_df_embb_no_alr = cm_input_df5[boruta_features_omit_alr['embb']['sel_features']+['dst', 'dst_mode', 'gene_name']] combined_df_katg_no_alr = cm_input_df5[boruta_features_omit_alr['katg']['sel_features']+['dst', 'dst_mode', 'gene_name']] combined_df_pnca_no_alr = cm_input_df5[boruta_features_omit_alr['pnca']['sel_features']+['dst', 'dst_mode', 'gene_name']] combined_df_gid_no_alr = cm_input_df5[boruta_features_omit_alr['gid' ]['sel_features']+['dst', 'dst_mode', 'gene_name']] combined_df_rpob_no_alr = cm_input_df5[boruta_features_omit_alr['rpob' ]['sel_features']+['dst', 'dst_mode', 'gene_name']] CombinedModelML(combined_df_embb_no_alr , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["embb"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = ["alr"] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS_no_Talr" ) CombinedModelML(combined_df_katg_no_alr , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["katg"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = ["alr"] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS_no_Talr" ) CombinedModelML(combined_df_pnca_no_alr , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["pnca"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = ["alr"] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS_no_Talr" ) CombinedModelML(combined_df_gid_no_alr , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["gid"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = ["alr"] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS_no_Talr" ) CombinedModelML(combined_df_rpob_no_alr , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["rpob"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = ["alr"] , output_dir = "/home/tanu/git/LSHTM_ML/output/feature_selection/" , file_suffix = "FS_no_Talr" ) # write all feature rankings for i in boruta_features_omit_alr: print (i) gene_fs_ranking_no_alr = boruta_features_omit_alr[i]['fs_ranking'] gene_fs_ranking_no_alr.to_csv("/home/tanu/git/LSHTM_ML/output/feature_selection/"+ i + "_boruta_featues_5Tgenes.csv")