LSHTM_analysis/scripts/ml/pnca_config_dissected.py

207 lines
8.1 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 28 05:25:30 2022
@author: tanu
"""
import os
gene = 'pncA'
drug = 'pyrazinamide'
#total_mtblineage_uc = 8
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
#---------------------------
# Version 1: no AAindex
#from UQ_ML_data import *
#setvars(gene,drug)
#from UQ_ML_data import *
#---------------------------
from ml_data_dissected import *
setvars(gene,drug)
from ml_data_dissected import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML
# TT run all ML clfs: baseline mode
from MultModelsCl_dissected import MultModelsCl_dissected
############################################################################
print('\n#####################################################################\n'
, '\nRunning ML analysis: UQ [without AA index but with active site annotations]'
, '\nGene name:', gene
, '\nDrug name:', drug)
#==================
# Specify outdir
#==================
outdir_ml = outdir + 'ml/uq_v1/dissected'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('\n================================================================\n')
X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN
X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
print('\n================================================================'
, '\nTotal Evolutionary features (n):' , len(X_evolFN)
, '\n--------------Evol. feature colnames:', X_evolFN
, '\n================================================================'
, '\n\nTotal structural features (n):', len(X_structural_FN)
, '\n--------Stability ncols:' , len(X_stability_FN)
, '\n--------------Common stability colnames:' , X_common_stability_Fnum
, '\n--------------Foldx colnames:' , X_foldX_Fnum
, '\n--------Affinity ncols:' , len(X_affinityFN)
, '\n--------------Common affinity colnames:' , common_affinity_Fnum
, '\n--------------Gene specific affinity colnames:', gene_affinity_colnames
, '\n--------Residue prop ncols:' , len(X_resprop_FN)
, '\n--------------Residue Prop cols:' , X_str_Fnum
, '\n--------------AA change Prop cols:' , X_aap_Fcat
, '\n--------------AA index cols:' , X_aaindex_Fnum
, '\n================================================================'
, '\n\nTotal Genomic features (n):' , len(X_genomicFN)
, '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum)
, '\n--------------MAF+OR colnames:' , X_gn_mafor_Fnum
, '\n--------Lineage cols:' , len(X_gn_linegae_Fnum)
, '\n--------------Lineage cols:' , X_gn_linegae_Fnum
, '\n--------Other cols:' , len(X_gn_Fcat)
, '\n--------------Other cols:' , X_gn_Fcat
, '\n================================================================')
# Sanity check
if ( len(X.columns) == len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)):
print('\nPass: No. of features match')
else:
print('\nFail: Count of feature mismatch'
, '\nExpected:', len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)
, '\nGot:', len(X.columns))
sys.exit()
print('\n#####################################################################\n')
# ###############################################################################
# #==================
# # Baseline models
# #==================
# mm_skf_scoresD = MultModelsCl_dissected(input_df = X
# , target = y
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# baseline_all = pd.DataFrame(mm_skf_scoresD)
# baseline_all = baseline_all.T
# #baseline_train = baseline_all.filter(like='train_', axis=1)
# baseline_CT = baseline_all.filter(like='test_', axis=1)
# baseline_CT.sort_values(by=['test_mcc'], ascending=False, inplace=True)
# baseline_BT = baseline_all.filter(like='bts_', axis=1)
# baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
# #%% SMOTE NC: Oversampling [Numerical + categorical]
# mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
# , target = y_smnc
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# smnc_all = pd.DataFrame(mm_skf_scoresD7)
# smnc_all = smnc_all.T
# smnc_CT = smnc_all.filter(like='test_', axis=1)
# smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# smnc_BT = smnc_all.filter(like='bts_', axis=1)
# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
# #%% ROS: Numerical + categorical
# mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
# , target = y_ros
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# ros_all = pd.DataFrame(mm_skf_scoresD3)
# ros_all = ros_all.T
# ros_CT = ros_all.filter(like='test_', axis=1)
# ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# ros_BT = ros_all.filter(like='bts_', axis=1)
# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
# #%% RUS: Numerical + categorical
# mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
# , target = y_rus
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# rus_all = pd.DataFrame(mm_skf_scoresD4)
# rus_all = rus_all.T
# rus_CT = rus_all.filter(like='test_', axis=1)
# rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rus_BT = rus_all.filter(like='bts_' , axis=1)
# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
# #%% ROS + RUS Combined: Numerical + categorical
# mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
# , target = y_rouC
# , var_type = 'mixed'
# , skf_cv = skf_cv
# , blind_test_input_df = X_bts
# , blind_test_target = y_bts)
# rouC_all = pd.DataFrame(mm_skf_scoresD8)
# rouC_all = rouC_all.T
# rouC_CT = rouC_all.filter(like='test_', axis=1)
# rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
# rouC_BT = rouC_all.filter(like='bts_', axis=1)
# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
# # Write csv
# rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')