#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat May 28 05:25:30 2022 @author: tanu """ import os gene = 'pncA' drug = 'pyrazinamide' #total_mtblineage_uc = 8 homedir = os.path.expanduser("~") os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/') #--------------------------- # Version 1: no AAindex #from UQ_ML_data import * #setvars(gene,drug) #from UQ_ML_data import * #--------------------------- from ml_data_dissected import * setvars(gene,drug) from ml_data_dissected import * # from YC run_all_ML: run locally #from UQ_yc_RunAllClfs import run_all_ML # TT run all ML clfs: baseline mode from MultModelsCl_dissected import MultModelsCl_dissected ############################################################################ print('\n#####################################################################\n' , '\nRunning ML analysis: UQ [without AA index but with active site annotations]' , '\nGene name:', gene , '\nDrug name:', drug) #================== # Specify outdir #================== outdir_ml = outdir + 'ml/uq_v1/dissected' print('\nOutput directory:', outdir_ml) #%%########################################################################### print('\n================================================================\n') , '\n\nTotal no. of evolutionary features:' , len(X_evolFN) , '\n\nTotal no. of stability features:' , len(X_stability_FN) , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum) , '\n--------Foldx cols:' , len(X_foldX_Fnum) , '\n\nTotal no. of affinity features:' , len(X_affinityFN) , '\n--------Common affinity cols:' , len(common_affinity_Fnum) , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) , '\n\nTotal no. of residue level features:', len(X_resprop_FN) , '\n--------AA index cols:' , len(X_aaindex_Fnum) , '\n--------Residue Prop cols:' , len(X_str_Fnum) , '\n--------AA change Prop cols:' , len(X_aap_Fcat) , '\n\nTotal no. of genomic features:' , len(X_genomicFN) , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) , '\n--------Other cols:' , len(X_gn_Fcat) X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat all_featuresN = X_evolFN + X_structural_FN + X_genomicFN ############################################################################### print('\n================================================================' , '\nTotal Evolutionary features (n):' , len(X_evolFN) , '\n--------------Evol. feature colnames:', X_evolFN , '\n================================================================' , '\n\nTotal structural features (n):', len(X_structural_FN) , '\n--------Stability ncols:' , len(X_stability_FN) , '\n--------------Common stability colnames:' , X_common_stability_Fnum , '\n--------------Foldx colnames:' , X_foldX_Fnum , '\n--------Affinity ncols:' , len(X_affinityFN) , '\n--------------Common affinity colnames:' , common_affinity_Fnum , '\n--------------Gene specific affinity colnames:', gene_affinity_colnames , '\n--------Residue prop ncols:' , len(X_resprop_FN) , '\n--------------Residue Prop cols:' , X_str_Fnum , '\n--------------AA change Prop cols:' , X_aap_Fcat , '\n--------------AA index cols:' , X_aaindex_Fnum , '\n================================================================' , '\n\nTotal Genomic features (n):' , len(X_genomicFN) , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) , '\n--------------MAF+OR colnames:' , X_gn_mafor_Fnum , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) , '\n--------------Lineage cols:' , X_gn_linegae_Fnum , '\n--------Other cols:' , len(X_gn_Fcat) , '\n--------------Other cols:' , X_gn_Fcat , '\n================================================================') # Sanity check if ( len(X.columns) == len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN)): print('\nPass: No. of features match') else: print('\nFail: Count of feature mismatch' , '\nExpected:', len(X_evolFN) + len(X_structural_FN) + len(X_genomicFN) , '\nGot:', len(X.columns)) sys.exit() print('\n#####################################################################\n') # ############################################################################### # #================== # # Baseline models # #================== # mm_skf_scoresD = MultModelsCl_dissected(input_df = X # , target = y # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # baseline_all = pd.DataFrame(mm_skf_scoresD) # baseline_all = baseline_all.T # #baseline_train = baseline_all.filter(like='train_', axis=1) # baseline_CT = baseline_all.filter(like='test_', axis=1) # baseline_CT.sort_values(by=['test_mcc'], ascending=False, inplace=True) # baseline_BT = baseline_all.filter(like='bts_', axis=1) # baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') # baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') # #%% SMOTE NC: Oversampling [Numerical + categorical] # mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc # , target = y_smnc # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # smnc_all = pd.DataFrame(mm_skf_scoresD7) # smnc_all = smnc_all.T # smnc_CT = smnc_all.filter(like='test_', axis=1) # smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # smnc_BT = smnc_all.filter(like='bts_', axis=1) # smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') # smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') # #%% ROS: Numerical + categorical # mm_skf_scoresD3 = MultModelsCl(input_df = X_ros # , target = y_ros # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # ros_all = pd.DataFrame(mm_skf_scoresD3) # ros_all = ros_all.T # ros_CT = ros_all.filter(like='test_', axis=1) # ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # ros_BT = ros_all.filter(like='bts_', axis=1) # ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') # ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') # #%% RUS: Numerical + categorical # mm_skf_scoresD4 = MultModelsCl(input_df = X_rus # , target = y_rus # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # rus_all = pd.DataFrame(mm_skf_scoresD4) # rus_all = rus_all.T # rus_CT = rus_all.filter(like='test_', axis=1) # rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # rus_BT = rus_all.filter(like='bts_' , axis=1) # rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') # rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') # #%% ROS + RUS Combined: Numerical + categorical # mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC # , target = y_rouC # , var_type = 'mixed' # , skf_cv = skf_cv # , blind_test_input_df = X_bts # , blind_test_target = y_bts) # rouC_all = pd.DataFrame(mm_skf_scoresD8) # rouC_all = rouC_all.T # rouC_CT = rouC_all.filter(like='test_', axis=1) # rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) # rouC_BT = rouC_all.filter(like='bts_', axis=1) # rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) # # Write csv # rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') # rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')