From b7e1b51a31b2aa16901defdee1fe2bdf91bec193 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 1 Jul 2022 20:38:08 +0100 Subject: [PATCH] added ml_iterator_fs.py --- scripts/ml/ml_iterator_fs.py | 156 +++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 scripts/ml/ml_iterator_fs.py diff --git a/scripts/ml/ml_iterator_fs.py b/scripts/ml/ml_iterator_fs.py new file mode 100644 index 0000000..60ec2db --- /dev/null +++ b/scripts/ml/ml_iterator_fs.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 29 20:29:36 2022 + +@author: tanu +""" +import sys, os +import pandas as pd +import numpy as np +import re +#import prettyprint as pp +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') +sys.path +############################################################################### +#==================== +# Import ML functions +#==================== + +from MultClfs import * +from GetMLData import * +from SplitTTS import * +from FS import * +# param dict for getmldata() +combined_model_paramD = {'data_combined_model' : False + , 'use_or' : False + , 'omit_all_genomic_features': False + , 'write_maskfile' : False + , 'write_outfile' : False } +############################################################################### +#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"] +outdir = homedir + '/git/Data/ml_combined/fs/' +ml_gene_drugD = {'pncA' : 'pyrazinamide' + # , 'embB' : 'ethambutol' + # , 'katG' : 'isoniazid' + # , 'rpoB' : 'rifampicin' + # , 'gid' : 'streptomycin' + } +gene_dataD={} +#split_types = ['70_30', '80_20', 'sl'] +#split_data_types = ['actual', 'complete'] +split_types = ['70_30'] +split_data_types = ['actual', 'complete'] + +fs_models = [('Logistic Regression' , LogisticRegression(**rs) )] +# fs_models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) +# , ('Decision Tree' , DecisionTreeClassifier(**rs) ) +# , ('Extra Tree' , ExtraTreeClassifier(**rs) ) +# , ('Extra Trees' , ExtraTreesClassifier(**rs) ) +# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) +# , ('LDA' , LinearDiscriminantAnalysis() ) +# , ('Logistic Regression' , LogisticRegression(**rs) ) +# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) +# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) +# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) +# , ('Ridge Classifier' , RidgeClassifier(**rs) ) +# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) +# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) +# ] + +for gene, drug in ml_gene_drugD.items(): + print ('\nGene:', gene + , '\nDrug:', drug) + gene_low = gene.lower() + gene_dataD[gene_low] = getmldata(gene, drug + , data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it. + , use_or = False + , omit_all_genomic_features = False + , write_maskfile = False + , write_outfile = False) + + for split_type in split_types: + for data_type in split_data_types: + # unused per-split outfile + out_filename = outdir + gene.lower() + '_'+split_type+'_' + data_type + '.json' + tempD=split_tts(gene_dataD[gene_low] + , data_type = data_type + , split_type = split_type + , oversampling = True # TURN IT ON TO RUN THE OTHERS BIS + , dst_colname = 'dst' + , target_colname = 'dst_mode' + , include_gene_name = True + ) + paramD = { + 'baseline_paramD': { 'input_df' : tempD['X'] + , 'target' : tempD['y'] + , 'var_type' : 'mixed' + , 'resampling_type': 'none'} + ,'smnc_paramD': { 'input_df' : tempD['X_smnc'] + , 'target' : tempD['y_smnc'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + # , 'ros_paramD': { 'input_df' : tempD['X_ros'] + # , 'target' : tempD['y_ros'] + # , 'var_type' : 'mixed' + # , 'resampling_type' : 'ros'} + # , 'rus_paramD' : { 'input_df' : tempD['X_rus'] + # , 'target' : tempD['y_rus'] + # , 'var_type' : 'mixed' + # , 'resampling_type' : 'rus'} + # , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] + # , 'target' : tempD['y_rouC'] + # , 'var_type' : 'mixed' + # , 'resampling_type': 'rouC'} + } + #for m in fs_models: + # print(m) + + out_fsD = {} + index = 1 + for model_name, model_fn in fs_models: + print('\nRunning classifier with FS:', index + , '\nModel_name:' , model_name + , '\nModel func:' , model_fn) + #, '\nList of models:', models) + index = index+1 + + out_fsD[model_name] = {} + # current_model = {} + for k, v in paramD.items(): + # out_filename = (gene.lower() + '_' + split_type + '_' + data_type + '_' + k + '.json') + fsD_params=paramD[k] + # print("XXXXXX THIS: ", len(fsD_params['input_df']) ) + # print("XXXXXX THIS: ", out_filename ) + + # current_model[k] = fsgs_rfecv( + out_fsD[model_name][k] = fsgs_rfecv( + **fsD_params + , param_gridLd = [{'fs__min_features_to_select': [1]}] + , blind_test_df = tempD['X_bts'] + , blind_test_target = tempD['y_bts'] + , estimator = model_fn + , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below + , custom_fs = RFECV(DecisionTreeClassifier(**rs), cv = skf_cv, scoring = 'matthews_corrcoef') + , cv_method = skf_cv + ) + # write per-resampler outfile here + # with open(out_filename, 'w') as f: + # f.write(json.dumps(current_model + # , cls = NpEncoder ) + # ) + + # write per-split outfile here + with open(out_filename, 'w') as f: + f.write(json.dumps(out_fsD + #, cls = NpEncoder + )) +#%%############################################################################ +# # Read output json +# testF = outdir + 'pnca_70_30_actual.json' +# testF = outdir + 'pnca_70_30_complete.json' + +# with open(testF, 'r') as f: +# data = json.load(f)