From 37f5199c5cdc01259cd9a11bc4b1195690233971 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 28 Jul 2022 15:25:31 +0100 Subject: [PATCH] added the cm_ml_iterator_TODO.py for later --- .../ml/combined_model/cm_ml_iterator_TODO.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100755 scripts/ml/combined_model/cm_ml_iterator_TODO.py diff --git a/scripts/ml/combined_model/cm_ml_iterator_TODO.py b/scripts/ml/combined_model/cm_ml_iterator_TODO.py new file mode 100755 index 0000000..f899b68 --- /dev/null +++ b/scripts/ml/combined_model/cm_ml_iterator_TODO.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 29 20:29:36 2022 + +@author: tanu +""" +import sys, os +import pandas as pd +import numpy as np +import re + +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') +sys.path +############################################################################### +outdir = homedir + '/git/LSHTM_ML/output/combined/' + +#==================== +# Import ML functions +#==================== +#from MultClfs import * +#from MultClfs_logo_skf import * +from MultClfs_logo_skf_split import * + +from GetMLData import * +from SplitTTS import * + +# Input data +from ml_data_combined import * + +############################################################################### +print('\nUsing data with 5 genes:', len(cm_input_df5)) + +############################################################################### + +split_types = ['70_30', '80_20', 'sl'] +split_data_types = ['actual', 'complete'] + +for split_type in split_types: + for data_type in split_data_types: + + out_filename = outdir + 'cm_' + split_type + '_' + data_type + '.csv' + print(out_filename) + tempD = split_tts(cm_input_df5 + , data_type = data_type + , split_type = split_type + , oversampling = True + , dst_colname = 'dst' + , target_colname = 'dst_mode' + , include_gene_name = True + ) + paramD = { + 'baseline_paramD': { 'input_df' : tempD['X'] + , 'target' : tempD['y'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'none'} + , 'smnc_paramD' : { 'input_df' : tempD['X_smnc'] + , 'target' : tempD['y_smnc'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + , 'ros_paramD' : { 'input_df' : tempD['X_ros'] + , 'target' : tempD['y_ros'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + , 'rus_paramD' : { 'input_df' : tempD['X_rus'] + , 'target' : tempD['y_rus'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] + , 'target' : tempD['y_rouC'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'rouC'} + } + + mmDD = {} + for k, v in paramD.items(): + scoresD = MultModelsCl_logo_skf(**paramD[k] + XXXXXXXXXXXXXXXXXXXXXXX + mmDD[k] = scoresD + + # Extracting the dfs from within the dict and concatenating to output as one df + for k, v in mmDD.items(): + out_wf= pd.concat(mmDD, ignore_index = True) + + out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False) +