#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Jun 29 20:29:36 2022 @author: tanu """ import sys, os import pandas as pd import numpy as np import re ############################################################################### homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### outdir = homedir + '/git/LSHTM_ML/output/combined/' #==================== # Import ML functions #==================== #from MultClfs import * #from MultClfs_logo_skf import * from MultClfs_logo_skf_split import * from GetMLData import * from SplitTTS import * # Input data from ml_data_combined import * ############################################################################### print('\nUsing data with 5 genes:', len(cm_input_df5)) ############################################################################### split_types = ['70_30', '80_20', 'sl'] split_data_types = ['actual', 'complete'] for split_type in split_types: for data_type in split_data_types: out_filename = outdir + 'cm_' + split_type + '_' + data_type + '.csv' print(out_filename) tempD = split_tts(cm_input_df5 , data_type = data_type , split_type = split_type , oversampling = True , dst_colname = 'dst' , target_colname = 'dst_mode' , include_gene_name = True ) paramD = { 'baseline_paramD': { 'input_df' : tempD['X'] , 'target' : tempD['y'] , 'var_type' : 'mixed' , 'resampling_type' : 'none'} , 'smnc_paramD' : { 'input_df' : tempD['X_smnc'] , 'target' : tempD['y_smnc'] , 'var_type' : 'mixed' , 'resampling_type' : 'smnc'} , 'ros_paramD' : { 'input_df' : tempD['X_ros'] , 'target' : tempD['y_ros'] , 'var_type' : 'mixed' , 'resampling_type' : 'ros'} , 'rus_paramD' : { 'input_df' : tempD['X_rus'] , 'target' : tempD['y_rus'] , 'var_type' : 'mixed' , 'resampling_type' : 'rus'} , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] , 'target' : tempD['y_rouC'] , 'var_type' : 'mixed' , 'resampling_type' : 'rouC'} } mmDD = {} for k, v in paramD.items(): scoresD = MultModelsCl_logo_skf(**paramD[k] XXXXXXXXXXXXXXXXXXXXXXX mmDD[k] = scoresD # Extracting the dfs from within the dict and concatenating to output as one df for k, v in mmDD.items(): out_wf= pd.concat(mmDD, ignore_index = True) out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False)