added FS to MultClfs.py and modified data for different splits for consistency

This commit is contained in:
Tanushree Tunstall 2022-06-24 20:35:53 +01:00
parent edb7aebd6a
commit e2bc384155
12 changed files with 1585 additions and 994 deletions

View file

@ -37,7 +37,7 @@ def setvars(gene,drug):
import argparse
import re
#%% GLOBALS
tts_split = "70/30"
tts_split = "70_30"
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
@ -727,7 +727,7 @@ def setvars(gene,drug):
#------------------------------
oversample = RandomOverSampler(sampling_strategy='minority')
X_ros, y_ros = oversample.fit_resample(X, y)
print('Simple Random OverSampling\n', Counter(y_ros))
print('\nSimple Random OverSampling\n', Counter(y_ros))
print(X_ros.shape)
#------------------------------
@ -736,7 +736,7 @@ def setvars(gene,drug):
#------------------------------
undersample = RandomUnderSampler(sampling_strategy='majority')
X_rus, y_rus = undersample.fit_resample(X, y)
print('Simple Random UnderSampling\n', Counter(y_rus))
print('\nSimple Random UnderSampling\n', Counter(y_rus))
print(X_rus.shape)
#------------------------------
@ -747,7 +747,7 @@ def setvars(gene,drug):
X_ros, y_ros = oversample.fit_resample(X, y)
undersample = RandomUnderSampler(sampling_strategy='majority')
X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
print('Simple Combined Over and UnderSampling\n', Counter(y_rouC))
print('\nSimple Combined Over and UnderSampling\n', Counter(y_rouC))
print(X_rouC.shape)
#------------------------------
@ -767,7 +767,7 @@ def setvars(gene,drug):
categorical_colind = X.columns.get_indexer(list(categorical_ix))
categorical_colind
k_sm = 5 # 5 is deafult
k_sm = 5 # 5 is default
sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
@ -797,5 +797,10 @@ def setvars(gene,drug):
# print(X_enn.shape)
# print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
###############################################################################
###########################################################################
# TODO: Find over and undersampling JUST for categorical data
###########################################################################
print('\n#################################################################'
, '\nDim of X for gene:', gene.lower(), '\n', X.shape
, '\n###############################################################')