slight formatting for existing scripts

This commit is contained in:
Tanushree Tunstall 2022-06-18 19:35:49 +01:00
parent a53fce5455
commit 9bc26c1947
4 changed files with 33 additions and 7 deletions

View file

@ -553,9 +553,7 @@ def setvars(gene,drug):
# Training and BLIND test set: scaling law split
# https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
# Throw away previous blind_test_df, and call the 30% data as blind_test
# as these were imputed values and initial analysis shows that this
# is not very representative
# test data size ~ 1/sqrt(features NOT including target variable)
#================================================================
my_df_ml[drug].isna().sum()
# blind_test_df = my_df_ml[my_df_ml[drug].isna()]
@ -650,7 +648,6 @@ def setvars(gene,drug):
x_ncols = len(x_features.columns)
print('\nNo. of columns for x_features:', x_ncols)
# NEED It for scaling law split
#https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
else:
sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
@ -670,7 +667,8 @@ def setvars(gene,drug):
yc2_ratio = yc2[0]/yc2[1]
print('\n-------------------------------------------------------------'
, '\nSuccessfully split data according to scaling law: 1/np.sqrt(x_ncols)'
, '\nSuccessfully split data with stratification according to scaling law: 1/sqrt(x_ncols)'
, '\nInput features data size:', x_features.shape
, '\nTrain data size:', X.shape
, '\nTest data size:', sl_test_size, ' ', X_bts.shape
, '\ny_train numbers:', yc1