From 9bc26c19471bf4d1191ef971990c1568f31e643e Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sat, 18 Jun 2022 19:35:49 +0100 Subject: [PATCH] slight formatting for existing scripts --- scripts/ml/ml_data_7030.py | 1 + scripts/ml/ml_data_8020.py | 1 + scripts/ml/ml_data_sl.py | 8 +++----- scripts/ml/running_ml_scripts.txt | 30 ++++++++++++++++++++++++++++-- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/scripts/ml/ml_data_7030.py b/scripts/ml/ml_data_7030.py index 4976439..83a792a 100644 --- a/scripts/ml/ml_data_7030.py +++ b/scripts/ml/ml_data_7030.py @@ -667,6 +667,7 @@ def setvars(gene,drug): print('\n-------------------------------------------------------------' , '\nSuccessfully split data with stratification: 70/30' + , '\nInput features data size:', x_features.shape , '\nTrain data size:', X.shape , '\nTest data size:', X_bts.shape , '\ny_train numbers:', yc1 diff --git a/scripts/ml/ml_data_8020.py b/scripts/ml/ml_data_8020.py index 9049345..4928839 100644 --- a/scripts/ml/ml_data_8020.py +++ b/scripts/ml/ml_data_8020.py @@ -655,6 +655,7 @@ def setvars(gene,drug): print('\n-------------------------------------------------------------' , '\nSuccessfully split data with stratification: 80/20 ' + , '\nInput features data size:', x_features.shape , '\nTrain data size:', X.shape , '\nTest data size:', X_bts.shape , '\ny_train numbers:', yc1 diff --git a/scripts/ml/ml_data_sl.py b/scripts/ml/ml_data_sl.py index b8a9ded..e850ab5 100644 --- a/scripts/ml/ml_data_sl.py +++ b/scripts/ml/ml_data_sl.py @@ -553,9 +553,7 @@ def setvars(gene,drug): # Training and BLIND test set: scaling law split # https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d - # Throw away previous blind_test_df, and call the 30% data as blind_test - # as these were imputed values and initial analysis shows that this - # is not very representative + # test data size ~ 1/sqrt(features NOT including target variable) #================================================================ my_df_ml[drug].isna().sum() # blind_test_df = my_df_ml[my_df_ml[drug].isna()] @@ -650,7 +648,6 @@ def setvars(gene,drug): x_ncols = len(x_features.columns) print('\nNo. of columns for x_features:', x_ncols) # NEED It for scaling law split - #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d else: sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!') @@ -670,7 +667,8 @@ def setvars(gene,drug): yc2_ratio = yc2[0]/yc2[1] print('\n-------------------------------------------------------------' - , '\nSuccessfully split data according to scaling law: 1/np.sqrt(x_ncols)' + , '\nSuccessfully split data with stratification according to scaling law: 1/sqrt(x_ncols)' + , '\nInput features data size:', x_features.shape , '\nTrain data size:', X.shape , '\nTest data size:', sl_test_size, ' ', X_bts.shape , '\ny_train numbers:', yc1 diff --git a/scripts/ml/running_ml_scripts.txt b/scripts/ml/running_ml_scripts.txt index ae8736f..f9b271a 100644 --- a/scripts/ml/running_ml_scripts.txt +++ b/scripts/ml/running_ml_scripts.txt @@ -90,8 +90,34 @@ ./rpob_rt.py 2>&1 | tee log_rpob_rt.txt ./alr_rt.py 2>&1 | tee log_alr_rt.txt +######################################################################## +# COMPLETE Data: actual + na i.e imputed +######################################################################## +================================= +# Split: 70/30 [COMPLETE DATA] +# All features including AA index +# Date: 18/05/2022 +# captures error: 2>$1 +================================= +./pnca_cd_7030.py 2>&1 | tee log_pnca_cd_7030.txt +./embb_cd_7030.py 2>&1 | tee log_embb_cd_7030.txt +./gid_cd_7030.py 2>&1 | tee log_gid_cd_7030.txt +./katg_cd_7030.py 2>&1 | tee log_katg_cd_7030.txt +./rpob_cd_7030.py 2>&1 | tee log_rpob_cd_7030.txt +./alr_cd_7030.py 2>&1 | tee log_alr_cd_7030.txt - - +######################################################################## +================================= +# Split: 80/20 [COMPLETE DATA] +# All features including AA index +# Date: 18/05/2022 +# captures error: 2>$1 +================================= +./pnca_cd_8020.py 2>&1 | tee log_pnca_cd_8020.txt +./embb_cd_8020.py 2>&1 | tee log_embb_cd_8020.txt +./gid_cd_8020.py 2>&1 | tee log_gid_cd_8020.txt +./katg_cd_8020.py 2>&1 | tee log_katg_cd_8020.txt +./rpob_cd_8020.py 2>&1 | tee log_rpob_cd_8020.txt +./alr_cd_8020.py 2>&1 | tee log_alr_cd_8020.txt