added log files for these ml runs

2022-06-18 14:44:02 +01:00 · 2022-06-18 14:44:02 +01:00 · e176d018cb
commit e176d018cb
parent 5bd8ba33f7
20 changed files with 303568 additions and 0 deletions
--- a/scripts/ml/log_alr_8020.txt
+++ b/scripts/ml/log_alr_8020.txt
@ -0,0 +1,75 @@
+/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_8020.py:549: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame
+
+See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+  mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
+1.22.4
+1.4.1
+
+aaindex_df contains non-numerical data
+
+Total no. of non-numerial columns: 2
+
+Selecting numerical data only
+
+PASS: successfully selected numerical columns only for aaindex_df
+
+Now checking for NA in the remaining aaindex_cols
+
+Counting aaindex_df cols with NA 
+ncols with NA: 4 columns 
+Dropping these... 
+Original ncols: 127
+
+Revised df ncols: 123
+
+Checking NA in revised df...
+
+PASS: cols with NA successfully dropped from aaindex_df 
+Proceeding with combining aa_df with other features_df
+
+PASS: ncols match 
+Expected ncols: 123 
+Got: 123
+
+Total no. of columns in clean aa_df: 123
+
+Proceeding to merge, expected nrows in merged_df: 271
+
+PASS: my_features_df and aa_df successfully combined 
+nrows: 271 
+ncols: 269
+count of NULL values before imputation
+
+or_mychisq          256
+log10_or_mychisq    256
+dtype: int64
+count of NULL values AFTER imputation
+
+mutationinformation    0
+or_rawI                0
+logorI                 0
+dtype: int64
+
+PASS: OR values imputed, data ready for ML
+
+Total no. of features for aaindex: 123
+
+No. of numerical features: 168 
+No. of categorical features: 7
+
+PASS: x_features has no target variable
+
+No. of columns for x_features: 175
+Traceback (most recent call last):
+  File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_8020.py", line 19, in <module>
+    setvars(gene,drug)
+  File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_8020.py", line 656, in setvars
+    X, X_bts, y, y_bts = train_test_split(x_features, y_target
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
+    train, test = next(cv.split(X=arrays[0], y=stratify))
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
+    for train, test in self._iter_indices(X, y, groups):
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
+    raise ValueError(
+ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
--- a/scripts/ml/log_alr_rt.txt
+++ b/scripts/ml/log_alr_rt.txt
@ -0,0 +1,107 @@
+/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py:550: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame
+
+See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+  mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
+1.22.4
+1.4.1
+
+aaindex_df contains non-numerical data
+
+Total no. of non-numerial columns: 2
+
+Selecting numerical data only
+
+PASS: successfully selected numerical columns only for aaindex_df
+
+Now checking for NA in the remaining aaindex_cols
+
+Counting aaindex_df cols with NA 
+ncols with NA: 4 columns 
+Dropping these... 
+Original ncols: 127
+
+Revised df ncols: 123
+
+Checking NA in revised df...
+
+PASS: cols with NA successfully dropped from aaindex_df 
+Proceeding with combining aa_df with other features_df
+
+PASS: ncols match 
+Expected ncols: 123 
+Got: 123
+
+Total no. of columns in clean aa_df: 123
+
+Proceeding to merge, expected nrows in merged_df: 271
+
+PASS: my_features_df and aa_df successfully combined 
+nrows: 271 
+ncols: 269
+count of NULL values before imputation
+
+or_mychisq          256
+log10_or_mychisq    256
+dtype: int64
+count of NULL values AFTER imputation
+
+mutationinformation    0
+or_rawI                0
+logorI                 0
+dtype: int64
+
+PASS: OR values imputed, data ready for ML
+
+Total no. of features for aaindex: 123
+
+No. of numerical features: 168 
+No. of categorical features: 7
+
+index: 0 
+ind: 1
+
+Mask count check: True
+
+index: 1 
+ind: 2
+
+Mask count check: True
+Original Data
+ Counter({0: 262, 1: 1}) Data dim: (263, 175)
+
+------------------------------------------------------------- 
+Successfully split data: REVERSE training 
+imputed values: training set 
+actual values: blind test set 
+Train data size: (263, 175) 
+Test data size: (8, 175) 
+y_train numbers: Counter({0: 262, 1: 1}) 
+y_train ratio: 262.0 
+ 
+y_test_numbers: Counter({0: 7, 1: 1}) 
+y_test ratio: 7.0 
+-------------------------------------------------------------
+Simple Random OverSampling
+ Counter({0: 262, 1: 262})
+(524, 175)
+Simple Random UnderSampling
+ Counter({0: 1, 1: 1})
+(2, 175)
+Simple Combined Over and UnderSampling
+ Counter({0: 262, 1: 262})
+(524, 175)
+Traceback (most recent call last):
+  File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_rt.py", line 19, in <module>
+    setvars(gene,drug)
+  File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py", line 701, in setvars
+    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
+    output = self._fit_resample(X, y)
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
+    X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
+    nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
+    raise ValueError(
+ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6
--- a/scripts/ml/log_alr_sl.txt
+++ b/scripts/ml/log_alr_sl.txt
@ -0,0 +1,75 @@
+/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_sl.py:549: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame
+
+See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+  mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
+1.22.4
+1.4.1
+
+aaindex_df contains non-numerical data
+
+Total no. of non-numerial columns: 2
+
+Selecting numerical data only
+
+PASS: successfully selected numerical columns only for aaindex_df
+
+Now checking for NA in the remaining aaindex_cols
+
+Counting aaindex_df cols with NA 
+ncols with NA: 4 columns 
+Dropping these... 
+Original ncols: 127
+
+Revised df ncols: 123
+
+Checking NA in revised df...
+
+PASS: cols with NA successfully dropped from aaindex_df 
+Proceeding with combining aa_df with other features_df
+
+PASS: ncols match 
+Expected ncols: 123 
+Got: 123
+
+Total no. of columns in clean aa_df: 123
+
+Proceeding to merge, expected nrows in merged_df: 271
+
+PASS: my_features_df and aa_df successfully combined 
+nrows: 271 
+ncols: 269
+count of NULL values before imputation
+
+or_mychisq          256
+log10_or_mychisq    256
+dtype: int64
+count of NULL values AFTER imputation
+
+mutationinformation    0
+or_rawI                0
+logorI                 0
+dtype: int64
+
+PASS: OR values imputed, data ready for ML
+
+Total no. of features for aaindex: 123
+
+No. of numerical features: 168 
+No. of categorical features: 7
+
+PASS: x_features has no target variable
+
+No. of columns for x_features: 175
+Traceback (most recent call last):
+  File "/home/tanu/git/LSHTM_analysis/scripts/ml/./alr_sl.py", line 19, in <module>
+    setvars(gene,drug)
+  File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_sl.py", line 660, in setvars
+    X, X_bts, y, y_bts = train_test_split(x_features, y_target
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 2454, in train_test_split
+    train, test = next(cv.split(X=arrays[0], y=stratify))
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1613, in split
+    for train, test in self._iter_indices(X, y, groups):
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/model_selection/_split.py", line 1953, in _iter_indices
+    raise ValueError(
+ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
--- a/scripts/ml/log_embb_8020.txt
+++ b/scripts/ml/log_embb_8020.txt
--- a/scripts/ml/log_embb_rt.txt
+++ b/scripts/ml/log_embb_rt.txt
--- a/scripts/ml/log_embb_sl.txt
+++ b/scripts/ml/log_embb_sl.txt
--- a/scripts/ml/log_gid_7030.txt
+++ b/scripts/ml/log_gid_7030.txt
--- a/scripts/ml/log_gid_8020.txt
+++ b/scripts/ml/log_gid_8020.txt
--- a/scripts/ml/log_gid_rt.txt
+++ b/scripts/ml/log_gid_rt.txt
--- a/scripts/ml/log_gid_rt_v1.txt
+++ b/scripts/ml/log_gid_rt_v1.txt
@ -0,0 +1,107 @@
+/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py:550: SettingWithCopyWarning: 
+A value is trying to be set on a copy of a slice from a DataFrame
+
+See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
+  mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
+1.22.4
+1.4.1
+
+aaindex_df contains non-numerical data
+
+Total no. of non-numerial columns: 2
+
+Selecting numerical data only
+
+PASS: successfully selected numerical columns only for aaindex_df
+
+Now checking for NA in the remaining aaindex_cols
+
+Counting aaindex_df cols with NA 
+ncols with NA: 4 columns 
+Dropping these... 
+Original ncols: 127
+
+Revised df ncols: 123
+
+Checking NA in revised df...
+
+PASS: cols with NA successfully dropped from aaindex_df 
+Proceeding with combining aa_df with other features_df
+
+PASS: ncols match 
+Expected ncols: 123 
+Got: 123
+
+Total no. of columns in clean aa_df: 123
+
+Proceeding to merge, expected nrows in merged_df: 531
+
+PASS: my_features_df and aa_df successfully combined 
+nrows: 531 
+ncols: 286
+count of NULL values before imputation
+
+or_mychisq          263
+log10_or_mychisq    263
+dtype: int64
+count of NULL values AFTER imputation
+
+mutationinformation    0
+or_rawI                0
+logorI                 0
+dtype: int64
+
+PASS: OR values imputed, data ready for ML
+
+Total no. of features for aaindex: 123
+
+No. of numerical features: 167 
+No. of categorical features: 7
+
+index: 0 
+ind: 1
+
+Mask count check: True
+
+index: 1 
+ind: 2
+
+Mask count check: True
+Original Data
+ Counter({0: 409, 1: 3}) Data dim: (412, 174)
+
+------------------------------------------------------------- 
+Successfully split data: REVERSE training 
+imputed values: training set 
+actual values: blind test set 
+Train data size: (412, 174) 
+Test data size: (119, 174) 
+y_train numbers: Counter({0: 409, 1: 3}) 
+y_train ratio: 136.33333333333334 
+ 
+y_test_numbers: Counter({0: 76, 1: 43}) 
+y_test ratio: 1.7674418604651163 
+-------------------------------------------------------------
+Simple Random OverSampling
+ Counter({0: 409, 1: 409})
+(818, 174)
+Simple Random UnderSampling
+ Counter({0: 3, 1: 3})
+(6, 174)
+Simple Combined Over and UnderSampling
+ Counter({0: 409, 1: 409})
+(818, 174)
+Traceback (most recent call last):
+  File "/home/tanu/git/LSHTM_analysis/scripts/ml/./gid_rt.py", line 19, in <module>
+    setvars(gene,drug)
+  File "/home/tanu/git/LSHTM_analysis/scripts/ml/ml_data_rt.py", line 701, in setvars
+    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/base.py", line 83, in fit_resample
+    output = self._fit_resample(X, y)
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 533, in _fit_resample
+    X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/imblearn/over_sampling/_smote/base.py", line 324, in _fit_resample
+    nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
+  File "/home/tanu/anaconda3/envs/UQ/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 749, in kneighbors
+    raise ValueError(
+ValueError: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 6
--- a/scripts/ml/log_gid_sl.txt
+++ b/scripts/ml/log_gid_sl.txt
--- a/scripts/ml/log_katg_8020.txt
+++ b/scripts/ml/log_katg_8020.txt
--- a/scripts/ml/log_katg_rt.txt
+++ b/scripts/ml/log_katg_rt.txt
--- a/scripts/ml/log_katg_sl.txt
+++ b/scripts/ml/log_katg_sl.txt
--- a/scripts/ml/log_pnca_8020.txt
+++ b/scripts/ml/log_pnca_8020.txt
--- a/scripts/ml/log_pnca_rt.txt
+++ b/scripts/ml/log_pnca_rt.txt
--- a/scripts/ml/log_pnca_sl.txt
+++ b/scripts/ml/log_pnca_sl.txt
--- a/scripts/ml/log_rpob_8020.txt
+++ b/scripts/ml/log_rpob_8020.txt
--- a/scripts/ml/log_rpob_rt.txt
+++ b/scripts/ml/log_rpob_rt.txt
--- a/scripts/ml/log_rpob_sl.txt
+++ b/scripts/ml/log_rpob_sl.txt