added run_7030.py that runs as cmd for all gene targets and sampling methods and outputs a single csv
This commit is contained in:
parent
5b0ccdfec4
commit
bc12dbd7c2
5 changed files with 749 additions and 229 deletions
|
@ -61,7 +61,6 @@ def setvars(gene,drug):
|
|||
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
|
||||
|
||||
#%% FOR LATER: Combine ED logo data
|
||||
#%% DONE: active aa site annotations **DONE on 15/05/2022 as part of generating merged_dfs
|
||||
###########################################################################
|
||||
rs = {'random_state': 42}
|
||||
njobs = {'n_jobs': 10}
|
||||
|
@ -419,7 +418,7 @@ def setvars(gene,drug):
|
|||
#---------------------------------------
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
#%% Data for ML
|
||||
#%%########################################################################
|
||||
#==========================
|
||||
# Data for ML
|
||||
#==========================
|
||||
|
@ -551,8 +550,7 @@ def setvars(gene,drug):
|
|||
, 'polarity_change'
|
||||
, 'water_change'
|
||||
, 'active_site']
|
||||
|
||||
|
||||
|
||||
X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
|
||||
###############################################################################
|
||||
#========================
|
||||
|
@ -594,8 +592,7 @@ def setvars(gene,drug):
|
|||
###############################################################################
|
||||
#%% Define training and test data
|
||||
#======================================================
|
||||
# Training and BLIND test set [UQ]: actual vs imputed
|
||||
# No aa index but active_site included
|
||||
# Training and BLIND test set: actual vs imputed
|
||||
# dst with actual values : training set
|
||||
# dst with imputed values : blind test
|
||||
#======================================================
|
||||
|
@ -612,9 +609,9 @@ def setvars(gene,drug):
|
|||
training_df['dst_mode'].value_counts()
|
||||
|
||||
####################################################################
|
||||
#============
|
||||
# ML data
|
||||
#============
|
||||
#=====================================
|
||||
# ML data: actual vs imputed
|
||||
#=====================================
|
||||
#------
|
||||
# X: Training and Blind test (BTS)
|
||||
#------
|
||||
|
@ -625,20 +622,8 @@ def setvars(gene,drug):
|
|||
# y
|
||||
#------
|
||||
y = training_df['dst_mode']
|
||||
y_bts = blind_test_df['dst_mode']
|
||||
|
||||
# Quick check
|
||||
#(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
|
||||
for i in range(len(cols_to_mask)):
|
||||
ind = i+1
|
||||
print('\nindex:', i, '\nind:', ind)
|
||||
print('\nMask count check:'
|
||||
, (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
|
||||
)
|
||||
|
||||
print('Original Data\n', Counter(y)
|
||||
, 'Data dim:', X.shape)
|
||||
|
||||
y_bts = blind_test_df['dst_mode']
|
||||
|
||||
yc1 = Counter(y)
|
||||
yc1_ratio = yc1[0]/yc1[1]
|
||||
|
||||
|
@ -705,7 +690,18 @@ def setvars(gene,drug):
|
|||
, '\ny_test ratio:', yc2_ratio
|
||||
, '\n-------------------------------------------------------------'
|
||||
)
|
||||
##########################################################################
|
||||
# Quick check
|
||||
#(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
|
||||
for i in range(len(cols_to_mask)):
|
||||
ind = i+1
|
||||
print('\nindex:', i, '\nind:', ind)
|
||||
print('\nMask count check:'
|
||||
, (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
|
||||
)
|
||||
|
||||
print('Original Data\n', Counter(y)
|
||||
, 'Data dim:', X.shape)
|
||||
###########################################################################
|
||||
#%%
|
||||
###########################################################################
|
||||
|
@ -760,7 +756,7 @@ def setvars(gene,drug):
|
|||
k_sm = 5 # 5 is deafult
|
||||
sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
|
||||
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
|
||||
print('SMOTE_NC OverSampling\n', Counter(y_smnc))
|
||||
print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
|
||||
print(X_smnc.shape)
|
||||
globals().update(locals()) # TROLOLOLOLOLOLS
|
||||
#print("i did a horrible hack :-)")
|
||||
|
@ -774,7 +770,7 @@ def setvars(gene,drug):
|
|||
# sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
|
||||
# X_sm, y_sm = sm.fit_resample(X, y)
|
||||
# print(X_sm.shape)
|
||||
# print('SMOTE OverSampling\n', Counter(y_sm))
|
||||
# print('\nSMOTE OverSampling\n', Counter(y_sm))
|
||||
# y_sm_df = y_sm.to_frame()
|
||||
# y_sm_df.value_counts().plot(kind = 'bar')
|
||||
|
||||
|
@ -785,7 +781,7 @@ def setvars(gene,drug):
|
|||
# sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
|
||||
# X_enn, y_enn = sm_enn.fit_resample(X, y)
|
||||
# print(X_enn.shape)
|
||||
# print('SMOTE Over+Under Sampling combined\n', Counter(y_enn))
|
||||
# print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
|
||||
|
||||
###############################################################################
|
||||
# TODO: Find over and undersampling JUST for categorical data
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue