added run_7030.py that runs as cmd for all gene targets and sampling methods and outputs a single csv

This commit is contained in:
Tanushree Tunstall 2022-06-21 20:37:53 +01:00
parent 5b0ccdfec4
commit bc12dbd7c2
5 changed files with 749 additions and 229 deletions

View file

@ -61,7 +61,6 @@ def setvars(gene,drug):
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%% FOR LATER: Combine ED logo data
#%% DONE: active aa site annotations **DONE on 15/05/2022 as part of generating merged_dfs
###########################################################################
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
@ -419,7 +418,7 @@ def setvars(gene,drug):
#---------------------------------------
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#%% Data for ML
#%%########################################################################
#==========================
# Data for ML
#==========================
@ -551,8 +550,7 @@ def setvars(gene,drug):
, 'polarity_change'
, 'water_change'
, 'active_site']
X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
###############################################################################
#========================
@ -594,8 +592,7 @@ def setvars(gene,drug):
###############################################################################
#%% Define training and test data
#======================================================
# Training and BLIND test set [UQ]: actual vs imputed
# No aa index but active_site included
# Training and BLIND test set: actual vs imputed
# dst with actual values : training set
# dst with imputed values : blind test
#======================================================
@ -612,9 +609,9 @@ def setvars(gene,drug):
training_df['dst_mode'].value_counts()
####################################################################
#============
# ML data
#============
#=====================================
# ML data: actual vs imputed
#=====================================
#------
# X: Training and Blind test (BTS)
#------
@ -625,20 +622,8 @@ def setvars(gene,drug):
# y
#------
y = training_df['dst_mode']
y_bts = blind_test_df['dst_mode']
# Quick check
#(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
for i in range(len(cols_to_mask)):
ind = i+1
print('\nindex:', i, '\nind:', ind)
print('\nMask count check:'
, (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
)
print('Original Data\n', Counter(y)
, 'Data dim:', X.shape)
y_bts = blind_test_df['dst_mode']
yc1 = Counter(y)
yc1_ratio = yc1[0]/yc1[1]
@ -705,7 +690,18 @@ def setvars(gene,drug):
, '\ny_test ratio:', yc2_ratio
, '\n-------------------------------------------------------------'
)
##########################################################################
# Quick check
#(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
for i in range(len(cols_to_mask)):
ind = i+1
print('\nindex:', i, '\nind:', ind)
print('\nMask count check:'
, (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
)
print('Original Data\n', Counter(y)
, 'Data dim:', X.shape)
###########################################################################
#%%
###########################################################################
@ -760,7 +756,7 @@ def setvars(gene,drug):
k_sm = 5 # 5 is deafult
sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
X_smnc, y_smnc = sm_nc.fit_resample(X, y)
print('SMOTE_NC OverSampling\n', Counter(y_smnc))
print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
print(X_smnc.shape)
globals().update(locals()) # TROLOLOLOLOLOLS
#print("i did a horrible hack :-)")
@ -774,7 +770,7 @@ def setvars(gene,drug):
# sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
# X_sm, y_sm = sm.fit_resample(X, y)
# print(X_sm.shape)
# print('SMOTE OverSampling\n', Counter(y_sm))
# print('\nSMOTE OverSampling\n', Counter(y_sm))
# y_sm_df = y_sm.to_frame()
# y_sm_df.value_counts().plot(kind = 'bar')
@ -785,7 +781,7 @@ def setvars(gene,drug):
# sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
# X_enn, y_enn = sm_enn.fit_resample(X, y)
# print(X_enn.shape)
# print('SMOTE Over+Under Sampling combined\n', Counter(y_enn))
# print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
###############################################################################
# TODO: Find over and undersampling JUST for categorical data