minor formatting consistency for 7030 scripts
This commit is contained in:
parent
e05e4e2e38
commit
2e50a555a0
8 changed files with 192 additions and 129 deletions
|
@ -422,22 +422,6 @@ def setvars(gene,drug):
|
|||
# Data for ML
|
||||
#==========================
|
||||
my_df_ml = my_df.copy()
|
||||
|
||||
#===============================
|
||||
# Training and BLIND test set
|
||||
#===============================
|
||||
# Separate blind test set
|
||||
my_df_ml[drug].isna().sum()
|
||||
|
||||
blind_test_df = my_df_ml[my_df_ml[drug].isna()]
|
||||
blind_test_df.shape
|
||||
|
||||
training_df = my_df_ml[my_df_ml[drug].notna()]
|
||||
training_df.shape
|
||||
|
||||
# Target1: dst_mode
|
||||
training_df[drug].value_counts()
|
||||
training_df['dst_mode'].value_counts()
|
||||
|
||||
#%% Build X: input for ML
|
||||
common_cols_stabiltyN = ['ligand_distance'
|
||||
|
@ -546,7 +530,6 @@ def setvars(gene,drug):
|
|||
# Masking columns:
|
||||
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
|
||||
#=======================
|
||||
#%% Masking columns
|
||||
# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
|
||||
# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
|
||||
|
||||
|
@ -567,6 +550,23 @@ def setvars(gene,drug):
|
|||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
|
||||
|
||||
#===================================================
|
||||
# Training and BLIND test set: actual vs imputed
|
||||
# dst with actual values : training set
|
||||
# dst with imputed values : blind test
|
||||
#==================================================
|
||||
my_df_ml[drug].isna().sum() #'na' ones are the blind_test set
|
||||
|
||||
blind_test_df = my_df_ml[my_df_ml[drug].isna()]
|
||||
blind_test_df.shape
|
||||
|
||||
training_df = my_df_ml[my_df_ml[drug].notna()]
|
||||
training_df.shape
|
||||
|
||||
# Target 1: dst_mode
|
||||
training_df[drug].value_counts()
|
||||
training_df['dst_mode'].value_counts()
|
||||
####################################################################
|
||||
#%% extracting dfs based on numerical, categorical column names
|
||||
#----------------------------------
|
||||
# WITHOUT the target var included
|
||||
|
@ -625,11 +625,11 @@ def setvars(gene,drug):
|
|||
print('Original Data\n', Counter(y)
|
||||
, 'Data dim:', X.shape)
|
||||
|
||||
###############################################################################
|
||||
###########################################################################
|
||||
#%%
|
||||
############################################################################
|
||||
###########################################################################
|
||||
# RESAMPLING
|
||||
###############################################################################
|
||||
###########################################################################
|
||||
#------------------------------
|
||||
# Simple Random oversampling
|
||||
# [Numerical + catgeorical]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue