minor formatting consistency for 7030 scripts

This commit is contained in:
Tanushree Tunstall 2022-06-18 14:41:05 +01:00
parent e05e4e2e38
commit 2e50a555a0
8 changed files with 192 additions and 129 deletions

View file

@ -422,22 +422,6 @@ def setvars(gene,drug):
# Data for ML
#==========================
my_df_ml = my_df.copy()
#===============================
# Training and BLIND test set
#===============================
# Separate blind test set
my_df_ml[drug].isna().sum()
blind_test_df = my_df_ml[my_df_ml[drug].isna()]
blind_test_df.shape
training_df = my_df_ml[my_df_ml[drug].notna()]
training_df.shape
# Target1: dst_mode
training_df[drug].value_counts()
training_df['dst_mode'].value_counts()
#%% Build X: input for ML
common_cols_stabiltyN = ['ligand_distance'
@ -546,7 +530,6 @@ def setvars(gene,drug):
# Masking columns:
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
#=======================
#%% Masking columns
# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
@ -567,6 +550,23 @@ def setvars(gene,drug):
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
#===================================================
# Training and BLIND test set: actual vs imputed
# dst with actual values : training set
# dst with imputed values : blind test
#==================================================
my_df_ml[drug].isna().sum() #'na' ones are the blind_test set
blind_test_df = my_df_ml[my_df_ml[drug].isna()]
blind_test_df.shape
training_df = my_df_ml[my_df_ml[drug].notna()]
training_df.shape
# Target 1: dst_mode
training_df[drug].value_counts()
training_df['dst_mode'].value_counts()
####################################################################
#%% extracting dfs based on numerical, categorical column names
#----------------------------------
# WITHOUT the target var included
@ -625,11 +625,11 @@ def setvars(gene,drug):
print('Original Data\n', Counter(y)
, 'Data dim:', X.shape)
###############################################################################
###########################################################################
#%%
############################################################################
###########################################################################
# RESAMPLING
###############################################################################
###########################################################################
#------------------------------
# Simple Random oversampling
# [Numerical + catgeorical]