minor formatting consistency for 7030 scripts
This commit is contained in:
parent
e05e4e2e38
commit
2e50a555a0
8 changed files with 192 additions and 129 deletions
|
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
|||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('Sanity checks:'
|
||||
, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', X.shape
|
||||
, '\nTarget feature numbers:', Counter(y)
|
||||
, '\nTarget features ratio:', yc1_ratio
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
@ -59,8 +67,8 @@ print('Strucutral features (n):'
|
|||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
|
|
|
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
|||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('Sanity checks:'
|
||||
, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', X.shape
|
||||
, '\nTarget feature numbers:', Counter(y)
|
||||
, '\nTarget features ratio:', yc1_ratio
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
@ -59,8 +67,8 @@ print('Strucutral features (n):'
|
|||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
|
|
|
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
|||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('Sanity checks:'
|
||||
, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', X.shape
|
||||
, '\nTarget feature numbers:', Counter(y)
|
||||
, '\nTarget features ratio:', yc1_ratio
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
@ -59,8 +67,8 @@ print('Strucutral features (n):'
|
|||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
|
|
|
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
|||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('Sanity checks:'
|
||||
, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', X.shape
|
||||
, '\nTarget feature numbers:', Counter(y)
|
||||
, '\nTarget features ratio:', yc1_ratio
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
@ -59,8 +67,8 @@ print('Strucutral features (n):'
|
|||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
|
|
|
@ -423,22 +423,6 @@ def setvars(gene,drug):
|
|||
#==========================
|
||||
my_df_ml = my_df.copy()
|
||||
|
||||
#===============================
|
||||
# Training and BLIND test set
|
||||
#===============================
|
||||
# Separate blind test set
|
||||
my_df_ml[drug].isna().sum()
|
||||
|
||||
blind_test_df = my_df_ml[my_df_ml[drug].isna()]
|
||||
blind_test_df.shape
|
||||
|
||||
training_df = my_df_ml[my_df_ml[drug].notna()]
|
||||
training_df.shape
|
||||
|
||||
# Target1: dst_mode
|
||||
training_df[drug].value_counts()
|
||||
training_df['dst_mode'].value_counts()
|
||||
|
||||
#%% Build X: input for ML
|
||||
common_cols_stabiltyN = ['ligand_distance'
|
||||
, 'ligand_affinity_change'
|
||||
|
@ -546,7 +530,6 @@ def setvars(gene,drug):
|
|||
# Masking columns:
|
||||
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
|
||||
#=======================
|
||||
#%% Masking columns
|
||||
# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
|
||||
# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
|
||||
|
||||
|
@ -567,6 +550,23 @@ def setvars(gene,drug):
|
|||
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
|
||||
mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
|
||||
|
||||
#===================================================
|
||||
# Training and BLIND test set: actual vs imputed
|
||||
# dst with actual values : training set
|
||||
# dst with imputed values : blind test
|
||||
#==================================================
|
||||
my_df_ml[drug].isna().sum() #'na' ones are the blind_test set
|
||||
|
||||
blind_test_df = my_df_ml[my_df_ml[drug].isna()]
|
||||
blind_test_df.shape
|
||||
|
||||
training_df = my_df_ml[my_df_ml[drug].notna()]
|
||||
training_df.shape
|
||||
|
||||
# Target 1: dst_mode
|
||||
training_df[drug].value_counts()
|
||||
training_df['dst_mode'].value_counts()
|
||||
####################################################################
|
||||
#%% extracting dfs based on numerical, categorical column names
|
||||
#----------------------------------
|
||||
# WITHOUT the target var included
|
||||
|
@ -625,11 +625,11 @@ def setvars(gene,drug):
|
|||
print('Original Data\n', Counter(y)
|
||||
, 'Data dim:', X.shape)
|
||||
|
||||
###############################################################################
|
||||
###########################################################################
|
||||
#%%
|
||||
############################################################################
|
||||
###########################################################################
|
||||
# RESAMPLING
|
||||
###############################################################################
|
||||
###########################################################################
|
||||
#------------------------------
|
||||
# Simple Random oversampling
|
||||
# [Numerical + catgeorical]
|
||||
|
|
|
@ -528,7 +528,6 @@ def setvars(gene,drug):
|
|||
# Masking columns:
|
||||
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
|
||||
#=======================
|
||||
#%% Masking columns
|
||||
# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
|
||||
# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
|
||||
|
||||
|
@ -557,7 +556,6 @@ def setvars(gene,drug):
|
|||
# as these were imputed values and initial analysis shows that this
|
||||
# is not very representative
|
||||
#================================================================
|
||||
# Separate blind test set
|
||||
my_df_ml[drug].isna().sum()
|
||||
# blind_test_df = my_df_ml[my_df_ml[drug].isna()]
|
||||
# blind_test_df.shape
|
||||
|
@ -565,77 +563,81 @@ def setvars(gene,drug):
|
|||
training_df = my_df_ml[my_df_ml[drug].notna()]
|
||||
training_df.shape
|
||||
|
||||
# Target1: dst_mode
|
||||
# Target 1: dst_mode
|
||||
training_df[drug].value_counts()
|
||||
training_df['dst_mode'].value_counts()
|
||||
|
||||
####################################################################
|
||||
|
||||
# #%% extracting dfs based on numerical, categorical column names
|
||||
# #----------------------------------
|
||||
# # WITHOUT the target var included
|
||||
# #----------------------------------
|
||||
# num_df = training_df[numerical_FN]
|
||||
# num_df.shape
|
||||
###############################################################################
|
||||
###############################################################################
|
||||
# #%% extracting dfs based on numerical, categorical column names
|
||||
# #----------------------------------
|
||||
# # WITHOUT the target var included
|
||||
# #----------------------------------
|
||||
# num_df = training_df[numerical_FN]
|
||||
# num_df.shape
|
||||
|
||||
# cat_df = training_df[categorical_FN]
|
||||
# cat_df.shape
|
||||
# cat_df = training_df[categorical_FN]
|
||||
# cat_df.shape
|
||||
|
||||
# all_df = training_df[numerical_FN + categorical_FN]
|
||||
# all_df.shape
|
||||
# all_df = training_df[numerical_FN + categorical_FN]
|
||||
# all_df.shape
|
||||
|
||||
# #------------------------------
|
||||
# # WITH the target var included:
|
||||
# #'wtgt': with target
|
||||
# #------------------------------
|
||||
# # drug and dst_mode should be the same thing
|
||||
# num_df_wtgt = training_df[numerical_FN + ['dst_mode']]
|
||||
# num_df_wtgt.shape
|
||||
# #------------------------------
|
||||
# # WITH the target var included:
|
||||
# #'wtgt': with target
|
||||
# #------------------------------
|
||||
# # drug and dst_mode should be the same thing
|
||||
# num_df_wtgt = training_df[numerical_FN + ['dst_mode']]
|
||||
# num_df_wtgt.shape
|
||||
|
||||
# cat_df_wtgt = training_df[categorical_FN + ['dst_mode']]
|
||||
# cat_df_wtgt.shape
|
||||
# cat_df_wtgt = training_df[categorical_FN + ['dst_mode']]
|
||||
# cat_df_wtgt.shape
|
||||
|
||||
# all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
|
||||
# all_df_wtgt.shape
|
||||
|
||||
# all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
|
||||
# all_df_wtgt.shape
|
||||
#%%########################################################################
|
||||
# #============
|
||||
# # ML data: OLD
|
||||
# #============
|
||||
# #------
|
||||
# # X: Training and Blind test (BTS)
|
||||
# #------
|
||||
# X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
|
||||
# X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
|
||||
# #X = all_df_wtgt[numerical_FN] # training numerical only
|
||||
# #X_bts = blind_test_df[numerical_FN] # blind test data numerical
|
||||
# #============
|
||||
# # ML data: OLD
|
||||
# #============
|
||||
# #------
|
||||
# # X: Training and Blind test (BTS)
|
||||
# #------
|
||||
# X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
|
||||
# X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
|
||||
# #X = all_df_wtgt[numerical_FN] # training numerical only
|
||||
# #X_bts = blind_test_df[numerical_FN] # blind test data numerical
|
||||
|
||||
# #------
|
||||
# # y
|
||||
# #------
|
||||
# y = all_df_wtgt['dst_mode'] # training data y
|
||||
# y_bts = blind_test_df['dst_mode'] # blind data test y
|
||||
# #------
|
||||
# # y
|
||||
# #------
|
||||
# y = all_df_wtgt['dst_mode'] # training data y
|
||||
# y_bts = blind_test_df['dst_mode'] # blind data test y
|
||||
|
||||
# #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']]
|
||||
# #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']]
|
||||
|
||||
# # Quick check
|
||||
# #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
|
||||
# for i in range(len(cols_to_mask)):
|
||||
# ind = i+1
|
||||
# print('\nindex:', i, '\nind:', ind)
|
||||
# print('\nMask count check:'
|
||||
# , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
|
||||
# )
|
||||
# # Quick check
|
||||
# #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
|
||||
# for i in range(len(cols_to_mask)):
|
||||
# ind = i+1
|
||||
# print('\nindex:', i, '\nind:', ind)
|
||||
# print('\nMask count check:'
|
||||
# , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
|
||||
# )
|
||||
|
||||
# print('Original Data\n', Counter(y)
|
||||
# , 'Data dim:', X.shape)
|
||||
# print('Original Data\n', Counter(y)
|
||||
# , 'Data dim:', X.shape)
|
||||
|
||||
|
||||
#============
|
||||
###############################################################################
|
||||
###############################################################################
|
||||
#====================================
|
||||
# ML data: Train test split: 70/30
|
||||
# with stratification
|
||||
# 70% : training_data for CV
|
||||
# 30% : blind test
|
||||
#============
|
||||
#=====================================
|
||||
|
||||
# features: all_df or
|
||||
x_features = training_df[numerical_FN + categorical_FN]
|
||||
|
@ -664,7 +666,9 @@ def setvars(gene,drug):
|
|||
yc2_ratio = yc2[0]/yc2[1]
|
||||
|
||||
print('\n-------------------------------------------------------------'
|
||||
, '\nSuccessfully split data 70/30 with stratification'
|
||||
, '\nSuccessfully split data with stratification: 70/30'
|
||||
, '\nTrain data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\ny_train numbers:', yc1
|
||||
, '\ny_train ratio:',yc1_ratio
|
||||
, '\n'
|
||||
|
@ -672,12 +676,23 @@ def setvars(gene,drug):
|
|||
, '\ny_test ratio:', yc2_ratio
|
||||
, '\n-------------------------------------------------------------'
|
||||
)
|
||||
##########################################################################
|
||||
# Quick check
|
||||
#(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
|
||||
for i in range(len(cols_to_mask)):
|
||||
ind = i+1
|
||||
print('\nindex:', i, '\nind:', ind)
|
||||
print('\nMask count check:'
|
||||
, (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
print('Original Data\n', Counter(y)
|
||||
, 'Data dim:', X.shape)
|
||||
###########################################################################
|
||||
#%%
|
||||
############################################################################
|
||||
###########################################################################
|
||||
# RESAMPLING
|
||||
###############################################################################
|
||||
###########################################################################
|
||||
#------------------------------
|
||||
# Simple Random oversampling
|
||||
# [Numerical + catgeorical]
|
||||
|
|
|
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
|||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('Sanity checks:'
|
||||
, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', X.shape
|
||||
, '\nTarget feature numbers:', Counter(y)
|
||||
, '\nTarget features ratio:', yc1_ratio
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
@ -59,8 +67,8 @@ print('Strucutral features (n):'
|
|||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
|
|
|
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
|
|||
print('\nOutput directory:', outdir_ml)
|
||||
|
||||
#%%###########################################################################
|
||||
print('Sanity checks:'
|
||||
, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', X.shape
|
||||
, '\nTarget feature numbers:', Counter(y)
|
||||
, '\nTarget features ratio:', yc1_ratio
|
||||
print('\nSanity checks:'
|
||||
#, '\nML source data size:', x_features.shape
|
||||
, '\nTotal input features:', len(X.columns)
|
||||
, '\n'
|
||||
, '\nTraining data size:', X.shape
|
||||
, '\nTest data size:', X_bts.shape
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (training data):', Counter(y)
|
||||
, '\nTarget features ratio (training data:', yc1_ratio
|
||||
, '\n'
|
||||
, '\nTarget feature numbers (test data):', Counter(y_bts)
|
||||
, '\nTarget features ratio (test data):', yc2_ratio
|
||||
|
||||
, '\n\n#####################################################################\n')
|
||||
|
||||
print('\n================================================================\n')
|
||||
|
@ -59,8 +67,8 @@ print('Strucutral features (n):'
|
|||
|
||||
print('AAindex features (n):'
|
||||
, len(X_aaindexFN)
|
||||
# , '\nThese are:\n'
|
||||
# , X_aaindexFN
|
||||
, '\nThese are:\n'
|
||||
, X_aaindexFN
|
||||
, '\n================================================================\n')
|
||||
|
||||
print('Evolutionary features (n):'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue