minor formatting consistency for 7030 scripts

This commit is contained in:
Tanushree Tunstall 2022-06-18 14:41:05 +01:00
parent e05e4e2e38
commit 2e50a555a0
8 changed files with 192 additions and 129 deletions

View file

@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('Sanity checks:'
, '\nML source data size:', x_features.shape
, '\nTotal input features:', X.shape
, '\nTarget feature numbers:', Counter(y)
, '\nTarget features ratio:', yc1_ratio
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'
print('AAindex features (n):'
, len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'

View file

@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('Sanity checks:'
, '\nML source data size:', x_features.shape
, '\nTotal input features:', X.shape
, '\nTarget feature numbers:', Counter(y)
, '\nTarget features ratio:', yc1_ratio
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'
print('AAindex features (n):'
, len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'

View file

@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('Sanity checks:'
, '\nML source data size:', x_features.shape
, '\nTotal input features:', X.shape
, '\nTarget feature numbers:', Counter(y)
, '\nTarget features ratio:', yc1_ratio
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'
print('AAindex features (n):'
, len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'

View file

@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('Sanity checks:'
, '\nML source data size:', x_features.shape
, '\nTotal input features:', X.shape
, '\nTarget feature numbers:', Counter(y)
, '\nTarget features ratio:', yc1_ratio
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'
print('AAindex features (n):'
, len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'

View file

@ -423,22 +423,6 @@ def setvars(gene,drug):
#==========================
my_df_ml = my_df.copy()
#===============================
# Training and BLIND test set
#===============================
# Separate blind test set
my_df_ml[drug].isna().sum()
blind_test_df = my_df_ml[my_df_ml[drug].isna()]
blind_test_df.shape
training_df = my_df_ml[my_df_ml[drug].notna()]
training_df.shape
# Target1: dst_mode
training_df[drug].value_counts()
training_df['dst_mode'].value_counts()
#%% Build X: input for ML
common_cols_stabiltyN = ['ligand_distance'
, 'ligand_affinity_change'
@ -546,7 +530,6 @@ def setvars(gene,drug):
# Masking columns:
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
#=======================
#%% Masking columns
# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
@ -567,6 +550,23 @@ def setvars(gene,drug):
mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
#===================================================
# Training and BLIND test set: actual vs imputed
# dst with actual values : training set
# dst with imputed values : blind test
#==================================================
my_df_ml[drug].isna().sum() #'na' ones are the blind_test set
blind_test_df = my_df_ml[my_df_ml[drug].isna()]
blind_test_df.shape
training_df = my_df_ml[my_df_ml[drug].notna()]
training_df.shape
# Target 1: dst_mode
training_df[drug].value_counts()
training_df['dst_mode'].value_counts()
####################################################################
#%% extracting dfs based on numerical, categorical column names
#----------------------------------
# WITHOUT the target var included
@ -625,11 +625,11 @@ def setvars(gene,drug):
print('Original Data\n', Counter(y)
, 'Data dim:', X.shape)
###############################################################################
###########################################################################
#%%
############################################################################
###########################################################################
# RESAMPLING
###############################################################################
###########################################################################
#------------------------------
# Simple Random oversampling
# [Numerical + catgeorical]

View file

@ -528,7 +528,6 @@ def setvars(gene,drug):
# Masking columns:
# (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
#=======================
#%% Masking columns
# my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
# my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
@ -557,7 +556,6 @@ def setvars(gene,drug):
# as these were imputed values and initial analysis shows that this
# is not very representative
#================================================================
# Separate blind test set
my_df_ml[drug].isna().sum()
# blind_test_df = my_df_ml[my_df_ml[drug].isna()]
# blind_test_df.shape
@ -565,12 +563,14 @@ def setvars(gene,drug):
training_df = my_df_ml[my_df_ml[drug].notna()]
training_df.shape
# Target1: dst_mode
# Target 1: dst_mode
training_df[drug].value_counts()
training_df['dst_mode'].value_counts()
####################################################################
###############################################################################
###############################################################################
# #%% extracting dfs based on numerical, categorical column names
# #----------------------------------
# # WITHOUT the target var included
@ -597,6 +597,7 @@ def setvars(gene,drug):
# all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
# all_df_wtgt.shape
#%%########################################################################
# #============
# # ML data: OLD
@ -629,13 +630,14 @@ def setvars(gene,drug):
# print('Original Data\n', Counter(y)
# , 'Data dim:', X.shape)
#============
###############################################################################
###############################################################################
#====================================
# ML data: Train test split: 70/30
# with stratification
# 70% : training_data for CV
# 30% : blind test
#============
#=====================================
# features: all_df or
x_features = training_df[numerical_FN + categorical_FN]
@ -664,7 +666,9 @@ def setvars(gene,drug):
yc2_ratio = yc2[0]/yc2[1]
print('\n-------------------------------------------------------------'
, '\nSuccessfully split data 70/30 with stratification'
, '\nSuccessfully split data with stratification: 70/30'
, '\nTrain data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\ny_train numbers:', yc1
, '\ny_train ratio:',yc1_ratio
, '\n'
@ -672,12 +676,23 @@ def setvars(gene,drug):
, '\ny_test ratio:', yc2_ratio
, '\n-------------------------------------------------------------'
)
##########################################################################
# Quick check
#(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
for i in range(len(cols_to_mask)):
ind = i+1
print('\nindex:', i, '\nind:', ind)
print('\nMask count check:'
, (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
)
###############################################################################
print('Original Data\n', Counter(y)
, 'Data dim:', X.shape)
###########################################################################
#%%
############################################################################
###########################################################################
# RESAMPLING
###############################################################################
###########################################################################
#------------------------------
# Simple Random oversampling
# [Numerical + catgeorical]

View file

@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('Sanity checks:'
, '\nML source data size:', x_features.shape
, '\nTotal input features:', X.shape
, '\nTarget feature numbers:', Counter(y)
, '\nTarget features ratio:', yc1_ratio
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'
print('AAindex features (n):'
, len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'

View file

@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
print('\nOutput directory:', outdir_ml)
#%%###########################################################################
print('Sanity checks:'
, '\nML source data size:', x_features.shape
, '\nTotal input features:', X.shape
, '\nTarget feature numbers:', Counter(y)
, '\nTarget features ratio:', yc1_ratio
print('\nSanity checks:'
#, '\nML source data size:', x_features.shape
, '\nTotal input features:', len(X.columns)
, '\n'
, '\nTraining data size:', X.shape
, '\nTest data size:', X_bts.shape
, '\n'
, '\nTarget feature numbers (training data):', Counter(y)
, '\nTarget features ratio (training data:', yc1_ratio
, '\n'
, '\nTarget feature numbers (test data):', Counter(y_bts)
, '\nTarget features ratio (test data):', yc2_ratio
, '\n\n#####################################################################\n')
print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'
print('AAindex features (n):'
, len(X_aaindexFN)
# , '\nThese are:\n'
# , X_aaindexFN
, '\nThese are:\n'
, X_aaindexFN
, '\n================================================================\n')
print('Evolutionary features (n):'