diff --git a/scripts/ml/alr_7030.py b/scripts/ml/alr_7030.py index 90637e4..bf1de0f 100755 --- a/scripts/ml/alr_7030.py +++ b/scripts/ml/alr_7030.py @@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) #%%########################################################################### -print('Sanity checks:' - , '\nML source data size:', x_features.shape - , '\nTotal input features:', X.shape - , '\nTarget feature numbers:', Counter(y) - , '\nTarget features ratio:', yc1_ratio +print('\nSanity checks:' + #, '\nML source data size:', x_features.shape + , '\nTotal input features:', len(X.columns) + , '\n' + , '\nTraining data size:', X.shape + , '\nTest data size:', X_bts.shape + , '\n' + , '\nTarget feature numbers (training data):', Counter(y) + , '\nTarget features ratio (training data:', yc1_ratio + , '\n' + , '\nTarget feature numbers (test data):', Counter(y_bts) + , '\nTarget features ratio (test data):', yc2_ratio + , '\n\n#####################################################################\n') print('\n================================================================\n') @@ -59,8 +67,8 @@ print('Strucutral features (n):' print('AAindex features (n):' , len(X_aaindexFN) - # , '\nThese are:\n' - # , X_aaindexFN + , '\nThese are:\n' + , X_aaindexFN , '\n================================================================\n') print('Evolutionary features (n):' diff --git a/scripts/ml/embb_7030.py b/scripts/ml/embb_7030.py index 37ae81b..9c169b2 100755 --- a/scripts/ml/embb_7030.py +++ b/scripts/ml/embb_7030.py @@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) #%%########################################################################### -print('Sanity checks:' - , '\nML source data size:', x_features.shape - , '\nTotal input features:', X.shape - , '\nTarget feature numbers:', Counter(y) - , '\nTarget features ratio:', yc1_ratio +print('\nSanity checks:' + #, '\nML source data size:', x_features.shape + , '\nTotal input features:', len(X.columns) + , '\n' + , '\nTraining data size:', X.shape + , '\nTest data size:', X_bts.shape + , '\n' + , '\nTarget feature numbers (training data):', Counter(y) + , '\nTarget features ratio (training data:', yc1_ratio + , '\n' + , '\nTarget feature numbers (test data):', Counter(y_bts) + , '\nTarget features ratio (test data):', yc2_ratio + , '\n\n#####################################################################\n') print('\n================================================================\n') @@ -59,8 +67,8 @@ print('Strucutral features (n):' print('AAindex features (n):' , len(X_aaindexFN) - # , '\nThese are:\n' - # , X_aaindexFN + , '\nThese are:\n' + , X_aaindexFN , '\n================================================================\n') print('Evolutionary features (n):' diff --git a/scripts/ml/gid_7030.py b/scripts/ml/gid_7030.py index 0769f9c..9beaaee 100755 --- a/scripts/ml/gid_7030.py +++ b/scripts/ml/gid_7030.py @@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) #%%########################################################################### -print('Sanity checks:' - , '\nML source data size:', x_features.shape - , '\nTotal input features:', X.shape - , '\nTarget feature numbers:', Counter(y) - , '\nTarget features ratio:', yc1_ratio +print('\nSanity checks:' + #, '\nML source data size:', x_features.shape + , '\nTotal input features:', len(X.columns) + , '\n' + , '\nTraining data size:', X.shape + , '\nTest data size:', X_bts.shape + , '\n' + , '\nTarget feature numbers (training data):', Counter(y) + , '\nTarget features ratio (training data:', yc1_ratio + , '\n' + , '\nTarget feature numbers (test data):', Counter(y_bts) + , '\nTarget features ratio (test data):', yc2_ratio + , '\n\n#####################################################################\n') print('\n================================================================\n') @@ -59,8 +67,8 @@ print('Strucutral features (n):' print('AAindex features (n):' , len(X_aaindexFN) - # , '\nThese are:\n' - # , X_aaindexFN + , '\nThese are:\n' + , X_aaindexFN , '\n================================================================\n') print('Evolutionary features (n):' diff --git a/scripts/ml/katg_7030.py b/scripts/ml/katg_7030.py index 0192f21..7bb9307 100755 --- a/scripts/ml/katg_7030.py +++ b/scripts/ml/katg_7030.py @@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) #%%########################################################################### -print('Sanity checks:' - , '\nML source data size:', x_features.shape - , '\nTotal input features:', X.shape - , '\nTarget feature numbers:', Counter(y) - , '\nTarget features ratio:', yc1_ratio +print('\nSanity checks:' + #, '\nML source data size:', x_features.shape + , '\nTotal input features:', len(X.columns) + , '\n' + , '\nTraining data size:', X.shape + , '\nTest data size:', X_bts.shape + , '\n' + , '\nTarget feature numbers (training data):', Counter(y) + , '\nTarget features ratio (training data:', yc1_ratio + , '\n' + , '\nTarget feature numbers (test data):', Counter(y_bts) + , '\nTarget features ratio (test data):', yc2_ratio + , '\n\n#####################################################################\n') print('\n================================================================\n') @@ -59,8 +67,8 @@ print('Strucutral features (n):' print('AAindex features (n):' , len(X_aaindexFN) - # , '\nThese are:\n' - # , X_aaindexFN + , '\nThese are:\n' + , X_aaindexFN , '\n================================================================\n') print('Evolutionary features (n):' diff --git a/scripts/ml/ml_data.py b/scripts/ml/ml_data.py index e63da15..488c549 100644 --- a/scripts/ml/ml_data.py +++ b/scripts/ml/ml_data.py @@ -422,22 +422,6 @@ def setvars(gene,drug): # Data for ML #========================== my_df_ml = my_df.copy() - - #=============================== - # Training and BLIND test set - #=============================== - # Separate blind test set - my_df_ml[drug].isna().sum() - - blind_test_df = my_df_ml[my_df_ml[drug].isna()] - blind_test_df.shape - - training_df = my_df_ml[my_df_ml[drug].notna()] - training_df.shape - - # Target1: dst_mode - training_df[drug].value_counts() - training_df['dst_mode'].value_counts() #%% Build X: input for ML common_cols_stabiltyN = ['ligand_distance' @@ -546,7 +530,6 @@ def setvars(gene,drug): # Masking columns: # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 #======================= - #%% Masking columns # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() @@ -567,6 +550,23 @@ def setvars(gene,drug): mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + #=================================================== + # Training and BLIND test set: actual vs imputed + # dst with actual values : training set + # dst with imputed values : blind test + #================================================== + my_df_ml[drug].isna().sum() #'na' ones are the blind_test set + + blind_test_df = my_df_ml[my_df_ml[drug].isna()] + blind_test_df.shape + + training_df = my_df_ml[my_df_ml[drug].notna()] + training_df.shape + + # Target 1: dst_mode + training_df[drug].value_counts() + training_df['dst_mode'].value_counts() + #################################################################### #%% extracting dfs based on numerical, categorical column names #---------------------------------- # WITHOUT the target var included @@ -625,11 +625,11 @@ def setvars(gene,drug): print('Original Data\n', Counter(y) , 'Data dim:', X.shape) - ############################################################################### + ########################################################################### #%% - ############################################################################ + ########################################################################### # RESAMPLING - ############################################################################### + ########################################################################### #------------------------------ # Simple Random oversampling # [Numerical + catgeorical] diff --git a/scripts/ml/ml_data_7030.py b/scripts/ml/ml_data_7030.py index 6e90454..4976439 100644 --- a/scripts/ml/ml_data_7030.py +++ b/scripts/ml/ml_data_7030.py @@ -528,7 +528,6 @@ def setvars(gene,drug): # Masking columns: # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 #======================= - #%% Masking columns # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() @@ -557,7 +556,6 @@ def setvars(gene,drug): # as these were imputed values and initial analysis shows that this # is not very representative #================================================================ - # Separate blind test set my_df_ml[drug].isna().sum() # blind_test_df = my_df_ml[my_df_ml[drug].isna()] # blind_test_df.shape @@ -565,77 +563,81 @@ def setvars(gene,drug): training_df = my_df_ml[my_df_ml[drug].notna()] training_df.shape - # Target1: dst_mode + # Target 1: dst_mode training_df[drug].value_counts() training_df['dst_mode'].value_counts() #################################################################### - # #%% extracting dfs based on numerical, categorical column names - # #---------------------------------- - # # WITHOUT the target var included - # #---------------------------------- - # num_df = training_df[numerical_FN] - # num_df.shape - - # cat_df = training_df[categorical_FN] - # cat_df.shape - - # all_df = training_df[numerical_FN + categorical_FN] - # all_df.shape - - # #------------------------------ - # # WITH the target var included: - # #'wtgt': with target - # #------------------------------ - # # drug and dst_mode should be the same thing - # num_df_wtgt = training_df[numerical_FN + ['dst_mode']] - # num_df_wtgt.shape - - # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] - # cat_df_wtgt.shape - - # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] - # all_df_wtgt.shape - #%%######################################################################## - # #============ - # # ML data: OLD - # #============ - # #------ - # # X: Training and Blind test (BTS) - # #------ - # X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL - # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL - # #X = all_df_wtgt[numerical_FN] # training numerical only - # #X_bts = blind_test_df[numerical_FN] # blind test data numerical - - # #------ - # # y - # #------ - # y = all_df_wtgt['dst_mode'] # training data y - # y_bts = blind_test_df['dst_mode'] # blind data test y - - # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] - - # # Quick check - # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() - # for i in range(len(cols_to_mask)): - # ind = i+1 - # print('\nindex:', i, '\nind:', ind) - # print('\nMask count check:' - # , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() - # ) - - # print('Original Data\n', Counter(y) - # , 'Data dim:', X.shape) - +############################################################################### +############################################################################### + # #%% extracting dfs based on numerical, categorical column names + # #---------------------------------- + # # WITHOUT the target var included + # #---------------------------------- + # num_df = training_df[numerical_FN] + # num_df.shape - #============ + # cat_df = training_df[categorical_FN] + # cat_df.shape + + # all_df = training_df[numerical_FN + categorical_FN] + # all_df.shape + + # #------------------------------ + # # WITH the target var included: + # #'wtgt': with target + # #------------------------------ + # # drug and dst_mode should be the same thing + # num_df_wtgt = training_df[numerical_FN + ['dst_mode']] + # num_df_wtgt.shape + + # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] + # cat_df_wtgt.shape + + # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] + # all_df_wtgt.shape + + #%%######################################################################## + # #============ + # # ML data: OLD + # #============ + # #------ + # # X: Training and Blind test (BTS) + # #------ + # X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL + # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL + # #X = all_df_wtgt[numerical_FN] # training numerical only + # #X_bts = blind_test_df[numerical_FN] # blind test data numerical + + # #------ + # # y + # #------ + # y = all_df_wtgt['dst_mode'] # training data y + # y_bts = blind_test_df['dst_mode'] # blind data test y + + # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] + + # # Quick check + # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() + # for i in range(len(cols_to_mask)): + # ind = i+1 + # print('\nindex:', i, '\nind:', ind) + # print('\nMask count check:' + # , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() + # ) + + # print('Original Data\n', Counter(y) + # , 'Data dim:', X.shape) + +############################################################################### +############################################################################### + #==================================== # ML data: Train test split: 70/30 # with stratification # 70% : training_data for CV # 30% : blind test - #============ + #===================================== # features: all_df or x_features = training_df[numerical_FN + categorical_FN] @@ -664,7 +666,9 @@ def setvars(gene,drug): yc2_ratio = yc2[0]/yc2[1] print('\n-------------------------------------------------------------' - , '\nSuccessfully split data 70/30 with stratification' + , '\nSuccessfully split data with stratification: 70/30' + , '\nTrain data size:', X.shape + , '\nTest data size:', X_bts.shape , '\ny_train numbers:', yc1 , '\ny_train ratio:',yc1_ratio , '\n' @@ -672,12 +676,23 @@ def setvars(gene,drug): , '\ny_test ratio:', yc2_ratio , '\n-------------------------------------------------------------' ) + ########################################################################## + # Quick check + #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() + for i in range(len(cols_to_mask)): + ind = i+1 + print('\nindex:', i, '\nind:', ind) + print('\nMask count check:' + , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() + ) - ############################################################################### + print('Original Data\n', Counter(y) + , 'Data dim:', X.shape) + ########################################################################### #%% - ############################################################################ + ########################################################################### # RESAMPLING - ############################################################################### + ########################################################################### #------------------------------ # Simple Random oversampling # [Numerical + catgeorical] diff --git a/scripts/ml/pnca_7030.py b/scripts/ml/pnca_7030.py index a04c813..ddda78d 100755 --- a/scripts/ml/pnca_7030.py +++ b/scripts/ml/pnca_7030.py @@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) #%%########################################################################### -print('Sanity checks:' - , '\nML source data size:', x_features.shape - , '\nTotal input features:', X.shape - , '\nTarget feature numbers:', Counter(y) - , '\nTarget features ratio:', yc1_ratio +print('\nSanity checks:' + #, '\nML source data size:', x_features.shape + , '\nTotal input features:', len(X.columns) + , '\n' + , '\nTraining data size:', X.shape + , '\nTest data size:', X_bts.shape + , '\n' + , '\nTarget feature numbers (training data):', Counter(y) + , '\nTarget features ratio (training data:', yc1_ratio + , '\n' + , '\nTarget feature numbers (test data):', Counter(y_bts) + , '\nTarget features ratio (test data):', yc2_ratio + , '\n\n#####################################################################\n') print('\n================================================================\n') @@ -59,8 +67,8 @@ print('Strucutral features (n):' print('AAindex features (n):' , len(X_aaindexFN) - # , '\nThese are:\n' - # , X_aaindexFN + , '\nThese are:\n' + , X_aaindexFN , '\n================================================================\n') print('Evolutionary features (n):' diff --git a/scripts/ml/rpob_7030.py b/scripts/ml/rpob_7030.py index a4e6bb8..2a98bf0 100755 --- a/scripts/ml/rpob_7030.py +++ b/scripts/ml/rpob_7030.py @@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/' print('\nOutput directory:', outdir_ml) #%%########################################################################### -print('Sanity checks:' - , '\nML source data size:', x_features.shape - , '\nTotal input features:', X.shape - , '\nTarget feature numbers:', Counter(y) - , '\nTarget features ratio:', yc1_ratio +print('\nSanity checks:' + #, '\nML source data size:', x_features.shape + , '\nTotal input features:', len(X.columns) + , '\n' + , '\nTraining data size:', X.shape + , '\nTest data size:', X_bts.shape + , '\n' + , '\nTarget feature numbers (training data):', Counter(y) + , '\nTarget features ratio (training data:', yc1_ratio + , '\n' + , '\nTarget feature numbers (test data):', Counter(y_bts) + , '\nTarget features ratio (test data):', yc2_ratio + , '\n\n#####################################################################\n') print('\n================================================================\n') @@ -59,8 +67,8 @@ print('Strucutral features (n):' print('AAindex features (n):' , len(X_aaindexFN) - # , '\nThese are:\n' - # , X_aaindexFN + , '\nThese are:\n' + , X_aaindexFN , '\n================================================================\n') print('Evolutionary features (n):'