minor formatting consistency for 7030 scripts

2022-06-18 14:41:05 +01:00 · 2022-06-18 14:41:05 +01:00 · 2e50a555a0
commit 2e50a555a0
parent e05e4e2e38
8 changed files with 192 additions and 129 deletions
--- a/scripts/ml/alr_7030.py
+++ b/scripts/ml/alr_7030.py
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 #%%###########################################################################
-print('Sanity checks:'
-      , '\nML source data size:', x_features.shape
-      , '\nTotal input features:', X.shape
-      , '\nTarget feature numbers:', Counter(y)
-      , '\nTarget features ratio:', yc1_ratio
+print('\nSanity checks:'
+      #, '\nML source data size:', x_features.shape
+      , '\nTotal input features:', len(X.columns)
+      , '\n'
+      , '\nTraining data size:', X.shape
+      , '\nTest data size:', X_bts.shape
+      , '\n'
+      , '\nTarget feature numbers (training data):', Counter(y)
+      , '\nTarget features ratio (training data:', yc1_ratio
+      , '\n'
+      , '\nTarget feature numbers (test data):', Counter(y_bts)
+      , '\nTarget features ratio (test data):', yc2_ratio
+      
      , '\n\n#####################################################################\n')

 print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'

 print('AAindex features (n):'
      , len(X_aaindexFN)
-      # , '\nThese are:\n'
-      # , X_aaindexFN
+      , '\nThese are:\n'
+      , X_aaindexFN
      , '\n================================================================\n')

 print('Evolutionary features (n):'
--- a/scripts/ml/embb_7030.py
+++ b/scripts/ml/embb_7030.py
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 #%%###########################################################################
-print('Sanity checks:'
-      , '\nML source data size:', x_features.shape
-      , '\nTotal input features:', X.shape
-      , '\nTarget feature numbers:', Counter(y)
-      , '\nTarget features ratio:', yc1_ratio
+print('\nSanity checks:'
+      #, '\nML source data size:', x_features.shape
+      , '\nTotal input features:', len(X.columns)
+      , '\n'
+      , '\nTraining data size:', X.shape
+      , '\nTest data size:', X_bts.shape
+      , '\n'
+      , '\nTarget feature numbers (training data):', Counter(y)
+      , '\nTarget features ratio (training data:', yc1_ratio
+      , '\n'
+      , '\nTarget feature numbers (test data):', Counter(y_bts)
+      , '\nTarget features ratio (test data):', yc2_ratio
+      
      , '\n\n#####################################################################\n')

 print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'

 print('AAindex features (n):'
      , len(X_aaindexFN)
-      # , '\nThese are:\n'
-      # , X_aaindexFN
+      , '\nThese are:\n'
+      , X_aaindexFN
      , '\n================================================================\n')

 print('Evolutionary features (n):'
--- a/scripts/ml/gid_7030.py
+++ b/scripts/ml/gid_7030.py
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 #%%###########################################################################
-print('Sanity checks:'
-      , '\nML source data size:', x_features.shape
-      , '\nTotal input features:', X.shape
-      , '\nTarget feature numbers:', Counter(y)
-      , '\nTarget features ratio:', yc1_ratio
+print('\nSanity checks:'
+      #, '\nML source data size:', x_features.shape
+      , '\nTotal input features:', len(X.columns)
+      , '\n'
+      , '\nTraining data size:', X.shape
+      , '\nTest data size:', X_bts.shape
+      , '\n'
+      , '\nTarget feature numbers (training data):', Counter(y)
+      , '\nTarget features ratio (training data:', yc1_ratio
+      , '\n'
+      , '\nTarget feature numbers (test data):', Counter(y_bts)
+      , '\nTarget features ratio (test data):', yc2_ratio
+      
      , '\n\n#####################################################################\n')

 print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'

 print('AAindex features (n):'
      , len(X_aaindexFN)
-      # , '\nThese are:\n'
-      # , X_aaindexFN
+      , '\nThese are:\n'
+      , X_aaindexFN
      , '\n================================================================\n')

 print('Evolutionary features (n):'
--- a/scripts/ml/katg_7030.py
+++ b/scripts/ml/katg_7030.py
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 #%%###########################################################################
-print('Sanity checks:'
-      , '\nML source data size:', x_features.shape
-      , '\nTotal input features:', X.shape
-      , '\nTarget feature numbers:', Counter(y)
-      , '\nTarget features ratio:', yc1_ratio
+print('\nSanity checks:'
+      #, '\nML source data size:', x_features.shape
+      , '\nTotal input features:', len(X.columns)
+      , '\n'
+      , '\nTraining data size:', X.shape
+      , '\nTest data size:', X_bts.shape
+      , '\n'
+      , '\nTarget feature numbers (training data):', Counter(y)
+      , '\nTarget features ratio (training data:', yc1_ratio
+      , '\n'
+      , '\nTarget feature numbers (test data):', Counter(y_bts)
+      , '\nTarget features ratio (test data):', yc2_ratio
+      
      , '\n\n#####################################################################\n')

 print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'

 print('AAindex features (n):'
      , len(X_aaindexFN)
-      # , '\nThese are:\n'
-      # , X_aaindexFN
+      , '\nThese are:\n'
+      , X_aaindexFN
      , '\n================================================================\n')

 print('Evolutionary features (n):'
--- a/scripts/ml/ml_data.py
+++ b/scripts/ml/ml_data.py
@ -423,22 +423,6 @@ def setvars(gene,drug):
    #==========================
    my_df_ml = my_df.copy()
    
-    #===============================
-    #   Training and BLIND test set
-    #===============================
-    # Separate blind test set
-    my_df_ml[drug].isna().sum()
-    
-    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
-    blind_test_df.shape
-    
-    training_df =  my_df_ml[my_df_ml[drug].notna()]
-    training_df.shape
-    
-    # Target1: dst_mode
-    training_df[drug].value_counts()
-    training_df['dst_mode'].value_counts()
-    
    #%% Build X: input for ML
    common_cols_stabiltyN = ['ligand_distance'
               , 'ligand_affinity_change'
@ -546,7 +530,6 @@ def setvars(gene,drug):
    # Masking columns:
    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
    #=======================
-    #%% Masking columns 
    # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
    # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
    
@ -567,6 +550,23 @@ def setvars(gene,drug):
    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
    
+    #===================================================
+    # Training and BLIND test set: actual vs imputed
+    # dst with actual values  : training set
+    # dst with imputed values : blind test
+    #==================================================
+    my_df_ml[drug].isna().sum()  #'na' ones are the blind_test set
+    
+    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
+    blind_test_df.shape
+    
+    training_df =  my_df_ml[my_df_ml[drug].notna()]
+    training_df.shape
+    
+    # Target 1: dst_mode
+    training_df[drug].value_counts()
+    training_df['dst_mode'].value_counts()
+    ####################################################################
    #%% extracting dfs based on numerical, categorical column names
    #----------------------------------
    # WITHOUT the target var included
@ -625,11 +625,11 @@ def setvars(gene,drug):
    print('Original Data\n', Counter(y)
          , 'Data dim:', X.shape)
    
-    ###############################################################################
+    ###########################################################################
    #%% 
-    ############################################################################
+    ###########################################################################
    #                               RESAMPLING
-    ###############################################################################
+    ###########################################################################
    #------------------------------
    # Simple Random oversampling
    # [Numerical + catgeorical]
--- a/scripts/ml/ml_data_7030.py
+++ b/scripts/ml/ml_data_7030.py
@ -528,7 +528,6 @@ def setvars(gene,drug):
    # Masking columns:
    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
    #=======================
-    #%% Masking columns 
    # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
    # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
    
@ -557,7 +556,6 @@ def setvars(gene,drug):
    # as these were imputed values and initial analysis shows that this
    # is not very representative
    #================================================================
-    # Separate blind test set
    my_df_ml[drug].isna().sum()
    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
    #    blind_test_df.shape
@ -565,12 +563,14 @@ def setvars(gene,drug):
    training_df =  my_df_ml[my_df_ml[drug].notna()]
    training_df.shape
    
-    # Target1: dst_mode
+    # Target 1: dst_mode
    training_df[drug].value_counts()
    training_df['dst_mode'].value_counts()
    
    ####################################################################
    
+###############################################################################
+###############################################################################
    # #%% extracting dfs based on numerical, categorical column names
    # #----------------------------------
    # # WITHOUT the target var included
@ -597,6 +597,7 @@ def setvars(gene,drug):
    
    # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
    # all_df_wtgt.shape
+    
    #%%########################################################################
    # #============
    # # ML data: OLD
@ -629,13 +630,14 @@ def setvars(gene,drug):
    # print('Original Data\n', Counter(y)
    #       , 'Data dim:', X.shape)
    
-    
-    #============
+###############################################################################
+###############################################################################    
+    #====================================
    # ML data: Train test split: 70/30
    # with stratification
    # 70% : training_data for CV
    # 30% : blind test 
-    #============
+    #=====================================
      
    # features: all_df or
    x_features = training_df[numerical_FN + categorical_FN]
@ -664,7 +666,9 @@ def setvars(gene,drug):
    yc2_ratio = yc2[0]/yc2[1]

    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data 70/30 with stratification'
+          , '\nSuccessfully split data with stratification: 70/30'
+          , '\nTrain data size:', X.shape
+          , '\nTest data size:', X_bts.shape
          , '\ny_train numbers:', yc1
          , '\ny_train ratio:',yc1_ratio
          , '\n'
@ -672,12 +676,23 @@ def setvars(gene,drug):
          , '\ny_test ratio:', yc2_ratio
          , '\n-------------------------------------------------------------'
          )
+    ##########################################################################    
+    # Quick check
+    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
+    for i in range(len(cols_to_mask)):
+        ind = i+1
+        print('\nindex:', i, '\nind:', ind)
+        print('\nMask count check:'
+              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
+              )
    
-    ###############################################################################
+    print('Original Data\n', Counter(y)
+          , 'Data dim:', X.shape)
+    ###########################################################################
    #%% 
-    ############################################################################
+    ###########################################################################
    #                               RESAMPLING
-    ###############################################################################
+    ###########################################################################
    #------------------------------
    # Simple Random oversampling
    # [Numerical + catgeorical]
--- a/scripts/ml/pnca_7030.py
+++ b/scripts/ml/pnca_7030.py
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 #%%###########################################################################
-print('Sanity checks:'
-      , '\nML source data size:', x_features.shape
-      , '\nTotal input features:', X.shape
-      , '\nTarget feature numbers:', Counter(y)
-      , '\nTarget features ratio:', yc1_ratio
+print('\nSanity checks:'
+      #, '\nML source data size:', x_features.shape
+      , '\nTotal input features:', len(X.columns)
+      , '\n'
+      , '\nTraining data size:', X.shape
+      , '\nTest data size:', X_bts.shape
+      , '\n'
+      , '\nTarget feature numbers (training data):', Counter(y)
+      , '\nTarget features ratio (training data:', yc1_ratio
+      , '\n'
+      , '\nTarget feature numbers (test data):', Counter(y_bts)
+      , '\nTarget features ratio (test data):', yc2_ratio
+      
      , '\n\n#####################################################################\n')

 print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'

 print('AAindex features (n):'
      , len(X_aaindexFN)
-      # , '\nThese are:\n'
-      # , X_aaindexFN
+      , '\nThese are:\n'
+      , X_aaindexFN
      , '\n================================================================\n')

 print('Evolutionary features (n):'
--- a/scripts/ml/rpob_7030.py
+++ b/scripts/ml/rpob_7030.py
@ -40,11 +40,19 @@ outdir_ml = outdir + 'ml/tts_7030/'
 print('\nOutput directory:', outdir_ml)

 #%%###########################################################################
-print('Sanity checks:'
-      , '\nML source data size:', x_features.shape
-      , '\nTotal input features:', X.shape
-      , '\nTarget feature numbers:', Counter(y)
-      , '\nTarget features ratio:', yc1_ratio
+print('\nSanity checks:'
+      #, '\nML source data size:', x_features.shape
+      , '\nTotal input features:', len(X.columns)
+      , '\n'
+      , '\nTraining data size:', X.shape
+      , '\nTest data size:', X_bts.shape
+      , '\n'
+      , '\nTarget feature numbers (training data):', Counter(y)
+      , '\nTarget features ratio (training data:', yc1_ratio
+      , '\n'
+      , '\nTarget feature numbers (test data):', Counter(y_bts)
+      , '\nTarget features ratio (test data):', yc2_ratio
+      
      , '\n\n#####################################################################\n')

 print('\n================================================================\n')
@ -59,8 +67,8 @@ print('Strucutral features (n):'

 print('AAindex features (n):'
      , len(X_aaindexFN)
-      # , '\nThese are:\n'
-      # , X_aaindexFN
+      , '\nThese are:\n'
+      , X_aaindexFN
      , '\n================================================================\n')

 print('Evolutionary features (n):'