changed dir for reading func in pnca_config.py

2022-06-17 16:37:07 +01:00 · 2022-06-17 16:37:07 +01:00 · e6d3692445
commit e6d3692445
parent 96d4e61dca
2 changed files with 10 additions and 14 deletions
--- a/scripts/ml/ml_data.py
+++ b/scripts/ml/ml_data.py
@ -423,9 +423,9 @@ def setvars(gene,drug):
    #==========================
    my_df_ml = my_df.copy()
        
-    #==========================
-    #     BLIND test set
-    #==========================
+    #===============================
+    #   Training and BLIND test set
+    #===============================
    # Separate blind test set
    my_df_ml[drug].isna().sum()
    
@ -435,7 +435,7 @@ def setvars(gene,drug):
    training_df =  my_df_ml[my_df_ml[drug].notna()]
    training_df.shape
    
-    # Target1: dst
+    # Target1: dst_mode
    training_df[drug].value_counts()
    training_df['dst_mode'].value_counts()
    
@ -514,15 +514,11 @@ def setvars(gene,drug):
    
    print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
    
-    #%% Construct numerical and categorical column names
    # numerical feature names
-    #    numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN 
-    
-    #numerical_FN = X_ssFN  + X_evolFN + X_genomicFN
    numerical_FN = X_ssFN  + X_evolFN + X_genomicFN + X_aaindexFN

    
-    #categorical feature names
+    # categorical feature names
    categorical_FN = ['ss_class'
                # , 'wt_prop_water'
                # , 'mut_prop_water'
@ -534,8 +530,8 @@ def setvars(gene,drug):
                , 'electrostatics_change'
                , 'polarity_change'
                , 'water_change'
-                #, 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1]
-                , 'active_site'
+                , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
+                , 'active_site' #[didn't use it for uq_v1]
                #, 'gene_name' # will be required for the combined stuff
                 ]
    #----------------------------------------------
@ -561,7 +557,7 @@ def setvars(gene,drug):
    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
    
-    # mask the column ligand distance > 10
+    # mask the mcsm affinity related columns where ligand distance > 10
    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0
    (my_df_ml['ligand_affinity_change'] == 0).sum()
    
--- a/scripts/ml/pnca_config.py
+++ b/scripts/ml/pnca_config.py
@ -13,7 +13,7 @@ drug  = 'pyrazinamide'
 #total_mtblineage_uc = 8

 homedir = os.path.expanduser("~")
-os.chdir( homedir + '/git/ML_AI_training/')
+os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')

 #---------------------------
 # Version 1: no AAindex
@ -30,7 +30,7 @@ from ml_data import *
 #from UQ_yc_RunAllClfs import run_all_ML

 # TT run all ML clfs: baseline mode
-from UQ_MultModelsCl import MultModelsCl
+from MultModelsCl import MultModelsCl

 #%%###########################################################################