fixed aa_index creeping categorical values in numerical cols

2022-06-16 17:47:00 +01:00 · 2022-06-16 17:47:00 +01:00 · c666c426c0
commit c666c426c0
parent 89cbeb3610
2 changed files with 708 additions and 659 deletions
--- a/UQ_ML_data2.py
+++ b/UQ_ML_data2.py
@ -5,7 +5,7 @@ Created on Sun Mar  6 13:41:54 2022

@author: tanu
 """
-#def setvars(gene,drug):
+def setvars(gene,drug):
    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
    import os, sys
    import pandas as pd
@ -70,7 +70,8 @@ geneL_na        = ['gid']
    geneL_na_ppi2   = ['rpob']
    geneL_ppi2      = ['alr', 'embb', 'katg']
    
-num_type = ['int64', 'float64']
+    #num_type = ['int64', 'float64']
+    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cat_type = ['object', 'bool']
    
    #==============
@ -101,9 +102,40 @@ mycols = my_features_df.columns
    # File 2
    #---------
    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-aaindex_df = pd.read_csv(infile_aaindex) 
+    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
    aaindex_df.dtypes
    
+    #-----------
+    # check for non-numerical columns
+    #-----------
+    if any(aaindex_df.dtypes==object):
+        print('\naaindex_df contains non-numerical data')
+    
+    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
+    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
+    
+    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
+
+    #-----------
+    # Extract numerical data only
+    #-----------
+    print('\nSelecting numerical data only')
+    aaindex_df = aaindex_df.select_dtypes(include = num_type)
+
+    #---------------------------
+    # aaindex: sanity check 1
+    #---------------------------
+    if len(aaindex_df.columns) == expected_aa_ncols:
+        print('\nPASS: successfully selected numerical columns only for aaindex_df')
+    else:
+        print('\nFAIL: Numbers mismatch'
+              , '\nExpected ncols:', expected_aa_ncols
+              , '\nGot:', len(aaindex_df.columns))    
+        
+    #---------------
+    # check for NA
+    #---------------
+    print('\nNow checking for NA in the remaining aaindex_cols')
    c1 = aaindex_df.isna().sum()
    c2 = c1.sort_values(ascending=False)
    print('\nCounting aaindex_df cols with NA'
@ -126,9 +158,21 @@ else:
        print('\nPASS: cols with NA successfully dropped from aaindex_df'
              , '\nProceeding with combining aa_df with other features_df')
        
+    #---------------------------
+    # aaindex: sanity check 2
+    #---------------------------
+    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
+    if len(aa_df.columns) == expected_aa_ncols2:
+        print('\nPASS: ncols match'
+              , '\nExpected ncols:', expected_aa_ncols2
+              , '\nGot:', len(aa_df.columns))
+    else:
+        print('\nFAIL: Numbers mismatch'
+              , '\nExpected ncols:', expected_aa_ncols2
+              , '\nGot:', len(aa_df.columns))            
+        
    # Important: need this to identify aaindex cols    
    aa_df_cols = aa_df.columns
-aa_df_cols = aa_df_cols.drop(['mutationinformation'])
    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
    
    ###############################################################################
@ -136,6 +180,7 @@ print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
    #===========================
    # Merge my_df + aaindex_df
    #===========================
+    
    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
        print('\nMerging on column: mutationinformation')   
    
@ -147,12 +192,24 @@ else:
              , '\nnrows my_df:', len(my_features_df)
              , '\nnrows aa_df:', len(aa_df))
               
+    #-----------------
+    # Reset index: mutationinformation
+    # Very important for merging
+    #-----------------
+    aa_df = aa_df.reset_index()
+    
    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col

+    #-----------------
+    # Merge: my_features_df + aa_df
+    #-----------------
    merged_df = pd.merge(my_features_df
                         , aa_df
                         , on = 'mutationinformation')
    
+    #---------------------------
+    # aaindex: sanity check 3
+    #---------------------------
    if len(merged_df.columns) == expected_ncols:
        print('\nPASS: my_features_df and aa_df successfully combined'
              , '\nnrows:', len(merged_df)
@ -207,8 +264,6 @@ my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineag
    ###########################################################################
    #%% Active site annotation column
    # change from numberic to categorical
-num_type = ['int64', 'float64']
-cat_type = ['object', 'bool']

    if my_df['active_site'].dtype in num_type:
        my_df['active_site'] = my_df['active_site'].astype(object)
@ -368,21 +423,6 @@ else:
    #==========================
    my_df_ml = my_df.copy()
        
-
-# # get index for the last column for my_features_df
-# my_features_df_lcolname = my_features_df.columns[-1]
-# my_features_df_lcolname_i = my_features_df.columns.get_loc(my_features_df_lcolname)
-
-# # get index for the last column for merged_df i.e my_df i.e my_df_ml
-# aa_df_lcolname = aa_df.columns[-1]
-# aa_df = aa_df.columns.get_loc(aa_df_lcolname)
-
-
-
-# aaindex_col_start = my_features_df_lcolname_i + 1
-
-
-
    #==========================
    #     BLIND test set
    #==========================
@ -478,8 +518,10 @@ print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
    # numerical feature names
    #    numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN 
    
+    #numerical_FN = X_ssFN  + X_evolFN + X_genomicFN
    numerical_FN = X_ssFN  + X_evolFN + X_genomicFN + X_aaindexFN

+    
    #categorical feature names
    categorical_FN = ['ss_class'
                # , 'wt_prop_water'
--- a/rpob_config.py
+++ b/rpob_config.py
@ -15,9 +15,16 @@ drug  = 'rifampicin'
 homedir = os.path.expanduser("~")
 os.chdir( homedir + '/git/ML_AI_training/')

-from UQ_ML_data import *
+#---------------------------
+# Version 1: no AAindex
+#from UQ_ML_data import *
+#setvars(gene,drug)
+#from UQ_ML_data import *
+#---------------------------
+
+from UQ_ML_data2 import *
 setvars(gene,drug)
-from UQ_ML_data import *
+from UQ_ML_data2 import *

 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML