fixed aa_index creeping categorical values in numerical cols

2022-06-16 17:47:00 +01:00 · 2022-06-16 17:47:00 +01:00 · c666c426c0
commit c666c426c0
parent 89cbeb3610
2 changed files with 708 additions and 659 deletions
--- a/UQ_ML_data2.py
+++ b/UQ_ML_data2.py
@ -5,7 +5,7 @@ Created on Sun Mar  6 13:41:54 2022
@author: tanu
 """
-#def setvars(gene,drug):
+def setvars(gene,drug):
    #https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
    import os, sys
    import pandas as pd
@ -70,7 +70,8 @@ geneL_na        = ['gid']
    geneL_na_ppi2   = ['rpob']
    geneL_ppi2      = ['alr', 'embb', 'katg']
-num_type = ['int64', 'float64']
+    #num_type = ['int64', 'float64']
    num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cat_type = ['object', 'bool']
    #==============
@ -101,9 +102,40 @@ mycols = my_features_df.columns
    # File 2
    #---------
    infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' 
-aaindex_df = pd.read_csv(infile_aaindex) 
+    aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) 
    aaindex_df.dtypes
    #-----------
    # check for non-numerical columns
    #-----------
    if any(aaindex_df.dtypes==object):
        print('\naaindex_df contains non-numerical data')
    aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
    print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
    expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
    #-----------
    # Extract numerical data only
    #-----------
    print('\nSelecting numerical data only')
    aaindex_df = aaindex_df.select_dtypes(include = num_type)
    #---------------------------
    # aaindex: sanity check 1
    #---------------------------
    if len(aaindex_df.columns) == expected_aa_ncols:
        print('\nPASS: successfully selected numerical columns only for aaindex_df')
    else:
        print('\nFAIL: Numbers mismatch'
              , '\nExpected ncols:', expected_aa_ncols
              , '\nGot:', len(aaindex_df.columns))    
    #---------------
    # check for NA
    #---------------
    print('\nNow checking for NA in the remaining aaindex_cols')
    c1 = aaindex_df.isna().sum()
    c2 = c1.sort_values(ascending=False)
    print('\nCounting aaindex_df cols with NA'
@ -126,9 +158,21 @@ else:
        print('\nPASS: cols with NA successfully dropped from aaindex_df'
              , '\nProceeding with combining aa_df with other features_df')
    #---------------------------
    # aaindex: sanity check 2
    #---------------------------
    expected_aa_ncols2 =  len(aaindex_df.columns) - sum(c2>0)  
    if len(aa_df.columns) == expected_aa_ncols2:
        print('\nPASS: ncols match'
              , '\nExpected ncols:', expected_aa_ncols2
              , '\nGot:', len(aa_df.columns))
    else:
        print('\nFAIL: Numbers mismatch'
              , '\nExpected ncols:', expected_aa_ncols2
              , '\nGot:', len(aa_df.columns))            
    # Important: need this to identify aaindex cols    
    aa_df_cols = aa_df.columns
 aa_df_cols = aa_df_cols.drop(['mutationinformation'])
    print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
    ###############################################################################
@ -136,6 +180,7 @@ print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
    #===========================
    # Merge my_df + aaindex_df
    #===========================
    if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
        print('\nMerging on column: mutationinformation')   
@ -147,12 +192,24 @@ else:
              , '\nnrows my_df:', len(my_features_df)
              , '\nnrows aa_df:', len(aa_df))
    #-----------------
    # Reset index: mutationinformation
    # Very important for merging
    #-----------------
    aa_df = aa_df.reset_index()
    expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
    #-----------------
    # Merge: my_features_df + aa_df
    #-----------------
    merged_df = pd.merge(my_features_df
                         , aa_df
                         , on = 'mutationinformation')
    #---------------------------
    # aaindex: sanity check 3
    #---------------------------
    if len(merged_df.columns) == expected_ncols:
        print('\nPASS: my_features_df and aa_df successfully combined'
              , '\nnrows:', len(merged_df)
@ -207,8 +264,6 @@ my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineag
    ###########################################################################
    #%% Active site annotation column
    # change from numberic to categorical
 num_type = ['int64', 'float64']
 cat_type = ['object', 'bool']
    if my_df['active_site'].dtype in num_type:
        my_df['active_site'] = my_df['active_site'].astype(object)
@ -368,21 +423,6 @@ else:
    #==========================
    my_df_ml = my_df.copy()
 # # get index for the last column for my_features_df
 # my_features_df_lcolname = my_features_df.columns[-1]
 # my_features_df_lcolname_i = my_features_df.columns.get_loc(my_features_df_lcolname)
 # # get index for the last column for merged_df i.e my_df i.e my_df_ml
 # aa_df_lcolname = aa_df.columns[-1]
 # aa_df = aa_df.columns.get_loc(aa_df_lcolname)
 # aaindex_col_start = my_features_df_lcolname_i + 1
    #==========================
    #     BLIND test set
    #==========================
@ -478,8 +518,10 @@ print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
    # numerical feature names
    #    numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN 
    #numerical_FN = X_ssFN  + X_evolFN + X_genomicFN
    numerical_FN = X_ssFN  + X_evolFN + X_genomicFN + X_aaindexFN
    #categorical feature names
    categorical_FN = ['ss_class'
                # , 'wt_prop_water'
--- a/rpob_config.py
+++ b/rpob_config.py
@ -15,9 +15,16 @@ drug  = 'rifampicin'
 homedir = os.path.expanduser("~")
 os.chdir( homedir + '/git/ML_AI_training/')
-from UQ_ML_data import *
+#---------------------------
 # Version 1: no AAindex
 #from UQ_ML_data import *
 #setvars(gene,drug)
 #from UQ_ML_data import *
 #---------------------------
 from UQ_ML_data2 import *
 setvars(gene,drug)
-from UQ_ML_data import *
+from UQ_ML_data2 import *
 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML