fixed aa_index creeping categorical values in numerical cols
This commit is contained in:
parent
89cbeb3610
commit
c666c426c0
2 changed files with 708 additions and 659 deletions
|
@ -5,7 +5,7 @@ Created on Sun Mar 6 13:41:54 2022
|
|||
|
||||
@author: tanu
|
||||
"""
|
||||
#def setvars(gene,drug):
|
||||
def setvars(gene,drug):
|
||||
#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
|
@ -70,7 +70,8 @@ geneL_na = ['gid']
|
|||
geneL_na_ppi2 = ['rpob']
|
||||
geneL_ppi2 = ['alr', 'embb', 'katg']
|
||||
|
||||
num_type = ['int64', 'float64']
|
||||
#num_type = ['int64', 'float64']
|
||||
num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
|
||||
cat_type = ['object', 'bool']
|
||||
|
||||
#==============
|
||||
|
@ -101,9 +102,40 @@ mycols = my_features_df.columns
|
|||
# File 2
|
||||
#---------
|
||||
infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv'
|
||||
aaindex_df = pd.read_csv(infile_aaindex)
|
||||
aaindex_df = pd.read_csv(infile_aaindex, index_col = 0)
|
||||
aaindex_df.dtypes
|
||||
|
||||
#-----------
|
||||
# check for non-numerical columns
|
||||
#-----------
|
||||
if any(aaindex_df.dtypes==object):
|
||||
print('\naaindex_df contains non-numerical data')
|
||||
|
||||
aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
|
||||
print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
|
||||
|
||||
expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
|
||||
|
||||
#-----------
|
||||
# Extract numerical data only
|
||||
#-----------
|
||||
print('\nSelecting numerical data only')
|
||||
aaindex_df = aaindex_df.select_dtypes(include = num_type)
|
||||
|
||||
#---------------------------
|
||||
# aaindex: sanity check 1
|
||||
#---------------------------
|
||||
if len(aaindex_df.columns) == expected_aa_ncols:
|
||||
print('\nPASS: successfully selected numerical columns only for aaindex_df')
|
||||
else:
|
||||
print('\nFAIL: Numbers mismatch'
|
||||
, '\nExpected ncols:', expected_aa_ncols
|
||||
, '\nGot:', len(aaindex_df.columns))
|
||||
|
||||
#---------------
|
||||
# check for NA
|
||||
#---------------
|
||||
print('\nNow checking for NA in the remaining aaindex_cols')
|
||||
c1 = aaindex_df.isna().sum()
|
||||
c2 = c1.sort_values(ascending=False)
|
||||
print('\nCounting aaindex_df cols with NA'
|
||||
|
@ -126,9 +158,21 @@ else:
|
|||
print('\nPASS: cols with NA successfully dropped from aaindex_df'
|
||||
, '\nProceeding with combining aa_df with other features_df')
|
||||
|
||||
#---------------------------
|
||||
# aaindex: sanity check 2
|
||||
#---------------------------
|
||||
expected_aa_ncols2 = len(aaindex_df.columns) - sum(c2>0)
|
||||
if len(aa_df.columns) == expected_aa_ncols2:
|
||||
print('\nPASS: ncols match'
|
||||
, '\nExpected ncols:', expected_aa_ncols2
|
||||
, '\nGot:', len(aa_df.columns))
|
||||
else:
|
||||
print('\nFAIL: Numbers mismatch'
|
||||
, '\nExpected ncols:', expected_aa_ncols2
|
||||
, '\nGot:', len(aa_df.columns))
|
||||
|
||||
# Important: need this to identify aaindex cols
|
||||
aa_df_cols = aa_df.columns
|
||||
aa_df_cols = aa_df_cols.drop(['mutationinformation'])
|
||||
print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
|
||||
|
||||
###############################################################################
|
||||
|
@ -136,6 +180,7 @@ print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
|
|||
#===========================
|
||||
# Merge my_df + aaindex_df
|
||||
#===========================
|
||||
|
||||
if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
|
||||
print('\nMerging on column: mutationinformation')
|
||||
|
||||
|
@ -147,12 +192,24 @@ else:
|
|||
, '\nnrows my_df:', len(my_features_df)
|
||||
, '\nnrows aa_df:', len(aa_df))
|
||||
|
||||
#-----------------
|
||||
# Reset index: mutationinformation
|
||||
# Very important for merging
|
||||
#-----------------
|
||||
aa_df = aa_df.reset_index()
|
||||
|
||||
expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
|
||||
|
||||
#-----------------
|
||||
# Merge: my_features_df + aa_df
|
||||
#-----------------
|
||||
merged_df = pd.merge(my_features_df
|
||||
, aa_df
|
||||
, on = 'mutationinformation')
|
||||
|
||||
#---------------------------
|
||||
# aaindex: sanity check 3
|
||||
#---------------------------
|
||||
if len(merged_df.columns) == expected_ncols:
|
||||
print('\nPASS: my_features_df and aa_df successfully combined'
|
||||
, '\nnrows:', len(merged_df)
|
||||
|
@ -207,8 +264,6 @@ my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineag
|
|||
###########################################################################
|
||||
#%% Active site annotation column
|
||||
# change from numberic to categorical
|
||||
num_type = ['int64', 'float64']
|
||||
cat_type = ['object', 'bool']
|
||||
|
||||
if my_df['active_site'].dtype in num_type:
|
||||
my_df['active_site'] = my_df['active_site'].astype(object)
|
||||
|
@ -368,21 +423,6 @@ else:
|
|||
#==========================
|
||||
my_df_ml = my_df.copy()
|
||||
|
||||
|
||||
# # get index for the last column for my_features_df
|
||||
# my_features_df_lcolname = my_features_df.columns[-1]
|
||||
# my_features_df_lcolname_i = my_features_df.columns.get_loc(my_features_df_lcolname)
|
||||
|
||||
# # get index for the last column for merged_df i.e my_df i.e my_df_ml
|
||||
# aa_df_lcolname = aa_df.columns[-1]
|
||||
# aa_df = aa_df.columns.get_loc(aa_df_lcolname)
|
||||
|
||||
|
||||
|
||||
# aaindex_col_start = my_features_df_lcolname_i + 1
|
||||
|
||||
|
||||
|
||||
#==========================
|
||||
# BLIND test set
|
||||
#==========================
|
||||
|
@ -478,8 +518,10 @@ print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
|
|||
# numerical feature names
|
||||
# numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN
|
||||
|
||||
#numerical_FN = X_ssFN + X_evolFN + X_genomicFN
|
||||
numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN
|
||||
|
||||
|
||||
#categorical feature names
|
||||
categorical_FN = ['ss_class'
|
||||
# , 'wt_prop_water'
|
||||
|
|
|
@ -15,9 +15,16 @@ drug = 'rifampicin'
|
|||
homedir = os.path.expanduser("~")
|
||||
os.chdir( homedir + '/git/ML_AI_training/')
|
||||
|
||||
from UQ_ML_data import *
|
||||
#---------------------------
|
||||
# Version 1: no AAindex
|
||||
#from UQ_ML_data import *
|
||||
#setvars(gene,drug)
|
||||
#from UQ_ML_data import *
|
||||
#---------------------------
|
||||
|
||||
from UQ_ML_data2 import *
|
||||
setvars(gene,drug)
|
||||
from UQ_ML_data import *
|
||||
from UQ_ML_data2 import *
|
||||
|
||||
# from YC run_all_ML: run locally
|
||||
#from UQ_yc_RunAllClfs import run_all_ML
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue