fixed aa_index creeping categorical values in numerical cols

This commit is contained in:
Tanushree Tunstall 2022-06-16 17:47:00 +01:00
parent 89cbeb3610
commit c666c426c0
2 changed files with 708 additions and 659 deletions

View file

@ -5,7 +5,7 @@ Created on Sun Mar 6 13:41:54 2022
@author: tanu
"""
#def setvars(gene,drug):
def setvars(gene,drug):
#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
import os, sys
import pandas as pd
@ -70,7 +70,8 @@ geneL_na = ['gid']
geneL_na_ppi2 = ['rpob']
geneL_ppi2 = ['alr', 'embb', 'katg']
num_type = ['int64', 'float64']
#num_type = ['int64', 'float64']
num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
cat_type = ['object', 'bool']
#==============
@ -101,9 +102,40 @@ mycols = my_features_df.columns
# File 2
#---------
infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv'
aaindex_df = pd.read_csv(infile_aaindex)
aaindex_df = pd.read_csv(infile_aaindex, index_col = 0)
aaindex_df.dtypes
#-----------
# check for non-numerical columns
#-----------
if any(aaindex_df.dtypes==object):
print('\naaindex_df contains non-numerical data')
aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
#-----------
# Extract numerical data only
#-----------
print('\nSelecting numerical data only')
aaindex_df = aaindex_df.select_dtypes(include = num_type)
#---------------------------
# aaindex: sanity check 1
#---------------------------
if len(aaindex_df.columns) == expected_aa_ncols:
print('\nPASS: successfully selected numerical columns only for aaindex_df')
else:
print('\nFAIL: Numbers mismatch'
, '\nExpected ncols:', expected_aa_ncols
, '\nGot:', len(aaindex_df.columns))
#---------------
# check for NA
#---------------
print('\nNow checking for NA in the remaining aaindex_cols')
c1 = aaindex_df.isna().sum()
c2 = c1.sort_values(ascending=False)
print('\nCounting aaindex_df cols with NA'
@ -126,9 +158,21 @@ else:
print('\nPASS: cols with NA successfully dropped from aaindex_df'
, '\nProceeding with combining aa_df with other features_df')
#---------------------------
# aaindex: sanity check 2
#---------------------------
expected_aa_ncols2 = len(aaindex_df.columns) - sum(c2>0)
if len(aa_df.columns) == expected_aa_ncols2:
print('\nPASS: ncols match'
, '\nExpected ncols:', expected_aa_ncols2
, '\nGot:', len(aa_df.columns))
else:
print('\nFAIL: Numbers mismatch'
, '\nExpected ncols:', expected_aa_ncols2
, '\nGot:', len(aa_df.columns))
# Important: need this to identify aaindex cols
aa_df_cols = aa_df.columns
aa_df_cols = aa_df_cols.drop(['mutationinformation'])
print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
###############################################################################
@ -136,6 +180,7 @@ print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
#===========================
# Merge my_df + aaindex_df
#===========================
if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
print('\nMerging on column: mutationinformation')
@ -147,12 +192,24 @@ else:
, '\nnrows my_df:', len(my_features_df)
, '\nnrows aa_df:', len(aa_df))
#-----------------
# Reset index: mutationinformation
# Very important for merging
#-----------------
aa_df = aa_df.reset_index()
expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
#-----------------
# Merge: my_features_df + aa_df
#-----------------
merged_df = pd.merge(my_features_df
, aa_df
, on = 'mutationinformation')
#---------------------------
# aaindex: sanity check 3
#---------------------------
if len(merged_df.columns) == expected_ncols:
print('\nPASS: my_features_df and aa_df successfully combined'
, '\nnrows:', len(merged_df)
@ -207,8 +264,6 @@ my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineag
###########################################################################
#%% Active site annotation column
# change from numberic to categorical
num_type = ['int64', 'float64']
cat_type = ['object', 'bool']
if my_df['active_site'].dtype in num_type:
my_df['active_site'] = my_df['active_site'].astype(object)
@ -368,21 +423,6 @@ else:
#==========================
my_df_ml = my_df.copy()
# # get index for the last column for my_features_df
# my_features_df_lcolname = my_features_df.columns[-1]
# my_features_df_lcolname_i = my_features_df.columns.get_loc(my_features_df_lcolname)
# # get index for the last column for merged_df i.e my_df i.e my_df_ml
# aa_df_lcolname = aa_df.columns[-1]
# aa_df = aa_df.columns.get_loc(aa_df_lcolname)
# aaindex_col_start = my_features_df_lcolname_i + 1
#==========================
# BLIND test set
#==========================
@ -478,8 +518,10 @@ print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
# numerical feature names
# numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN
#numerical_FN = X_ssFN + X_evolFN + X_genomicFN
numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN
#categorical feature names
categorical_FN = ['ss_class'
# , 'wt_prop_water'

View file

@ -15,9 +15,16 @@ drug = 'rifampicin'
homedir = os.path.expanduser("~")
os.chdir( homedir + '/git/ML_AI_training/')
from UQ_ML_data import *
#---------------------------
# Version 1: no AAindex
#from UQ_ML_data import *
#setvars(gene,drug)
#from UQ_ML_data import *
#---------------------------
from UQ_ML_data2 import *
setvars(gene,drug)
from UQ_ML_data import *
from UQ_ML_data2 import *
# from YC run_all_ML: run locally
#from UQ_yc_RunAllClfs import run_all_ML