fixed aa_index creeping categorical values in numerical cols
This commit is contained in:
parent
89cbeb3610
commit
c666c426c0
2 changed files with 708 additions and 659 deletions
|
@ -5,7 +5,7 @@ Created on Sun Mar 6 13:41:54 2022
|
||||||
|
|
||||||
@author: tanu
|
@author: tanu
|
||||||
"""
|
"""
|
||||||
#def setvars(gene,drug):
|
def setvars(gene,drug):
|
||||||
#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
|
#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
|
||||||
import os, sys
|
import os, sys
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
@ -70,7 +70,8 @@ geneL_na = ['gid']
|
||||||
geneL_na_ppi2 = ['rpob']
|
geneL_na_ppi2 = ['rpob']
|
||||||
geneL_ppi2 = ['alr', 'embb', 'katg']
|
geneL_ppi2 = ['alr', 'embb', 'katg']
|
||||||
|
|
||||||
num_type = ['int64', 'float64']
|
#num_type = ['int64', 'float64']
|
||||||
|
num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
|
||||||
cat_type = ['object', 'bool']
|
cat_type = ['object', 'bool']
|
||||||
|
|
||||||
#==============
|
#==============
|
||||||
|
@ -101,9 +102,40 @@ mycols = my_features_df.columns
|
||||||
# File 2
|
# File 2
|
||||||
#---------
|
#---------
|
||||||
infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv'
|
infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv'
|
||||||
aaindex_df = pd.read_csv(infile_aaindex)
|
aaindex_df = pd.read_csv(infile_aaindex, index_col = 0)
|
||||||
aaindex_df.dtypes
|
aaindex_df.dtypes
|
||||||
|
|
||||||
|
#-----------
|
||||||
|
# check for non-numerical columns
|
||||||
|
#-----------
|
||||||
|
if any(aaindex_df.dtypes==object):
|
||||||
|
print('\naaindex_df contains non-numerical data')
|
||||||
|
|
||||||
|
aaindex_df_object = aaindex_df.select_dtypes(include = cat_type)
|
||||||
|
print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns))
|
||||||
|
|
||||||
|
expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns)
|
||||||
|
|
||||||
|
#-----------
|
||||||
|
# Extract numerical data only
|
||||||
|
#-----------
|
||||||
|
print('\nSelecting numerical data only')
|
||||||
|
aaindex_df = aaindex_df.select_dtypes(include = num_type)
|
||||||
|
|
||||||
|
#---------------------------
|
||||||
|
# aaindex: sanity check 1
|
||||||
|
#---------------------------
|
||||||
|
if len(aaindex_df.columns) == expected_aa_ncols:
|
||||||
|
print('\nPASS: successfully selected numerical columns only for aaindex_df')
|
||||||
|
else:
|
||||||
|
print('\nFAIL: Numbers mismatch'
|
||||||
|
, '\nExpected ncols:', expected_aa_ncols
|
||||||
|
, '\nGot:', len(aaindex_df.columns))
|
||||||
|
|
||||||
|
#---------------
|
||||||
|
# check for NA
|
||||||
|
#---------------
|
||||||
|
print('\nNow checking for NA in the remaining aaindex_cols')
|
||||||
c1 = aaindex_df.isna().sum()
|
c1 = aaindex_df.isna().sum()
|
||||||
c2 = c1.sort_values(ascending=False)
|
c2 = c1.sort_values(ascending=False)
|
||||||
print('\nCounting aaindex_df cols with NA'
|
print('\nCounting aaindex_df cols with NA'
|
||||||
|
@ -126,9 +158,21 @@ else:
|
||||||
print('\nPASS: cols with NA successfully dropped from aaindex_df'
|
print('\nPASS: cols with NA successfully dropped from aaindex_df'
|
||||||
, '\nProceeding with combining aa_df with other features_df')
|
, '\nProceeding with combining aa_df with other features_df')
|
||||||
|
|
||||||
|
#---------------------------
|
||||||
|
# aaindex: sanity check 2
|
||||||
|
#---------------------------
|
||||||
|
expected_aa_ncols2 = len(aaindex_df.columns) - sum(c2>0)
|
||||||
|
if len(aa_df.columns) == expected_aa_ncols2:
|
||||||
|
print('\nPASS: ncols match'
|
||||||
|
, '\nExpected ncols:', expected_aa_ncols2
|
||||||
|
, '\nGot:', len(aa_df.columns))
|
||||||
|
else:
|
||||||
|
print('\nFAIL: Numbers mismatch'
|
||||||
|
, '\nExpected ncols:', expected_aa_ncols2
|
||||||
|
, '\nGot:', len(aa_df.columns))
|
||||||
|
|
||||||
# Important: need this to identify aaindex cols
|
# Important: need this to identify aaindex cols
|
||||||
aa_df_cols = aa_df.columns
|
aa_df_cols = aa_df.columns
|
||||||
aa_df_cols = aa_df_cols.drop(['mutationinformation'])
|
|
||||||
print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
|
print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
@ -136,6 +180,7 @@ print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols))
|
||||||
#===========================
|
#===========================
|
||||||
# Merge my_df + aaindex_df
|
# Merge my_df + aaindex_df
|
||||||
#===========================
|
#===========================
|
||||||
|
|
||||||
if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
|
if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]:
|
||||||
print('\nMerging on column: mutationinformation')
|
print('\nMerging on column: mutationinformation')
|
||||||
|
|
||||||
|
@ -147,12 +192,24 @@ else:
|
||||||
, '\nnrows my_df:', len(my_features_df)
|
, '\nnrows my_df:', len(my_features_df)
|
||||||
, '\nnrows aa_df:', len(aa_df))
|
, '\nnrows aa_df:', len(aa_df))
|
||||||
|
|
||||||
|
#-----------------
|
||||||
|
# Reset index: mutationinformation
|
||||||
|
# Very important for merging
|
||||||
|
#-----------------
|
||||||
|
aa_df = aa_df.reset_index()
|
||||||
|
|
||||||
expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
|
expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col
|
||||||
|
|
||||||
|
#-----------------
|
||||||
|
# Merge: my_features_df + aa_df
|
||||||
|
#-----------------
|
||||||
merged_df = pd.merge(my_features_df
|
merged_df = pd.merge(my_features_df
|
||||||
, aa_df
|
, aa_df
|
||||||
, on = 'mutationinformation')
|
, on = 'mutationinformation')
|
||||||
|
|
||||||
|
#---------------------------
|
||||||
|
# aaindex: sanity check 3
|
||||||
|
#---------------------------
|
||||||
if len(merged_df.columns) == expected_ncols:
|
if len(merged_df.columns) == expected_ncols:
|
||||||
print('\nPASS: my_features_df and aa_df successfully combined'
|
print('\nPASS: my_features_df and aa_df successfully combined'
|
||||||
, '\nnrows:', len(merged_df)
|
, '\nnrows:', len(merged_df)
|
||||||
|
@ -207,8 +264,6 @@ my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineag
|
||||||
###########################################################################
|
###########################################################################
|
||||||
#%% Active site annotation column
|
#%% Active site annotation column
|
||||||
# change from numberic to categorical
|
# change from numberic to categorical
|
||||||
num_type = ['int64', 'float64']
|
|
||||||
cat_type = ['object', 'bool']
|
|
||||||
|
|
||||||
if my_df['active_site'].dtype in num_type:
|
if my_df['active_site'].dtype in num_type:
|
||||||
my_df['active_site'] = my_df['active_site'].astype(object)
|
my_df['active_site'] = my_df['active_site'].astype(object)
|
||||||
|
@ -368,21 +423,6 @@ else:
|
||||||
#==========================
|
#==========================
|
||||||
my_df_ml = my_df.copy()
|
my_df_ml = my_df.copy()
|
||||||
|
|
||||||
|
|
||||||
# # get index for the last column for my_features_df
|
|
||||||
# my_features_df_lcolname = my_features_df.columns[-1]
|
|
||||||
# my_features_df_lcolname_i = my_features_df.columns.get_loc(my_features_df_lcolname)
|
|
||||||
|
|
||||||
# # get index for the last column for merged_df i.e my_df i.e my_df_ml
|
|
||||||
# aa_df_lcolname = aa_df.columns[-1]
|
|
||||||
# aa_df = aa_df.columns.get_loc(aa_df_lcolname)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# aaindex_col_start = my_features_df_lcolname_i + 1
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#==========================
|
#==========================
|
||||||
# BLIND test set
|
# BLIND test set
|
||||||
#==========================
|
#==========================
|
||||||
|
@ -478,8 +518,10 @@ print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
|
||||||
# numerical feature names
|
# numerical feature names
|
||||||
# numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN
|
# numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN
|
||||||
|
|
||||||
|
#numerical_FN = X_ssFN + X_evolFN + X_genomicFN
|
||||||
numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN
|
numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN
|
||||||
|
|
||||||
|
|
||||||
#categorical feature names
|
#categorical feature names
|
||||||
categorical_FN = ['ss_class'
|
categorical_FN = ['ss_class'
|
||||||
# , 'wt_prop_water'
|
# , 'wt_prop_water'
|
||||||
|
|
|
@ -15,9 +15,16 @@ drug = 'rifampicin'
|
||||||
homedir = os.path.expanduser("~")
|
homedir = os.path.expanduser("~")
|
||||||
os.chdir( homedir + '/git/ML_AI_training/')
|
os.chdir( homedir + '/git/ML_AI_training/')
|
||||||
|
|
||||||
from UQ_ML_data import *
|
#---------------------------
|
||||||
|
# Version 1: no AAindex
|
||||||
|
#from UQ_ML_data import *
|
||||||
|
#setvars(gene,drug)
|
||||||
|
#from UQ_ML_data import *
|
||||||
|
#---------------------------
|
||||||
|
|
||||||
|
from UQ_ML_data2 import *
|
||||||
setvars(gene,drug)
|
setvars(gene,drug)
|
||||||
from UQ_ML_data import *
|
from UQ_ML_data2 import *
|
||||||
|
|
||||||
# from YC run_all_ML: run locally
|
# from YC run_all_ML: run locally
|
||||||
#from UQ_yc_RunAllClfs import run_all_ML
|
#from UQ_yc_RunAllClfs import run_all_ML
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue