loopity_loop_CALL
This commit is contained in:
parent
7aead2d4f4
commit
160053d361
5 changed files with 163 additions and 188 deletions
163
imports.py
163
imports.py
|
@ -27,6 +27,8 @@ from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoe
|
|||
from sklearn.metrics import make_scorer
|
||||
from sklearn.metrics import classification_report
|
||||
|
||||
from sklearn.metrics import average_precision_score
|
||||
|
||||
from sklearn.model_selection import cross_validate
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
|
@ -43,6 +45,16 @@ import numpy as np
|
|||
print(np.__version__)
|
||||
print(pd.__version__)
|
||||
from statistics import mean, stdev, median, mode
|
||||
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from imblearn.pipeline import Pipeline
|
||||
#from sklearn.datasets import make_classification
|
||||
from sklearn.model_selection import cross_validate
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold
|
||||
from sklearn.ensemble import AdaBoostClassifier
|
||||
from imblearn.combine import SMOTEENN
|
||||
from imblearn.under_sampling import EditedNearestNeighbours
|
||||
#%%
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir(homedir + "/git/ML_AI_training/")
|
||||
|
@ -52,8 +64,8 @@ from MultClassPipe import MultClassPipeline
|
|||
from MultClassPipe2 import MultClassPipeline2
|
||||
from loopity_loop import MultClassPipeSKF
|
||||
|
||||
gene = 'rpoB'
|
||||
drug = 'rifampicin'
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
|
||||
#==============
|
||||
# directories
|
||||
|
@ -79,64 +91,33 @@ geneL_ppi2 = ['alr', 'embb', 'katg']
|
|||
#%% get cols
|
||||
mycols = my_df.columns
|
||||
|
||||
my_df['active_aa_pos'].dtype
|
||||
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
|
||||
# change from numberic to
|
||||
num_type = ['int64', 'float64']
|
||||
cat_type = ['object', 'bool']
|
||||
|
||||
if my_df['active_aa_pos'].dtype in num_type:
|
||||
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
|
||||
my_df['active_aa_pos'].dtype
|
||||
|
||||
# FIXME: if this is not structural, remove from source..
|
||||
# Drop NA where numerical cols have them
|
||||
if gene.lower() in geneL_na_ppi2:
|
||||
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
||||
#D1148 get rid of
|
||||
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
|
||||
my_df = my_df.drop(index=na_index)
|
||||
|
||||
# FIXME: either impute or remove!
|
||||
# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
|
||||
if gene.lower() in ['embb']:
|
||||
na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
|
||||
my_df = my_df.drop(index=na_index)
|
||||
#%%============================================================================
|
||||
# GET Y
|
||||
|
||||
# Target1: mutation_info_labels
|
||||
dm_om_map = {'DM': 1, 'OM': 0}
|
||||
target1 = my_df['mutation_info_labels'].map(dm_om_map)
|
||||
target1.value_counts()
|
||||
|
||||
# Target2: drug
|
||||
drug_labels = drug + '_labels'
|
||||
drug_labels
|
||||
my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
|
||||
my_df[drug_labels].value_counts()
|
||||
my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
|
||||
my_df[drug_labels].value_counts()
|
||||
target2 = my_df[drug_labels]
|
||||
|
||||
# Target3: drtype [Binary]
|
||||
drtype_labels = 'drtype_labels'
|
||||
my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0
|
||||
, 'Other' : 0
|
||||
, 'Pre-MDR' : 1
|
||||
, 'MDR' : 1
|
||||
, 'Pre-XDR' : 1
|
||||
, 'XDR' : 1})
|
||||
# target3 = 'drtype' [Multinomial]
|
||||
target3 = my_df[drtype_labels]
|
||||
|
||||
# target4
|
||||
drtype_labels2 = 'drtype_labels2'
|
||||
my_df[drtype_labels2] = my_df['drtype'].map({'Sensitive' : 0
|
||||
, 'Other' : 0
|
||||
, 'Pre-MDR' : 1
|
||||
, 'MDR' : 1
|
||||
, 'Pre-XDR' : 2
|
||||
, 'XDR' : 2})
|
||||
target4 = my_df[drtype_labels2]
|
||||
|
||||
# sanity checks
|
||||
target1.value_counts()
|
||||
my_df['mutation_info_labels'].value_counts()
|
||||
|
||||
target2.value_counts()
|
||||
my_df[drug_labels].value_counts()
|
||||
|
||||
target3.value_counts()
|
||||
my_df['drtype'].value_counts()
|
||||
target4.value_counts()
|
||||
my_df['drtype'].value_counts()
|
||||
# Target1: mutation_info_labels, convert to
|
||||
dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
|
||||
my_df['mutation_class'] = my_df['mutation_info_labels'].map(dm_om_map)
|
||||
my_df['mutation_class'].value_counts()
|
||||
my_df['mutation_info_labels']. value_counts()
|
||||
|
||||
#%%
|
||||
# GET X
|
||||
|
@ -159,10 +140,7 @@ if gene.lower() in geneL_na:
|
|||
|
||||
if gene.lower() in geneL_na_ppi2:
|
||||
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
||||
#D1148 get rid of
|
||||
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
|
||||
my_df = my_df.drop(index=na_index)
|
||||
|
||||
|
||||
X_strFN = ['asa'
|
||||
, 'rsa'
|
||||
, 'kd_values'
|
||||
|
@ -172,53 +150,48 @@ X_evolFN = ['consurf_score'
|
|||
, 'snap2_score'
|
||||
, 'snap2_accuracy_pc']
|
||||
|
||||
# TODO: ADD ED values
|
||||
# Problematic due to NA: filling NA with unknown or string will make it categorical
|
||||
# OPTIONS
|
||||
# 1. Imputing: KNN or MICE or from distribution
|
||||
# 2. Fill na with median or mode
|
||||
# 3. Separate datset without including genomic features AT ALL for ML, then using this as a 'blind test set'
|
||||
# this means the size of the training data gets reduced!
|
||||
# 4. Remove genomic features from ML COMPLETELEY!
|
||||
|
||||
# X_genomicFN = ['af'
|
||||
# , 'or_mychisq'
|
||||
# , 'or_logistic'
|
||||
# , 'or_fisher'
|
||||
# , 'pval_fisher']
|
||||
|
||||
#%% try combinations
|
||||
X_vars1 = my_df[x_stabilityN]
|
||||
X_vars2 = my_df[X_strFN]
|
||||
X_vars3 = my_df[X_evolFN]
|
||||
|
||||
X_vars5 = my_df[x_stabilityN + X_strFN]
|
||||
X_vars6 = my_df[x_stabilityN + X_evolFN]
|
||||
#X_vars7 = my_df[x_stabilityN + X_genomicFN]
|
||||
X_vars8 = my_df[X_strFN + X_evolFN]
|
||||
#X_vars9 = my_df[X_strFN + X_genomicFN]
|
||||
#X_vars10 = my_df[X_evolFN + X_genomicFN]
|
||||
X_vars11 = my_df[x_stabilityN + X_strFN + X_evolFN]
|
||||
#X_vars12 = my_df[x_stabilityN + X_strFN + X_evolFN + X_genomicFN]
|
||||
|
||||
numerical_features_names = x_stabilityN + X_strFN + X_evolFN
|
||||
#%% Construct numerical and categorical column names
|
||||
numerical_FN = x_stabilityN + X_strFN + X_evolFN
|
||||
|
||||
# separate ones for foldx?
|
||||
categorical_features_names = ['ss_class'
|
||||
, 'wt_prop_water'
|
||||
# , 'lineage_labels' # misleading if using merged_df3
|
||||
, 'mut_prop_water'
|
||||
, 'wt_prop_polarity'
|
||||
, 'mut_prop_polarity'
|
||||
, 'wt_calcprop'
|
||||
, 'mut_calcprop'
|
||||
, 'active_aa_pos']
|
||||
categorical_FN = ['ss_class'
|
||||
, 'wt_prop_water'
|
||||
# , 'lineage_labels' # misleading if using merged_df3
|
||||
, 'mut_prop_water'
|
||||
, 'wt_prop_polarity'
|
||||
, 'mut_prop_polarity'
|
||||
, 'wt_calcprop'
|
||||
, 'mut_calcprop'
|
||||
, 'active_aa_pos']
|
||||
|
||||
numerical_features_df = my_df[numerical_features_names]
|
||||
numerical_features_df.shape
|
||||
#%% extracting dfs based on numerical, categorical column names
|
||||
#----------------------------------
|
||||
# WITHOUT the target var included
|
||||
#----------------------------------
|
||||
num_df = my_df[numerical_FN]
|
||||
num_df.shape
|
||||
|
||||
categorical_features_df = my_df[categorical_features_names]
|
||||
categorical_features_df.shape
|
||||
cat_df = my_df[categorical_FN]
|
||||
cat_df.shape
|
||||
|
||||
all_features_df = my_df[numerical_features_names + categorical_features_names]
|
||||
all_features_df.shape
|
||||
all_df = my_df[numerical_FN + categorical_FN]
|
||||
all_df.shape
|
||||
|
||||
#------------------------------
|
||||
# WITH the target var included:
|
||||
#'wtgt': with target
|
||||
#------------------------------
|
||||
num_df_wtgt = my_df[numerical_FN + ['mutation_class']]
|
||||
num_df_wtgt.shape
|
||||
|
||||
cat_df_wtgt = my_df[categorical_FN + ['mutation_class']]
|
||||
cat_df_wtgt.shape
|
||||
|
||||
all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
|
||||
all_df_wtgt.shape
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue