179 lines
No EOL
6 KiB
Python
179 lines
No EOL
6 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Thu Feb 24 10:48:10 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
###############################################################################
|
|
# questions:
|
|
# which data to use: merged_df3 or merged_df2
|
|
# which is the target? or_mychisq or drtype col
|
|
# scaling: can it be from -1 to 1?
|
|
# how to include the mutation information?
|
|
# 'wild_type', 'mutant', 'postion'
|
|
# whether to log transform the af and or cols
|
|
# to allow mean mode values to be imputed for validation set
|
|
# whether to calculate mean, median accounting for NA or removing them?
|
|
|
|
# strategy:
|
|
# available data = X_train
|
|
# available data but NAN = validation_test
|
|
# test data: mut generated not in mcsm
|
|
|
|
###############################################################################
|
|
import os, sys
|
|
import re
|
|
from sklearn.datasets import load_boston
|
|
from sklearn import linear_model
|
|
from sklearn import preprocessing
|
|
import pandas as pd
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
print(np.__version__)
|
|
print(pd.__version__)
|
|
#%% read data
|
|
homedir = os.path.expanduser("~")
|
|
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
|
|
|
# this needs to be merged_df2 or merged_df3?
|
|
my_df = pd.read_csv("pnca_all_params.csv")
|
|
|
|
my_df.dtypes
|
|
my_df_cols = my_df.columns
|
|
|
|
omit_cols1 = ['pdb_file'
|
|
, 'seq_offset4pdb'
|
|
, 'mut_3upper'
|
|
, 'wild_pos'
|
|
, 'wild_chain_pos'
|
|
, 'chain'
|
|
, 'wt_3upper'
|
|
, 'consurf_colour'
|
|
, 'consurf_colour_rev'
|
|
, 'consurf_msa_data'
|
|
, 'consurf_aa_variety'
|
|
, 'snap2_accuracy_pc'
|
|
, 'beta_logistic'
|
|
, 'se_logistic'
|
|
, 'zval_logisitc'
|
|
, 'pval_chisq'
|
|
, 'log10_or_mychisq'
|
|
, 'neglog_pval_fisher'
|
|
, 'or_fisher'
|
|
, 'wild_type'
|
|
, 'mutant_type'
|
|
, 'position'
|
|
, 'ligand_id'
|
|
, 'mutation'
|
|
, 'ss'
|
|
, 'ss_class' # include it later?
|
|
, 'contacts'
|
|
]
|
|
|
|
omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
|
|
|
|
# [WATCH:] just to test since these have negative values!
|
|
omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
|
|
|
|
omit_cols = omit_cols1 + omit_cols2 + omit_cols3
|
|
|
|
my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
|
|
my_df_filt_cols = my_df_filt.columns
|
|
|
|
#fill NaNs with column means in each column
|
|
my_df_filt2 = my_df_filt.fillna(my_df_filt.mean())
|
|
my_df_filt3 = my_df_filt.fillna(my_df_filt.median())
|
|
|
|
my_df_filt_noNA = my_df_filt.fillna(0)
|
|
|
|
summ = my_df_filt.describe()
|
|
summ_noNA = my_df_filt_noNA.describe()
|
|
|
|
foo = my_df_filt['or_mychisq'].value_counts()
|
|
foo = foo.to_frame()
|
|
|
|
########################
|
|
# [WATCH]: Drop na
|
|
my_df2 = my_df_filt3.dropna()
|
|
my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
|
|
my_df2['resistance'].value_counts()
|
|
y = my_df2['resistance']
|
|
y.value_counts()
|
|
|
|
|
|
#%%============================================================================
|
|
X_validation_muts = my_df['mutationinformation'][~my_df['mutationinformation'].isin(my_df2['mutationinformation'])]
|
|
X_validation_all = my_df_filt3[~my_df_filt3['mutationinformation'].isin(my_df2['mutationinformation'])]
|
|
X_validation_f = X_validation_all.loc[:, ~X_validation_all.columns.isin(['or_mychisq', 'resistance'])]
|
|
X_validation = X_validation_f.set_index('mutationinformation')
|
|
|
|
#%% fill na in cols with mean value
|
|
X_validation.info()
|
|
X_validation.isna().any()
|
|
|
|
na_df = X_validation_f[X_validation_f.columns[X_validation_f.isna().any()]]
|
|
na_colnames = X_validation_f.columns[X_validation_f.isna().any()]
|
|
na_colsL = list(na_colnames)
|
|
|
|
#==============================================================================
|
|
omit_cols_y = ['or_mychisq', 'resistance']
|
|
my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
|
|
#%%############################################################################
|
|
X_train = my_df_ml.set_index('mutationinformation')
|
|
#X_train = X_train.iloc[:,:4]
|
|
y_train = y
|
|
#X_train = X_train.dropna()
|
|
#y_train = y.dropna()
|
|
|
|
# check dim
|
|
X_train.shape
|
|
y_train.shape
|
|
|
|
|
|
###############################################################################
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.model_selection import cross_validate
|
|
from sklearn.metrics import make_scorer
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
|
|
|
model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
|
, ('logis', LogisticRegression(class_weight = 'unbalanced'))
|
|
])
|
|
|
|
model_logisP.fit(X_train, y_train)
|
|
fitted_vals = model_logisP.predict(X_train)
|
|
fitted_vals
|
|
|
|
# gives the array of predictions
|
|
model_logisP.predict(X_train)
|
|
model_logisP.predict(X_validation)
|
|
y_pred = model_logisP.predict(X_train)
|
|
y_pred2 = model_logisP.predict(X_validation)
|
|
|
|
accuracy_score(y_train,y_pred2)
|
|
precision_score(y_train,y_pred2, pos_label = 1)# tp/(tp + fp)
|
|
recall_score(y_train,y_pred2, pos_label = 1) # tp/(tp + fn)
|
|
|
|
|
|
################
|
|
acc = make_scorer(accuracy_score)
|
|
def precision(y_true,y_pred):
|
|
return precision_score(y_true,y_pred,pos_label = 1) #0
|
|
|
|
def recall(y_true,y_pred):
|
|
return recall_score(y_true, y_pred, pos_label = 1) #0
|
|
|
|
prec = make_scorer(precision)
|
|
rec = make_scorer(recall)
|
|
output = cross_validate(model_logisP
|
|
, X_train
|
|
, y
|
|
, scoring = {'acc' : acc
|
|
,'prec' : prec
|
|
,'rec' : rec}
|
|
, cv = 10, return_train_score = False)
|
|
|
|
pd.DataFrame(output).mean() |