renamed hyperparams to gscv
This commit is contained in:
parent
a82358dbb4
commit
ad5ebad7f8
31 changed files with 4433 additions and 0 deletions
212
earlier_versions/my_datap1.py
Normal file
212
earlier_versions/my_datap1.py
Normal file
|
@ -0,0 +1,212 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Feb 24 10:48:10 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
###############################################################################
|
||||
# questions:
|
||||
# which data to use: merged_df3 or merged_df2
|
||||
# which is the target? or_mychisq or drtype col
|
||||
# scaling: can it be from -1 to 1?
|
||||
|
||||
# strategy:
|
||||
# available data = X_train
|
||||
# available data but NAN = validation_test
|
||||
# test data: mut generated not in mcsm
|
||||
|
||||
###############################################################################
|
||||
import os, sys
|
||||
import re
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn import linear_model
|
||||
from sklearn import preprocessing
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
print(np.__version__)
|
||||
print(pd.__version__)
|
||||
#%% read data
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
||||
|
||||
# this needs to be merged_df2 or merged_df3?
|
||||
my_df = pd.read_csv("pnca_all_params.csv")
|
||||
|
||||
my_df.dtypes
|
||||
my_df_cols = my_df.columns
|
||||
|
||||
omit_cols1 = ['pdb_file'
|
||||
, 'seq_offset4pdb'
|
||||
, 'mut_3upper'
|
||||
, 'wild_pos'
|
||||
, 'wild_chain_pos'
|
||||
, 'chain'
|
||||
, 'wt_3upper'
|
||||
, 'consurf_colour'
|
||||
, 'consurf_colour_rev'
|
||||
, 'consurf_msa_data'
|
||||
, 'consurf_aa_variety'
|
||||
, 'snap2_accuracy_pc'
|
||||
, 'beta_logistic'
|
||||
, 'se_logistic'
|
||||
, 'zval_logisitc'
|
||||
, 'pval_chisq'
|
||||
, 'log10_or_mychisq'
|
||||
, 'neglog_pval_fisher'
|
||||
, 'wild_type'
|
||||
, 'mutant_type'
|
||||
, 'position'
|
||||
, 'ligand_id'
|
||||
, 'mutation'
|
||||
, 'ss'
|
||||
, 'ss_class' # include it later?
|
||||
]
|
||||
|
||||
omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")])
|
||||
|
||||
# [WATCH:] just to test since these have negative values!
|
||||
omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")])
|
||||
|
||||
omit_cols = omit_cols1 + omit_cols2 + omit_cols3
|
||||
|
||||
my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)]
|
||||
my_df_filt_cols = my_df_filt.columns
|
||||
|
||||
foo = my_df_filt['or_mychisq'].value_counts()
|
||||
foo = foo.to_frame()
|
||||
########################
|
||||
# [WATCH]: Drop na
|
||||
my_df2 = my_df_filt.dropna()
|
||||
my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1)
|
||||
my_df2['resistance'].value_counts()
|
||||
y = my_df2['resistance']
|
||||
|
||||
#==============================================================================
|
||||
omit_cols_y = ['or_mychisq', 'resistance']
|
||||
my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)]
|
||||
#%%############################################################################
|
||||
X_train = my_df_ml.set_index('mutationinformation')
|
||||
X_train = X_train.iloc[:,:4]
|
||||
y_train = y
|
||||
#X_train = X_train.dropna()
|
||||
#y_train = y.dropna()
|
||||
|
||||
# check dim
|
||||
X_train.shape
|
||||
y_train.shape
|
||||
|
||||
#%%=====================================================
|
||||
grid = sns.PairGrid(data = pd.concat([X_train
|
||||
, pd.Series(y_train , name = "resistance")]
|
||||
, axis = 1))
|
||||
|
||||
grid.map_offdiag(sns.scatterplot)
|
||||
grid.map_diag(sns.distplot)
|
||||
plt.show()
|
||||
|
||||
model = linear_model.LinearRegression()
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
###################
|
||||
# test set
|
||||
X_test = my_df[my_df['or_mychisq'].isnull()]
|
||||
#X_test =[ X_test.iloc[:,:4]]
|
||||
# HARD part?
|
||||
# what should be the test set?
|
||||
X_test = [23.9, 0.69, -0.16, 0.59
|
||||
, 5, 0.5, 0.4, -1
|
||||
, 0.1, 1, 1, 1]
|
||||
X_test_re = np.array(X_test).reshape(3, -1)
|
||||
|
||||
|
||||
####################
|
||||
fitted = model.predict(X_train)
|
||||
model.coef_
|
||||
model.predict(X_test_re)
|
||||
resid = y_train - fitted
|
||||
resid
|
||||
|
||||
#####################
|
||||
from sklearn import preprocessing
|
||||
scaler = preprocessing.MinMaxScaler()
|
||||
scaler.fit(X_train)
|
||||
#We can then create a scaled training set
|
||||
X_train_scaled = scaler.transform(X_train)
|
||||
new_scaled = scaler.transform(X_test_re)
|
||||
model.predict(new_scaled)
|
||||
#########
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
# model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
||||
# ,('regression', linear_model.LinearRegression())
|
||||
# ])
|
||||
model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler())
|
||||
,('logis', LogisticRegression(class_weight = 'balanced'))
|
||||
])
|
||||
|
||||
model_pipe.fit(X_train,y_train)
|
||||
fitted_vals = model_pipe.predict(X_train)
|
||||
# gives the array of predictions
|
||||
model_pipe.predict(X_test_re)
|
||||
|
||||
# for Linear Reg only
|
||||
# resid = y_train - fitted_vals
|
||||
# resid
|
||||
|
||||
#=====
|
||||
# Logistic 1 test
|
||||
# FAILS since: the test set dim and input dim should be the same
|
||||
# i.e if you give the model 10 features to train on, you will need
|
||||
# 10 features to predict something?
|
||||
# THINK!!!!
|
||||
#=====
|
||||
mod_logis = linear_model.LogisticRegression(class_weight = 'balanced')
|
||||
mod_logis.fit(X_train,y_train)
|
||||
X_test = [23.9]
|
||||
X_test_re = np.array(X_test).reshape(1, -1)
|
||||
mod_logis.predict(X_test_re)
|
||||
#################
|
||||
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
||||
y_pred = model_pipe.predict(X_train)
|
||||
accuracy_score(y_train,y_pred)
|
||||
precision_score(y_train,y_pred,pos_label=1)# tp/(tp + fp)
|
||||
recall_score(y_train,y_pred,pos_label=1) # tp/(tp + fn)
|
||||
########
|
||||
|
||||
# WORKS!
|
||||
from sklearn.model_selection import cross_validate
|
||||
from sklearn.metrics import make_scorer
|
||||
import pandas as pd
|
||||
acc = make_scorer(accuracy_score)
|
||||
def precision(y_true,y_pred):
|
||||
return precision_score(y_true,y_pred,pos_label = 1) #0
|
||||
|
||||
def recall(y_true,y_pred):
|
||||
return recall_score(y_true, y_pred, pos_label = 1) #0
|
||||
|
||||
prec = make_scorer(precision)
|
||||
rec = make_scorer(recall)
|
||||
output = cross_validate(model_pipe
|
||||
, X_train
|
||||
, y_train
|
||||
, scoring = {'acc' : acc
|
||||
,'prec' : prec
|
||||
,'rec' : rec}
|
||||
, cv = 10, return_train_score = False)
|
||||
|
||||
pd.DataFrame(output).mean()
|
||||
# fit_time 0.005486
|
||||
# score_time 0.002673
|
||||
# test_acc 0.601799
|
||||
# test_prec 0.976936
|
||||
# test_rec 0.603226
|
||||
# dtype: float64
|
||||
|
||||
# the three scores
|
||||
# 0.65527950310559
|
||||
# 0.9853658536585366
|
||||
# 0.6516129032258065
|
Loading…
Add table
Add a link
Reference in a new issue