#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Feb 24 10:48:10 2022 @author: tanu """ ############################################################################### # questions: # which data to use: merged_df3 or merged_df2 # which is the target? or_mychisq or drtype col # scaling: can it be from -1 to 1? # strategy: # available data = X_train # available data but NAN = validation_test # test data: mut generated not in mcsm ############################################################################### import os, sys import re from sklearn.datasets import load_boston from sklearn import linear_model from sklearn import preprocessing import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np print(np.__version__) print(pd.__version__) #%% read data homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/test_data") # this needs to be merged_df2 or merged_df3? my_df = pd.read_csv("pnca_all_params.csv") my_df.dtypes my_df_cols = my_df.columns omit_cols1 = ['pdb_file' , 'seq_offset4pdb' , 'mut_3upper' , 'wild_pos' , 'wild_chain_pos' , 'chain' , 'wt_3upper' , 'consurf_colour' , 'consurf_colour_rev' , 'consurf_msa_data' , 'consurf_aa_variety' , 'snap2_accuracy_pc' , 'beta_logistic' , 'se_logistic' , 'zval_logisitc' , 'pval_chisq' , 'log10_or_mychisq' , 'neglog_pval_fisher' , 'wild_type' , 'mutant_type' , 'position' , 'ligand_id' , 'mutation' , 'ss' , 'ss_class' # include it later? ] omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")]) # [WATCH:] just to test since these have negative values! omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")]) omit_cols = omit_cols1 + omit_cols2 + omit_cols3 my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)] my_df_filt_cols = my_df_filt.columns foo = my_df_filt['or_mychisq'].value_counts() foo = foo.to_frame() ######################## # [WATCH]: Drop na my_df2 = my_df_filt.dropna() my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1) my_df2['resistance'].value_counts() y = my_df2['resistance'] #============================================================================== omit_cols_y = ['or_mychisq', 'resistance'] my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)] #%%############################################################################ X_train = my_df_ml.set_index('mutationinformation') X_train = X_train.iloc[:,:4] y_train = y #X_train = X_train.dropna() #y_train = y.dropna() # check dim X_train.shape y_train.shape #%%===================================================== grid = sns.PairGrid(data = pd.concat([X_train , pd.Series(y_train , name = "resistance")] , axis = 1)) grid.map_offdiag(sns.scatterplot) grid.map_diag(sns.distplot) plt.show() model = linear_model.LinearRegression() model.fit(X_train, y_train) ################### # test set X_test = my_df[my_df['or_mychisq'].isnull()] #X_test =[ X_test.iloc[:,:4]] # HARD part? # what should be the test set? X_test = [23.9, 0.69, -0.16, 0.59 , 5, 0.5, 0.4, -1 , 0.1, 1, 1, 1] X_test_re = np.array(X_test).reshape(3, -1) #################### fitted = model.predict(X_train) model.coef_ model.predict(X_test_re) resid = y_train - fitted resid ##################### from sklearn import preprocessing scaler = preprocessing.MinMaxScaler() scaler.fit(X_train) #We can then create a scaled training set X_train_scaled = scaler.transform(X_train) new_scaled = scaler.transform(X_test_re) model.predict(new_scaled) ######### from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression # model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler()) # ,('regression', linear_model.LinearRegression()) # ]) model_pipe = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler()) ,('logis', LogisticRegression(class_weight = 'balanced')) ]) model_pipe.fit(X_train,y_train) fitted_vals = model_pipe.predict(X_train) # gives the array of predictions model_pipe.predict(X_test_re) # for Linear Reg only # resid = y_train - fitted_vals # resid #===== # Logistic 1 test # FAILS since: the test set dim and input dim should be the same # i.e if you give the model 10 features to train on, you will need # 10 features to predict something? # THINK!!!! #===== mod_logis = linear_model.LogisticRegression(class_weight = 'balanced') mod_logis.fit(X_train,y_train) X_test = [23.9] X_test_re = np.array(X_test).reshape(1, -1) mod_logis.predict(X_test_re) ################# from sklearn.metrics import accuracy_score, precision_score, recall_score y_pred = model_pipe.predict(X_train) accuracy_score(y_train,y_pred) precision_score(y_train,y_pred,pos_label=1)# tp/(tp + fp) recall_score(y_train,y_pred,pos_label=1) # tp/(tp + fn) ######## # WORKS! from sklearn.model_selection import cross_validate from sklearn.metrics import make_scorer import pandas as pd acc = make_scorer(accuracy_score) def precision(y_true,y_pred): return precision_score(y_true,y_pred,pos_label = 1) #0 def recall(y_true,y_pred): return recall_score(y_true, y_pred, pos_label = 1) #0 prec = make_scorer(precision) rec = make_scorer(recall) output = cross_validate(model_pipe , X_train , y_train , scoring = {'acc' : acc ,'prec' : prec ,'rec' : rec} , cv = 10, return_train_score = False) pd.DataFrame(output).mean() # fit_time 0.005486 # score_time 0.002673 # test_acc 0.601799 # test_prec 0.976936 # test_rec 0.603226 # dtype: float64 # the three scores # 0.65527950310559 # 0.9853658536585366 # 0.6516129032258065