#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Feb 24 10:48:10 2022 @author: tanu """ ############################################################################### # questions: # which data to use: merged_df3 or merged_df2 # which is the target? or_mychisq or drtype col # scaling: can it be from -1 to 1? # how to include the mutation information? # 'wild_type', 'mutant', 'postion' # whether to log transform the af and or cols # to allow mean mode values to be imputed for validation set # whether to calculate mean, median accounting for NA or removing them? # strategy: # available data = X_train # available data but NAN = validation_test # test data: mut generated not in mcsm ############################################################################### import os, sys import re from sklearn.datasets import load_boston from sklearn import linear_model from sklearn import preprocessing import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np print(np.__version__) print(pd.__version__) #%% read data homedir = os.path.expanduser("~") os.chdir(homedir + "/git/ML_AI_training/test_data") # this needs to be merged_df2 or merged_df3? my_df = pd.read_csv("pnca_all_params.csv") my_df.dtypes my_df_cols = my_df.columns omit_cols1 = ['pdb_file' , 'seq_offset4pdb' , 'mut_3upper' , 'wild_pos' , 'wild_chain_pos' , 'chain' , 'wt_3upper' , 'consurf_colour' , 'consurf_colour_rev' , 'consurf_msa_data' , 'consurf_aa_variety' , 'snap2_accuracy_pc' , 'beta_logistic' , 'se_logistic' , 'zval_logisitc' , 'pval_chisq' , 'log10_or_mychisq' , 'neglog_pval_fisher' , 'or_fisher' , 'wild_type' , 'mutant_type' , 'position' , 'ligand_id' , 'mutation' , 'ss' , 'ss_class' # include it later? , 'contacts' ] omit_cols2 = list(my_df.columns[my_df.columns.str.contains(".*ci_.*") | my_df.columns.str.contains(".*_scaled*") | my_df.columns.str.contains(".*_outcome*")]) # [WATCH:] just to test since these have negative values! omit_cols3 = list(my_df.columns[my_df.columns.str.contains("electro_.*") | my_df.columns.str.contains("disulfide_.*") | my_df.columns.str.contains("hbonds_.*") | my_df.columns.str.contains("partcov_.*") | my_df.columns.str.contains("vdwclashes.*") | my_df.columns.str.contains("volumetric.*")]) omit_cols = omit_cols1 + omit_cols2 + omit_cols3 my_df_filt = my_df.loc[:, ~my_df.columns.isin(omit_cols)] my_df_filt_cols = my_df_filt.columns #fill NaNs with column means in each column my_df_filt2 = my_df_filt.fillna(my_df_filt.mean()) my_df_filt3 = my_df_filt.fillna(my_df_filt.median()) my_df_filt_noNA = my_df_filt.fillna(0) summ = my_df_filt.describe() summ_noNA = my_df_filt_noNA.describe() foo = my_df_filt['or_mychisq'].value_counts() foo = foo.to_frame() ######################## # [WATCH]: Drop na my_df2 = my_df_filt3.dropna() my_df2['resistance'] = my_df2['or_mychisq'].apply(lambda x: 0 if x <=1 else 1) my_df2['resistance'].value_counts() y = my_df2['resistance'] y.value_counts() #%%============================================================================ X_validation_muts = my_df['mutationinformation'][~my_df['mutationinformation'].isin(my_df2['mutationinformation'])] X_validation_all = my_df_filt3[~my_df_filt3['mutationinformation'].isin(my_df2['mutationinformation'])] X_validation_f = X_validation_all.loc[:, ~X_validation_all.columns.isin(['or_mychisq', 'resistance'])] X_validation = X_validation_f.set_index('mutationinformation') #%% fill na in cols with mean value X_validation.info() X_validation.isna().any() na_df = X_validation_f[X_validation_f.columns[X_validation_f.isna().any()]] na_colnames = X_validation_f.columns[X_validation_f.isna().any()] na_colsL = list(na_colnames) #============================================================================== omit_cols_y = ['or_mychisq', 'resistance'] my_df_ml = my_df2.loc[:, ~my_df2.columns.isin(omit_cols_y)] #%%############################################################################ X_train = my_df_ml.set_index('mutationinformation') #X_train = X_train.iloc[:,:4] y_train = y #X_train = X_train.dropna() #y_train = y.dropna() # check dim X_train.shape y_train.shape ############################################################################### from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate from sklearn.metrics import make_scorer from sklearn.metrics import accuracy_score, precision_score, recall_score model_logisP = Pipeline(steps = [('preprocess', preprocessing.MinMaxScaler()) , ('logis', LogisticRegression(class_weight = 'unbalanced')) ]) model_logisP.fit(X_train, y_train) fitted_vals = model_logisP.predict(X_train) fitted_vals # gives the array of predictions model_logisP.predict(X_train) model_logisP.predict(X_validation) y_pred = model_logisP.predict(X_train) y_pred2 = model_logisP.predict(X_validation) accuracy_score(y_train,y_pred2) precision_score(y_train,y_pred2, pos_label = 1)# tp/(tp + fp) recall_score(y_train,y_pred2, pos_label = 1) # tp/(tp + fn) ################ acc = make_scorer(accuracy_score) def precision(y_true,y_pred): return precision_score(y_true,y_pred,pos_label = 1) #0 def recall(y_true,y_pred): return recall_score(y_true, y_pred, pos_label = 1) #0 prec = make_scorer(precision) rec = make_scorer(recall) output = cross_validate(model_logisP , X_train , y , scoring = {'acc' : acc ,'prec' : prec ,'rec' : rec} , cv = 10, return_train_score = False) pd.DataFrame(output).mean()