LSHTM_analysis/scripts/ml/untitled5.py

52 lines
1.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 2 11:11:49 2022
@author: tanu
"""
# https://towardsdatascience.com/explain-feature-variation-employing-pca-in-scikit-learn-6711e0a5c0b7
from sklearn.decomposition import PCA
#import tensorflow as tf
#from tensorflow import keras
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import matthews_corrcoef
# pca = PCA().fit(X)
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel(number of components)
# plt.ylabel(cumulative explained variance)
# from old scripts
fooD = combined_DF_OS(combined_df)
numerical_ix = fooD['X'].select_dtypes(include=['int64', 'float64']).columns
numerical_ix
num_featuresL = list(numerical_ix)
numerical_colind = fooD['X'].columns.get_indexer(list(numerical_ix) )
numerical_colind
numF = fooD['X'][numerical_ix]
categorical_ix = fooD['X'].select_dtypes(include=['object', 'bool']).columns
categorical_ix
categorical_colind = fooD['X'].columns.get_indexer(list(categorical_ix))
categorical_colind
##############
X_train,X_test,y_train,y_test=train_test_split(numF,fooD['y'],test_size=0.2)
pca=PCA(n_components=50)
X_train_new=pca.fit_transform(X_train)
X_test_new=pca.transform(X_test)
print(X_train.shape)
print(X_train_new.shape)
pca.explained_variance_ratio_
clf=KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train_new,y_train)
y_pred_new=clf.predict(X_test_new)
matthews_corrcoef(y_test,y_pred_new)