added count for targets for all genes and ran multiple classification models for all of the genes and target as a start
This commit is contained in:
parent
89158bc669
commit
877862acb7
8 changed files with 948 additions and 0 deletions
95
MultClassPipe.py
Normal file
95
MultClassPipe.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Fri Mar 4 15:25:33 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#%%
|
||||||
|
import os, sys
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.naive_bayes import BernoulliNB
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from xgboost import XGBClassifier
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
|
||||||
|
#%%
|
||||||
|
rs = {'random_state': 42}
|
||||||
|
|
||||||
|
# Multiple Classification - Model Pipeline
|
||||||
|
def MultClassPipeline(X_train, X_test, y_train, y_test):
|
||||||
|
|
||||||
|
log_reg = LogisticRegression(**rs)
|
||||||
|
nb = BernoulliNB()
|
||||||
|
knn = KNeighborsClassifier()
|
||||||
|
svm = SVC(**rs)
|
||||||
|
mlp = MLPClassifier(max_iter=500, **rs)
|
||||||
|
dt = DecisionTreeClassifier(**rs)
|
||||||
|
et = ExtraTreesClassifier(**rs)
|
||||||
|
rf = RandomForestClassifier(**rs)
|
||||||
|
xgb = XGBClassifier(**rs, verbosity=0)
|
||||||
|
|
||||||
|
clfs = [
|
||||||
|
('Logistic Regression', log_reg),
|
||||||
|
('Naive Bayes', nb),
|
||||||
|
('K-Nearest Neighbors', knn),
|
||||||
|
('SVM', svm),
|
||||||
|
('MLP', mlp),
|
||||||
|
('Decision Tree', dt),
|
||||||
|
('Extra Trees', et),
|
||||||
|
('Random Forest', rf),
|
||||||
|
('XGBoost', xgb)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
pipelines = []
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
|
||||||
|
|
||||||
|
for clf_name, clf in clfs:
|
||||||
|
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('scaler', MinMaxScaler()),
|
||||||
|
#('scaler', StandardScaler()),
|
||||||
|
('classifier', clf)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
pipeline.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Model predictions
|
||||||
|
y_pred = pipeline.predict(X_test)
|
||||||
|
|
||||||
|
# F1-Score
|
||||||
|
fscore = f1_score(y_test, y_pred)
|
||||||
|
# Precision
|
||||||
|
pres = precision_score(y_test, y_pred)
|
||||||
|
# Recall
|
||||||
|
rcall = recall_score(y_test, y_pred)
|
||||||
|
# Accuracy
|
||||||
|
accu = accuracy_score(y_test, y_pred)
|
||||||
|
# ROC_AUC
|
||||||
|
roc_auc = roc_auc_score(y_test, y_pred)
|
||||||
|
|
||||||
|
pipelines.append(pipeline)
|
||||||
|
|
||||||
|
scores_df = scores_df.append({
|
||||||
|
'Model' : clf_name,
|
||||||
|
'F1_Score' : fscore,
|
||||||
|
'Precision' : pres,
|
||||||
|
'Recall' : rcall,
|
||||||
|
'Accuracy' : accu,
|
||||||
|
'ROC_AUC' : roc_auc
|
||||||
|
|
||||||
|
},
|
||||||
|
ignore_index = True)
|
||||||
|
|
||||||
|
return pipelines, scores_df
|
||||||
|
|
45
X_categories
Normal file
45
X_categories
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Fri Mar 4 15:09:37 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
|
||||||
|
X_categ_str = ['ss_class'
|
||||||
|
, 'wt_prop_water'
|
||||||
|
, 'mut_prop_water'
|
||||||
|
, 'wt_prop_polarity'
|
||||||
|
, 'mut_prop_polarity'
|
||||||
|
, 'wt_calcprop'
|
||||||
|
, 'mut_calcprop'
|
||||||
|
, 'active_aa_pos']
|
||||||
|
|
||||||
|
# only valid if we use merged_df2
|
||||||
|
X_categ_str_lin = X_categ_str + ['lineage_labels']
|
||||||
|
|
||||||
|
X_categ_foldx = ['contacts'
|
||||||
|
'electro_rr'
|
||||||
|
'electro_mm'
|
||||||
|
'electro_sm'
|
||||||
|
'electro_ss'
|
||||||
|
'disulfide_rr'
|
||||||
|
'disulfide_mm'
|
||||||
|
'disulfide_sm'
|
||||||
|
'disulfide_ss'
|
||||||
|
'hbonds_rr'
|
||||||
|
'hbonds_mm'
|
||||||
|
'hbonds_sm'
|
||||||
|
'hbonds_ss'
|
||||||
|
'partcov_rr'
|
||||||
|
'partcov_mm'
|
||||||
|
'partcov_sm'
|
||||||
|
'partcov_ss'
|
||||||
|
'vdwclashes_rr'
|
||||||
|
'vdwclashes_mm'
|
||||||
|
'vdwclashes_sm'
|
||||||
|
'vdwclashes_ss'
|
||||||
|
'volumetric_rr'
|
||||||
|
'volumetric_mm'
|
||||||
|
'volumetric_sm'
|
||||||
|
'volumetric_ss']
|
BIN
__pycache__/MultClassPipe.cpython-37.pyc
Normal file
BIN
__pycache__/MultClassPipe.cpython-37.pyc
Normal file
Binary file not shown.
335
ml_data/.Rhistory
Normal file
335
ml_data/.Rhistory
Normal file
|
@ -0,0 +1,335 @@
|
||||||
|
source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
##################################################
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
######################################################
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
if( (length(colnames_order) == ncol(merged_df3)) && (all(colnames_order %in%colnames(merged_df3))) ){
|
||||||
|
cat("\nProceeding with rearranging columns in merged_df3")
|
||||||
|
merged_df3_o = merged_df3[ , colnames_order]
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3_o, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3_o)
|
||||||
|
, "\nncols:" , ncol(merged_df3_o))
|
||||||
|
}else
|
||||||
|
cat("length mismatch:"
|
||||||
|
, colnames(merged_df3)[!colnames(merged_df3)%in%(colnames_order )]
|
||||||
|
)
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3)
|
||||||
|
, "\nncols:" , ncol(merged_df3))
|
||||||
|
#=========================================================
|
||||||
|
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||||
|
mdf2_outName
|
||||||
|
cat("\nWriting output file:", mdf2_outName)
|
||||||
|
write.csv(merged_df2, mdf2_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df2)
|
||||||
|
, "\nncols:" , ncol(merged_df2))
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
#----------------------------------------------------
|
||||||
|
# source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
active_aa_pos
|
||||||
|
merged_df3['position']%in%active_aa_pos
|
||||||
|
merged_df3$position%in%active_aa_pos
|
||||||
|
merged_df3['active_aa_pos'] <- merged_df3['position']
|
||||||
|
merged_df3['active_aa_pos']
|
||||||
|
identical(merged_df3['active_aa_pos'] , merged_df3['position'])
|
||||||
|
(merged_df3['active_aa_pos'] == merged_df3['position'])
|
||||||
|
all(merged_df3['active_aa_pos'] == merged_df3['position'])
|
||||||
|
merged_df3['active_aa_pos'] <- merged_df3['position']
|
||||||
|
if (merged_df3$position%in%active_aa_pos){
|
||||||
|
merged_df3['active_aa_pos'] = 1
|
||||||
|
}else{
|
||||||
|
merged_df3['active_aa_pos'] = 0
|
||||||
|
}
|
||||||
|
merged_df3['active_aa_pos']
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
merged_df3['active_aa_pos'] <- merged_df3['position']
|
||||||
|
merged_df3$active_aa_pos <- merged_df3$osition
|
||||||
|
merged_df3$active_aa_pos
|
||||||
|
merged_df3$active_aa_pos <- merged_df3$position
|
||||||
|
merged_df3$active_aa_pos
|
||||||
|
merged_df3$postion%in%active_aa_pos
|
||||||
|
merged_df3$postion%in%active_aa_pos
|
||||||
|
merged_df3$postion
|
||||||
|
erged_df3$position%in%active_aa_pos
|
||||||
|
merged_df3$position
|
||||||
|
active_aa_pos
|
||||||
|
which(merged_df3$position%in%active_aa_pos)
|
||||||
|
c =which(merged_df3$position%in%active_aa_pos)
|
||||||
|
merged_df3$position[c]
|
||||||
|
active_aa_pos
|
||||||
|
merged_df3$position%in%active_aa_pos
|
||||||
|
merged_df3$active_aa_pos <- merged_df3$position
|
||||||
|
merged_df3$active_aa_pos %in% active_aa_pos
|
||||||
|
ifelse(merged_df3$active_aa_pos %in% active_aa_pos , "1", "0")
|
||||||
|
table(merged_df3$active_aa_po)
|
||||||
|
str(merged_df3$active_aa_po)
|
||||||
|
str(merged_df3$active_aa_pos)
|
||||||
|
#TODO later!
|
||||||
|
merged_df3$active_aa_pos <- merged_df3$position
|
||||||
|
merged_df3$active_aa_pos
|
||||||
|
ifelse(merged_df3$active_aa_pos %in% active_aa_pos , 1, 0)
|
||||||
|
str(merged_df3$active_aa_pos)
|
||||||
|
#str(merged_df3$active_aa_pos)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
#str(merged_df3$active_aa_pos)
|
||||||
|
foo = merged_df3$active_aa_pos
|
||||||
|
merged_df3$active_aa_pos
|
||||||
|
ifelse(merged_df3$active_aa_pos %in% active_aa_pos , 1, 0)
|
||||||
|
merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0)
|
||||||
|
#str(merged_df3$active_aa_pos)
|
||||||
|
foo = merged_df3$active_aa_pos
|
||||||
|
#str(merged_df3$active_aa_pos)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
length(active_aa_pos)
|
||||||
|
which(merged_df3$position%in%active_aa_pos)
|
||||||
|
which(merged_df3$position%in%active_aa_pos)
|
||||||
|
which(!merged_df3$position%in%active_aa_pos)
|
||||||
|
which(merged_df3$position%in%active_aa_pos)
|
||||||
|
active_aa_pos)
|
||||||
|
active_aa_pos
|
||||||
|
merged_df3$position[209,]
|
||||||
|
merged_df3[209,]
|
||||||
|
merged_df3$position[209]
|
||||||
|
merged_df3[209]
|
||||||
|
merged_df3[209,]
|
||||||
|
active_aa_pos
|
||||||
|
merged_df3$position[!merged_df3$position%in%active_aa_pos]
|
||||||
|
merged_df3$position[!active_aa_pos%in%merged_df3$position]
|
||||||
|
active_aa_pos
|
||||||
|
active_aa_pos[!active_aa_pos%in%merged_df3$position]
|
||||||
|
#str(merged_df3$active_aa_pos)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
#----------------------------------------------------
|
||||||
|
# source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0)
|
||||||
|
#str(merged_df3$active_aa_pos)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3)
|
||||||
|
, "\nncols:" , ncol(merged_df3))
|
||||||
|
merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
table(merged_df2$active_aa_pos)
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
#----------------------------------------------------
|
||||||
|
# source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
######################################################
|
||||||
|
merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3)
|
||||||
|
, "\nncols:" , ncol(merged_df3))
|
||||||
|
#=========================================================
|
||||||
|
merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df2$active_aa_pos)
|
||||||
|
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||||
|
mdf2_outName
|
||||||
|
cat("\nWriting output file:", mdf2_outName)
|
||||||
|
write.csv(merged_df2, mdf2_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df2)
|
||||||
|
, "\nncols:" , ncol(merged_df2))
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
######################################################
|
||||||
|
merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3)
|
||||||
|
, "\nncols:" , ncol(merged_df3))
|
||||||
|
#=========================================================
|
||||||
|
merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df2$active_aa_pos)
|
||||||
|
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||||
|
mdf2_outName
|
||||||
|
cat("\nWriting output file:", mdf2_outName)
|
||||||
|
write.csv(merged_df2, mdf2_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df2)
|
||||||
|
, "\nncols:" , ncol(merged_df2))
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
#----------------------------------------------------
|
||||||
|
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
gene
|
||||||
|
drug
|
||||||
|
######################################################
|
||||||
|
merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3)
|
||||||
|
, "\nncols:" , ncol(merged_df3))
|
||||||
|
#=========================================================
|
||||||
|
merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df2$active_aa_pos)
|
||||||
|
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||||
|
mdf2_outName
|
||||||
|
cat("\nWriting output file:", mdf2_outName)
|
||||||
|
write.csv(merged_df2, mdf2_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df2)
|
||||||
|
, "\nncols:" , ncol(merged_df2))
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
#----------------------------------------------------
|
||||||
|
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
######################################################
|
||||||
|
gene; drug
|
||||||
|
merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3)
|
||||||
|
, "\nncols:" , ncol(merged_df3))
|
||||||
|
#=========================================================
|
||||||
|
merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df2$active_aa_pos)
|
||||||
|
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||||
|
mdf2_outName
|
||||||
|
cat("\nWriting output file:", mdf2_outName)
|
||||||
|
write.csv(merged_df2, mdf2_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df2)
|
||||||
|
, "\nncols:" , ncol(merged_df2))
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
#----------------------------------------------------
|
||||||
|
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
######################################################
|
||||||
|
gene; drug
|
||||||
|
merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3)
|
||||||
|
, "\nncols:" , ncol(merged_df3))
|
||||||
|
#=========================================================
|
||||||
|
merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df2$active_aa_pos)
|
||||||
|
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||||
|
mdf2_outName
|
||||||
|
cat("\nWriting output file:", mdf2_outName)
|
||||||
|
write.csv(merged_df2, mdf2_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df2)
|
||||||
|
, "\nncols:" , ncol(merged_df2))
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
#----------------------------------------------------
|
||||||
|
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
#----------------------------------------------------
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
######################################################
|
||||||
|
gene; drug
|
||||||
|
merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df3$active_aa_pos)
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3)
|
||||||
|
, "\nncols:" , ncol(merged_df3))
|
||||||
|
#=========================================================
|
||||||
|
merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0)
|
||||||
|
table(merged_df2$active_aa_pos)
|
||||||
|
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||||
|
mdf2_outName
|
||||||
|
cat("\nWriting output file:", mdf2_outName)
|
||||||
|
write.csv(merged_df2, mdf2_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df2)
|
||||||
|
, "\nncols:" , ncol(merged_df2))
|
65
ml_data/del/ml_data_v1.R
Normal file
65
ml_data/del/ml_data_v1.R
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
#!/usr/bin/env Rscript
|
||||||
|
|
||||||
|
# target var options:
|
||||||
|
# drtype: MDR, etc, full data
|
||||||
|
# pyrazinamide: 0 and 1, loss of data
|
||||||
|
# mutation_info_labels: DM and OM, full data
|
||||||
|
##################################################
|
||||||
|
# ONLY ONCE
|
||||||
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
#write.csv(colnames(merged_df3), "data_colnames.csv")
|
||||||
|
#---------------------------------------------------
|
||||||
|
colnames_order_pnca = read.csv("~/git/ML_AI_training/ml_data/colnames_order.csv"
|
||||||
|
, header = F)
|
||||||
|
# reorder columns by name
|
||||||
|
colnames_order_pnca <- colnames_order_pnca$V1
|
||||||
|
###################################################
|
||||||
|
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||||
|
#config_gene = c("alr", "embb")
|
||||||
|
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||||
|
|
||||||
|
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
# FIXME: "cycloserine" "mcsm_ppi2_affinity" "mcsm_ppi2_scaled" "mcsm_ppi2_outcome" "interface_dist"
|
||||||
|
# source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
# source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
##################################################
|
||||||
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
|
||||||
|
######################################################
|
||||||
|
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||||
|
mdf3_outName
|
||||||
|
|
||||||
|
if( (length(colnames_order) == ncol(merged_df3)) && (all(colnames_order %in%colnames(merged_df3))) ){
|
||||||
|
cat("\nProceeding with rearranging columns in merged_df3")
|
||||||
|
merged_df3_o = merged_df3[ , colnames_order]
|
||||||
|
cat("\nWriting output file:", mdf3_outName)
|
||||||
|
write.csv(merged_df3_o, mdf3_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df3_o)
|
||||||
|
, "\nncols:" , ncol(merged_df3_o))
|
||||||
|
|
||||||
|
}else
|
||||||
|
cat("length mismatch:"
|
||||||
|
, colnames(merged_df3)[!colnames(merged_df3)%in%(colnames_order )]
|
||||||
|
)
|
||||||
|
|
||||||
|
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||||
|
mdf2_outName
|
||||||
|
|
||||||
|
if( (length(colnames_order) == ncol(merged_df2)) && (all(colnames_order %in%colnames(merged_df2))) ){
|
||||||
|
cat("\nProceeding with rearranging columns in merged_df3")
|
||||||
|
merged_df2_o = merged_df2[ , colnames_order]
|
||||||
|
cat("\nWriting output file:", mdf2_outName)
|
||||||
|
write.csv(merged_df2_o, mdf2_outName, row.names = F)
|
||||||
|
cat("\nnrows:" , nrow(merged_df2_o)
|
||||||
|
, "\nncols:" , ncol(merged_df2_o))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
171
my_data6.py
Normal file
171
my_data6.py
Normal file
|
@ -0,0 +1,171 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Fri Mar 4 14:54:30 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
import os, sys
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.naive_bayes import BernoulliNB
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from xgboost import XGBClassifier
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
|
||||||
|
#%%
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
os.chdir(homedir + "/git/ML_AI_training/")
|
||||||
|
|
||||||
|
# my function
|
||||||
|
from MultClassPipe import MultClassPipeline
|
||||||
|
|
||||||
|
#gene = 'pncA'
|
||||||
|
#drug = 'pyrazinamide'
|
||||||
|
|
||||||
|
#==============
|
||||||
|
# directories
|
||||||
|
#==============
|
||||||
|
datadir = homedir + '/git/Data/'
|
||||||
|
indir = datadir + drug + '/input/'
|
||||||
|
outdir = datadir + drug + '/output/'
|
||||||
|
|
||||||
|
#=======
|
||||||
|
# input
|
||||||
|
#=======
|
||||||
|
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
|
||||||
|
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
|
||||||
|
|
||||||
|
my_df = pd.read_csv(infile_ml1)
|
||||||
|
my_df.dtypes
|
||||||
|
my_df_cols = my_df.columns
|
||||||
|
|
||||||
|
geneL_basic = ['pnca']
|
||||||
|
geneL_na = ['gid']
|
||||||
|
geneL_na_ppi2 = ['rpob']
|
||||||
|
geneL_ppi2 = ['alr', 'embb', 'katg']
|
||||||
|
#%% get cols
|
||||||
|
mycols = my_df.columns
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
# GET Y
|
||||||
|
|
||||||
|
# Target1: mutation_info_labels
|
||||||
|
dm_om_map = {'DM': 1, 'OM': 0}
|
||||||
|
target1 = my_df['mutation_info_labels'].map(dm_om_map)
|
||||||
|
|
||||||
|
# Target2: drug
|
||||||
|
drug_labels = drug + '_labels'
|
||||||
|
drug_labels
|
||||||
|
my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
|
||||||
|
my_df[drug_labels].value_counts()
|
||||||
|
my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
|
||||||
|
my_df[drug_labels].value_counts()
|
||||||
|
target2 = my_df[drug_labels]
|
||||||
|
|
||||||
|
# Target3: drtype
|
||||||
|
drtype_labels = 'drtype_labels'
|
||||||
|
my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0
|
||||||
|
, 'Other' : 0
|
||||||
|
, 'Pre-MDR' : 1
|
||||||
|
, 'MDR' : 1
|
||||||
|
, 'Pre-XDR' : 1
|
||||||
|
, 'XDR' : 1})
|
||||||
|
# target3 = my_df['drtype']
|
||||||
|
target3 = my_df[drtype_labels]
|
||||||
|
|
||||||
|
# sanity checks
|
||||||
|
target1.value_counts()
|
||||||
|
my_df['mutation_info_labels'].value_counts()
|
||||||
|
|
||||||
|
target2.value_counts()
|
||||||
|
my_df[drug_labels].value_counts()
|
||||||
|
|
||||||
|
target3.value_counts()
|
||||||
|
my_df['drtype'].value_counts()
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# GET X
|
||||||
|
common_cols_stabilty = ['ligand_distance'
|
||||||
|
, 'ligand_affinity_change'
|
||||||
|
, 'duet_stability_change'
|
||||||
|
, 'ddg_foldx'
|
||||||
|
, 'deepddg'
|
||||||
|
, 'ddg_dynamut2']
|
||||||
|
|
||||||
|
# Build stability columns ~ gene
|
||||||
|
if gene.lower() in geneL_basic:
|
||||||
|
x_stability_cols = common_cols_stabilty
|
||||||
|
|
||||||
|
if gene.lower() in geneL_ppi2:
|
||||||
|
x_stability_cols = common_cols_stabilty + ['mcsm_ppi2_affinity'
|
||||||
|
, 'interface_dist']
|
||||||
|
if gene.lower() in geneL_na:
|
||||||
|
x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity']
|
||||||
|
|
||||||
|
if gene.lower() in geneL_na_ppi2:
|
||||||
|
x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
||||||
|
#D1148 get rid of
|
||||||
|
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
|
||||||
|
my_df = my_df.drop(index=na_index)
|
||||||
|
|
||||||
|
X_strF = ['asa'
|
||||||
|
, 'rsa'
|
||||||
|
, 'kd_values'
|
||||||
|
, 'rd_values']
|
||||||
|
|
||||||
|
X_evolF = ['consurf_score'
|
||||||
|
, 'snap2_score'
|
||||||
|
, 'snap2_accuracy_pc']
|
||||||
|
|
||||||
|
# TODO: ADD ED values
|
||||||
|
# Problematic due to NA
|
||||||
|
# X_genomicF = ['af'
|
||||||
|
# , 'or_mychisq'
|
||||||
|
# , 'or_logistic'
|
||||||
|
# , 'or_fisher'
|
||||||
|
# , 'pval_fisher']
|
||||||
|
|
||||||
|
#%% try combinations
|
||||||
|
X_vars1 = my_df[x_stability_cols]
|
||||||
|
X_vars2 = my_df[X_strF]
|
||||||
|
X_vars3 = my_df[X_evolF]
|
||||||
|
#X_vars4 = my_df[X_genomicF]
|
||||||
|
#X_vars4 = X_vars4.fillna('unknown') # need one hot encoder!
|
||||||
|
|
||||||
|
X_vars5 = my_df[x_stability_cols + X_strF]
|
||||||
|
X_vars6 = my_df[x_stability_cols + X_evolF]
|
||||||
|
#X_vars7 = my_df[x_stability_cols + X_genomicF]
|
||||||
|
X_vars8 = my_df[X_strF + X_evolF]
|
||||||
|
#X_vars9 = my_df[X_strF + X_genomicF]
|
||||||
|
#X_vars10 = my_df[X_evolF + X_genomicF]
|
||||||
|
X_vars11 = my_df[x_stability_cols + X_strF + X_evolF ]
|
||||||
|
#X_vars12 = my_df[x_stability_cols + X_strF + X_evolF + X_genomicF]
|
||||||
|
|
||||||
|
#%%
|
||||||
|
X_vars1.shape[1]
|
||||||
|
|
||||||
|
# TODO: stratified cross validate
|
||||||
|
# Train-test Split
|
||||||
|
|
||||||
|
# TARGET1
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X_vars1,
|
||||||
|
target1,
|
||||||
|
test_size = 0.33,
|
||||||
|
random_state = 42)
|
||||||
|
MultClassPipeline(X_train, X_test, y_train, y_test)
|
||||||
|
|
||||||
|
# TARGET3
|
||||||
|
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_vars5,
|
||||||
|
target3,
|
||||||
|
test_size = 0.33,
|
||||||
|
random_state = 42)
|
||||||
|
MultClassPipeline(X_train3, X_test3, y_train3, y_test3)
|
156
my_data_gid.py
Normal file
156
my_data_gid.py
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Thu Mar 3 17:08:18 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.naive_bayes import BernoulliNB
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from xgboost import XGBClassifier
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import os
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
|
||||||
|
import pandas as pd
|
||||||
|
#%%
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
||||||
|
|
||||||
|
# this needs to be merged_df2 or merged_df3?
|
||||||
|
#gene 'pncA'
|
||||||
|
drug = 'pyrazinamide'
|
||||||
|
|
||||||
|
my_df = pd.read_csv("pnca_merged_df3.csv")
|
||||||
|
|
||||||
|
my_df.dtypes
|
||||||
|
my_df_cols = my_df.columns
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
# GET Y
|
||||||
|
# Y = my_df.loc[:,drug] #has NA
|
||||||
|
dm_om_map = {'DM': 1, 'OM': 0}
|
||||||
|
my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map)
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
my_df['resistance'].value_counts()
|
||||||
|
my_df['mutation_info_labels'].value_counts()
|
||||||
|
Y = my_df['resistance']
|
||||||
|
|
||||||
|
# GET X
|
||||||
|
cols = my_df.columns
|
||||||
|
X_stability = my_df[['ligand_distance'
|
||||||
|
, 'ligand_affinity_change'
|
||||||
|
, 'duet_stability_change'
|
||||||
|
, 'ddg_foldx'
|
||||||
|
, 'deepddg'
|
||||||
|
, 'ddg_dynamut2']]
|
||||||
|
|
||||||
|
X_evol = my_df[['consurf_score'
|
||||||
|
, 'snap2_score'
|
||||||
|
, 'snap2_accuracy_pc']]
|
||||||
|
|
||||||
|
X_str = my_df[['asa'
|
||||||
|
, 'rsa'
|
||||||
|
, 'kd_values'
|
||||||
|
, 'rd_values']]
|
||||||
|
|
||||||
|
#%% try combinations
|
||||||
|
X_vars = X_stability
|
||||||
|
X_vars = X_evol
|
||||||
|
X_vars = X_str
|
||||||
|
|
||||||
|
X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
|
||||||
|
X_vars = pd.concat([X_stability, X_evol], axis = 1)
|
||||||
|
X_vars = pd.concat([X_stability, X_str], axis = 1)
|
||||||
|
X_vars = pd.concat([X_evol, X_str], axis = 1)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
X_vars.shape[1]
|
||||||
|
|
||||||
|
# TODO: stratified cross validate
|
||||||
|
# Train-test Split
|
||||||
|
rs = {'random_state': 42}
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X_vars,
|
||||||
|
Y,
|
||||||
|
test_size = 0.33,
|
||||||
|
random_state = 42)
|
||||||
|
|
||||||
|
# Classification - Model Pipeline
|
||||||
|
def modelPipeline(X_train, X_test, y_train, y_test):
|
||||||
|
|
||||||
|
log_reg = LogisticRegression(**rs)
|
||||||
|
nb = BernoulliNB()
|
||||||
|
knn = KNeighborsClassifier()
|
||||||
|
svm = SVC(**rs)
|
||||||
|
mlp = MLPClassifier(max_iter=500, **rs)
|
||||||
|
dt = DecisionTreeClassifier(**rs)
|
||||||
|
et = ExtraTreesClassifier(**rs)
|
||||||
|
rf = RandomForestClassifier(**rs)
|
||||||
|
xgb = XGBClassifier(**rs, verbosity=0)
|
||||||
|
|
||||||
|
clfs = [
|
||||||
|
('Logistic Regression', log_reg),
|
||||||
|
('Naive Bayes', nb),
|
||||||
|
('K-Nearest Neighbors', knn),
|
||||||
|
('SVM', svm),
|
||||||
|
('MLP', mlp),
|
||||||
|
('Decision Tree', dt),
|
||||||
|
('Extra Trees', et),
|
||||||
|
('Random Forest', rf),
|
||||||
|
('XGBoost', xgb)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
pipelines = []
|
||||||
|
|
||||||
|
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
|
||||||
|
|
||||||
|
|
||||||
|
for clf_name, clf in clfs:
|
||||||
|
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('scaler', StandardScaler()),
|
||||||
|
('classifier', clf)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
pipeline.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Model predictions
|
||||||
|
y_pred = pipeline.predict(X_test)
|
||||||
|
|
||||||
|
# F1-Score
|
||||||
|
fscore = f1_score(y_test, y_pred)
|
||||||
|
# Precision
|
||||||
|
pres = precision_score(y_test, y_pred)
|
||||||
|
# Recall
|
||||||
|
rcall = recall_score(y_test, y_pred)
|
||||||
|
# Accuracy
|
||||||
|
accu = accuracy_score(y_test, y_pred)
|
||||||
|
# ROC_AUC
|
||||||
|
roc_auc = roc_auc_score(y_test, y_pred)
|
||||||
|
|
||||||
|
|
||||||
|
pipelines.append(pipeline)
|
||||||
|
|
||||||
|
scores_df = scores_df.append({
|
||||||
|
'Model' : clf_name,
|
||||||
|
'F1_Score' : fscore,
|
||||||
|
'Precision' : pres,
|
||||||
|
'Recall' : rcall,
|
||||||
|
'Accuracy' : accu,
|
||||||
|
'ROC_AUC' : roc_auc
|
||||||
|
|
||||||
|
},
|
||||||
|
ignore_index = True)
|
||||||
|
|
||||||
|
return pipelines, scores_df
|
||||||
|
|
||||||
|
|
||||||
|
modelPipeline(X_train, X_test, y_train, y_test)
|
81
my_data_target_counts.py
Normal file
81
my_data_target_counts.py
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Thu Mar 3 17:08:18 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#%% load packages
|
||||||
|
import sys, os
|
||||||
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
from functools import reduce
|
||||||
|
#%%
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
os.chdir(homedir + "/git/ML_AI_training/test_data")
|
||||||
|
|
||||||
|
#gene = ''
|
||||||
|
#drug = ''
|
||||||
|
|
||||||
|
#==============
|
||||||
|
# directories
|
||||||
|
#==============
|
||||||
|
datadir = homedir + '/git/Data/'
|
||||||
|
indir = datadir + drug + '/input/'
|
||||||
|
outdir = datadir + drug + '/output/'
|
||||||
|
|
||||||
|
# gene_baiscL = ['pnca']
|
||||||
|
# geneL_naL = ['gid', 'rpob']
|
||||||
|
# geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob']
|
||||||
|
|
||||||
|
#=======
|
||||||
|
# input
|
||||||
|
#=======
|
||||||
|
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
|
||||||
|
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
|
||||||
|
|
||||||
|
my_df = pd.read_csv(infile_ml1)
|
||||||
|
|
||||||
|
my_df.dtypes
|
||||||
|
my_df_cols = my_df.columns
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
# GET Y
|
||||||
|
drug_labels = drug + '_labels'
|
||||||
|
drug_labels
|
||||||
|
my_df[drug_labels] = my_df[drug]
|
||||||
|
my_df[drug_labels].value_counts()
|
||||||
|
my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'})
|
||||||
|
my_df[drug_labels].value_counts()
|
||||||
|
my_df[drug_labels] = my_df[drug_labels].fillna('unknown')
|
||||||
|
my_df[drug_labels].value_counts()
|
||||||
|
|
||||||
|
mutC = my_df[[ 'mutationinformation']].count()
|
||||||
|
|
||||||
|
target1C = my_df['mutation_info_labels'].value_counts()
|
||||||
|
|
||||||
|
target2C = my_df[drug_labels].value_counts()
|
||||||
|
#target2C.index = target2C.index.to_series().map({1: 'resistant', 0: 'sensitive'})
|
||||||
|
|
||||||
|
target3C = my_df['drtype'].value_counts()
|
||||||
|
|
||||||
|
targetsC = pd.concat([mutC, target1C, target2C, target3C])
|
||||||
|
targetsC
|
||||||
|
|
||||||
|
# targetsC2 = pd.concat([mutC, target1C, target2C
|
||||||
|
# #, target3C
|
||||||
|
# ], axis = 1)
|
||||||
|
# targetsC2
|
||||||
|
|
||||||
|
#%% try combinations
|
||||||
|
# X_vars = X_stability
|
||||||
|
# X_vars = X_evol
|
||||||
|
# X_vars = X_str
|
||||||
|
|
||||||
|
# X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1)
|
||||||
|
# X_vars = pd.concat([X_stability, X_evol], axis = 1)
|
||||||
|
# X_vars = pd.concat([X_stability, X_str], axis = 1)
|
||||||
|
# X_vars = pd.concat([X_evol, X_str], axis = 1)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue