diff --git a/MultClassPipe.py b/MultClassPipe.py new file mode 100644 index 0000000..592c193 --- /dev/null +++ b/MultClassPipe.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:25:33 2022 + +@author: tanu +""" +#%% +import os, sys +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +#%% +rs = {'random_state': 42} + +# Multiple Classification - Model Pipeline +def MultClassPipeline(X_train, X_test, y_train, y_test): + + log_reg = LogisticRegression(**rs) + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter=500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + xgb = XGBClassifier(**rs, verbosity=0) + + clfs = [ + ('Logistic Regression', log_reg), + ('Naive Bayes', nb), + ('K-Nearest Neighbors', knn), + ('SVM', svm), + ('MLP', mlp), + ('Decision Tree', dt), + ('Extra Trees', et), + ('Random Forest', rf), + ('XGBoost', xgb) + ] + + + pipelines = [] + + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + + for clf_name, clf in clfs: + + pipeline = Pipeline(steps=[ + ('scaler', MinMaxScaler()), + #('scaler', StandardScaler()), + ('classifier', clf) + ] + ) + pipeline.fit(X_train, y_train) + + # Model predictions + y_pred = pipeline.predict(X_test) + + # F1-Score + fscore = f1_score(y_test, y_pred) + # Precision + pres = precision_score(y_test, y_pred) + # Recall + rcall = recall_score(y_test, y_pred) + # Accuracy + accu = accuracy_score(y_test, y_pred) + # ROC_AUC + roc_auc = roc_auc_score(y_test, y_pred) + + pipelines.append(pipeline) + + scores_df = scores_df.append({ + 'Model' : clf_name, + 'F1_Score' : fscore, + 'Precision' : pres, + 'Recall' : rcall, + 'Accuracy' : accu, + 'ROC_AUC' : roc_auc + + }, + ignore_index = True) + + return pipelines, scores_df + diff --git a/X_categories b/X_categories new file mode 100644 index 0000000..f121551 --- /dev/null +++ b/X_categories @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:09:37 2022 + +@author: tanu +""" + +X_categ_str = ['ss_class' + , 'wt_prop_water' + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + , 'active_aa_pos'] + +# only valid if we use merged_df2 +X_categ_str_lin = X_categ_str + ['lineage_labels'] + +X_categ_foldx = ['contacts' +'electro_rr' +'electro_mm' +'electro_sm' +'electro_ss' +'disulfide_rr' +'disulfide_mm' +'disulfide_sm' +'disulfide_ss' +'hbonds_rr' +'hbonds_mm' +'hbonds_sm' +'hbonds_ss' +'partcov_rr' +'partcov_mm' +'partcov_sm' +'partcov_ss' +'vdwclashes_rr' +'vdwclashes_mm' +'vdwclashes_sm' +'vdwclashes_ss' +'volumetric_rr' +'volumetric_mm' +'volumetric_sm' +'volumetric_ss'] diff --git a/__pycache__/MultClassPipe.cpython-37.pyc b/__pycache__/MultClassPipe.cpython-37.pyc new file mode 100644 index 0000000..3e4d465 Binary files /dev/null and b/__pycache__/MultClassPipe.cpython-37.pyc differ diff --git a/ml_data/.Rhistory b/ml_data/.Rhistory new file mode 100644 index 0000000..4ce229d --- /dev/null +++ b/ml_data/.Rhistory @@ -0,0 +1,335 @@ +source("~/git/LSHTM_analysis/config/alr.R") +# source("~/git/LSHTM_analysis/config/embb.R") +# source("~/git/LSHTM_analysis/config/gid.R") +# source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +# source("~/git/LSHTM_analysis/config/rpob.R") +################################################## +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +###################################################### +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName +if( (length(colnames_order) == ncol(merged_df3)) && (all(colnames_order %in%colnames(merged_df3))) ){ +cat("\nProceeding with rearranging columns in merged_df3") +merged_df3_o = merged_df3[ , colnames_order] +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3_o, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3_o) +, "\nncols:" , ncol(merged_df3_o)) +}else +cat("length mismatch:" +, colnames(merged_df3)[!colnames(merged_df3)%in%(colnames_order )] +) +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) +, "\nncols:" , ncol(merged_df3)) +#========================================================= +mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") +mdf2_outName +cat("\nWriting output file:", mdf2_outName) +write.csv(merged_df2, mdf2_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df2) +, "\nncols:" , ncol(merged_df2)) +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) +#---------------------------------------------------- +# source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/embb.R") +# source("~/git/LSHTM_analysis/config/gid.R") +# source("~/git/LSHTM_analysis/config/katg.R") +# source("~/git/LSHTM_analysis/config/pnca.R") +# source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +active_aa_pos +merged_df3['position']%in%active_aa_pos +merged_df3$position%in%active_aa_pos +merged_df3['active_aa_pos'] <- merged_df3['position'] +merged_df3['active_aa_pos'] +identical(merged_df3['active_aa_pos'] , merged_df3['position']) +(merged_df3['active_aa_pos'] == merged_df3['position']) +all(merged_df3['active_aa_pos'] == merged_df3['position']) +merged_df3['active_aa_pos'] <- merged_df3['position'] +if (merged_df3$position%in%active_aa_pos){ +merged_df3['active_aa_pos'] = 1 +}else{ +merged_df3['active_aa_pos'] = 0 +} +merged_df3['active_aa_pos'] +table(merged_df3$active_aa_pos) +merged_df3['active_aa_pos'] <- merged_df3['position'] +merged_df3$active_aa_pos <- merged_df3$osition +merged_df3$active_aa_pos +merged_df3$active_aa_pos <- merged_df3$position +merged_df3$active_aa_pos +merged_df3$postion%in%active_aa_pos +merged_df3$postion%in%active_aa_pos +merged_df3$postion +erged_df3$position%in%active_aa_pos +merged_df3$position +active_aa_pos +which(merged_df3$position%in%active_aa_pos) +c =which(merged_df3$position%in%active_aa_pos) +merged_df3$position[c] +active_aa_pos +merged_df3$position%in%active_aa_pos +merged_df3$active_aa_pos <- merged_df3$position +merged_df3$active_aa_pos %in% active_aa_pos +ifelse(merged_df3$active_aa_pos %in% active_aa_pos , "1", "0") +table(merged_df3$active_aa_po) +str(merged_df3$active_aa_po) +str(merged_df3$active_aa_pos) +#TODO later! +merged_df3$active_aa_pos <- merged_df3$position +merged_df3$active_aa_pos +ifelse(merged_df3$active_aa_pos %in% active_aa_pos , 1, 0) +str(merged_df3$active_aa_pos) +#str(merged_df3$active_aa_pos) +table(merged_df3$active_aa_pos) +#str(merged_df3$active_aa_pos) +foo = merged_df3$active_aa_pos +merged_df3$active_aa_pos +ifelse(merged_df3$active_aa_pos %in% active_aa_pos , 1, 0) +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +#str(merged_df3$active_aa_pos) +foo = merged_df3$active_aa_pos +#str(merged_df3$active_aa_pos) +table(merged_df3$active_aa_pos) +length(active_aa_pos) +which(merged_df3$position%in%active_aa_pos) +which(merged_df3$position%in%active_aa_pos) +which(!merged_df3$position%in%active_aa_pos) +which(merged_df3$position%in%active_aa_pos) +active_aa_pos) +active_aa_pos +merged_df3$position[209,] +merged_df3[209,] +merged_df3$position[209] +merged_df3[209] +merged_df3[209,] +active_aa_pos +merged_df3$position[!merged_df3$position%in%active_aa_pos] +merged_df3$position[!active_aa_pos%in%merged_df3$position] +active_aa_pos +active_aa_pos[!active_aa_pos%in%merged_df3$position] +#str(merged_df3$active_aa_pos) +table(merged_df3$active_aa_pos) +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) +#---------------------------------------------------- +# source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/embb.R") +# source("~/git/LSHTM_analysis/config/gid.R") +# source("~/git/LSHTM_analysis/config/katg.R") +# source("~/git/LSHTM_analysis/config/pnca.R") +# source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +#str(merged_df3$active_aa_pos) +table(merged_df3$active_aa_pos) +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) +, "\nncols:" , ncol(merged_df3)) +merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0) +table(merged_df3$active_aa_pos) +table(merged_df2$active_aa_pos) +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) +#---------------------------------------------------- +# source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/embb.R") +# source("~/git/LSHTM_analysis/config/gid.R") +# source("~/git/LSHTM_analysis/config/katg.R") +# source("~/git/LSHTM_analysis/config/pnca.R") +# source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +###################################################### +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +table(merged_df3$active_aa_pos) +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) +, "\nncols:" , ncol(merged_df3)) +#========================================================= +merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0) +table(merged_df2$active_aa_pos) +mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") +mdf2_outName +cat("\nWriting output file:", mdf2_outName) +write.csv(merged_df2, mdf2_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df2) +, "\nncols:" , ncol(merged_df2)) +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) +#---------------------------------------------------- +source("~/git/LSHTM_analysis/config/alr.R") +# source("~/git/LSHTM_analysis/config/embb.R") +# source("~/git/LSHTM_analysis/config/gid.R") +# source("~/git/LSHTM_analysis/config/katg.R") +# source("~/git/LSHTM_analysis/config/pnca.R") +# source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +###################################################### +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +table(merged_df3$active_aa_pos) +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) +, "\nncols:" , ncol(merged_df3)) +#========================================================= +merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0) +table(merged_df2$active_aa_pos) +mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") +mdf2_outName +cat("\nWriting output file:", mdf2_outName) +write.csv(merged_df2, mdf2_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df2) +, "\nncols:" , ncol(merged_df2)) +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) +#---------------------------------------------------- +#source("~/git/LSHTM_analysis/config/alr.R") +#source("~/git/LSHTM_analysis/config/embb.R") +source("~/git/LSHTM_analysis/config/gid.R") +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +gene +drug +###################################################### +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +table(merged_df3$active_aa_pos) +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) +, "\nncols:" , ncol(merged_df3)) +#========================================================= +merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0) +table(merged_df2$active_aa_pos) +mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") +mdf2_outName +cat("\nWriting output file:", mdf2_outName) +write.csv(merged_df2, mdf2_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df2) +, "\nncols:" , ncol(merged_df2)) +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) +#---------------------------------------------------- +#source("~/git/LSHTM_analysis/config/alr.R") +#source("~/git/LSHTM_analysis/config/embb.R") +#source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +###################################################### +gene; drug +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +table(merged_df3$active_aa_pos) +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) +, "\nncols:" , ncol(merged_df3)) +#========================================================= +merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0) +table(merged_df2$active_aa_pos) +mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") +mdf2_outName +cat("\nWriting output file:", mdf2_outName) +write.csv(merged_df2, mdf2_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df2) +, "\nncols:" , ncol(merged_df2)) +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) +#---------------------------------------------------- +#source("~/git/LSHTM_analysis/config/alr.R") +#source("~/git/LSHTM_analysis/config/embb.R") +#source("~/git/LSHTM_analysis/config/gid.R") +#source("~/git/LSHTM_analysis/config/katg.R") +source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +###################################################### +gene; drug +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +table(merged_df3$active_aa_pos) +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) +, "\nncols:" , ncol(merged_df3)) +#========================================================= +merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0) +table(merged_df2$active_aa_pos) +mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") +mdf2_outName +cat("\nWriting output file:", mdf2_outName) +write.csv(merged_df2, mdf2_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df2) +, "\nncols:" , ncol(merged_df2)) +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) +#---------------------------------------------------- +#source("~/git/LSHTM_analysis/config/alr.R") +#source("~/git/LSHTM_analysis/config/embb.R") +#source("~/git/LSHTM_analysis/config/gid.R") +#source("~/git/LSHTM_analysis/config/katg.R") +#source("~/git/LSHTM_analysis/config/pnca.R") +source("~/git/LSHTM_analysis/config/rpob.R") +#---------------------------------------------------- +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +###################################################### +gene; drug +merged_df3$active_aa_pos = ifelse(merged_df3$position %in% active_aa_pos , 1, 0) +table(merged_df3$active_aa_pos) +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName +cat("\nWriting output file:", mdf3_outName) +write.csv(merged_df3, mdf3_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df3) +, "\nncols:" , ncol(merged_df3)) +#========================================================= +merged_df2$active_aa_pos = ifelse(merged_df2$position %in% active_aa_pos , 1, 0) +table(merged_df2$active_aa_pos) +mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") +mdf2_outName +cat("\nWriting output file:", mdf2_outName) +write.csv(merged_df2, mdf2_outName, row.names = F) +cat("\nnrows:" , nrow(merged_df2) +, "\nncols:" , ncol(merged_df2)) diff --git a/ml_data/del/ml_data_v1.R b/ml_data/del/ml_data_v1.R new file mode 100644 index 0000000..7e2c883 --- /dev/null +++ b/ml_data/del/ml_data_v1.R @@ -0,0 +1,65 @@ +#!/usr/bin/env Rscript + +# target var options: +# drtype: MDR, etc, full data +# pyrazinamide: 0 and 1, loss of data +# mutation_info_labels: DM and OM, full data +################################################## +# ONLY ONCE +#source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +#write.csv(colnames(merged_df3), "data_colnames.csv") +#--------------------------------------------------- +colnames_order_pnca = read.csv("~/git/ML_AI_training/ml_data/colnames_order.csv" + , header = F) +# reorder columns by name +colnames_order_pnca <- colnames_order_pnca$V1 +################################################### +#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob") +#config_gene = c("alr", "embb") +#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F) + +#source("~/git/LSHTM_analysis/config/alr.R") +# FIXME: "cycloserine" "mcsm_ppi2_affinity" "mcsm_ppi2_scaled" "mcsm_ppi2_outcome" "interface_dist" +# source("~/git/LSHTM_analysis/config/embb.R") +# source("~/git/LSHTM_analysis/config/gid.R") +# source("~/git/LSHTM_analysis/config/katg.R") +source("~/git/LSHTM_analysis/config/pnca.R") +# source("~/git/LSHTM_analysis/config/rpob.R") +################################################## +source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") + +###################################################### +mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv") +mdf3_outName + +if( (length(colnames_order) == ncol(merged_df3)) && (all(colnames_order %in%colnames(merged_df3))) ){ + cat("\nProceeding with rearranging columns in merged_df3") + merged_df3_o = merged_df3[ , colnames_order] + cat("\nWriting output file:", mdf3_outName) + write.csv(merged_df3_o, mdf3_outName, row.names = F) + cat("\nnrows:" , nrow(merged_df3_o) + , "\nncols:" , ncol(merged_df3_o)) + + }else + cat("length mismatch:" + , colnames(merged_df3)[!colnames(merged_df3)%in%(colnames_order )] + ) + +mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv") +mdf2_outName + +if( (length(colnames_order) == ncol(merged_df2)) && (all(colnames_order %in%colnames(merged_df2))) ){ + cat("\nProceeding with rearranging columns in merged_df3") + merged_df2_o = merged_df2[ , colnames_order] + cat("\nWriting output file:", mdf2_outName) + write.csv(merged_df2_o, mdf2_outName, row.names = F) + cat("\nnrows:" , nrow(merged_df2_o) + , "\nncols:" , ncol(merged_df2_o)) + +} + + + + + diff --git a/my_data6.py b/my_data6.py new file mode 100644 index 0000000..adfe017 --- /dev/null +++ b/my_data6.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 14:54:30 2022 + +@author: tanu +""" +import os, sys +import pandas as pd +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/") + +# my function +from MultClassPipe import MultClassPipeline + +#gene = 'pncA' +#drug = 'pyrazinamide' + +#============== +# directories +#============== +datadir = homedir + '/git/Data/' +indir = datadir + drug + '/input/' +outdir = datadir + drug + '/output/' + +#======= +# input +#======= +infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' +#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + +my_df = pd.read_csv(infile_ml1) +my_df.dtypes +my_df_cols = my_df.columns + +geneL_basic = ['pnca'] +geneL_na = ['gid'] +geneL_na_ppi2 = ['rpob'] +geneL_ppi2 = ['alr', 'embb', 'katg'] +#%% get cols +mycols = my_df.columns + +#%%============================================================================ +# GET Y + +# Target1: mutation_info_labels +dm_om_map = {'DM': 1, 'OM': 0} +target1 = my_df['mutation_info_labels'].map(dm_om_map) + +# Target2: drug +drug_labels = drug + '_labels' +drug_labels +my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'}) +my_df[drug_labels].value_counts() +my_df[drug_labels] = my_df[drug_labels].fillna('unknown') +my_df[drug_labels].value_counts() +target2 = my_df[drug_labels] + +# Target3: drtype +drtype_labels = 'drtype_labels' +my_df[drtype_labels] = my_df['drtype'].map({'Sensitive' : 0 + , 'Other' : 0 + , 'Pre-MDR' : 1 + , 'MDR' : 1 + , 'Pre-XDR' : 1 + , 'XDR' : 1}) +# target3 = my_df['drtype'] +target3 = my_df[drtype_labels] + +# sanity checks +target1.value_counts() +my_df['mutation_info_labels'].value_counts() + +target2.value_counts() +my_df[drug_labels].value_counts() + +target3.value_counts() +my_df['drtype'].value_counts() + +#%% +# GET X +common_cols_stabilty = ['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2'] + +# Build stability columns ~ gene +if gene.lower() in geneL_basic: + x_stability_cols = common_cols_stabilty + +if gene.lower() in geneL_ppi2: + x_stability_cols = common_cols_stabilty + ['mcsm_ppi2_affinity' + , 'interface_dist'] +if gene.lower() in geneL_na: + x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity'] + +if gene.lower() in geneL_na_ppi2: + x_stability_cols = common_cols_stabilty + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #D1148 get rid of + na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) + +X_strF = ['asa' + , 'rsa' + , 'kd_values' + , 'rd_values'] + +X_evolF = ['consurf_score' + , 'snap2_score' + , 'snap2_accuracy_pc'] + +# TODO: ADD ED values +# Problematic due to NA +# X_genomicF = ['af' +# , 'or_mychisq' +# , 'or_logistic' +# , 'or_fisher' +# , 'pval_fisher'] + +#%% try combinations +X_vars1 = my_df[x_stability_cols] +X_vars2 = my_df[X_strF] +X_vars3 = my_df[X_evolF] +#X_vars4 = my_df[X_genomicF] +#X_vars4 = X_vars4.fillna('unknown') # need one hot encoder! + +X_vars5 = my_df[x_stability_cols + X_strF] +X_vars6 = my_df[x_stability_cols + X_evolF] +#X_vars7 = my_df[x_stability_cols + X_genomicF] +X_vars8 = my_df[X_strF + X_evolF] +#X_vars9 = my_df[X_strF + X_genomicF] +#X_vars10 = my_df[X_evolF + X_genomicF] +X_vars11 = my_df[x_stability_cols + X_strF + X_evolF ] +#X_vars12 = my_df[x_stability_cols + X_strF + X_evolF + X_genomicF] + +#%% +X_vars1.shape[1] + +# TODO: stratified cross validate +# Train-test Split + +# TARGET1 +X_train, X_test, y_train, y_test = train_test_split(X_vars1, + target1, + test_size = 0.33, + random_state = 42) +MultClassPipeline(X_train, X_test, y_train, y_test) + +# TARGET3 +X_train3, X_test3, y_train3, y_test3 = train_test_split(X_vars5, + target3, + test_size = 0.33, + random_state = 42) +MultClassPipeline(X_train3, X_test3, y_train3, y_test3) diff --git a/my_data_gid.py b/my_data_gid.py new file mode 100644 index 0000000..500e6ba --- /dev/null +++ b/my_data_gid.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 3 17:08:18 2022 + +@author: tanu +""" +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split +import os +from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +import pandas as pd +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/test_data") + +# this needs to be merged_df2 or merged_df3? +#gene 'pncA' +drug = 'pyrazinamide' + +my_df = pd.read_csv("pnca_merged_df3.csv") + +my_df.dtypes +my_df_cols = my_df.columns + +#%%============================================================================ +# GET Y +# Y = my_df.loc[:,drug] #has NA +dm_om_map = {'DM': 1, 'OM': 0} +my_df['resistance'] = my_df['mutation_info_labels'].map(dm_om_map) + +# sanity check +my_df['resistance'].value_counts() +my_df['mutation_info_labels'].value_counts() +Y = my_df['resistance'] + +# GET X +cols = my_df.columns +X_stability = my_df[['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2']] + +X_evol = my_df[['consurf_score' + , 'snap2_score' + , 'snap2_accuracy_pc']] + +X_str = my_df[['asa' + , 'rsa' + , 'kd_values' + , 'rd_values']] + +#%% try combinations +X_vars = X_stability +X_vars = X_evol +X_vars = X_str + +X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1) +X_vars = pd.concat([X_stability, X_evol], axis = 1) +X_vars = pd.concat([X_stability, X_str], axis = 1) +X_vars = pd.concat([X_evol, X_str], axis = 1) + +#%% +X_vars.shape[1] + +# TODO: stratified cross validate +# Train-test Split +rs = {'random_state': 42} +X_train, X_test, y_train, y_test = train_test_split(X_vars, + Y, + test_size = 0.33, + random_state = 42) + +# Classification - Model Pipeline +def modelPipeline(X_train, X_test, y_train, y_test): + + log_reg = LogisticRegression(**rs) + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter=500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + xgb = XGBClassifier(**rs, verbosity=0) + + clfs = [ + ('Logistic Regression', log_reg), + ('Naive Bayes', nb), + ('K-Nearest Neighbors', knn), + ('SVM', svm), + ('MLP', mlp), + ('Decision Tree', dt), + ('Extra Trees', et), + ('Random Forest', rf), + ('XGBoost', xgb) + ] + + + pipelines = [] + + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + + + for clf_name, clf in clfs: + + pipeline = Pipeline(steps=[ + ('scaler', StandardScaler()), + ('classifier', clf) + ] + ) + pipeline.fit(X_train, y_train) + + # Model predictions + y_pred = pipeline.predict(X_test) + + # F1-Score + fscore = f1_score(y_test, y_pred) + # Precision + pres = precision_score(y_test, y_pred) + # Recall + rcall = recall_score(y_test, y_pred) + # Accuracy + accu = accuracy_score(y_test, y_pred) + # ROC_AUC + roc_auc = roc_auc_score(y_test, y_pred) + + + pipelines.append(pipeline) + + scores_df = scores_df.append({ + 'Model' : clf_name, + 'F1_Score' : fscore, + 'Precision' : pres, + 'Recall' : rcall, + 'Accuracy' : accu, + 'ROC_AUC' : roc_auc + + }, + ignore_index = True) + + return pipelines, scores_df + + +modelPipeline(X_train, X_test, y_train, y_test) \ No newline at end of file diff --git a/my_data_target_counts.py b/my_data_target_counts.py new file mode 100644 index 0000000..df6994b --- /dev/null +++ b/my_data_target_counts.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 3 17:08:18 2022 + +@author: tanu +""" +#%% load packages +import sys, os +import pandas as pd +from pandas import DataFrame +import numpy as np +import argparse +from functools import reduce +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/test_data") + +#gene = '' +#drug = '' + +#============== +# directories +#============== +datadir = homedir + '/git/Data/' +indir = datadir + drug + '/input/' +outdir = datadir + drug + '/output/' + +# gene_baiscL = ['pnca'] +# geneL_naL = ['gid', 'rpob'] +# geneL_ppi2L = ['alr', 'embb', 'katg', 'rpob'] + +#======= +# input +#======= +infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' +#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + +my_df = pd.read_csv(infile_ml1) + +my_df.dtypes +my_df_cols = my_df.columns + +#%%============================================================================ +# GET Y +drug_labels = drug + '_labels' +drug_labels +my_df[drug_labels] = my_df[drug] +my_df[drug_labels].value_counts() +my_df[drug_labels] = my_df[drug].map({1: 'resistant', 0: 'sensitive'}) +my_df[drug_labels].value_counts() +my_df[drug_labels] = my_df[drug_labels].fillna('unknown') +my_df[drug_labels].value_counts() + +mutC = my_df[[ 'mutationinformation']].count() + +target1C = my_df['mutation_info_labels'].value_counts() + +target2C = my_df[drug_labels].value_counts() +#target2C.index = target2C.index.to_series().map({1: 'resistant', 0: 'sensitive'}) + +target3C = my_df['drtype'].value_counts() + +targetsC = pd.concat([mutC, target1C, target2C, target3C]) +targetsC + +# targetsC2 = pd.concat([mutC, target1C, target2C +# #, target3C +# ], axis = 1) +# targetsC2 + +#%% try combinations +# X_vars = X_stability +# X_vars = X_evol +# X_vars = X_str + +# X_vars = pd.concat([X_stability, X_evol, X_str], axis = 1) +# X_vars = pd.concat([X_stability, X_evol], axis = 1) +# X_vars = pd.concat([X_stability, X_str], axis = 1) +# X_vars = pd.concat([X_evol, X_str], axis = 1) +