diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py new file mode 100644 index 0000000..551d9a1 --- /dev/null +++ b/scripts/ml/combined_model/cm_logo_skf.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 29 19:44:06 2022 + +@author: tanu +""" +import sys, os +import pandas as pd +import numpy as np +import re +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') +sys.path +############################################################################### +#==================== +# Import ML functions +#==================== +from ml_data_combined import * +from MultClfs_logo_skf import * +#from GetMLData import * +#from SplitTTS import * + +skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs) + +#logo = LeaveOneGroupOut() + +#%% +def CMLogoSkf(combined_df + , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] + , bts_genes = ["embb", "katg", "rpob", "pnca", "gid"] + , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] + , target_var = 'dst_mode' + , gene_group = 'gene_name' + , std_gene_omit = [] + ): + + for bts_gene in bts_genes: + print('\n BTS gene:', bts_gene) + + tr_gene_omit = std_gene_omit + [bts_gene] + n_tr_genes = (len(bts_genes) - (len(std_gene_omit))) + #n_total_genes = (len(bts_genes) - len(std_gene_omit)) + n_total_genes = len(all_genes) + + training_genesL = std_gene_omit + list(set(bts_genes) - set(tr_gene_omit)) + #training_genesL = [element for element in bts_genes if element not in tr_gene_omit] + + print('\nTotal genes: ', n_total_genes + ,'\nTraining on:', n_tr_genes + ,'\nTraining on genes:', training_genesL + , '\nOmitted genes:', tr_gene_omit + , '\nBlind test gene:', bts_gene) + + tts_split_type = "logoBT_" + bts_gene + + outFile = "/home/tanu/git/Data/ml_combined/" + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv" + print(outFile) + + #------- + # training + #------ + cm_training_df = combined_df[~combined_df['gene_name'].isin(tr_gene_omit)] + + cm_X = cm_training_df.drop(cols_to_drop, axis=1, inplace=False) + #cm_y = cm_training_df.loc[:,'dst_mode'] + cm_y = cm_training_df.loc[:, target_var] + + + gene_group = cm_training_df.loc[:,'gene_name'] + + print('\nTraining data dim:', cm_X.shape + , '\nTraining Target dim:', cm_y.shape) + + if all(cm_X.columns.isin(cols_to_drop) == False): + print('\nChecked training df does NOT have Target var') + else: + sys.exit('\nFAIL: training data contains Target var') + + #--------------- + # BTS: genes + #--------------- + cm_test_df = combined_df[combined_df['gene_name'].isin([bts_gene])] + + cm_bts_X = cm_test_df.drop(cols_to_drop, axis = 1, inplace = False) + #cm_bts_y = cm_test_df.loc[:, 'dst_mode'] + cm_bts_y = cm_test_df.loc[:, target_var] + + print('\nTraining data dim:', cm_bts_X.shape + , '\nTraining Target dim:', cm_bts_y.shape) + + + #%%:Running Multiple models on LOGO with SKF + cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X + , target = cm_y + , group = 'none' + , sel_cv = skf_cv + + , blind_test_df = cm_bts_X + , blind_test_target = cm_bts_y + + , tts_split_type = tts_split_type + + , resampling_type = 'none' # default + , add_cm = True + , add_yn = True + , var_type = 'mixed' + + , run_blind_test = True + , return_formatted_output = True + , random_state = 42 + , n_jobs = 10 + ) + + cD3_v2.to_csv(outFile) + +#%% +CMLogoSkf(combined_df) +CMLogoSkf(combined_df, std_gene_omit=['alr']) diff --git a/scripts/ml/combined_model/cm_ml_iterator.py b/scripts/ml/combined_model/cm_ml_iterator.py new file mode 100755 index 0000000..e6ea9d2 --- /dev/null +++ b/scripts/ml/combined_model/cm_ml_iterator.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 29 20:29:36 2022 + +@author: tanu +""" +import sys, os +import pandas as pd +import numpy as np +import re + +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') +sys.path +############################################################################### +#==================== +# Import ML functions +#==================== +from MultClfs import * +from GetMLData import * +from SplitTTS import * + +# param dict for getmldata() +combined_model_paramD = {'data_combined_model' : False + , 'use_or' : False + , 'omit_all_genomic_features': False + , 'write_maskfile' : False + , 'write_outfile' : False } +############################################################################### +#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"] + +ml_gene_drugD = {'pncA' : 'pyrazinamide' + , 'embB' : 'ethambutol' + , 'katG' : 'isoniazid' + , 'rpoB' : 'rifampicin' + , 'gid' : 'streptomycin' + } +gene_dataD={} +split_types = ['70_30', '80_20', 'sl'] +split_data_types = ['actual', 'complete'] + +for gene, drug in ml_gene_drugD.items(): + print ('\nGene:', gene + , '\nDrug:', drug) + gene_low = gene.lower() + gene_dataD[gene_low] = getmldata(gene, drug + , data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it. + , use_or = False + , omit_all_genomic_features = False + , write_maskfile = False + , write_outfile = False) + + for split_type in split_types: + for data_type in split_data_types: + out_filename = (gene.lower()+'_'+split_type+'_'+data_type+'.csv') + tempD=split_tts(gene_dataD[gene_low] + , data_type = data_type + , split_type = split_type + , oversampling = True + , dst_colname = 'dst' + , target_colname = 'dst_mode' + , include_gene_name = True + ) + paramD = { + 'baseline_paramD': { 'input_df' : tempD['X'] + , 'target' : tempD['y'] + , 'var_type' : 'mixed' + , 'resampling_type': 'none'} + , 'smnc_paramD': { 'input_df' : tempD['X_smnc'] + , 'target' : tempD['y_smnc'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + , 'ros_paramD': { 'input_df' : tempD['X_ros'] + , 'target' : tempD['y_ros'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + , 'rus_paramD' : { 'input_df' : tempD['X_rus'] + , 'target' : tempD['y_rus'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] + , 'target' : tempD['y_rouC'] + , 'var_type' : 'mixed' + , 'resampling_type': 'rouC'} + } + + mmDD = {} + for k, v in paramD.items(): + scoresD = MultModelsCl(**paramD[k] + , tts_split_type = split_type + , skf_cv = skf_cv + , blind_test_df = tempD['X_bts'] + , blind_test_target = tempD['y_bts'] + , add_cm = True + , add_yn = True + , return_formatted_output = True) + mmDD[k] = scoresD + + # Extracting the dfs from within the dict and concatenating to output as one df + for k, v in mmDD.items(): + out_wf= pd.concat(mmDD, ignore_index = True) + + out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False) + diff --git a/scripts/ml/ml_functions/MultClfs_logo_skf.py b/scripts/ml/ml_functions/MultClfs_logo_skf.py index 14a4dda..8e2cbf7 100755 --- a/scripts/ml/ml_functions/MultClfs_logo_skf.py +++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py @@ -89,14 +89,7 @@ scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef) , 'jcc' : make_scorer(jaccard_score) }) -skf_cv = StratifiedKFold(n_splits = 10 - #, shuffle = False, random_state= None) - , shuffle = True,**rs) -rskf_cv = RepeatedStratifiedKFold(n_splits = 10 - , n_repeats = 3 - , **rs) -logo = LeaveOneGroupOut() mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} @@ -160,7 +153,10 @@ def MultModelsCl_logo_skf(input_df , add_yn = True # adds target var class numbers , var_type = ['numerical', 'categorical','mixed'] , run_blind_test = True - , return_formatted_output = True): + , return_formatted_output = True + , random_state = 42 + , n_jobs = 10 + , ): ''' @ param input_df: input features @@ -179,10 +175,24 @@ def MultModelsCl_logo_skf(input_df Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training ''' - # if group == 'none': - # sel_cv = skf_cv - # else: - # group = 'none' +#%% Func globals + rs = {'random_state': random_state} + njobs = {'n_jobs': n_jobs} + + skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) + + rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats = 3 + , **rs) + logo = LeaveOneGroupOut() + + # select CV type: + if group == 'none': + sel_cv = skf_cv + else: + sel_cv = logo #====================================================== # Determine categorical and numerical features #====================================================== @@ -210,7 +220,7 @@ def MultModelsCl_logo_skf(input_df #====================================================== # Specify multiple Classification Models #====================================================== - models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) ) , ('Decision Tree' , DecisionTreeClassifier(**rs) ) , ('Extra Tree' , ExtraTreeClassifier(**rs) ) diff --git a/scripts/ml/ml_functions/ml_data_combined.py b/scripts/ml/ml_functions/ml_data_combined.py index b4d18bb..5033359 100644 --- a/scripts/ml/ml_functions/ml_data_combined.py +++ b/scripts/ml/ml_functions/ml_data_combined.py @@ -63,40 +63,8 @@ else: , '\nGot:', len(common_cols)) colnames_combined_df = combined_df.columns +if 'gene_name' in colnames_combined_df: + print("\nGene name included") +else: + ('\nGene name NOT included') ############################################################################## - -#%% split_tts(): func params -# (ml_input_data -# , data_type = ['actual', 'complete'] -# , split_type = ['70_30', '80_20', 'sl'] -# , oversampling = True -# , dst_colname = 'dst'# determine how to subset the actual vs reverse data -# , target_colname = 'dst_mode' -# , include_gene_name = True -# , k_smote = 5) -#%% split data into different data types -# #=================== -# # 70/30 -# #=================== -# # actual -# tts_7030_paramD = {'data_type' : 'actual' -# , 'split_type' : '70_30'} - -# # complete -# tts_cd_7030_paramD = {'data_type' : 'complete' -# , 'split_type' : '70_30'} - -# # call split_tts() -# data_CM_7030D = split_tts(ml_input_data = combined_df -# , **tts_7030_paramD -# , oversampling = True -# , dst_colname = 'dst' -# , target_colname = 'dst_mode' -# , include_gene_name = False) # when not doing leave one group out - -# data_cd_CM_7030D = split_tts(ml_input_data = combined_df -# , **tts_cd_7030_paramD -# , oversampling = True -# , dst_colname = 'dst' -# , target_colname = 'dst_mode' -# , include_gene_name = False)