From 11af00f1db0e7c07d0a0fe107b311ef608fe3166 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 1 Jul 2022 21:40:14 +0100 Subject: [PATCH] changed ml output dirs and ready to run fs --- scripts/ml/combined_model/cm_logo_skf.py | 8 +- scripts/ml/combined_model/cm_ml_iterator.py | 26 +++--- scripts/ml/ml_iterator.py | 8 +- scripts/ml/ml_iterator_fs.py | 82 +++++++++--------- scripts/ml/running_ml_scripts.txt | 95 --------------------- 5 files changed, 67 insertions(+), 152 deletions(-) mode change 100644 => 100755 scripts/ml/ml_iterator_fs.py delete mode 100644 scripts/ml/running_ml_scripts.txt diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py index f4cf311..24d6af9 100755 --- a/scripts/ml/combined_model/cm_logo_skf.py +++ b/scripts/ml/combined_model/cm_logo_skf.py @@ -80,6 +80,8 @@ homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### +outdir = homedir + '/git/LSHTM_ML/output/combined/ + #==================== # Import ML functions #==================== @@ -92,6 +94,9 @@ skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True, random_state = 42) #logo = LeaveOneGroupOut() +######################################################################## +# COMPLETE data: No tts_split +######################################################################## #%% def CMLogoSkf(combined_df , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] @@ -125,7 +130,8 @@ def CMLogoSkf(combined_df tts_split_type = "logo_skf_BT_" + bts_gene - outFile = "/home/tanu/git/Data/ml_combined/" + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv" + outFile = outdir + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv" + print(outFile) #------- diff --git a/scripts/ml/combined_model/cm_ml_iterator.py b/scripts/ml/combined_model/cm_ml_iterator.py index e6ea9d2..20e8b0a 100755 --- a/scripts/ml/combined_model/cm_ml_iterator.py +++ b/scripts/ml/combined_model/cm_ml_iterator.py @@ -15,19 +15,19 @@ homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### +outdir = homedir + '/git/LSHTM_ML/output/combined/ + #==================== # Import ML functions #==================== -from MultClfs import * +#from MultClfs import * +from MultClfs_logo_skf import * from GetMLData import * from SplitTTS import * -# param dict for getmldata() -combined_model_paramD = {'data_combined_model' : False - , 'use_or' : False - , 'omit_all_genomic_features': False - , 'write_maskfile' : False - , 'write_outfile' : False } +# Input data +from ml_data_combined import * + ############################################################################### #ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"] @@ -54,7 +54,7 @@ for gene, drug in ml_gene_drugD.items(): for split_type in split_types: for data_type in split_data_types: - out_filename = (gene.lower()+'_'+split_type+'_'+data_type+'.csv') + out_filename = outdir + gene.lower()+ '_' + split_type + '_' + data_type + '.csv' tempD=split_tts(gene_dataD[gene_low] , data_type = data_type , split_type = split_type @@ -88,14 +88,8 @@ for gene, drug in ml_gene_drugD.items(): mmDD = {} for k, v in paramD.items(): - scoresD = MultModelsCl(**paramD[k] - , tts_split_type = split_type - , skf_cv = skf_cv - , blind_test_df = tempD['X_bts'] - , blind_test_target = tempD['y_bts'] - , add_cm = True - , add_yn = True - , return_formatted_output = True) + scoresD = MultModelsCl_logo_skf(**paramD[k] + XXXXXXXXXXXXXXXXXXXXXXX mmDD[k] = scoresD # Extracting the dfs from within the dict and concatenating to output as one df diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py index e6ea9d2..a7dc7c6 100755 --- a/scripts/ml/ml_iterator.py +++ b/scripts/ml/ml_iterator.py @@ -15,6 +15,8 @@ homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### +outdir = homedir + '/git/LSHTM_ML/output/genes/' + #==================== # Import ML functions #==================== @@ -54,7 +56,9 @@ for gene, drug in ml_gene_drugD.items(): for split_type in split_types: for data_type in split_data_types: - out_filename = (gene.lower()+'_'+split_type+'_'+data_type+'.csv') + + out_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + '.csv' + tempD=split_tts(gene_dataD[gene_low] , data_type = data_type , split_type = split_type @@ -103,5 +107,5 @@ for gene, drug in ml_gene_drugD.items(): out_wf= pd.concat(mmDD, ignore_index = True) out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) - out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False) + out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+ out_filename), index = False) diff --git a/scripts/ml/ml_iterator_fs.py b/scripts/ml/ml_iterator_fs.py old mode 100644 new mode 100755 index 60ec2db..57d93e2 --- a/scripts/ml/ml_iterator_fs.py +++ b/scripts/ml/ml_iterator_fs.py @@ -15,6 +15,8 @@ homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### +outdir = homedir + '/git/LSHTM_ML/output/fs/' + #==================== # Import ML functions #==================== @@ -31,7 +33,8 @@ combined_model_paramD = {'data_combined_model' : False , 'write_outfile' : False } ############################################################################### #ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"] -outdir = homedir + '/git/Data/ml_combined/fs/' +# outdir = homedir + '/git/Data/ml_combined/fs/' + ml_gene_drugD = {'pncA' : 'pyrazinamide' # , 'embB' : 'ethambutol' # , 'katG' : 'isoniazid' @@ -39,26 +42,27 @@ ml_gene_drugD = {'pncA' : 'pyrazinamide' # , 'gid' : 'streptomycin' } gene_dataD={} -#split_types = ['70_30', '80_20', 'sl'] -#split_data_types = ['actual', 'complete'] -split_types = ['70_30'] +split_types = ['70_30', '80_20', 'sl'] split_data_types = ['actual', 'complete'] +#split_types = ['70_30'] +#split_data_types = ['actual', 'complete'] -fs_models = [('Logistic Regression' , LogisticRegression(**rs) )] -# fs_models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) -# , ('Decision Tree' , DecisionTreeClassifier(**rs) ) -# , ('Extra Tree' , ExtraTreeClassifier(**rs) ) -# , ('Extra Trees' , ExtraTreesClassifier(**rs) ) -# , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) -# , ('LDA' , LinearDiscriminantAnalysis() ) -# , ('Logistic Regression' , LogisticRegression(**rs) ) -# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) -# , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) -# , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) -# , ('Ridge Classifier' , RidgeClassifier(**rs) ) -# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) -# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) -# ] +#fs_models = [('Logistic Regression' , LogisticRegression(**rs) )] + +fs_models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) + , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + , ('LDA' , LinearDiscriminantAnalysis() ) + , ('Logistic Regression' , LogisticRegression(**rs) ) + , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) + , ('Ridge Classifier' , RidgeClassifier(**rs) ) + , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + ] for gene, drug in ml_gene_drugD.items(): print ('\nGene:', gene @@ -88,26 +92,28 @@ for gene, drug in ml_gene_drugD.items(): , 'target' : tempD['y'] , 'var_type' : 'mixed' , 'resampling_type': 'none'} - ,'smnc_paramD': { 'input_df' : tempD['X_smnc'] - , 'target' : tempD['y_smnc'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'smnc'} - # , 'ros_paramD': { 'input_df' : tempD['X_ros'] - # , 'target' : tempD['y_ros'] - # , 'var_type' : 'mixed' - # , 'resampling_type' : 'ros'} - # , 'rus_paramD' : { 'input_df' : tempD['X_rus'] - # , 'target' : tempD['y_rus'] - # , 'var_type' : 'mixed' - # , 'resampling_type' : 'rus'} - # , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] - # , 'target' : tempD['y_rouC'] - # , 'var_type' : 'mixed' - # , 'resampling_type': 'rouC'} + + , 'smnc_paramD' : { 'input_df' : tempD['X_smnc'] + , 'target' : tempD['y_smnc'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + + , 'ros_paramD' : { 'input_df' : tempD['X_ros'] + , 'target' : tempD['y_ros'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + + , 'rus_paramD' : { 'input_df' : tempD['X_rus'] + , 'target' : tempD['y_rus'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + + , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] + , 'target' : tempD['y_rouC'] + , 'var_type' : 'mixed' + , 'resampling_type': 'rouC'} } - #for m in fs_models: - # print(m) - + out_fsD = {} index = 1 for model_name, model_fn in fs_models: diff --git a/scripts/ml/running_ml_scripts.txt b/scripts/ml/running_ml_scripts.txt deleted file mode 100644 index 279cb9d..0000000 --- a/scripts/ml/running_ml_scripts.txt +++ /dev/null @@ -1,95 +0,0 @@ -######################################################################## - -# 70/30 [WITHOUT OR] - -######################################################################## - -=-----------------------------------= -# actual data -#------------------------------------= - -time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030_.txt -time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030_.txt -time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030_.txt -time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030_.txt -time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030_.txt -time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030_.txt - -=-----------------------------------= -# COMPLETE data -#------------------------------------= - -time ./run_cd_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_7030_.txt -time ./run_cd_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_7030_.txt -time ./run_cd_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_7030_.txt -time ./run_cd_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_7030_.txt -time ./run_cd_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_7030_.txt -time ./run_cd_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_7030_.txt - - -######################################################################## - -# 80/20 [WITHOUT OR] - -######################################################################## -=-----------------------------------= -# actual data -#------------------------------------= - -time ./run_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_8020_.txt -time ./run_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_8020_.txt -time ./run_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_8020_.txt -time ./run_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_8020_.txt -time ./run_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_8020_.txt -time ./run_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_8020_.txt - -=-----------------------------------= -# COMPLETE data -#------------------------------------= - -time ./run_cd_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_8020_.txt -time ./run_cd_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_8020_.txt -time ./run_cd_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_8020_.txt -time ./run_cd_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_8020_.txt -time ./run_cd_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_8020_.txt -time ./run_cd_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_8020_.txt -######################################################################## - -# SL [WITHOUT OR] - -######################################################################## - -=-----------------------------------= -# actual data -#------------------------------------= -time ./run_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_sl_.txt -time ./run_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_sl_.txt -time ./run_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_sl_.txt -time ./run_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_sl_.txt -time ./run_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_sl_.txt -time ./run_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_sl_.txt - -=-----------------------------------= -# COMPLETE data -#------------------------------------= -time ./run_cd_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_sl_.txt -time ./run_cd_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_sl_.txt -time ./run_cd_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_sl_.txt -time ./run_cd_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_sl_.txt -time ./run_cd_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_sl_.txt -time ./run_cd_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_sl_.txt - - -######################################################################## - -######################################################################## -######################################################################## -###################### Feature Selection ########################## -######################################################################## -######################################################################## - -# 7030 -time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt - - -time ./run_FS_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030_.txt