From dccd3c8eb28efa14bfc38b5af46ec9466537a465 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sat, 2 Jul 2022 10:25:42 +0100 Subject: [PATCH] multiple changes --- scripts/ml/combined_model/cm_logo_skf.py | 2 +- scripts/ml/combined_model/cm_ml_iterator.py | 118 +++++++++---------- scripts/ml/ml_functions/MultClfs.py | 7 +- scripts/ml/ml_functions/MultClfs_logo.py | 6 - scripts/ml/ml_functions/MultClfs_logo_skf.py | 7 +- scripts/ml/ml_functions/ml_data_combined.py | 4 + scripts/ml/ml_iterator_fs.py | 60 ++++------ 7 files changed, 82 insertions(+), 122 deletions(-) diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py index 28d8730..a54b8f2 100755 --- a/scripts/ml/combined_model/cm_logo_skf.py +++ b/scripts/ml/combined_model/cm_logo_skf.py @@ -80,7 +80,7 @@ homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### -outdir = homedir + '/git/LSHTM_ML/output/combined/ +outdir = homedir + '/git/LSHTM_ML/output/combined/' #==================== # Import ML functions diff --git a/scripts/ml/combined_model/cm_ml_iterator.py b/scripts/ml/combined_model/cm_ml_iterator.py index 20e8b0a..f899b68 100755 --- a/scripts/ml/combined_model/cm_ml_iterator.py +++ b/scripts/ml/combined_model/cm_ml_iterator.py @@ -15,13 +15,15 @@ homedir = os.path.expanduser("~") sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') sys.path ############################################################################### -outdir = homedir + '/git/LSHTM_ML/output/combined/ +outdir = homedir + '/git/LSHTM_ML/output/combined/' #==================== # Import ML functions #==================== #from MultClfs import * -from MultClfs_logo_skf import * +#from MultClfs_logo_skf import * +from MultClfs_logo_skf_split import * + from GetMLData import * from SplitTTS import * @@ -29,73 +31,59 @@ from SplitTTS import * from ml_data_combined import * ############################################################################### -#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"] +print('\nUsing data with 5 genes:', len(cm_input_df5)) + +############################################################################### -ml_gene_drugD = {'pncA' : 'pyrazinamide' - , 'embB' : 'ethambutol' - , 'katG' : 'isoniazid' - , 'rpoB' : 'rifampicin' - , 'gid' : 'streptomycin' - } -gene_dataD={} split_types = ['70_30', '80_20', 'sl'] split_data_types = ['actual', 'complete'] -for gene, drug in ml_gene_drugD.items(): - print ('\nGene:', gene - , '\nDrug:', drug) - gene_low = gene.lower() - gene_dataD[gene_low] = getmldata(gene, drug - , data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it. - , use_or = False - , omit_all_genomic_features = False - , write_maskfile = False - , write_outfile = False) +for split_type in split_types: + for data_type in split_data_types: - for split_type in split_types: - for data_type in split_data_types: - out_filename = outdir + gene.lower()+ '_' + split_type + '_' + data_type + '.csv' - tempD=split_tts(gene_dataD[gene_low] - , data_type = data_type - , split_type = split_type - , oversampling = True - , dst_colname = 'dst' - , target_colname = 'dst_mode' - , include_gene_name = True - ) - paramD = { - 'baseline_paramD': { 'input_df' : tempD['X'] - , 'target' : tempD['y'] - , 'var_type' : 'mixed' - , 'resampling_type': 'none'} - , 'smnc_paramD': { 'input_df' : tempD['X_smnc'] - , 'target' : tempD['y_smnc'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'smnc'} - , 'ros_paramD': { 'input_df' : tempD['X_ros'] - , 'target' : tempD['y_ros'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'ros'} - , 'rus_paramD' : { 'input_df' : tempD['X_rus'] - , 'target' : tempD['y_rus'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'rus'} - , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] - , 'target' : tempD['y_rouC'] - , 'var_type' : 'mixed' - , 'resampling_type': 'rouC'} - } - - mmDD = {} - for k, v in paramD.items(): - scoresD = MultModelsCl_logo_skf(**paramD[k] - XXXXXXXXXXXXXXXXXXXXXXX - mmDD[k] = scoresD + out_filename = outdir + 'cm_' + split_type + '_' + data_type + '.csv' + print(out_filename) + tempD = split_tts(cm_input_df5 + , data_type = data_type + , split_type = split_type + , oversampling = True + , dst_colname = 'dst' + , target_colname = 'dst_mode' + , include_gene_name = True + ) + paramD = { + 'baseline_paramD': { 'input_df' : tempD['X'] + , 'target' : tempD['y'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'none'} + , 'smnc_paramD' : { 'input_df' : tempD['X_smnc'] + , 'target' : tempD['y_smnc'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'smnc'} + , 'ros_paramD' : { 'input_df' : tempD['X_ros'] + , 'target' : tempD['y_ros'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'ros'} + , 'rus_paramD' : { 'input_df' : tempD['X_rus'] + , 'target' : tempD['y_rus'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'rus'} + , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] + , 'target' : tempD['y_rouC'] + , 'var_type' : 'mixed' + , 'resampling_type' : 'rouC'} + } + + mmDD = {} + for k, v in paramD.items(): + scoresD = MultModelsCl_logo_skf(**paramD[k] + XXXXXXXXXXXXXXXXXXXXXXX + mmDD[k] = scoresD - # Extracting the dfs from within the dict and concatenating to output as one df - for k, v in mmDD.items(): - out_wf= pd.concat(mmDD, ignore_index = True) - - out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) - out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False) + # Extracting the dfs from within the dict and concatenating to output as one df + for k, v in mmDD.items(): + out_wf= pd.concat(mmDD, ignore_index = True) + + out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) + out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False) diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index b16ce2e..522ef20 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -344,12 +344,7 @@ def MultModelsCl(input_df, target, skf_cv mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC - - #ADD: target numbers for bts - yc2 = Counter(blind_test_target) - yc2_ratio = yc2[0]/yc2[1] - mm_skf_scoresD[model_name]['n_test_size'] = len(blind_test_df) - mm_skf_scoresD[model_name]['n_testY_ratio']= round(yc2_ratio,2) + #return(mm_skf_scoresD) #============================ diff --git a/scripts/ml/ml_functions/MultClfs_logo.py b/scripts/ml/ml_functions/MultClfs_logo.py index aa9bb4e..bf03382 100755 --- a/scripts/ml/ml_functions/MultClfs_logo.py +++ b/scripts/ml/ml_functions/MultClfs_logo.py @@ -357,12 +357,6 @@ def MultModelsCl_logo(input_df mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC - - #ADD: target numbers for bts - yc2 = Counter(blind_test_target) - yc2_ratio = yc2[0]/yc2[1] - mm_skf_scoresD[model_name]['n_test_size'] = len(blind_test_df) - mm_skf_scoresD[model_name]['n_testY_ratio']= round(yc2_ratio,2) #return(mm_skf_scoresD) #============================ diff --git a/scripts/ml/ml_functions/MultClfs_logo_skf.py b/scripts/ml/ml_functions/MultClfs_logo_skf.py index afebe94..68eb906 100755 --- a/scripts/ml/ml_functions/MultClfs_logo_skf.py +++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py @@ -369,12 +369,7 @@ def MultModelsCl_logo_skf(input_df mm_skf_scoresD[model_name]['bts_roc_auc'] = round(roc_auc_score(blind_test_target, bts_predict),2) mm_skf_scoresD[model_name]['bts_jcc'] = round(jaccard_score(blind_test_target, bts_predict),2) #mm_skf_scoresD[model_name]['diff_mcc'] = train_test_diff_MCC - - #ADD: target numbers for bts - yc2 = Counter(blind_test_target) - yc2_ratio = yc2[0]/yc2[1] - mm_skf_scoresD[model_name]['n_test_size'] = len(blind_test_df) - mm_skf_scoresD[model_name]['n_testY_ratio']= round(yc2_ratio,2) + #return(mm_skf_scoresD) #============================ diff --git a/scripts/ml/ml_functions/ml_data_combined.py b/scripts/ml/ml_functions/ml_data_combined.py index 5033359..c4fc494 100644 --- a/scripts/ml/ml_functions/ml_data_combined.py +++ b/scripts/ml/ml_functions/ml_data_combined.py @@ -67,4 +67,8 @@ if 'gene_name' in colnames_combined_df: print("\nGene name included") else: ('\nGene name NOT included') + + +omit_gene_alr = ['alr'] +cm_input_df5 = combined_df[~combined_df['gene_name'].isin(omit_gene_alr)] ############################################################################## diff --git a/scripts/ml/ml_iterator_fs.py b/scripts/ml/ml_iterator_fs.py index 57d93e2..b8ea10b 100755 --- a/scripts/ml/ml_iterator_fs.py +++ b/scripts/ml/ml_iterator_fs.py @@ -51,17 +51,17 @@ split_data_types = ['actual', 'complete'] fs_models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - , ('Extra Tree' , ExtraTreeClassifier(**rs) ) - , ('Extra Trees' , ExtraTreesClassifier(**rs) ) - , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - , ('LDA' , LinearDiscriminantAnalysis() ) - , ('Logistic Regression' , LogisticRegression(**rs) ) - , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) - , ('Ridge Classifier' , RidgeClassifier(**rs) ) - , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + #, ('Extra Tree' , ExtraTreeClassifier(**rs) ) + #, ('Extra Trees' , ExtraTreesClassifier(**rs) ) + #, ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + #, ('LDA' , LinearDiscriminantAnalysis() ) + #, ('Logistic Regression' , LogisticRegression(**rs) ) + #, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + #, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + #, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000 ) ) + #, ('Ridge Classifier' , RidgeClassifier(**rs) ) + #, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + #, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) ] for gene, drug in ml_gene_drugD.items(): @@ -78,7 +78,7 @@ for gene, drug in ml_gene_drugD.items(): for split_type in split_types: for data_type in split_data_types: # unused per-split outfile - out_filename = outdir + gene.lower() + '_'+split_type+'_' + data_type + '.json' + #out_filename = outdir + gene.lower() + '_'+split_type+'_' + data_type + '.json' tempD=split_tts(gene_dataD[gene_low] , data_type = data_type , split_type = split_type @@ -122,41 +122,25 @@ for gene, drug in ml_gene_drugD.items(): , '\nModel func:' , model_fn) #, '\nList of models:', models) index = index+1 - - out_fsD[model_name] = {} - # current_model = {} + #out_fsD[model_name] = {} + current_model = {} for k, v in paramD.items(): - # out_filename = (gene.lower() + '_' + split_type + '_' + data_type + '_' + k + '.json') + out_filename = (gene.lower() + '_' + split_type + '_' + data_type + '_' + model_name + '_' + k + '.json') fsD_params=paramD[k] - # print("XXXXXX THIS: ", len(fsD_params['input_df']) ) - # print("XXXXXX THIS: ", out_filename ) - # current_model[k] = fsgs_rfecv( - out_fsD[model_name][k] = fsgs_rfecv( + #out_fsD[model_name][k] = fsgs_rfecv( + thingg = foo( + ) + current_model[k] = fsgs_rfecv( **fsD_params , param_gridLd = [{'fs__min_features_to_select': [1]}] , blind_test_df = tempD['X_bts'] , blind_test_target = tempD['y_bts'] , estimator = model_fn , use_fs = False # uses estimator as the RFECV parameter for fs. Set to TRUE if you want to supply custom_fs as shown below + # NOTE: IS THIS CORRECT?!? , custom_fs = RFECV(DecisionTreeClassifier(**rs), cv = skf_cv, scoring = 'matthews_corrcoef') , cv_method = skf_cv ) - # write per-resampler outfile here - # with open(out_filename, 'w') as f: - # f.write(json.dumps(current_model - # , cls = NpEncoder ) - # ) - - # write per-split outfile here - with open(out_filename, 'w') as f: - f.write(json.dumps(out_fsD - #, cls = NpEncoder - )) -#%%############################################################################ -# # Read output json -# testF = outdir + 'pnca_70_30_actual.json' -# testF = outdir + 'pnca_70_30_complete.json' - -# with open(testF, 'r') as f: -# data = json.load(f) + with open(out_filename, 'w') as f: + f.write(json.dumps(current_model)