From 23799275a0d1585f469e461a6cec9ca6aad573a6 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 8 Jul 2022 13:53:17 +0100 Subject: [PATCH] saving work from thinkpad --- scripts/count_vars_ML.R | 22 +++++++- scripts/ml/ml_functions/MultClfs.py | 53 ++++++++++--------- .../ml/ml_functions/test_func_singlegene.py | 5 +- 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index 228d462..ec13f53 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -4,7 +4,7 @@ #source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/katg.R") -#source("~/git/LSHTM_analysis/config/pnca.R") +source("~/git/LSHTM_analysis/config/pnca.R") source("~/git/LSHTM_analysis/config/rpob.R") ############################# @@ -55,7 +55,7 @@ if (check12) { cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ') }else{ stop('FAIL: Something is wrong with the dst_mode column. Quitting!') -``} +} #========================== # CHECK: active site labels @@ -189,6 +189,24 @@ if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 #quit() } +#%%################################################################### +# check merged_df3 +check_mdf3 = merged_df3[, cols_sel] + +check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode) +ft_mdf3 = as.data.frame.matrix(check_mdf3T) + +#================== +# CHECK: dst mode +#=================== +dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3 + +sel = c("mutationinformation", "dst", "dst_mode") + +a = merged_df3[, sel] +str(a) + + # write file # outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') # outfile_merged_df3 diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index c703a6a..46be467 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -146,8 +146,8 @@ scoreBT_mapD = {'bts_mcc' : 'MCC' def MultModelsCl(input_df, target #, skf_cv , sel_cv - , blind_test_df - , blind_test_target + #, blind_test_df + #, blind_test_target , tts_split_type , resampling_type = 'none' # default @@ -231,35 +231,36 @@ def MultModelsCl(input_df, target # Specify multiple Classification Models #====================================================== models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) - , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) - , ('Decision Tree' , DecisionTreeClassifier(**rs) ) - , ('Extra Tree' , ExtraTreeClassifier(**rs) ) - , ('Extra Trees' , ExtraTreesClassifier(**rs) ) - , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) - , ('Gaussian NB' , GaussianNB() ) - , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) - , ('K-Nearest Neighbors' , KNeighborsClassifier() ) - , ('LDA' , LinearDiscriminantAnalysis() ) - , ('Logistic Regression' , LogisticRegression(**rs) ) - , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) - , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) + # , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) + # , ('Decision Tree' , DecisionTreeClassifier(**rs) ) + # , ('Extra Tree' , ExtraTreeClassifier(**rs) ) + # , ('Extra Trees' , ExtraTreesClassifier(**rs) ) + # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) + # , ('Gaussian NB' , GaussianNB() ) + # , ('Gaussian Process' , GaussianProcessClassifier(**rs) ) + # , ('K-Nearest Neighbors' , KNeighborsClassifier() ) + , ('LDA' , LinearDiscriminantAnalysis() ) + # , ('Logistic Regression' , LogisticRegression(**rs) ) + # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) + # , ('MLP' , MLPClassifier(max_iter = 500, **rs) ) #, ('Multinomial' , MultinomialNB() ) - , ('Naive Bayes' , BernoulliNB() ) - , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) - , ('QDA' , QuadraticDiscriminantAnalysis() ) - , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) - # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 + # , ('Naive Bayes' , BernoulliNB() ) + # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) + # , ('QDA' , QuadraticDiscriminantAnalysis() ) + # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) + # # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 # , n_estimators = 1000 # , bootstrap = True # , oob_score = True # , **njobs # , **rs # , max_features = 'auto') ) - , ('Ridge Classifier' , RidgeClassifier(**rs) ) - , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) - , ('SVC' , SVC(**rs) ) - , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) - , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) + # , ('Ridge Classifier' , RidgeClassifier(**rs) ) + # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) + # , ('SVC' , SVC(**rs) ) + # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) + # , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) + # ] mm_skf_scoresD = {} @@ -308,7 +309,7 @@ def MultModelsCl(input_df, target # ADD more info: meta data related to input df mm_skf_scoresD[model_name]['resampling'] = resampling_type mm_skf_scoresD[model_name]['n_training_size'] = len(input_df) - mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2) + #mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2) mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns) mm_skf_scoresD[model_name]['tts_split'] = tts_split_type @@ -357,7 +358,7 @@ def MultModelsCl(input_df, target # Build bts numbers dict btD = {'n_blindY_neg' : Counter(blind_test_target)[0] , 'n_blindY_pos' : Counter(blind_test_target)[1] - , 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2) + #, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2) , 'n_test_size' : len(blind_test_df) } # Update cmD+tnD dicts with btD diff --git a/scripts/ml/ml_functions/test_func_singlegene.py b/scripts/ml/ml_functions/test_func_singlegene.py index f340ef8..ea80074 100644 --- a/scripts/ml/ml_functions/test_func_singlegene.py +++ b/scripts/ml/ml_functions/test_func_singlegene.py @@ -58,8 +58,8 @@ all(df.columns.isin(['gene_name'])) # should be False spl_type = '70_30' -spl_type = '80_20' -spl_type = 'sl' +#spl_type = '80_20' +#spl_type = 'sl' df2 = split_tts(df , data_type = 'actual' @@ -84,7 +84,6 @@ fooD = MultModelsCl(input_df = df2['X'] , var_type = ['mixed'] , scale_numeric = ['min_max'] , return_formatted_output = False - ) for k, v in fooD.items():