saving work from thinkpad

This commit is contained in:
Tanushree Tunstall 2022-07-08 13:53:17 +01:00
parent 5577f5b195
commit 23799275a0
3 changed files with 49 additions and 31 deletions

View file

@ -4,7 +4,7 @@
#source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/katg.R") #source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/pnca.R") source("~/git/LSHTM_analysis/config/pnca.R")
source("~/git/LSHTM_analysis/config/rpob.R") source("~/git/LSHTM_analysis/config/rpob.R")
############################# #############################
@ -55,7 +55,7 @@ if (check12) {
cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ') cat('\nPASS: dst mode labels verified. merged_df3 CAN be trusted! ')
}else{ }else{
stop('FAIL: Something is wrong with the dst_mode column. Quitting!') stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
``} }
#========================== #==========================
# CHECK: active site labels # CHECK: active site labels
@ -189,6 +189,24 @@ if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2
#quit() #quit()
} }
#%%###################################################################
# check merged_df3
check_mdf3 = merged_df3[, cols_sel]
check_mdf3T = table(check_mdf3$mutationinformation, check_mdf3$dst_mode)
ft_mdf3 = as.data.frame.matrix(check_mdf3T)
#==================
# CHECK: dst mode
#===================
dst_check_mdf3 = all((ft_mdf3[,1]==0)==(ft_mdf3[,2]!=0)); dst_check_mdf3
sel = c("mutationinformation", "dst", "dst_mode")
a = merged_df3[, sel]
str(a)
# write file # write file
# outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') # outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
# outfile_merged_df3 # outfile_merged_df3

View file

@ -146,8 +146,8 @@ scoreBT_mapD = {'bts_mcc' : 'MCC'
def MultModelsCl(input_df, target def MultModelsCl(input_df, target
#, skf_cv #, skf_cv
, sel_cv , sel_cv
, blind_test_df #, blind_test_df
, blind_test_target #, blind_test_target
, tts_split_type , tts_split_type
, resampling_type = 'none' # default , resampling_type = 'none' # default
@ -231,35 +231,36 @@ def MultModelsCl(input_df, target
# Specify multiple Classification Models # Specify multiple Classification Models
#====================================================== #======================================================
models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) ) models = [('AdaBoost Classifier' , AdaBoostClassifier(**rs) )
, ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) ) # , ('Bagging Classifier' , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
, ('Decision Tree' , DecisionTreeClassifier(**rs) ) # , ('Decision Tree' , DecisionTreeClassifier(**rs) )
, ('Extra Tree' , ExtraTreeClassifier(**rs) ) # , ('Extra Tree' , ExtraTreeClassifier(**rs) )
, ('Extra Trees' , ExtraTreesClassifier(**rs) ) # , ('Extra Trees' , ExtraTreesClassifier(**rs) )
, ('Gradient Boosting' , GradientBoostingClassifier(**rs) ) # , ('Gradient Boosting' , GradientBoostingClassifier(**rs) )
, ('Gaussian NB' , GaussianNB() ) # , ('Gaussian NB' , GaussianNB() )
, ('Gaussian Process' , GaussianProcessClassifier(**rs) ) # , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
, ('K-Nearest Neighbors' , KNeighborsClassifier() ) # , ('K-Nearest Neighbors' , KNeighborsClassifier() )
, ('LDA' , LinearDiscriminantAnalysis() ) , ('LDA' , LinearDiscriminantAnalysis() )
, ('Logistic Regression' , LogisticRegression(**rs) ) # , ('Logistic Regression' , LogisticRegression(**rs) )
, ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs)) # , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
, ('MLP' , MLPClassifier(max_iter = 500, **rs) ) # , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
#, ('Multinomial' , MultinomialNB() ) #, ('Multinomial' , MultinomialNB() )
, ('Naive Bayes' , BernoulliNB() ) # , ('Naive Bayes' , BernoulliNB() )
, ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) ) # , ('Passive Aggresive' , PassiveAggressiveClassifier(**rs, **njobs) )
, ('QDA' , QuadraticDiscriminantAnalysis() ) # , ('QDA' , QuadraticDiscriminantAnalysis() )
, ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) # , ('Random Forest' , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) )
# , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5 # # , ('Random Forest2' , RandomForestClassifier(min_samples_leaf = 5
# , n_estimators = 1000 # , n_estimators = 1000
# , bootstrap = True # , bootstrap = True
# , oob_score = True # , oob_score = True
# , **njobs # , **njobs
# , **rs # , **rs
# , max_features = 'auto') ) # , max_features = 'auto') )
, ('Ridge Classifier' , RidgeClassifier(**rs) ) # , ('Ridge Classifier' , RidgeClassifier(**rs) )
, ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) ) # , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
, ('SVC' , SVC(**rs) ) # , ('SVC' , SVC(**rs) )
, ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) ) # , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) ) # , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
#
] ]
mm_skf_scoresD = {} mm_skf_scoresD = {}
@ -308,7 +309,7 @@ def MultModelsCl(input_df, target
# ADD more info: meta data related to input df # ADD more info: meta data related to input df
mm_skf_scoresD[model_name]['resampling'] = resampling_type mm_skf_scoresD[model_name]['resampling'] = resampling_type
mm_skf_scoresD[model_name]['n_training_size'] = len(input_df) mm_skf_scoresD[model_name]['n_training_size'] = len(input_df)
mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2) #mm_skf_scoresD[model_name]['n_trainingY_ratio'] = round(Counter(target)[0]/Counter(target)[1], 2)
mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns) mm_skf_scoresD[model_name]['n_features'] = len(input_df.columns)
mm_skf_scoresD[model_name]['tts_split'] = tts_split_type mm_skf_scoresD[model_name]['tts_split'] = tts_split_type
@ -357,7 +358,7 @@ def MultModelsCl(input_df, target
# Build bts numbers dict # Build bts numbers dict
btD = {'n_blindY_neg' : Counter(blind_test_target)[0] btD = {'n_blindY_neg' : Counter(blind_test_target)[0]
, 'n_blindY_pos' : Counter(blind_test_target)[1] , 'n_blindY_pos' : Counter(blind_test_target)[1]
, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2) #, 'n_testY_ratio' : round(Counter(blind_test_target)[0]/Counter(blind_test_target)[1], 2)
, 'n_test_size' : len(blind_test_df) } , 'n_test_size' : len(blind_test_df) }
# Update cmD+tnD dicts with btD # Update cmD+tnD dicts with btD

View file

@ -58,8 +58,8 @@ all(df.columns.isin(['gene_name'])) # should be False
spl_type = '70_30' spl_type = '70_30'
spl_type = '80_20' #spl_type = '80_20'
spl_type = 'sl' #spl_type = 'sl'
df2 = split_tts(df df2 = split_tts(df
, data_type = 'actual' , data_type = 'actual'
@ -84,7 +84,6 @@ fooD = MultModelsCl(input_df = df2['X']
, var_type = ['mixed'] , var_type = ['mixed']
, scale_numeric = ['min_max'] , scale_numeric = ['min_max']
, return_formatted_output = False , return_formatted_output = False
) )
for k, v in fooD.items(): for k, v in fooD.items():