reran to generate merged_df3 with correct dst for dst muts. modified combining_dfs_plotting.R

This commit is contained in:
Tanushree Tunstall 2022-07-08 21:33:57 +01:00
parent 289c8913d0
commit 8079dd7b6c
6 changed files with 148 additions and 211 deletions

View file

@ -168,7 +168,7 @@ def MultModelsCl(input_df, target
@param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
@type: int or StratifiedKfold()
@var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho t encoder)
@var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
@type: list
returns

View file

@ -168,7 +168,7 @@ def MultModelsCl(input_df, target
@param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass
@type: int or StratifiedKfold()
@var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho t encoder)
@var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder)
@type: list
returns
@ -239,8 +239,8 @@ def MultModelsCl(input_df, target
# , ('Gaussian NB' , GaussianNB() )
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
# , ('LDA' , LinearDiscriminantAnalysis() )
# , ('Logistic Regression' , LogisticRegression(**rs) )
, ('LDA' , LinearDiscriminantAnalysis() )
, ('Logistic Regression' , LogisticRegression(**rs) )
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
#, ('Multinomial' , MultinomialNB() )
@ -259,7 +259,7 @@ def MultModelsCl(input_df, target
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
# , ('SVC' , SVC(**rs) )
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
#
]

View file

@ -14,23 +14,8 @@ sys.path
# import
from GetMLData import *
from SplitTTS import *
#from MultClfs_fi import *
from MultClfs import *
#%%
# X,y = load_boston(return_X_y=True)
# features = load_boston()['feature_names']
# X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
# rf = RandomForestRegressor(random_state=0)
# rf.fit(X_train,y_train)
# f_i = list(zip(features,rf.feature_importances_))
# f_i.sort(key = lambda x : x[1])
# plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
# plt.show()
#from MultClfs import *
from MultClfs_SIMPLE import *
#%%
@ -54,9 +39,8 @@ df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
#df = getmldata('alr' , 'cycloserine' , **combined_model_paramD)
all(df.columns.isin(['gene_name'])) # should be False
spl_type = '70_30'
#spl_type = '80_20'
#spl_type = 'sl'
@ -73,6 +57,16 @@ df2 = split_tts(df
all(df2['X'].columns.isin(['gene_name'])) # should be False
df['dst'].value_counts()
df['dst'].isna().sum()
df['dst_mode'].value_counts()
len(df)
Counter(df2['y'])
Counter(df2['y_bts'])
fooD = MultModelsCl(input_df = df2['X']
, target = df2['y']
, sel_cv = skf_cv