added test script to see one gene

2022-07-05 16:06:24 +01:00 · 2022-07-05 16:06:24 +01:00 · ebef0c7967
commit ebef0c7967
parent 79cb89a019
1 changed files with 79 additions and 0 deletions
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -0,0 +1,79 @@
+import pandas as pd
+import os, sys
+import numpy as np
+from sklearn.datasets import load_boston 
+from sklearn.ensemble import RandomForestRegressor 
+from sklearn.model_selection import train_test_split
+from sklearn.feature_selection import RFECV
+import matplotlib.pyplot as plt
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
+sys.path
+
+# import
+from GetMLData import *
+from SplitTTS import *
+from MultClfs_fi import *
+
+#%%
+# X,y = load_boston(return_X_y=True) 
+# features = load_boston()['feature_names']
+
+# X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
+
+# rf = RandomForestRegressor(random_state=0) 
+# rf.fit(X_train,y_train)
+
+
+# f_i = list(zip(features,rf.feature_importances_)) 
+# f_i.sort(key = lambda x : x[1])
+# plt.barh([x[0] for x in f_i],[x[1] for x in f_i]) 
+# plt.show()
+
+#%%
+
+sel_cv = StratifiedKFold(n_splits = 10
+                            , shuffle = True,**rs)
+#sel_cv = logo
+# sel_cv = RepeatedStratifiedKFold(n_splits = 5
+#                                   , n_repeats = 3
+#                                   , **rs)
+# param dict for getmldata()
+gene_model_paramD = {'data_combined_model'   : False
+                    , 'use_or'                   : False
+                    , 'omit_all_genomic_features': False
+                    , 'write_maskfile'           : False
+                    , 'write_outfile'            : False }
+
+#df = getmldata(gene, drug, **gene_model_paramD)
+df = getmldata('pncA', 'pyrazinamide', **gene_model_paramD)
+
+df2 = split_tts(df
+          , data_type = 'actual'
+          , split_type = '70_30'
+          , oversampling = False
+          , dst_colname = 'dst'
+          , target_colname = 'dst_mode'
+          , include_gene_name = True
+          , random_state = 42 # default
+      )
+
+all(df2['X'].columns.isin(['gene_name'])) # should be False
+
+fooD = MultClfs_fi (input_df = df2['X']
+                , target = df2['y']
+                , sel_cv = sel_cv
+                , run_blind_test = True
+                , blind_test_df =  df2['X_bts']
+                , blind_test_target =  df2['y_bts']
+                , tts_split_type  = '70_30'
+                , var_type = 'mixed'
+                , resampling_type = 'none' # default
+)
+
+for k, v in fooD.items():
+    print('\nModel:', k
+          , '\nTRAIN MCC:', fooD[k]['test_mcc']
+          ,  '\nBTS MCC:' , fooD[k]['bts_mcc']
+          , '\nDIFF:',fooD[k]['bts_mcc'] - fooD[k]['test_mcc'] )