added cm_logo_skf.py and placeholder for splits

2022-07-01 13:55:12 +01:00 · 2022-07-01 13:55:12 +01:00 · d812835713
commit d812835713
parent 952cfeb4c0
4 changed files with 254 additions and 49 deletions
--- a/scripts/ml/combined_model/cm_logo_skf.py
+++ b/scripts/ml/combined_model/cm_logo_skf.py
@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun 29 19:44:06 2022
+
+@author: tanu
+"""
+import sys, os
+import pandas as pd
+import numpy as np
+import re
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
+sys.path
+###############################################################################
+#====================
+# Import ML functions 
+#====================
+from ml_data_combined import *
+from MultClfs_logo_skf import *
+#from GetMLData import *
+#from SplitTTS import *
+
+skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs)
+
+#logo = LeaveOneGroupOut()
+
+#%%
+def CMLogoSkf(combined_df
+        , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
+        , bts_genes = ["embb", "katg", "rpob", "pnca", "gid"]
+        , cols_to_drop = ['dst', 'dst_mode', 'gene_name']
+        , target_var = 'dst_mode'
+        , gene_group = 'gene_name'
+        , std_gene_omit = []
+        ):
+
+    for bts_gene in bts_genes:
+        print('\n BTS gene:', bts_gene)
+            
+        tr_gene_omit = std_gene_omit + [bts_gene]
+        n_tr_genes = (len(bts_genes) - (len(std_gene_omit)))
+        #n_total_genes = (len(bts_genes) - len(std_gene_omit))
+        n_total_genes = len(all_genes)
+        
+        training_genesL = std_gene_omit + list(set(bts_genes) - set(tr_gene_omit))
+        #training_genesL = [element for element in bts_genes if element not in tr_gene_omit]
+    
+        print('\nTotal genes: ', n_total_genes
+              ,'\nTraining on:', n_tr_genes
+              ,'\nTraining on genes:', training_genesL
+              , '\nOmitted genes:', tr_gene_omit
+              , '\nBlind test gene:', bts_gene) 
+            
+        tts_split_type = "logoBT_" + bts_gene
+        
+        outFile = "/home/tanu/git/Data/ml_combined/" + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv"
+        print(outFile)
+    
+        #-------
+        # training
+        #------
+        cm_training_df = combined_df[~combined_df['gene_name'].isin(tr_gene_omit)]
+        
+        cm_X = cm_training_df.drop(cols_to_drop, axis=1, inplace=False)
+        #cm_y = cm_training_df.loc[:,'dst_mode']
+        cm_y = cm_training_df.loc[:, target_var]
+    
+    
+        gene_group = cm_training_df.loc[:,'gene_name']
+        
+        print('\nTraining data dim:', cm_X.shape
+              , '\nTraining Target dim:', cm_y.shape)
+        
+        if all(cm_X.columns.isin(cols_to_drop) == False):
+            print('\nChecked training df does NOT have Target var')
+        else:
+            sys.exit('\nFAIL: training data contains Target var')
+        
+        #---------------
+        # BTS: genes
+        #---------------
+        cm_test_df = combined_df[combined_df['gene_name'].isin([bts_gene])]
+        
+        cm_bts_X = cm_test_df.drop(cols_to_drop, axis = 1, inplace = False)
+        #cm_bts_y = cm_test_df.loc[:, 'dst_mode']
+        cm_bts_y = cm_test_df.loc[:, target_var]
+    
+        print('\nTraining data dim:', cm_bts_X.shape
+              , '\nTraining Target dim:', cm_bts_y.shape)
+        
+        
+        #%%:Running Multiple models on LOGO with SKF
+        cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X
+                        , target = cm_y
+                        , group = 'none'
+                        , sel_cv = skf_cv
+        
+                        , blind_test_df = cm_bts_X
+                        , blind_test_target = cm_bts_y
+                        
+                        , tts_split_type = tts_split_type
+                        
+                        , resampling_type = 'none' # default
+                        , add_cm = True 
+                        , add_yn = True 
+                        , var_type = 'mixed'
+                        
+                        , run_blind_test = True
+                        , return_formatted_output = True
+                        , random_state = 42
+                        , n_jobs = 10
+                        )
+        
+        cD3_v2.to_csv(outFile)
+
+#%%
+CMLogoSkf(combined_df)
+CMLogoSkf(combined_df, std_gene_omit=['alr'])
--- a/scripts/ml/combined_model/cm_ml_iterator.py
+++ b/scripts/ml/combined_model/cm_ml_iterator.py
@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun 29 20:29:36 2022
+
+@author: tanu
+"""
+import sys, os
+import pandas as pd
+import numpy as np
+import re
+    
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
+sys.path
+###############################################################################
+#====================
+# Import ML functions 
+#====================
+from MultClfs import *
+from GetMLData import *
+from SplitTTS import *
+
+# param dict for getmldata()
+combined_model_paramD = {'data_combined_model'   : False
+                    , 'use_or'                   : False
+                    , 'omit_all_genomic_features': False
+                    , 'write_maskfile'           : False
+                    , 'write_outfile'            : False }
+###############################################################################
+#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"]
+
+ml_gene_drugD = {'pncA'   : 'pyrazinamide'
+                 , 'embB' : 'ethambutol'
+                 , 'katG' : 'isoniazid'
+                 , 'rpoB' : 'rifampicin'
+                 , 'gid'  : 'streptomycin'
+                 }
+gene_dataD={}
+split_types = ['70_30', '80_20', 'sl']
+split_data_types = ['actual', 'complete']
+
+for gene, drug in ml_gene_drugD.items():
+    print ('\nGene:', gene
+           , '\nDrug:', drug)
+    gene_low = gene.lower()
+    gene_dataD[gene_low] = getmldata(gene, drug
+              , data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it.
+              , use_or = False
+              , omit_all_genomic_features = False
+              , write_maskfile = False
+              , write_outfile = False)
+
+    for split_type in split_types:
+        for data_type in split_data_types:
+            out_filename = (gene.lower()+'_'+split_type+'_'+data_type+'.csv')
+            tempD=split_tts(gene_dataD[gene_low]
+                      , data_type = data_type
+                      , split_type = split_type
+                      , oversampling = True
+                      , dst_colname = 'dst'
+                      , target_colname = 'dst_mode'
+                      , include_gene_name = True
+                  )
+            paramD = {
+                    'baseline_paramD': { 'input_df'        : tempD['X']
+                                        , 'target'         : tempD['y']
+                                        , 'var_type'       : 'mixed'
+                                        , 'resampling_type': 'none'}
+                    , 'smnc_paramD': { 'input_df'          : tempD['X_smnc']
+                                      , 'target'           : tempD['y_smnc']
+                                      , 'var_type'         : 'mixed'
+                                      , 'resampling_type'  : 'smnc'}
+                    , 'ros_paramD': { 'input_df'           : tempD['X_ros']
+                                    , 'target'             : tempD['y_ros']
+                                    , 'var_type'           : 'mixed'
+                                    , 'resampling_type'    : 'ros'}
+                    , 'rus_paramD' : { 'input_df'          : tempD['X_rus']
+                                      , 'target'           : tempD['y_rus']
+                                      , 'var_type'         : 'mixed'
+                                      , 'resampling_type'  : 'rus'}
+                    , 'rouC_paramD' : { 'input_df'         : tempD['X_rouC']
+                                        , 'target'         : tempD['y_rouC']
+                                        , 'var_type'       : 'mixed'
+                                        , 'resampling_type': 'rouC'}
+                    }
+            
+            mmDD = {}
+            for k, v in paramD.items():
+                scoresD = MultModelsCl(**paramD[k]
+                                    , tts_split_type = split_type
+                                    , skf_cv = skf_cv
+                                    , blind_test_df =  tempD['X_bts']
+                                    , blind_test_target = tempD['y_bts']
+                                    , add_cm = True 
+                                    , add_yn = True
+                                    , return_formatted_output = True)
+                mmDD[k] = scoresD
+
+            # Extracting the dfs from within the dict and concatenating to output as one df
+            for k, v in mmDD.items():
+                out_wf= pd.concat(mmDD, ignore_index = True)
+            
+            out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
+            out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False)
+
--- a/scripts/ml/ml_functions/MultClfs_logo_skf.py
+++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py
@ -89,14 +89,7 @@ scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'jcc'       : make_scorer(jaccard_score)
            }) 
  
-skf_cv = StratifiedKFold(n_splits = 10
-                          #, shuffle = False, random_state= None)
-                          , shuffle = True,**rs)

-rskf_cv = RepeatedStratifiedKFold(n_splits = 10
-                                  , n_repeats = 3
-                                  , **rs)
-logo = LeaveOneGroupOut()

 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@ -160,7 +153,10 @@ def MultModelsCl_logo_skf(input_df
                       , add_yn = True  # adds target var class numbers
                       , var_type = ['numerical', 'categorical','mixed']
                       , run_blind_test = True
-                       , return_formatted_output = True):
+                       , return_formatted_output = True
+                       , random_state = 42
+                       , n_jobs = 10
+                       , ):

    '''
    @ param input_df: input features 
@ -179,10 +175,24 @@ def MultModelsCl_logo_skf(input_df
    Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
    '''
    
-    # if group == 'none':
-    #     sel_cv = skf_cv
-    # else: 
-    #     group = 'none'
+#%% Func globals        
+    rs = {'random_state': random_state}
+    njobs = {'n_jobs': n_jobs}
+    
+    skf_cv = StratifiedKFold(n_splits = 10
+                              #, shuffle = False, random_state= None)
+                              , shuffle = True,**rs)
+
+    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                      , n_repeats = 3
+                                      , **rs)
+    logo = LeaveOneGroupOut()
+
+    # select CV type:           
+    if group == 'none':
+        sel_cv = skf_cv
+    else: 
+        sel_cv = logo
    #======================================================
    # Determine categorical and numerical features
    #======================================================
@ -210,7 +220,7 @@ def MultModelsCl_logo_skf(input_df
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
+    models = [('AdaBoost Classifier'          , AdaBoostClassifier(**rs) )
               , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
--- a/scripts/ml/ml_functions/ml_data_combined.py
+++ b/scripts/ml/ml_functions/ml_data_combined.py
@ -63,40 +63,8 @@ else:
          , '\nGot:',  len(common_cols))

 colnames_combined_df = combined_df.columns
+if 'gene_name' in colnames_combined_df:
+    print("\nGene name included")
+else:
+    ('\nGene name NOT included')
 ##############################################################################
-
-#%% split_tts(): func params
-# (ml_input_data
-#   , data_type      = ['actual', 'complete']
-#   , split_type     = ['70_30', '80_20', 'sl']
-#   , oversampling   = True
-#   , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
-#   , target_colname = 'dst_mode'
-#   , include_gene_name = True
-#   , k_smote = 5)
-#%% split data into different data types
-# #===================
-# #     70/30
-# #=================== 
-# # actual
-# tts_7030_paramD = {'data_type'    : 'actual'
-#               , 'split_type'      : '70_30'}
-
-# # complete
-# tts_cd_7030_paramD = {'data_type'  : 'complete'
-#               , 'split_type'      : '70_30'}
-
-# # call split_tts()                   
-# data_CM_7030D = split_tts(ml_input_data = combined_df
-#           , **tts_7030_paramD
-#           , oversampling = True
-#           , dst_colname = 'dst'
-#           , target_colname = 'dst_mode'
-#           , include_gene_name = False) # when not doing leave one group out  
-
-# data_cd_CM_7030D = split_tts(ml_input_data = combined_df
-#           , **tts_cd_7030_paramD
-#           , oversampling = True
-#           , dst_colname = 'dst'
-#           , target_colname = 'dst_mode'
-#           , include_gene_name = False)