undo wrong rename

2022-07-02 10:31:55 +01:00 · 2022-07-02 10:31:55 +01:00 · a166a37c0e
commit a166a37c0e
parent 5a81511163
1 changed files with 0 additions and 0 deletions
--- a/scripts/ml/ml_iterator.py
+++ b/scripts/ml/ml_iterator.py
@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun 29 20:29:36 2022
+
+@author: tanu
+"""
+import sys, os
+import pandas as pd
+import numpy as np
+import re
+    
+###############################################################################
+homedir = os.path.expanduser("~")
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
+sys.path
+###############################################################################
+outdir = homedir + '/git/LSHTM_ML/output/genes/'
+
+#====================
+# Import ML functions 
+#====================
+from MultClfs import *
+from GetMLData import *
+from SplitTTS import *
+
+# param dict for getmldata()
+combined_model_paramD = {'data_combined_model'   : False
+                    , 'use_or'                   : False
+                    , 'omit_all_genomic_features': False
+                    , 'write_maskfile'           : False
+                    , 'write_outfile'            : False }
+###############################################################################
+#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"]
+
+ml_gene_drugD = {'pncA'   : 'pyrazinamide'
+                 , 'embB' : 'ethambutol'
+                 , 'katG' : 'isoniazid'
+                 , 'rpoB' : 'rifampicin'
+                 , 'gid'  : 'streptomycin'
+                 }
+gene_dataD={}
+split_types = ['70_30', '80_20', 'sl']
+split_data_types = ['actual', 'complete']
+
+for gene, drug in ml_gene_drugD.items():
+    print ('\nGene:', gene
+           , '\nDrug:', drug)
+    gene_low = gene.lower()
+    gene_dataD[gene_low] = getmldata(gene, drug
+              , data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it.
+              , use_or = False
+              , omit_all_genomic_features = False
+              , write_maskfile = False
+              , write_outfile = False)
+
+    for split_type in split_types:
+        for data_type in split_data_types:
+        
+            out_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + '.csv'
+            
+            tempD=split_tts(gene_dataD[gene_low]
+                      , data_type = data_type
+                      , split_type = split_type
+                      , oversampling = True
+                      , dst_colname = 'dst'
+                      , target_colname = 'dst_mode'
+                      , include_gene_name = True
+                  )
+            paramD = {
+                    'baseline_paramD': { 'input_df'        : tempD['X']
+                                        , 'target'         : tempD['y']
+                                        , 'var_type'       : 'mixed'
+                                        , 'resampling_type': 'none'}
+                    , 'smnc_paramD': { 'input_df'          : tempD['X_smnc']
+                                      , 'target'           : tempD['y_smnc']
+                                      , 'var_type'         : 'mixed'
+                                      , 'resampling_type'  : 'smnc'}
+                    , 'ros_paramD': { 'input_df'           : tempD['X_ros']
+                                    , 'target'             : tempD['y_ros']
+                                    , 'var_type'           : 'mixed'
+                                    , 'resampling_type'    : 'ros'}
+                    , 'rus_paramD' : { 'input_df'          : tempD['X_rus']
+                                      , 'target'           : tempD['y_rus']
+                                      , 'var_type'         : 'mixed'
+                                      , 'resampling_type'  : 'rus'}
+                    , 'rouC_paramD' : { 'input_df'         : tempD['X_rouC']
+                                        , 'target'         : tempD['y_rouC']
+                                        , 'var_type'       : 'mixed'
+                                        , 'resampling_type': 'rouC'}
+                    }
+            
+            mmDD = {}
+            for k, v in paramD.items():
+                scoresD = MultModelsCl(**paramD[k]
+                                    , tts_split_type = split_type
+                                    , skf_cv = skf_cv
+                                    , blind_test_df =  tempD['X_bts']
+                                    , blind_test_target = tempD['y_bts']
+                                    , add_cm = True 
+                                    , add_yn = True
+                                    , return_formatted_output = True)
+                mmDD[k] = scoresD
+
+            # Extracting the dfs from within the dict and concatenating to output as one df
+            for k, v in mmDD.items():
+                out_wf= pd.concat(mmDD, ignore_index = True)
+            
+            out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
+            out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+ out_filename), index = False)
+