added .py

2022-06-29 12:08:35 +01:00 · 2022-06-29 12:08:35 +01:00 · 087170a798
commit 087170a798
parent 9aadb0329f
1 changed files with 102 additions and 0 deletions
--- a/scripts/ml/ml_functions/ml_data_combined.py
+++ b/scripts/ml/ml_functions/ml_data_combined.py
@ -0,0 +1,102 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sat Jun 25 11:07:30 2022
@author: tanu
 """
 import sys, os
 import pandas as pd
 import numpy as np
 import re
 ###############################################################################
 homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
 sys.path
 ###############################################################################
 #====================
 # Import ML functions 
 #====================
 # from MultClfs import *
 # from GetMLData import *
 from SplitTTS import *
 #%% Load all gene files #######################################################
 # param dict
 combined_model_paramD = {'data_combined_model'   : True
                    , 'use_or'                   : False
                    , 'omit_all_genomic_features': False
                    , 'write_maskfile'           : False
                    , 'write_outfile'            : False }
 pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)
 embb_df = getmldata('embB', 'ethambutol'   , **combined_model_paramD)
 katg_df = getmldata('katG', 'isoniazid'    , **combined_model_paramD)
 rpob_df = getmldata('rpoB', 'rifampicin'   , **combined_model_paramD)
 gid_df  = getmldata('gid' , 'streptomycin' , **combined_model_paramD)
 alr_df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
 # quick check
 foo = pd.concat([alr_df, pnca_df])
 check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1)
 # So, pd.concat will join correctly but introduce NAs.
 # TODO: discuss whether to make these 0 and use it or just omit
 # For now I am omitting these i.e combining only on common columns
 expected_nrows  = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df)
 # finding common columns
 dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df]
 common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
 expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)])                                                                                                                              
 expected_ncols
 if len(common_cols) == expected_ncols:
    print('\nProceeding to combine based on common cols (n):', len(common_cols))
    combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False)
    print('\nSuccessfully combined dfs:'
          , '\nNo. of dfs combined:', len(dfs_combine)
          , '\nDim of combined df:', combined_df.shape)
 else:
    print('\nFAIL: could not combine dfs, length mismatch'
          , '\nExpected ncols:', expected_ncols
          , '\nGot:',  len(common_cols))
 colnames_combined_df = combined_df.columns
 ##############################################################################
 #%% split_tts(): func params
 # (ml_input_data
 #   , data_type      = ['actual', 'complete']
 #   , split_type     = ['70_30', '80_20', 'sl']
 #   , oversampling   = True
 #   , dst_colname    = 'dst'# determine how to subset the actual vs reverse data
 #   , target_colname = 'dst_mode'
 #   , include_gene_name = True
 #   , k_smote = 5)
 #%% split data into different data types
 # #===================
 # #     70/30
 # #=================== 
 # # actual
 # tts_7030_paramD = {'data_type'    : 'actual'
 #               , 'split_type'      : '70_30'}
 # # complete
 # tts_cd_7030_paramD = {'data_type'  : 'complete'
 #               , 'split_type'      : '70_30'}
 # # call split_tts()                   
 # data_CM_7030D = split_tts(ml_input_data = combined_df
 #           , **tts_7030_paramD
 #           , oversampling = True
 #           , dst_colname = 'dst'
 #           , target_colname = 'dst_mode'
 #           , include_gene_name = False) # when not doing leave one group out  
 # data_cd_CM_7030D = split_tts(ml_input_data = combined_df
 #           , **tts_cd_7030_paramD
 #           , oversampling = True
 #           , dst_colname = 'dst'
 #           , target_colname = 'dst_mode'
 #           , include_gene_name = False)