added .py
This commit is contained in:
parent
9aadb0329f
commit
087170a798
1 changed files with 102 additions and 0 deletions
102
scripts/ml/ml_functions/ml_data_combined.py
Normal file
102
scripts/ml/ml_functions/ml_data_combined.py
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Sat Jun 25 11:07:30 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys, os
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import re
|
||||||
|
###############################################################################
|
||||||
|
homedir = os.path.expanduser("~")
|
||||||
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
|
||||||
|
sys.path
|
||||||
|
###############################################################################
|
||||||
|
#====================
|
||||||
|
# Import ML functions
|
||||||
|
#====================
|
||||||
|
# from MultClfs import *
|
||||||
|
# from GetMLData import *
|
||||||
|
from SplitTTS import *
|
||||||
|
#%% Load all gene files #######################################################
|
||||||
|
# param dict
|
||||||
|
combined_model_paramD = {'data_combined_model' : True
|
||||||
|
, 'use_or' : False
|
||||||
|
, 'omit_all_genomic_features': False
|
||||||
|
, 'write_maskfile' : False
|
||||||
|
, 'write_outfile' : False }
|
||||||
|
|
||||||
|
pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)
|
||||||
|
embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD)
|
||||||
|
katg_df = getmldata('katG', 'isoniazid' , **combined_model_paramD)
|
||||||
|
rpob_df = getmldata('rpoB', 'rifampicin' , **combined_model_paramD)
|
||||||
|
gid_df = getmldata('gid' , 'streptomycin' , **combined_model_paramD)
|
||||||
|
alr_df = getmldata('alr' , 'cycloserine' , **combined_model_paramD)
|
||||||
|
|
||||||
|
# quick check
|
||||||
|
foo = pd.concat([alr_df, pnca_df])
|
||||||
|
check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1)
|
||||||
|
# So, pd.concat will join correctly but introduce NAs.
|
||||||
|
# TODO: discuss whether to make these 0 and use it or just omit
|
||||||
|
# For now I am omitting these i.e combining only on common columns
|
||||||
|
|
||||||
|
expected_nrows = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df)
|
||||||
|
|
||||||
|
# finding common columns
|
||||||
|
dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df]
|
||||||
|
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
|
||||||
|
expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)])
|
||||||
|
expected_ncols
|
||||||
|
|
||||||
|
if len(common_cols) == expected_ncols:
|
||||||
|
print('\nProceeding to combine based on common cols (n):', len(common_cols))
|
||||||
|
combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False)
|
||||||
|
print('\nSuccessfully combined dfs:'
|
||||||
|
, '\nNo. of dfs combined:', len(dfs_combine)
|
||||||
|
, '\nDim of combined df:', combined_df.shape)
|
||||||
|
else:
|
||||||
|
print('\nFAIL: could not combine dfs, length mismatch'
|
||||||
|
, '\nExpected ncols:', expected_ncols
|
||||||
|
, '\nGot:', len(common_cols))
|
||||||
|
|
||||||
|
colnames_combined_df = combined_df.columns
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
#%% split_tts(): func params
|
||||||
|
# (ml_input_data
|
||||||
|
# , data_type = ['actual', 'complete']
|
||||||
|
# , split_type = ['70_30', '80_20', 'sl']
|
||||||
|
# , oversampling = True
|
||||||
|
# , dst_colname = 'dst'# determine how to subset the actual vs reverse data
|
||||||
|
# , target_colname = 'dst_mode'
|
||||||
|
# , include_gene_name = True
|
||||||
|
# , k_smote = 5)
|
||||||
|
#%% split data into different data types
|
||||||
|
# #===================
|
||||||
|
# # 70/30
|
||||||
|
# #===================
|
||||||
|
# # actual
|
||||||
|
# tts_7030_paramD = {'data_type' : 'actual'
|
||||||
|
# , 'split_type' : '70_30'}
|
||||||
|
|
||||||
|
# # complete
|
||||||
|
# tts_cd_7030_paramD = {'data_type' : 'complete'
|
||||||
|
# , 'split_type' : '70_30'}
|
||||||
|
|
||||||
|
# # call split_tts()
|
||||||
|
# data_CM_7030D = split_tts(ml_input_data = combined_df
|
||||||
|
# , **tts_7030_paramD
|
||||||
|
# , oversampling = True
|
||||||
|
# , dst_colname = 'dst'
|
||||||
|
# , target_colname = 'dst_mode'
|
||||||
|
# , include_gene_name = False) # when not doing leave one group out
|
||||||
|
|
||||||
|
# data_cd_CM_7030D = split_tts(ml_input_data = combined_df
|
||||||
|
# , **tts_cd_7030_paramD
|
||||||
|
# , oversampling = True
|
||||||
|
# , dst_colname = 'dst'
|
||||||
|
# , target_colname = 'dst_mode'
|
||||||
|
# , include_gene_name = False)
|
Loading…
Add table
Add a link
Reference in a new issue