diff --git a/scripts/ml/ml_functions/ml_data_combined.py b/scripts/ml/ml_functions/ml_data_combined.py new file mode 100644 index 0000000..64dd2a3 --- /dev/null +++ b/scripts/ml/ml_functions/ml_data_combined.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Jun 25 11:07:30 2022 + +@author: tanu +""" + +import sys, os +import pandas as pd +import numpy as np +import re +############################################################################### +homedir = os.path.expanduser("~") +sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') +sys.path +############################################################################### +#==================== +# Import ML functions +#==================== +# from MultClfs import * +# from GetMLData import * +from SplitTTS import * +#%% Load all gene files ####################################################### +# param dict +combined_model_paramD = {'data_combined_model' : True + , 'use_or' : False + , 'omit_all_genomic_features': False + , 'write_maskfile' : False + , 'write_outfile' : False } + +pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD) +embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD) +katg_df = getmldata('katG', 'isoniazid' , **combined_model_paramD) +rpob_df = getmldata('rpoB', 'rifampicin' , **combined_model_paramD) +gid_df = getmldata('gid' , 'streptomycin' , **combined_model_paramD) +alr_df = getmldata('alr' , 'cycloserine' , **combined_model_paramD) + +# quick check +foo = pd.concat([alr_df, pnca_df]) +check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1) +# So, pd.concat will join correctly but introduce NAs. +# TODO: discuss whether to make these 0 and use it or just omit +# For now I am omitting these i.e combining only on common columns + +expected_nrows = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df) + +# finding common columns +dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df] +common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine))) +expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)]) +expected_ncols + +if len(common_cols) == expected_ncols: + print('\nProceeding to combine based on common cols (n):', len(common_cols)) + combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False) + print('\nSuccessfully combined dfs:' + , '\nNo. of dfs combined:', len(dfs_combine) + , '\nDim of combined df:', combined_df.shape) +else: + print('\nFAIL: could not combine dfs, length mismatch' + , '\nExpected ncols:', expected_ncols + , '\nGot:', len(common_cols)) + +colnames_combined_df = combined_df.columns +############################################################################## + +#%% split_tts(): func params +# (ml_input_data +# , data_type = ['actual', 'complete'] +# , split_type = ['70_30', '80_20', 'sl'] +# , oversampling = True +# , dst_colname = 'dst'# determine how to subset the actual vs reverse data +# , target_colname = 'dst_mode' +# , include_gene_name = True +# , k_smote = 5) +#%% split data into different data types +# #=================== +# # 70/30 +# #=================== +# # actual +# tts_7030_paramD = {'data_type' : 'actual' +# , 'split_type' : '70_30'} + +# # complete +# tts_cd_7030_paramD = {'data_type' : 'complete' +# , 'split_type' : '70_30'} + +# # call split_tts() +# data_CM_7030D = split_tts(ml_input_data = combined_df +# , **tts_7030_paramD +# , oversampling = True +# , dst_colname = 'dst' +# , target_colname = 'dst_mode' +# , include_gene_name = False) # when not doing leave one group out + +# data_cd_CM_7030D = split_tts(ml_input_data = combined_df +# , **tts_cd_7030_paramD +# , oversampling = True +# , dst_colname = 'dst' +# , target_colname = 'dst_mode' +# , include_gene_name = False)