added .py

This commit is contained in:
Tanushree Tunstall 2022-06-29 12:08:35 +01:00
parent 9aadb0329f
commit 087170a798

View file

@ -0,0 +1,102 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 25 11:07:30 2022
@author: tanu
"""
import sys, os
import pandas as pd
import numpy as np
import re
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
sys.path
###############################################################################
#====================
# Import ML functions
#====================
# from MultClfs import *
# from GetMLData import *
from SplitTTS import *
#%% Load all gene files #######################################################
# param dict
combined_model_paramD = {'data_combined_model' : True
, 'use_or' : False
, 'omit_all_genomic_features': False
, 'write_maskfile' : False
, 'write_outfile' : False }
pnca_df = getmldata('pncA', 'pyrazinamide' , **combined_model_paramD)
embb_df = getmldata('embB', 'ethambutol' , **combined_model_paramD)
katg_df = getmldata('katG', 'isoniazid' , **combined_model_paramD)
rpob_df = getmldata('rpoB', 'rifampicin' , **combined_model_paramD)
gid_df = getmldata('gid' , 'streptomycin' , **combined_model_paramD)
alr_df = getmldata('alr' , 'cycloserine' , **combined_model_paramD)
# quick check
foo = pd.concat([alr_df, pnca_df])
check1 = foo.filter(regex= '.*_affinity|gene_name|ligand_distance', axis = 1)
# So, pd.concat will join correctly but introduce NAs.
# TODO: discuss whether to make these 0 and use it or just omit
# For now I am omitting these i.e combining only on common columns
expected_nrows = len(pnca_df) + len(embb_df) + len(katg_df) + len(rpob_df) + len(gid_df) + len(alr_df)
# finding common columns
dfs_combine = [pnca_df, embb_df, katg_df, rpob_df, gid_df, alr_df]
common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
expected_ncols = np.min([len(pnca_df.columns)] + [len(embb_df.columns)] + [len(katg_df.columns)] + [len(rpob_df.columns)] + [len(gid_df.columns)] + [len(alr_df.columns)])
expected_ncols
if len(common_cols) == expected_ncols:
print('\nProceeding to combine based on common cols (n):', len(common_cols))
combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False)
print('\nSuccessfully combined dfs:'
, '\nNo. of dfs combined:', len(dfs_combine)
, '\nDim of combined df:', combined_df.shape)
else:
print('\nFAIL: could not combine dfs, length mismatch'
, '\nExpected ncols:', expected_ncols
, '\nGot:', len(common_cols))
colnames_combined_df = combined_df.columns
##############################################################################
#%% split_tts(): func params
# (ml_input_data
# , data_type = ['actual', 'complete']
# , split_type = ['70_30', '80_20', 'sl']
# , oversampling = True
# , dst_colname = 'dst'# determine how to subset the actual vs reverse data
# , target_colname = 'dst_mode'
# , include_gene_name = True
# , k_smote = 5)
#%% split data into different data types
# #===================
# # 70/30
# #===================
# # actual
# tts_7030_paramD = {'data_type' : 'actual'
# , 'split_type' : '70_30'}
# # complete
# tts_cd_7030_paramD = {'data_type' : 'complete'
# , 'split_type' : '70_30'}
# # call split_tts()
# data_CM_7030D = split_tts(ml_input_data = combined_df
# , **tts_7030_paramD
# , oversampling = True
# , dst_colname = 'dst'
# , target_colname = 'dst_mode'
# , include_gene_name = False) # when not doing leave one group out
# data_cd_CM_7030D = split_tts(ml_input_data = combined_df
# , **tts_cd_7030_paramD
# , oversampling = True
# , dst_colname = 'dst'
# , target_colname = 'dst_mode'
# , include_gene_name = False)