added cm_logo_skf.py and placeholder for splits

This commit is contained in:
Tanushree Tunstall 2022-07-01 13:55:12 +01:00
parent 952cfeb4c0
commit d812835713
4 changed files with 254 additions and 49 deletions

View file

@ -0,0 +1,120 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 29 19:44:06 2022
@author: tanu
"""
import sys, os
import pandas as pd
import numpy as np
import re
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
sys.path
###############################################################################
#====================
# Import ML functions
#====================
from ml_data_combined import *
from MultClfs_logo_skf import *
#from GetMLData import *
#from SplitTTS import *
skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True,**rs)
#logo = LeaveOneGroupOut()
#%%
def CMLogoSkf(combined_df
, all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"]
, bts_genes = ["embb", "katg", "rpob", "pnca", "gid"]
, cols_to_drop = ['dst', 'dst_mode', 'gene_name']
, target_var = 'dst_mode'
, gene_group = 'gene_name'
, std_gene_omit = []
):
for bts_gene in bts_genes:
print('\n BTS gene:', bts_gene)
tr_gene_omit = std_gene_omit + [bts_gene]
n_tr_genes = (len(bts_genes) - (len(std_gene_omit)))
#n_total_genes = (len(bts_genes) - len(std_gene_omit))
n_total_genes = len(all_genes)
training_genesL = std_gene_omit + list(set(bts_genes) - set(tr_gene_omit))
#training_genesL = [element for element in bts_genes if element not in tr_gene_omit]
print('\nTotal genes: ', n_total_genes
,'\nTraining on:', n_tr_genes
,'\nTraining on genes:', training_genesL
, '\nOmitted genes:', tr_gene_omit
, '\nBlind test gene:', bts_gene)
tts_split_type = "logoBT_" + bts_gene
outFile = "/home/tanu/git/Data/ml_combined/" + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv"
print(outFile)
#-------
# training
#------
cm_training_df = combined_df[~combined_df['gene_name'].isin(tr_gene_omit)]
cm_X = cm_training_df.drop(cols_to_drop, axis=1, inplace=False)
#cm_y = cm_training_df.loc[:,'dst_mode']
cm_y = cm_training_df.loc[:, target_var]
gene_group = cm_training_df.loc[:,'gene_name']
print('\nTraining data dim:', cm_X.shape
, '\nTraining Target dim:', cm_y.shape)
if all(cm_X.columns.isin(cols_to_drop) == False):
print('\nChecked training df does NOT have Target var')
else:
sys.exit('\nFAIL: training data contains Target var')
#---------------
# BTS: genes
#---------------
cm_test_df = combined_df[combined_df['gene_name'].isin([bts_gene])]
cm_bts_X = cm_test_df.drop(cols_to_drop, axis = 1, inplace = False)
#cm_bts_y = cm_test_df.loc[:, 'dst_mode']
cm_bts_y = cm_test_df.loc[:, target_var]
print('\nTraining data dim:', cm_bts_X.shape
, '\nTraining Target dim:', cm_bts_y.shape)
#%%:Running Multiple models on LOGO with SKF
cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X
, target = cm_y
, group = 'none'
, sel_cv = skf_cv
, blind_test_df = cm_bts_X
, blind_test_target = cm_bts_y
, tts_split_type = tts_split_type
, resampling_type = 'none' # default
, add_cm = True
, add_yn = True
, var_type = 'mixed'
, run_blind_test = True
, return_formatted_output = True
, random_state = 42
, n_jobs = 10
)
cD3_v2.to_csv(outFile)
#%%
CMLogoSkf(combined_df)
CMLogoSkf(combined_df, std_gene_omit=['alr'])

View file

@ -0,0 +1,107 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 29 20:29:36 2022
@author: tanu
"""
import sys, os
import pandas as pd
import numpy as np
import re
###############################################################################
homedir = os.path.expanduser("~")
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
sys.path
###############################################################################
#====================
# Import ML functions
#====================
from MultClfs import *
from GetMLData import *
from SplitTTS import *
# param dict for getmldata()
combined_model_paramD = {'data_combined_model' : False
, 'use_or' : False
, 'omit_all_genomic_features': False
, 'write_maskfile' : False
, 'write_outfile' : False }
###############################################################################
#ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"]
ml_gene_drugD = {'pncA' : 'pyrazinamide'
, 'embB' : 'ethambutol'
, 'katG' : 'isoniazid'
, 'rpoB' : 'rifampicin'
, 'gid' : 'streptomycin'
}
gene_dataD={}
split_types = ['70_30', '80_20', 'sl']
split_data_types = ['actual', 'complete']
for gene, drug in ml_gene_drugD.items():
print ('\nGene:', gene
, '\nDrug:', drug)
gene_low = gene.lower()
gene_dataD[gene_low] = getmldata(gene, drug
, data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it.
, use_or = False
, omit_all_genomic_features = False
, write_maskfile = False
, write_outfile = False)
for split_type in split_types:
for data_type in split_data_types:
out_filename = (gene.lower()+'_'+split_type+'_'+data_type+'.csv')
tempD=split_tts(gene_dataD[gene_low]
, data_type = data_type
, split_type = split_type
, oversampling = True
, dst_colname = 'dst'
, target_colname = 'dst_mode'
, include_gene_name = True
)
paramD = {
'baseline_paramD': { 'input_df' : tempD['X']
, 'target' : tempD['y']
, 'var_type' : 'mixed'
, 'resampling_type': 'none'}
, 'smnc_paramD': { 'input_df' : tempD['X_smnc']
, 'target' : tempD['y_smnc']
, 'var_type' : 'mixed'
, 'resampling_type' : 'smnc'}
, 'ros_paramD': { 'input_df' : tempD['X_ros']
, 'target' : tempD['y_ros']
, 'var_type' : 'mixed'
, 'resampling_type' : 'ros'}
, 'rus_paramD' : { 'input_df' : tempD['X_rus']
, 'target' : tempD['y_rus']
, 'var_type' : 'mixed'
, 'resampling_type' : 'rus'}
, 'rouC_paramD' : { 'input_df' : tempD['X_rouC']
, 'target' : tempD['y_rouC']
, 'var_type' : 'mixed'
, 'resampling_type': 'rouC'}
}
mmDD = {}
for k, v in paramD.items():
scoresD = MultModelsCl(**paramD[k]
, tts_split_type = split_type
, skf_cv = skf_cv
, blind_test_df = tempD['X_bts']
, blind_test_target = tempD['y_bts']
, add_cm = True
, add_yn = True
, return_formatted_output = True)
mmDD[k] = scoresD
# Extracting the dfs from within the dict and concatenating to output as one df
for k, v in mmDD.items():
out_wf= pd.concat(mmDD, ignore_index = True)
out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False)

View file

@ -89,14 +89,7 @@ scoring_fn = ({ 'mcc' : make_scorer(matthews_corrcoef)
, 'jcc' : make_scorer(jaccard_score) , 'jcc' : make_scorer(jaccard_score)
}) })
skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None)
, shuffle = True,**rs)
rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats = 3
, **rs)
logo = LeaveOneGroupOut()
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
@ -160,7 +153,10 @@ def MultModelsCl_logo_skf(input_df
, add_yn = True # adds target var class numbers , add_yn = True # adds target var class numbers
, var_type = ['numerical', 'categorical','mixed'] , var_type = ['numerical', 'categorical','mixed']
, run_blind_test = True , run_blind_test = True
, return_formatted_output = True): , return_formatted_output = True
, random_state = 42
, n_jobs = 10
, ):
''' '''
@ param input_df: input features @ param input_df: input features
@ -179,10 +175,24 @@ def MultModelsCl_logo_skf(input_df
Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training
''' '''
# if group == 'none': #%% Func globals
# sel_cv = skf_cv rs = {'random_state': random_state}
# else: njobs = {'n_jobs': n_jobs}
# group = 'none'
skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None)
, shuffle = True,**rs)
rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats = 3
, **rs)
logo = LeaveOneGroupOut()
# select CV type:
if group == 'none':
sel_cv = skf_cv
else:
sel_cv = logo
#====================================================== #======================================================
# Determine categorical and numerical features # Determine categorical and numerical features
#====================================================== #======================================================

View file

@ -63,40 +63,8 @@ else:
, '\nGot:', len(common_cols)) , '\nGot:', len(common_cols))
colnames_combined_df = combined_df.columns colnames_combined_df = combined_df.columns
if 'gene_name' in colnames_combined_df:
print("\nGene name included")
else:
('\nGene name NOT included')
############################################################################## ##############################################################################
#%% split_tts(): func params
# (ml_input_data
# , data_type = ['actual', 'complete']
# , split_type = ['70_30', '80_20', 'sl']
# , oversampling = True
# , dst_colname = 'dst'# determine how to subset the actual vs reverse data
# , target_colname = 'dst_mode'
# , include_gene_name = True
# , k_smote = 5)
#%% split data into different data types
# #===================
# # 70/30
# #===================
# # actual
# tts_7030_paramD = {'data_type' : 'actual'
# , 'split_type' : '70_30'}
# # complete
# tts_cd_7030_paramD = {'data_type' : 'complete'
# , 'split_type' : '70_30'}
# # call split_tts()
# data_CM_7030D = split_tts(ml_input_data = combined_df
# , **tts_7030_paramD
# , oversampling = True
# , dst_colname = 'dst'
# , target_colname = 'dst_mode'
# , include_gene_name = False) # when not doing leave one group out
# data_cd_CM_7030D = split_tts(ml_input_data = combined_df
# , **tts_cd_7030_paramD
# , oversampling = True
# , dst_colname = 'dst'
# , target_colname = 'dst_mode'
# , include_gene_name = False)