From b2d0b827ad1cf4515dcb70985059533a59a86eb2 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sat, 2 Jul 2022 16:57:11 +0100 Subject: [PATCH] added cm run for logo_skf for actual data --- scripts/ml/combined_model/cm_logo_skf.py | 77 +++++++++++------- scripts/ml/combined_model/cm_ml_iterator.py | 89 --------------------- scripts/ml/ml_functions/ml_data_combined.py | 8 ++ scripts/ml/ml_iterator.py | 6 +- 4 files changed, 56 insertions(+), 124 deletions(-) delete mode 100755 scripts/ml/combined_model/cm_ml_iterator.py diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py index a54b8f2..0748466 100755 --- a/scripts/ml/combined_model/cm_logo_skf.py +++ b/scripts/ml/combined_model/cm_logo_skf.py @@ -98,13 +98,14 @@ skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True, random_state = 42) # COMPLETE data: No tts_split ######################################################################## #%% -def CMLogoSkf(combined_df +def CMLogoSkf(cm_input_df , all_genes = ["embb", "katg", "rpob", "pnca", "gid", "alr"] , bts_genes = ["embb", "katg", "rpob", "pnca", "gid"] , cols_to_drop = ['dst', 'dst_mode', 'gene_name'] , target_var = 'dst_mode' , gene_group = 'gene_name' , std_gene_omit = [] + , file_suffix = "" ): for bts_gene in bts_genes: @@ -127,17 +128,24 @@ def CMLogoSkf(combined_df ,'\nTraining on genes:', training_genesL , '\nOmitted genes:', tr_gene_omit , '\nBlind test gene:', bts_gene) + + print('\nDim of data:', cm_input_df.shape) tts_split_type = "logo_skf_BT_" + bts_gene - outFile = outdir + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv" - + # if len(file_suffix) > 0: + # file_suffix = '_' + file_suffix + # else: + # file_suffix = file_suffix + + outFile = outdir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix + ".csv" + print(outFile) #------- # training #------ - cm_training_df = combined_df[~combined_df['gene_name'].isin(tr_gene_omit)] + cm_training_df = cm_input_df[~cm_input_df['gene_name'].isin(tr_gene_omit)] cm_X = cm_training_df.drop(cols_to_drop, axis=1, inplace=False) #cm_y = cm_training_df.loc[:,'dst_mode'] @@ -156,7 +164,7 @@ def CMLogoSkf(combined_df #--------------- # BTS: genes #--------------- - cm_test_df = combined_df[combined_df['gene_name'].isin([bts_gene])] + cm_test_df = cm_input_df[cm_input_df['gene_name'].isin([bts_gene])] cm_bts_X = cm_test_df.drop(cols_to_drop, axis = 1, inplace = False) #cm_bts_y = cm_test_df.loc[:, 'dst_mode'] @@ -165,31 +173,40 @@ def CMLogoSkf(combined_df print('\nTEST data dim:', cm_bts_X.shape , '\nTEST Target dim:', cm_bts_y.shape) + print("Running Multiple models on LOGO with SKF") + # #%%:Running Multiple models on LOGO with SKF + # cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X + # , target = cm_y + # #, group = 'none' + # , sel_cv = skf_cv - #%%:Running Multiple models on LOGO with SKF - cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X - , target = cm_y - #, group = 'none' - , sel_cv = skf_cv + # , blind_test_df = cm_bts_X + # , blind_test_target = cm_bts_y + + # , tts_split_type = tts_split_type + + # , resampling_type = 'none' # default + # , add_cm = True + # , add_yn = True + # , var_type = 'mixed' + + # , run_blind_test = True + # , return_formatted_output = True + # , random_state = 42 + # , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores + # ) - , blind_test_df = cm_bts_X - , blind_test_target = cm_bts_y - - , tts_split_type = tts_split_type - - , resampling_type = 'none' # default - , add_cm = True - , add_yn = True - , var_type = 'mixed' - - , run_blind_test = True - , return_formatted_output = True - , random_state = 42 - , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores - ) - - cD3_v2.to_csv(outFile) + # cD3_v2.to_csv(outFile) -#%% -#CMLogoSkf(combined_df) -CMLogoSkf(combined_df, std_gene_omit=['alr']) +#%% RUN +#=============== +# Complete Data +#=============== +#CMLogoSkf(cm_input_df = combined_df,file_suffix = "complete") +#CMLogoSkf(cm_input_df = combined_df, std_gene_omit=['alr'], file_suffix = "complete") + +#=============== +# Actual Data +#=============== +CMLogoSkf(cm_input_df = combined_df_actual, file_suffix = "actual") +CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual") diff --git a/scripts/ml/combined_model/cm_ml_iterator.py b/scripts/ml/combined_model/cm_ml_iterator.py deleted file mode 100755 index f899b68..0000000 --- a/scripts/ml/combined_model/cm_ml_iterator.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Jun 29 20:29:36 2022 - -@author: tanu -""" -import sys, os -import pandas as pd -import numpy as np -import re - -############################################################################### -homedir = os.path.expanduser("~") -sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions') -sys.path -############################################################################### -outdir = homedir + '/git/LSHTM_ML/output/combined/' - -#==================== -# Import ML functions -#==================== -#from MultClfs import * -#from MultClfs_logo_skf import * -from MultClfs_logo_skf_split import * - -from GetMLData import * -from SplitTTS import * - -# Input data -from ml_data_combined import * - -############################################################################### -print('\nUsing data with 5 genes:', len(cm_input_df5)) - -############################################################################### - -split_types = ['70_30', '80_20', 'sl'] -split_data_types = ['actual', 'complete'] - -for split_type in split_types: - for data_type in split_data_types: - - out_filename = outdir + 'cm_' + split_type + '_' + data_type + '.csv' - print(out_filename) - tempD = split_tts(cm_input_df5 - , data_type = data_type - , split_type = split_type - , oversampling = True - , dst_colname = 'dst' - , target_colname = 'dst_mode' - , include_gene_name = True - ) - paramD = { - 'baseline_paramD': { 'input_df' : tempD['X'] - , 'target' : tempD['y'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'none'} - , 'smnc_paramD' : { 'input_df' : tempD['X_smnc'] - , 'target' : tempD['y_smnc'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'smnc'} - , 'ros_paramD' : { 'input_df' : tempD['X_ros'] - , 'target' : tempD['y_ros'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'ros'} - , 'rus_paramD' : { 'input_df' : tempD['X_rus'] - , 'target' : tempD['y_rus'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'rus'} - , 'rouC_paramD' : { 'input_df' : tempD['X_rouC'] - , 'target' : tempD['y_rouC'] - , 'var_type' : 'mixed' - , 'resampling_type' : 'rouC'} - } - - mmDD = {} - for k, v in paramD.items(): - scoresD = MultModelsCl_logo_skf(**paramD[k] - XXXXXXXXXXXXXXXXXXXXXXX - mmDD[k] = scoresD - - # Extracting the dfs from within the dict and concatenating to output as one df - for k, v in mmDD.items(): - out_wf= pd.concat(mmDD, ignore_index = True) - - out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False) - out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False) - diff --git a/scripts/ml/ml_functions/ml_data_combined.py b/scripts/ml/ml_functions/ml_data_combined.py index c4fc494..57e2295 100644 --- a/scripts/ml/ml_functions/ml_data_combined.py +++ b/scripts/ml/ml_functions/ml_data_combined.py @@ -71,4 +71,12 @@ else: omit_gene_alr = ['alr'] cm_input_df5 = combined_df[~combined_df['gene_name'].isin(omit_gene_alr)] + +#%% COMPLETE data +combined_df['dst'].isna().sum() +combined_df['dst'].value_counts().sum() + + +combined_df_actual = combined_df[~combined_df['dst'].isna()] + ############################################################################## diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py index a7dc7c6..de8e62a 100755 --- a/scripts/ml/ml_iterator.py +++ b/scripts/ml/ml_iterator.py @@ -48,11 +48,7 @@ for gene, drug in ml_gene_drugD.items(): , '\nDrug:', drug) gene_low = gene.lower() gene_dataD[gene_low] = getmldata(gene, drug - , data_combined_model = False # this means it doesn't include 'gene_name' as a feauture as a single gene-target shouldn't have it. - , use_or = False - , omit_all_genomic_features = False - , write_maskfile = False - , write_outfile = False) + , **combined_model_paramD) for split_type in split_types: for data_type in split_data_types: