diff --git a/scripts/ml/Mult_dissected_CALL.py b/scripts/ml/Mult_dissected_CALL.py index bc4bb07..2fa44fe 100644 --- a/scripts/ml/Mult_dissected_CALL.py +++ b/scripts/ml/Mult_dissected_CALL.py @@ -52,7 +52,7 @@ sampling_type_name = 'none' feature_gp_nameEV = 'evolutionary' n_featuresEV = len(X_evolFN) -scores_mmEV = MultModelsCl_dissected(input_df = X[X_evolFN] +scores_mmEV = MultModelsCl(input_df = X[X_evolFN] , target = y , var_type = 'mixed' , skf_cv = skf_cv @@ -96,7 +96,7 @@ baseline_EV['n_features'] = n_featuresEV feature_gp_nameGN = 'genomics' n_featuresGN = len(X_genomicFN) -scores_mmGN = MultModelsCl_dissected(input_df = X[X_genomicFN] +scores_mmGN = MultModelsCl(input_df = X[X_genomicFN] , target = y , var_type = 'mixed' , skf_cv = skf_cv @@ -143,7 +143,7 @@ baseline_GN['n_features'] = n_featuresGN feature_gp_nameSTR = 'structural' n_featuresSTR = len(X_structural_FN) -scores_mmSTR = MultModelsCl_dissected(input_df = X[X_structural_FN] +scores_mmSTR = MultModelsCl(input_df = X[X_structural_FN] , target = y , var_type = 'mixed' , skf_cv = skf_cv @@ -187,7 +187,7 @@ baseline_STR['n_features'] = n_featuresSTR feature_gp_nameSTB = 'stability' n_featuresSTB = len(X_stability_FN) -scores_mmSTB = MultModelsCl_dissected(input_df = X[X_stability_FN] +scores_mmSTB = MultModelsCl(input_df = X[X_stability_FN] , target = y , var_type = 'mixed' , skf_cv = skf_cv @@ -231,7 +231,7 @@ baseline_STB['n_features'] = n_featuresSTB feature_gp_nameAFF = 'affinity' n_featuresAFF = len(X_affinityFN) -scores_mmAFF = MultModelsCl_dissected(input_df = X[X_affinityFN] +scores_mmAFF = MultModelsCl(input_df = X[X_affinityFN] , target = y , var_type = 'mixed' , skf_cv = skf_cv @@ -275,7 +275,7 @@ baseline_AFF['n_features'] = n_featuresAFF feature_gp_nameRES = 'residue_prop' n_featuresRES = len(X_resprop_FN) -scores_mmRES = MultModelsCl_dissected(input_df = X[X_resprop_FN] +scores_mmRES = MultModelsCl(input_df = X[X_resprop_FN] , target = y , var_type = 'mixed' , skf_cv = skf_cv @@ -321,7 +321,7 @@ X_respropNOaaFN = list(set(X_resprop_FN) - set(X_aaindex_Fnum)) feature_gp_nameRNAA = 'ResPropNoAA' n_featuresRNAA = len(X_respropNOaaFN) -scores_mmRNAA = MultModelsCl_dissected(input_df = X[X_respropNOaaFN] +scores_mmRNAA = MultModelsCl(input_df = X[X_respropNOaaFN] , target = y , var_type = 'mixed' , skf_cv = skf_cv @@ -367,7 +367,7 @@ X_strNOaaFN = list(set(X_structural_FN) - set(X_aaindex_Fnum)) feature_gp_nameSNAA = 'StrNoAA' n_featuresSNAA = len(X_strNOaaFN) -scores_mmSNAA = MultModelsCl_dissected(input_df = X[X_strNOaaFN] +scores_mmSNAA = MultModelsCl(input_df = X[X_strNOaaFN] , target = y , var_type = 'mixed' , skf_cv = skf_cv diff --git a/scripts/ml/ml_data_7030.py b/scripts/ml/ml_data_7030.py index 83a792a..5e0bb62 100644 --- a/scripts/ml/ml_data_7030.py +++ b/scripts/ml/ml_data_7030.py @@ -34,6 +34,8 @@ def setvars(gene,drug): from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold from sklearn.pipeline import Pipeline, make_pipeline + import argparse + import re #%% GLOBALS rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -422,118 +424,31 @@ def setvars(gene,drug): #========================== my_df_ml = my_df.copy() - #%% Build X: input for ML - common_cols_stabiltyN = ['ligand_distance' - , 'ligand_affinity_change' - , 'duet_stability_change' - , 'ddg_foldx' - , 'deepddg' - , 'ddg_dynamut2' - , 'mmcsm_lig' - , 'contacts'] - - # Build stability columns ~ gene + # Build column names to mask for affinity chanhes if gene.lower() in geneL_basic: - X_stabilityN = common_cols_stabiltyN + #X_stabilityN = common_cols_stabiltyN + gene_affinity_colnames = []# not needed as its the common ones cols_to_mask = ['ligand_affinity_change'] if gene.lower() in geneL_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] - geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] if gene.lower() in geneL_na: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] - geneL_na_st_cols = ['mcsm_na_affinity'] - X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity'] if gene.lower() in geneL_na_ppi2: - # X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] - X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] - - X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' - , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' - , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' - , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' - , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' - , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss' - ] - - X_str = ['rsa' - #, 'asa' - , 'kd_values' - , 'rd_values'] - - X_ssFN = X_stabilityN + X_str + X_foldX_cols - - X_evolFN = ['consurf_score' - , 'snap2_score' - , 'provean_score'] - - X_genomic_mafor = ['maf' - , 'logorI' - # , 'or_rawI' - # , 'or_mychisq' - # , 'or_logistic' - # , 'or_fisher' - # , 'pval_fisher' - ] - - X_genomic_linegae = ['lineage_proportion' - , 'dist_lineage_proportion' - #, 'lineage' # could be included as a category but it has L2;L4 formatting - , 'lineage_count_all' - , 'lineage_count_unique' - ] - - X_genomicFN = X_genomic_mafor + X_genomic_linegae - - X_aaindexFN = list(aa_df_cols) - - print('\nTotal no. of features for aaindex:', len(X_aaindexFN)) - - # numerical feature names - numerical_FN = X_ssFN + X_evolFN + X_genomicFN + X_aaindexFN - - # categorical feature names - categorical_FN = ['ss_class' - # , 'wt_prop_water' - # , 'mut_prop_water' - # , 'wt_prop_polarity' - # , 'mut_prop_polarity' - # , 'wt_calcprop' - # , 'mut_calcprop' - , 'aa_prop_change' - , 'electrostatics_change' - , 'polarity_change' - , 'water_change' - , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] - , 'active_site' #[didn't use it for uq_v1] - #, 'gene_name' # will be required for the combined stuff - ] - - #---------------------------------------------- - # count numerical and categorical features - #---------------------------------------------- - - print('\nNo. of numerical features:', len(numerical_FN) - , '\nNo. of categorical features:', len(categorical_FN)) - #======================= # Masking columns: # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 #======================= - # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() - # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() - - # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 - # (my_df_ml['ligand_affinity_change'] == 0).sum() - my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() @@ -544,23 +459,149 @@ def setvars(gene,drug): mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask] + #=================================================== # write file for check mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) - mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') + #=================================================== + ############################################################################### + #%% Feature groups (FG): Build X for Input ML + ############################################################################ + #=========================== + # FG1: Evolutionary features + #=========================== + X_evolFN = ['consurf_score' + , 'snap2_score' + , 'provean_score'] + + ############################################################################### + #======================== + # FG2: Stability features + #======================== + #-------- + # common + #-------- + X_common_stability_Fnum = [ + 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2' + , 'contacts'] + #-------- + # FoldX + #-------- + X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' + , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' + , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' + , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' + , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' + , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'] + + X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum + + ############################################################################### + #=================== + # FG3: Affinity features + #=================== + common_affinity_Fnum = ['ligand_distance' + , 'ligand_affinity_change' + , 'mmcsm_lig'] + + # if gene.lower() in geneL_basic: + # X_affinityFN = common_affinity_Fnum + # else: + # X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + ############################################################################### + #============================ + # FG4: Residue level features + #============================ + #----------- + # AA index + #----------- + X_aaindex_Fnum = list(aa_df_cols) + print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum)) + + #----------------- + # surface area + # depth + # hydrophobicity + #----------------- + X_str_Fnum = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + + #--------------------------- + # Other aa properties + # active site indication + #--------------------------- + X_aap_Fcat = ['ss_class' + # , 'wt_prop_water' + # , 'mut_prop_water' + # , 'wt_prop_polarity' + # , 'mut_prop_polarity' + # , 'wt_calcprop' + # , 'mut_calcprop' + , 'aa_prop_change' + , 'electrostatics_change' + , 'polarity_change' + , 'water_change' + , 'active_site'] - ##################################################################### + X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat + ############################################################################### + #======================== + # FG5: Genomic features + #======================== + X_gn_mafor_Fnum = ['maf' + , 'logorI' + # , 'or_rawI' + # , 'or_mychisq' + # , 'or_logistic' + # , 'or_fisher' + # , 'pval_fisher' + ] + + X_gn_linegae_Fnum = ['lineage_proportion' + , 'dist_lineage_proportion' + #, 'lineage' # could be included as a category but it has L2;L4 formatting + , 'lineage_count_all' + , 'lineage_count_unique' + ] + + X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] + #, 'gene_name' # will be required for the combined stuff + ] + + X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat + ############################################################################### + #======================== + # FG6 collapsed: Structural : Atability + Affinity + ResidueProp + #======================== + X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN + + ############################################################################### + #======================== + # BUILDING all features + #======================== + all_featuresN = X_evolFN + X_structural_FN + X_genomicFN + + ############################################################################### + #%% Define training and test data #================================================================ # Training and BLIND test set: 70/30 - - # Throw away previous blind_test_df, and call the 30% data as blind_test - # as these were imputed values and initial analysis shows that this - # is not very representative + # dst with actual values : training set + # dst with imputed values : THROW AWAY [unrepresentative] #================================================================ my_df_ml[drug].isna().sum() + # blind_test_df = my_df_ml[my_df_ml[drug].isna()] # blind_test_df.shape - training_df = my_df_ml[my_df_ml[drug].notna()] + training_df = my_df_ml[my_df_ml[drug].notna()] training_df.shape # Target 1: dst_mode @@ -568,80 +609,14 @@ def setvars(gene,drug): training_df['dst_mode'].value_counts() #################################################################### - -############################################################################### -############################################################################### - # #%% extracting dfs based on numerical, categorical column names - # #---------------------------------- - # # WITHOUT the target var included - # #---------------------------------- - # num_df = training_df[numerical_FN] - # num_df.shape - - # cat_df = training_df[categorical_FN] - # cat_df.shape - - # all_df = training_df[numerical_FN + categorical_FN] - # all_df.shape - - # #------------------------------ - # # WITH the target var included: - # #'wtgt': with target - # #------------------------------ - # # drug and dst_mode should be the same thing - # num_df_wtgt = training_df[numerical_FN + ['dst_mode']] - # num_df_wtgt.shape - - # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']] - # cat_df_wtgt.shape - - # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']] - # all_df_wtgt.shape - - #%%######################################################################## - # #============ - # # ML data: OLD - # #============ - # #------ - # # X: Training and Blind test (BTS) - # #------ - # X = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL - # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL - # #X = all_df_wtgt[numerical_FN] # training numerical only - # #X_bts = blind_test_df[numerical_FN] # blind test data numerical - - # #------ - # # y - # #------ - # y = all_df_wtgt['dst_mode'] # training data y - # y_bts = blind_test_df['dst_mode'] # blind data test y - - # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] - - # # Quick check - # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() - # for i in range(len(cols_to_mask)): - # ind = i+1 - # print('\nindex:', i, '\nind:', ind) - # print('\nMask count check:' - # , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() - # ) - - # print('Original Data\n', Counter(y) - # , 'Data dim:', X.shape) - -############################################################################### -############################################################################### #==================================== # ML data: Train test split: 70/30 # with stratification # 70% : training_data for CV # 30% : blind test #===================================== - - # features: all_df or - x_features = training_df[numerical_FN + categorical_FN] - y_target = training_df['dst_mode'] + x_features = training_df[all_featuresN] + y_target = training_df['dst_mode'] # sanity check if not 'dst_mode' in x_features.columns: @@ -652,7 +627,9 @@ def setvars(gene,drug): #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d else: sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!') - + #------------------- + # train-test split + #------------------- #x_train, x_test, y_train, y_test # traditional var_names # so my downstream code doesn't need to change X, X_bts, y, y_bts = train_test_split(x_features, y_target @@ -664,16 +641,64 @@ def setvars(gene,drug): yc2 = Counter(y_bts) yc2_ratio = yc2[0]/yc2[1] - + + ############################################################################### + #====================================================== + # Determine categorical and numerical features + #====================================================== + numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns + numerical_cols + categorical_cols = X.select_dtypes(include=['object', 'bool']).columns + categorical_cols + + ################################################################################ + # IMPORTANT sanity checks + if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN): + print('\nPASS: ML data with input features, training and test generated...' + , '\n\nTotal no. of input features:' , len(X.columns) + , '\n--------No. of numerical features:' , len(numerical_cols) + , '\n--------No. of categorical features:' , len(categorical_cols) + + , '\n\nTotal no. of evolutionary features:' , len(X_evolFN) + + , '\n\nTotal no. of stability features:' , len(X_stability_FN) + , '\n--------Common stabilty cols:' , len(X_common_stability_Fnum) + , '\n--------Foldx cols:' , len(X_foldX_Fnum) + + , '\n\nTotal no. of affinity features:' , len(X_affinityFN) + , '\n--------Common affinity cols:' , len(common_affinity_Fnum) + , '\n--------Gene specific affinity cols:' , len(gene_affinity_colnames) + + , '\n\nTotal no. of residue level features:', len(X_resprop_FN) + , '\n--------AA index cols:' , len(X_aaindex_Fnum) + , '\n--------Residue Prop cols:' , len(X_str_Fnum) + , '\n--------AA change Prop cols:' , len(X_aap_Fcat) + + , '\n\nTotal no. of genomic features:' , len(X_genomicFN) + , '\n--------MAF+OR cols:' , len(X_gn_mafor_Fnum) + , '\n--------Lineage cols:' , len(X_gn_linegae_Fnum) + , '\n--------Other cols:' , len(X_gn_Fcat) + ) + else: + print('\nFAIL: numbers mismatch' + , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN) + , '\nGot:', len(X.columns)) + sys.exit() + ############################################################################### print('\n-------------------------------------------------------------' - , '\nSuccessfully split data with stratification: 70/30' - , '\nInput features data size:', x_features.shape - , '\nTrain data size:', X.shape - , '\nTest data size:', X_bts.shape + , '\nSuccessfully split data: ALL features' + , '\nactual values: training set' + , '\nimputed values: blind test set' + + , '\n\nTotal data size:', len(X) + len(X_bts) + + , '\n\nTrain data size:', X.shape , '\ny_train numbers:', yc1 - , '\ny_train ratio:',yc1_ratio - , '\n' + + , '\n\nTest data size:', X_bts.shape , '\ny_test_numbers:', yc2 + + , '\n\ny_train ratio:',yc1_ratio , '\ny_test ratio:', yc2_ratio , '\n-------------------------------------------------------------' ) @@ -700,7 +725,7 @@ def setvars(gene,drug): #------------------------------ oversample = RandomOverSampler(sampling_strategy='minority') X_ros, y_ros = oversample.fit_resample(X, y) - print('\nSimple Random OverSampling\n', Counter(y_ros)) + print('Simple Random OverSampling\n', Counter(y_ros)) print(X_ros.shape) #------------------------------ @@ -709,7 +734,7 @@ def setvars(gene,drug): #------------------------------ undersample = RandomUnderSampler(sampling_strategy='majority') X_rus, y_rus = undersample.fit_resample(X, y) - print('\nSimple Random UnderSampling\n', Counter(y_rus)) + print('Simple Random UnderSampling\n', Counter(y_rus)) print(X_rus.shape) #------------------------------ @@ -720,7 +745,7 @@ def setvars(gene,drug): X_ros, y_ros = oversample.fit_resample(X, y) undersample = RandomUnderSampler(sampling_strategy='majority') X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros) - print('\nSimple Combined Over and UnderSampling\n', Counter(y_rouC)) + print('Simple Combined Over and UnderSampling\n', Counter(y_rouC)) print(X_rouC.shape) #------------------------------ @@ -740,7 +765,7 @@ def setvars(gene,drug): categorical_colind = X.columns.get_indexer(list(categorical_ix)) categorical_colind - k_sm = 5 # 5 is default + k_sm = 5 # 5 is deafult sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs) X_smnc, y_smnc = sm_nc.fit_resample(X, y) print('\nSMOTE_NC OverSampling\n', Counter(y_smnc)) diff --git a/scripts/ml/ml_data_fg.py b/scripts/ml/ml_data_fg.py index d1daa2c..460d133 100644 --- a/scripts/ml/ml_data_fg.py +++ b/scripts/ml/ml_data_fg.py @@ -61,7 +61,6 @@ def setvars(gene,drug): jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% FOR LATER: Combine ED logo data - #%% DONE: active aa site annotations **DONE on 15/05/2022 as part of generating merged_dfs ########################################################################### rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -419,7 +418,7 @@ def setvars(gene,drug): #--------------------------------------- #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - #%% Data for ML + #%%######################################################################## #========================== # Data for ML #========================== @@ -551,8 +550,7 @@ def setvars(gene,drug): , 'polarity_change' , 'water_change' , 'active_site'] - - + X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat ############################################################################### #======================== @@ -594,8 +592,7 @@ def setvars(gene,drug): ############################################################################### #%% Define training and test data #====================================================== - # Training and BLIND test set [UQ]: actual vs imputed - # No aa index but active_site included + # Training and BLIND test set: actual vs imputed # dst with actual values : training set # dst with imputed values : blind test #====================================================== @@ -612,9 +609,9 @@ def setvars(gene,drug): training_df['dst_mode'].value_counts() #################################################################### - #============ - # ML data - #============ + #===================================== + # ML data: actual vs imputed + #===================================== #------ # X: Training and Blind test (BTS) #------ @@ -625,20 +622,8 @@ def setvars(gene,drug): # y #------ y = training_df['dst_mode'] - y_bts = blind_test_df['dst_mode'] - - # Quick check - #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() - for i in range(len(cols_to_mask)): - ind = i+1 - print('\nindex:', i, '\nind:', ind) - print('\nMask count check:' - , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() - ) - - print('Original Data\n', Counter(y) - , 'Data dim:', X.shape) - + y_bts = blind_test_df['dst_mode'] + yc1 = Counter(y) yc1_ratio = yc1[0]/yc1[1] @@ -705,7 +690,18 @@ def setvars(gene,drug): , '\ny_test ratio:', yc2_ratio , '\n-------------------------------------------------------------' ) + ########################################################################## + # Quick check + #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum() + for i in range(len(cols_to_mask)): + ind = i+1 + print('\nindex:', i, '\nind:', ind) + print('\nMask count check:' + , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum() + ) + print('Original Data\n', Counter(y) + , 'Data dim:', X.shape) ########################################################################### #%% ########################################################################### @@ -760,7 +756,7 @@ def setvars(gene,drug): k_sm = 5 # 5 is deafult sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs) X_smnc, y_smnc = sm_nc.fit_resample(X, y) - print('SMOTE_NC OverSampling\n', Counter(y_smnc)) + print('\nSMOTE_NC OverSampling\n', Counter(y_smnc)) print(X_smnc.shape) globals().update(locals()) # TROLOLOLOLOLOLS #print("i did a horrible hack :-)") @@ -774,7 +770,7 @@ def setvars(gene,drug): # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs) # X_sm, y_sm = sm.fit_resample(X, y) # print(X_sm.shape) - # print('SMOTE OverSampling\n', Counter(y_sm)) + # print('\nSMOTE OverSampling\n', Counter(y_sm)) # y_sm_df = y_sm.to_frame() # y_sm_df.value_counts().plot(kind = 'bar') @@ -785,7 +781,7 @@ def setvars(gene,drug): # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs )) # X_enn, y_enn = sm_enn.fit_resample(X, y) # print(X_enn.shape) - # print('SMOTE Over+Under Sampling combined\n', Counter(y_enn)) + # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn)) ############################################################################### # TODO: Find over and undersampling JUST for categorical data diff --git a/scripts/ml/run_7030.py b/scripts/ml/run_7030.py new file mode 100644 index 0000000..6ee5521 --- /dev/null +++ b/scripts/ml/run_7030.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Jun 20 13:05:23 2022 + +@author: tanu +""" +import re +import argparse +############################################################################### +# gene = 'pncA' +# drug = 'pyrazinamide' +#total_mtblineage_uc = 8 + +#%% command line args: case sensitive +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '') +arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') +args = arg_parser.parse_args() + +drug = args.drug +gene = args.gene + +############################################################################### +#================== +# other vars +#================== +tts_split = '70/30' +OutFile_suffix = '7030' +############################################################################### +#================== +# Import data +#================== +from ml_data_7030 import * +setvars(gene,drug) +from ml_data_7030 import * + +# from YC run_all_ML: run locally +#from UQ_yc_RunAllClfs import run_all_ML + +#==================== +# Import ML function +#==================== +# TT run all ML clfs: baseline model +from MultModelsCl import MultModelsCl + +############################################################################ +print('\n#####################################################################\n' + , '\nRunning ML analysis: feature groups ' + , '\nGene name:', gene + , '\nDrug name:', drug) + +#================== +# Specify outdir +#================== +outdir_ml = outdir + 'ml/tts_7030/' +print('\nOutput directory:', outdir_ml) +outFile = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv' + +############################################################################### +score_type_ordermapD = { 'mcc' : 1 + , 'fscore' : 2 + , 'jcc' : 3 + , 'precision' : 4 + , 'recall' : 5 + , 'accuracy' : 6 + , 'roc_auc' : 7 + , 'TN' : 8 + , 'FP' : 9 + , 'FN' : 10 + , 'TP' : 11 + , 'trainingY_neg': 12 + , 'trainingY_pos': 13 + , 'blindY_neg' : 14 + , 'blindY_pos' : 15 + , 'fit_time' : 16 + , 'score_time' : 17 + } + +# data dependent variable +bts_size = len(X_bts) +############################################################################### +#%% TTS: 7030 split +# mm_skf_scoresD = MultModelsCl(input_df = X +# , target = y +# , var_type = 'mixed' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) + +# baseline_all = pd.DataFrame(mm_skf_scoresD) +# baseline_all = baseline_all.T +# #baseline_train = baseline_all.filter(like='train_', axis=1) +# baseline_CT = baseline_all.filter(like='test_', axis=1) +# baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# baseline_BT = baseline_all.filter(like='bts_', axis=1) +# baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# # Write csv +# baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv') +# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv') + +#================ +# Baseline +# No resampling +#================ +# other data dependent variables +training_size_ns = len(X) +n_features = len(X.columns) + +scores_mmD = MultModelsCl(input_df = X + , target = y + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True) + +baseline_all_scores = pd.DataFrame(scores_mmD) + +baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) +baseline_all = baseline_all.reset_index() +baseline_all.rename(columns = {'index': 'original_names'}, inplace = True) + +# Indicate whether BT or CT +bt_pattern = re.compile(r'bts_.*') +baseline_all['data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) + +baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True) + +score_type_uniqueN = set(baseline_all['score_type']) +cL1 = list(score_type_ordermapD.keys()) +cL2 = list(score_type_uniqueN) + +if set(cL1).issubset(cL2): + print('\nPASS: sorting df by score that is mapped onto the order I want') + baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD) + baseline_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) +else: + sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') + +# add cols: specific +baseline_all['resampling'] = 'none' +baseline_all['training_size'] = training_size_ns + +# add cols: common +baseline_all['n_features'] = n_features +#baseline_all['test_size'] = bts_size +#baseline_all['tts_split'] = tts_split + +############################################################################### +#%% SMOTE NC: Oversampling [Numerical + categorical] +# mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc +# , target = y_smnc +# , var_type = 'mixed' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) +# smnc_all = pd.DataFrame(mm_skf_scoresD7) +# smnc_all = smnc_all.T + +# smnc_CT = smnc_all.filter(like='test_', axis=1) +# smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# smnc_BT = smnc_all.filter(like='bts_', axis=1) +# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# # Write csv +# smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv') +# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv') + +#================ +# Baselone +# SMOTE NC +#================ +# other data dependent variables +training_size_smnc = len(X_smnc) +n_features = len(X_smnc.columns) + +smnc_scores_mmD = MultModelsCl(input_df = X_smnc + , target = y_smnc + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True) + +smnc_all_scores = pd.DataFrame(smnc_scores_mmD) + +smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) +smnc_all = smnc_all.reset_index() +smnc_all.rename(columns = {'index': 'original_names'}, inplace = True) + +# Indicate whether BT or CT +bt_pattern = re.compile(r'bts_.*') +smnc_all['data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) + +smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True) + +score_type_uniqueN = set(smnc_all['score_type']) +cL1 = list(score_type_ordermapD.keys()) +cL2 = list(score_type_uniqueN) + +if set(cL1).issubset(cL2): + print('\nPASS: sorting df by score that is mapped onto the order I want') + smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD) + smnc_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) +else: + sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') + +# add cols: specific +smnc_all['resampling'] = 'smnc' +smnc_all['training_size'] = training_size_smnc + +# add cols: common +smnc_all['n_features'] = n_features +#smnc_all['test_size'] = bts_size +#smnc_all['tts_split'] = tts_split +############################################################################### +#%% ROS: Numerical + categorical +# mm_skf_scoresD3 = MultModelsCl(input_df = X_ros +# , target = y_ros +# , var_type = 'mixed' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) +# ros_all = pd.DataFrame(mm_skf_scoresD3) +# ros_all = ros_all.T + +# ros_CT = ros_all.filter(like='test_', axis=1) +# ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# ros_BT = ros_all.filter(like='bts_', axis=1) +# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# # Write csv +# ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv') +# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv') +#================ +# Baseline +# ROS +#================ +# other data dependent variables +training_size_ros = len(X_ros) +n_features = len(X_ros.columns) + +ros_scores_mmD = MultModelsCl(input_df = X_ros + , target = y_ros + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True) + +ros_all_scores = pd.DataFrame(ros_scores_mmD) + +ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) +ros_all = ros_all.reset_index() +ros_all.rename(columns = {'index': 'original_names'}, inplace = True) + +# Indicate whether BT or CT +bt_pattern = re.compile(r'bts_.*') +ros_all['data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) + +ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True) + +score_type_uniqueN = set(ros_all['score_type']) +cL1 = list(score_type_ordermapD.keys()) +cL2 = list(score_type_uniqueN) + +if set(cL1).issubset(cL2): + print('\nPASS: sorting df by score that is mapped onto the order I want') + ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD) + ros_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) +else: + sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') + +# add cols: specific +ros_all['resampling'] = 'ros' +ros_all['training_size'] = training_size_ros + +# add cols: common +ros_all['n_features'] = n_features +#ros_all['test_size'] = bts_size +#ros_all['tts_split'] = tts_split +############################################################################### +#%% RUS: Numerical + categorical +# mm_skf_scoresD4 = MultModelsCl(input_df = X_rus +# , target = y_rus +# , var_type = 'mixed' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) +# rus_all = pd.DataFrame(mm_skf_scoresD4) +# rus_all = rus_all.T + +# rus_CT = rus_all.filter(like='test_', axis=1) +# rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# rus_BT = rus_all.filter(like='bts_' , axis=1) +# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# # Write csv +# rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv') +# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv') + +#================ +# Baseline +# RUS +#================ +# other data dependent variables +training_size_rus = len(X_rus) +n_features = len(X_rus.columns) + +rus_scores_mmD = MultModelsCl(input_df = X_rus + , target = y_rus + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True) + +rus_all_scores = pd.DataFrame(rus_scores_mmD) + +rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) +rus_all = rus_all.reset_index() +rus_all.rename(columns = {'index': 'original_names'}, inplace = True) + +# Indicate whether BT or CT +bt_pattern = re.compile(r'bts_.*') +rus_all['data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) + +rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True) + +score_type_uniqueN = set(rus_all['score_type']) +cL1 = list(score_type_ordermapD.keys()) +cL2 = list(score_type_uniqueN) + +if set(cL1).issubset(cL2): + print('\nPASS: sorting df by score that is mapped onto the order I want') + rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD) + rus_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) +else: + sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') + +# add cols: specific +rus_all['resampling'] = 'rus' +rus_all['training_size'] = training_size_rus + +# add cols: common +rus_all['n_features'] = n_features +#rus_all['test_size'] = bts_size +#rus_all['tts_split'] = tts_split +############################################################################### +#%% ROS + RUS Combined: Numerical + categorical +# mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC +# , target = y_rouC +# , var_type = 'mixed' +# , skf_cv = skf_cv +# , blind_test_input_df = X_bts +# , blind_test_target = y_bts) +# rouC_all = pd.DataFrame(mm_skf_scoresD8) +# rouC_all = rouC_all.T + +# rouC_CT = rouC_all.filter(like='test_', axis=1) +# rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True) + +# rouC_BT = rouC_all.filter(like='bts_', axis=1) +# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True) + +# # Write csv +# rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv') +# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv') + +#================ +# Baseline +# ROUC +#================ +# other data dependent variables +training_size_rouC = len(X_rouC) +n_features = len(X_rouC.columns) + +rouC_scores_mmD = MultModelsCl(input_df = X_rouC + , target = y_rouC + , var_type = 'mixed' + , skf_cv = skf_cv + , blind_test_input_df = X_bts + , blind_test_target = y_bts + , add_cm = True + , add_yn = True) + +rouC_all_scores = pd.DataFrame(rouC_scores_mmD) + +rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0) +rouC_all = rouC_all.reset_index() +rouC_all.rename(columns = {'index': 'original_names'}, inplace = True) + +# Indicate whether BT or CT +bt_pattern = re.compile(r'bts_.*') +rouC_all['data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1) + +rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True) + +score_type_uniqueN = set(rouC_all['score_type']) +cL1 = list(score_type_ordermapD.keys()) +cL2 = list(score_type_uniqueN) + +if set(cL1).issubset(cL2): + print('\nPASS: sorting df by score that is mapped onto the order I want') + rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD) + rouC_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True) +else: + sys.exit('\nFAIL: could not sort df as score mapping for ordering failed') + +# add cols: specific +rouC_all['resampling'] = 'rouC' +rouC_all['training_size'] = training_size_rouC + +# add cols: common +rouC_all['n_features'] = n_features +#rouC_all['test_size'] = bts_size +#rouC_all['tts_split'] = tts_split + + + + + +############################################################################### +#%% COMBINING all FG dfs +#================ +# Combine all +# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns +#================ +dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ] + +dfs_nrows = [] +for df in dfs_combine: + dfs_nrows = dfs_nrows + [len(df)] +dfs_nrows = max(dfs_nrows) + +dfs_ncols = [] +for df in dfs_combine: + dfs_ncols = dfs_ncols + [len(df.columns)] +dfs_ncols = max(dfs_ncols) + +# dfs_ncols = [] +# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine) +# dfs_ncols2 + +expected_nrows = len(dfs_combine) * dfs_nrows +expected_ncols = dfs_ncols + +common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine))) + +if len(common_cols) == dfs_ncols : + combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True) + resampling_methods = combined_baseline[['resampling', 'training_size']] + resampling_methods = resampling_methods.drop_duplicates() + print('\nConcatenating dfs with different resampling methods:', tts_split + , '\nNo. of dfs combining:', len(dfs_combine) + , '\nThe sampling methods are:' + , '\n', resampling_methods) + if len(combined_baseline) == expected_nrows and len(combined_baseline.columns) == expected_ncols: + print('\nPASS:', len(dfs_combine), 'dfs successfully combined' + , '\nnrows in combined_df:', len(combined_baseline) + , '\nncols in combined_df:', len(combined_baseline.columns)) + else: + print('\nFAIL: concatenating failed' + , '\nExpected nrows:', expected_nrows + , '\nGot:', len(combined_baseline) + , '\nExpected ncols:', expected_ncols + , '\nGot:', len(combined_baseline.columns)) + sys.exit() +else: + sys.exit('\nConcatenting dfs not possible,check numbers ') + +# Add further column indications +combined_baseline['test_size'] = bts_size +combined_baseline['tts_split'] = tts_split + +# TODO: +# ADD y target ration for all + +# # rpow bind +# if all(ll((baseline_all.columns == baseline_GN.columns == baseline_STR.columns)): +# print('\nPASS:colnames match, proceeding to rowbind') +# comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline +############################################################################### +#==================== +# Write output file +#==================== +combined_baseline.to_csv(outFile, index = False) +print('\nFile successfully written:', outFile) +############################################################################### \ No newline at end of file diff --git a/scripts/ml/run_fg.py b/scripts/ml/run_fg.py index d9a504a..144438a 100755 --- a/scripts/ml/run_fg.py +++ b/scripts/ml/run_fg.py @@ -30,9 +30,9 @@ os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/') #================== # Import data #================== -from ml_data_dissected import * +from ml_data_fg import * setvars(gene,drug) -from ml_data_dissected import * +from ml_data_fg import * # from YC run_all_ML: run locally #from UQ_yc_RunAllClfs import run_all_ML @@ -60,7 +60,7 @@ outFile = outdir_ml + gene.lower() + '_baseline_FG.csv' #================== # other vars #================== -tts_split_name = 'original' +tts_split = 'original' resampling = 'none' ############################################################################### @@ -177,7 +177,7 @@ else: baseline_EV['feature_group'] = feature_gp_nameEV baseline_EV['resampling'] = resampling -baseline_EV['tts_split'] = tts_split_name +baseline_EV['tts_split'] = tts_split baseline_EV['n_features'] = n_featuresEV ############################################################################### #================ @@ -221,7 +221,7 @@ else: baseline_GN['feature_group'] = feature_gp_nameGN baseline_GN['resampling'] = resampling -baseline_GN['tts_split'] = tts_split_name +baseline_GN['tts_split'] = tts_split baseline_GN['n_features'] = n_featuresGN ############################################################################### #all_featuresN = X_evolFN + X_structural_FN + X_genomicFN @@ -268,7 +268,7 @@ else: baseline_STR['feature_group'] = feature_gp_nameSTR baseline_STR['resampling'] = resampling -baseline_STR['tts_split'] = tts_split_name +baseline_STR['tts_split'] = tts_split baseline_STR['n_features'] = n_featuresSTR ############################################################################## #================ @@ -312,7 +312,7 @@ else: baseline_STB['feature_group'] = feature_gp_nameSTB baseline_STB['resampling'] = resampling -baseline_STB['tts_split'] = tts_split_name +baseline_STB['tts_split'] = tts_split baseline_STB['n_features'] = n_featuresSTB ############################################################################### #================ @@ -356,7 +356,7 @@ else: baseline_AFF['feature_group'] = feature_gp_nameAFF baseline_AFF['resampling'] = resampling -baseline_AFF['tts_split'] = tts_split_name +baseline_AFF['tts_split'] = tts_split baseline_AFF['n_features'] = n_featuresAFF ############################################################################### #================ @@ -400,7 +400,7 @@ else: baseline_RES['feature_group'] = feature_gp_nameRES baseline_RES['resampling'] = resampling -baseline_RES['tts_split'] = tts_split_name +baseline_RES['tts_split'] = tts_split baseline_RES['n_features'] = n_featuresRES ############################################################################### #================ @@ -446,7 +446,7 @@ else: baseline_RNAA['feature_group'] = feature_gp_nameRNAA baseline_RNAA['resampling'] = resampling -baseline_RNAA['tts_split'] = tts_split_name +baseline_RNAA['tts_split'] = tts_split baseline_RNAA['n_features'] = n_featuresRNAA ############################################################################### #================ @@ -492,7 +492,7 @@ else: baseline_SNAA['feature_group'] = feature_gp_nameSNAA baseline_SNAA['resampling'] = resampling -baseline_SNAA['tts_split'] = tts_split_name +baseline_SNAA['tts_split'] = tts_split baseline_SNAA['n_features'] = n_featuresSNAA ############################################################################### #%% COMBINING all FG dfs @@ -525,7 +525,7 @@ if len(common_cols) == dfs_ncols : combined_FG_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True) fgs = combined_FG_baseline[['feature_group', 'n_features']] fgs = fgs.drop_duplicates() - print('\nConcatenating dfs with feature groups after ML analysis (sampling type):' + print('\nConcatenating dfs with feature groups after ML analysis:' , '\nNo. of dfs combining:', len(dfs_combine) , '\nSampling type:', resampling , '\nThe feature groups are:'