added run_7030.py that runs as cmd for all gene targets and sampling methods and outputs a single csv

2022-06-21 20:37:53 +01:00 · 2022-06-21 20:37:53 +01:00 · bc12dbd7c2
commit bc12dbd7c2
parent 5b0ccdfec4
5 changed files with 749 additions and 229 deletions
--- a/scripts/ml/Mult_dissected_CALL.py
+++ b/scripts/ml/Mult_dissected_CALL.py
@ -52,7 +52,7 @@ sampling_type_name   = 'none'
 feature_gp_nameEV      = 'evolutionary'
 n_featuresEV           = len(X_evolFN)

-scores_mmEV = MultModelsCl_dissected(input_df = X[X_evolFN]
+scores_mmEV = MultModelsCl(input_df = X[X_evolFN]
                    , target = y
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
@ -96,7 +96,7 @@ baseline_EV['n_features']    = n_featuresEV
 feature_gp_nameGN      = 'genomics'
 n_featuresGN           = len(X_genomicFN)

-scores_mmGN = MultModelsCl_dissected(input_df = X[X_genomicFN]
+scores_mmGN = MultModelsCl(input_df = X[X_genomicFN]
                    , target = y
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
@ -143,7 +143,7 @@ baseline_GN['n_features']    = n_featuresGN
 feature_gp_nameSTR      = 'structural'
 n_featuresSTR           = len(X_structural_FN)

-scores_mmSTR = MultModelsCl_dissected(input_df = X[X_structural_FN]
+scores_mmSTR = MultModelsCl(input_df = X[X_structural_FN]
                    , target = y
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
@ -187,7 +187,7 @@ baseline_STR['n_features']    = n_featuresSTR
 feature_gp_nameSTB      = 'stability'
 n_featuresSTB           = len(X_stability_FN)

-scores_mmSTB = MultModelsCl_dissected(input_df = X[X_stability_FN]
+scores_mmSTB = MultModelsCl(input_df = X[X_stability_FN]
                    , target = y
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
@ -231,7 +231,7 @@ baseline_STB['n_features']    = n_featuresSTB
 feature_gp_nameAFF      = 'affinity'
 n_featuresAFF           = len(X_affinityFN)

-scores_mmAFF = MultModelsCl_dissected(input_df = X[X_affinityFN]
+scores_mmAFF = MultModelsCl(input_df = X[X_affinityFN]
                    , target = y
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
@ -275,7 +275,7 @@ baseline_AFF['n_features']    = n_featuresAFF
 feature_gp_nameRES      = 'residue_prop'
 n_featuresRES           = len(X_resprop_FN)

-scores_mmRES = MultModelsCl_dissected(input_df = X[X_resprop_FN]
+scores_mmRES = MultModelsCl(input_df = X[X_resprop_FN]
                    , target = y
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
@ -321,7 +321,7 @@ X_respropNOaaFN = list(set(X_resprop_FN) - set(X_aaindex_Fnum))
 feature_gp_nameRNAA      = 'ResPropNoAA'
 n_featuresRNAA           = len(X_respropNOaaFN)

-scores_mmRNAA = MultModelsCl_dissected(input_df = X[X_respropNOaaFN]
+scores_mmRNAA = MultModelsCl(input_df = X[X_respropNOaaFN]
                    , target = y
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
@ -367,7 +367,7 @@ X_strNOaaFN = list(set(X_structural_FN) - set(X_aaindex_Fnum))
 feature_gp_nameSNAA      = 'StrNoAA'
 n_featuresSNAA           = len(X_strNOaaFN)

-scores_mmSNAA = MultModelsCl_dissected(input_df = X[X_strNOaaFN]
+scores_mmSNAA = MultModelsCl(input_df = X[X_strNOaaFN]
                    , target = y
                    , var_type = 'mixed'
                    , skf_cv = skf_cv
--- a/scripts/ml/ml_data_7030.py
+++ b/scripts/ml/ml_data_7030.py
@ -34,6 +34,8 @@ def setvars(gene,drug):
    from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold
    
    from sklearn.pipeline import Pipeline, make_pipeline
+    import argparse
+    import re
    #%% GLOBALS
    rs = {'random_state': 42}
    njobs = {'n_jobs': 10}
@ -422,118 +424,31 @@ def setvars(gene,drug):
    #==========================
    my_df_ml = my_df.copy()
    
-    #%% Build X: input for ML
-    common_cols_stabiltyN = ['ligand_distance'
-               , 'ligand_affinity_change'
-               , 'duet_stability_change'
-               , 'ddg_foldx'
-               , 'deepddg'
-               , 'ddg_dynamut2'
-               , 'mmcsm_lig'
-               , 'contacts']
-    
-    # Build stability columns ~ gene
+    # Build column names to mask for affinity chanhes
    if gene.lower() in geneL_basic:
-        X_stabilityN = common_cols_stabiltyN
+        #X_stabilityN = common_cols_stabiltyN
+        gene_affinity_colnames = []# not needed as its the common ones 
        cols_to_mask = ['ligand_affinity_change']
        
    if gene.lower() in geneL_ppi2:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' , 'interface_dist'] 
-        geneL_ppi2_st_cols = ['mcsm_ppi2_affinity', 'interface_dist'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
+        gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols
        cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity']
    
    if gene.lower() in geneL_na:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
-        geneL_na_st_cols =  ['mcsm_na_affinity'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
+        gene_affinity_colnames =  ['mcsm_na_affinity'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols
        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity']
    
    if gene.lower() in geneL_na_ppi2:
-    #    X_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        geneL_na_ppi2_st_cols = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
-        X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
+        gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+        #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols
        cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity']
    
-    
-    X_foldX_cols = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
-    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
-    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
-    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
-    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
-    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'
-    ]
-    
-    X_str =  ['rsa'
-               #, 'asa'
-               , 'kd_values'
-               , 'rd_values']    
-    
-    X_ssFN = X_stabilityN + X_str + X_foldX_cols
-    
-    X_evolFN =  ['consurf_score'
-               , 'snap2_score'
-               , 'provean_score']
-        
-    X_genomic_mafor =  ['maf'
-                    , 'logorI'
-                    # , 'or_rawI'
-                    # , 'or_mychisq'
-                    # , 'or_logistic'
-                    # , 'or_fisher'
-                    # , 'pval_fisher'
-                    ]
-    
-    X_genomic_linegae  = ['lineage_proportion'
-                          , 'dist_lineage_proportion'
-                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
-                          , 'lineage_count_all'
-                          , 'lineage_count_unique'
-                          ]
-    
-    X_genomicFN = X_genomic_mafor + X_genomic_linegae
-    
-    X_aaindexFN = list(aa_df_cols)
-    
-    print('\nTotal no. of features for aaindex:', len(X_aaindexFN))
-    
-    # numerical feature names
-    numerical_FN = X_ssFN  + X_evolFN + X_genomicFN + X_aaindexFN
-      
-    # categorical feature names
-    categorical_FN = ['ss_class'
-                # , 'wt_prop_water'
-                # , 'mut_prop_water'
-                # , 'wt_prop_polarity'
-                # , 'mut_prop_polarity'
-                # , 'wt_calcprop'
-                # , 'mut_calcprop'
-                , 'aa_prop_change'
-                , 'electrostatics_change'
-                , 'polarity_change'
-                , 'water_change'
-                , 'drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2]
-                , 'active_site' #[didn't use it for uq_v1]
-                #, 'gene_name' # will be required for the combined stuff
-                 ]
-                 
-    #----------------------------------------------
-    # count numerical and categorical features
-    #----------------------------------------------
-    
-    print('\nNo. of numerical features:', len(numerical_FN)
-          , '\nNo. of categorical features:', len(categorical_FN))
-    
    #=======================
    # Masking columns:
    # (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10
    #=======================
-    # my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts()
-    # my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
-    
-    # my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0
-    # (my_df_ml['ligand_affinity_change'] == 0).sum()
-    
    my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts()
    my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts()
    my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts()
@ -544,23 +459,149 @@ def setvars(gene,drug):
    
    mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask]  
    
+    #===================================================
    # write file for check
    mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True)
-    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv') 
+    mask_check.to_csv(outdir + 'ml/' + gene.lower() + '_mask_check.csv')
+    #===================================================
+    ###############################################################################
+    #%% Feature groups (FG): Build X for Input ML 
+    ############################################################################
+    #===========================
+    # FG1: Evolutionary features
+    #===========================
+    X_evolFN =  ['consurf_score'
+               , 'snap2_score'
+               , 'provean_score']
+    
+    ###############################################################################
+    #========================
+    # FG2: Stability features
+    #========================
+    #--------
+    # common
+    #--------
+    X_common_stability_Fnum = [
+               'duet_stability_change'
+               , 'ddg_foldx'
+               , 'deepddg'
+               , 'ddg_dynamut2'
+               , 'contacts']
+    #--------
+    # FoldX
+    #--------
+    X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss'
+    , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss'
+    , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss'
+    , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss'
+    , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss'
+    , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss']
+    
+    X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum
+    
+    ###############################################################################
+    #===================
+    # FG3: Affinity features
+    #===================
+    common_affinity_Fnum =  ['ligand_distance'
+                    , 'ligand_affinity_change'
+                    , 'mmcsm_lig']
+    
+    # if gene.lower() in geneL_basic:
+    #     X_affinityFN = common_affinity_Fnum 
+    # else:
+    #     X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+        
+    X_affinityFN = common_affinity_Fnum + gene_affinity_colnames
+    
+    ###############################################################################
+    #============================
+    # FG4: Residue level features
+    #============================
+    #-----------
+    # AA index
+    #-----------
+    X_aaindex_Fnum = list(aa_df_cols)
+    print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum))
+    
+    #-----------------
+    # surface area
+    # depth
+    # hydrophobicity
+    #-----------------
+    X_str_Fnum =  ['rsa'
+               #, 'asa'
+               , 'kd_values'
+               , 'rd_values']   
+    
+    #---------------------------
+    # Other aa properties
+    # active site indication
+    #---------------------------
+    X_aap_Fcat = ['ss_class'
+                # , 'wt_prop_water'
+                # , 'mut_prop_water'
+                # , 'wt_prop_polarity'
+                # , 'mut_prop_polarity'
+                # , 'wt_calcprop'
+                # , 'mut_calcprop'
+                , 'aa_prop_change'
+                , 'electrostatics_change'
+                , 'polarity_change'
+                , 'water_change'
+                , 'active_site']
       
-    #####################################################################
+    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
+    ###############################################################################
+    #========================
+    # FG5: Genomic features
+    #========================
+    X_gn_mafor_Fnum =  ['maf'
+                    , 'logorI'
+                    # , 'or_rawI'
+                    # , 'or_mychisq'
+                    # , 'or_logistic'
+                    # , 'or_fisher'
+                    # , 'pval_fisher'
+                    ]
+    
+    X_gn_linegae_Fnum  = ['lineage_proportion'
+                          , 'dist_lineage_proportion'
+                          #, 'lineage' # could be included as a category but it has L2;L4  formatting
+                          , 'lineage_count_all'
+                          , 'lineage_count_unique'
+                          ]
+    
+    X_gn_Fcat = ['drtype_mode_labels'  # beware then you can't use it to predict [USED it for uq_v1, not v2]
+                   #, 'gene_name' # will be required for the combined stuff
+                 ]
+    
+    X_genomicFN = X_gn_mafor_Fnum + X_gn_linegae_Fnum + X_gn_Fcat
+    ###############################################################################
+    #========================
+    # FG6 collapsed: Structural : Atability + Affinity + ResidueProp
+    #========================
+    X_structural_FN =  X_stability_FN + X_affinityFN + X_resprop_FN
+    
+    ###############################################################################
+    #========================
+    # BUILDING all features
+    #========================
+    all_featuresN = X_evolFN + X_structural_FN + X_genomicFN
+    
+    ###############################################################################
+    #%% Define training and test data
    #================================================================
    # Training and BLIND test set: 70/30
-    
-    # Throw away previous blind_test_df, and call the 30% data as blind_test
-    # as these were imputed values and initial analysis shows that this
-    # is not very representative
+    # dst with actual values  : training set
+    # dst with imputed values : THROW AWAY [unrepresentative]
    #================================================================
    my_df_ml[drug].isna().sum()
+    
    #    blind_test_df = my_df_ml[my_df_ml[drug].isna()]
    #    blind_test_df.shape
    
-    training_df =  my_df_ml[my_df_ml[drug].notna()]
+    training_df = my_df_ml[my_df_ml[drug].notna()]
    training_df.shape
    
    # Target 1: dst_mode
@ -568,80 +609,14 @@ def setvars(gene,drug):
    training_df['dst_mode'].value_counts()
    
    ####################################################################
-    
-###############################################################################
-###############################################################################
-    # #%% extracting dfs based on numerical, categorical column names
-    # #----------------------------------
-    # # WITHOUT the target var included
-    # #----------------------------------
-    # num_df = training_df[numerical_FN]
-    # num_df.shape
-    
-    # cat_df = training_df[categorical_FN]
-    # cat_df.shape
-    
-    # all_df = training_df[numerical_FN + categorical_FN]
-    # all_df.shape
-    
-    # #------------------------------
-    # # WITH the target var included:
-    #     #'wtgt': with target
-    # #------------------------------
-    # # drug and dst_mode should be the same thing
-    # num_df_wtgt = training_df[numerical_FN + ['dst_mode']]
-    # num_df_wtgt.shape
-    
-    # cat_df_wtgt = training_df[categorical_FN + ['dst_mode']]
-    # cat_df_wtgt.shape
-    
-    # all_df_wtgt = training_df[numerical_FN + categorical_FN + ['dst_mode']]
-    # all_df_wtgt.shape
-    
-    #%%########################################################################
-    # #============
-    # # ML data: OLD
-    # #============
-    # #------
-    # # X: Training and Blind test (BTS)
-    # #------
-    # X     = all_df_wtgt[numerical_FN + categorical_FN] # training data ALL
-    # X_bts = blind_test_df[numerical_FN + categorical_FN] # blind test data ALL
-    # #X = all_df_wtgt[numerical_FN] # training numerical only
-    # #X_bts = blind_test_df[numerical_FN] # blind test data numerical
-    
-    # #------
-    # # y
-    # #------
-    # y = all_df_wtgt['dst_mode'] # training data y
-    # y_bts = blind_test_df['dst_mode'] # blind data test y
-    
-    # #X_bts_wt = blind_test_df[numerical_FN + ['dst_mode']] 
-    
-    # # Quick check
-    # #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    # for i in range(len(cols_to_mask)):
-    #     ind = i+1
-    #     print('\nindex:', i, '\nind:', ind)
-    #     print('\nMask count check:'
-    #           , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-    #           )
-    
-    # print('Original Data\n', Counter(y)
-    #       , 'Data dim:', X.shape)
-    
-###############################################################################
-###############################################################################    
    #====================================
    # ML data: Train test split: 70/30
    # with stratification
    # 70% : training_data for CV
    # 30% : blind test 
    #=====================================
-      
-    # features: all_df or
-    x_features = training_df[numerical_FN + categorical_FN]
-    y_target = training_df['dst_mode']
+    x_features = training_df[all_featuresN]
+    y_target   = training_df['dst_mode']
    
    # sanity check
    if not 'dst_mode' in x_features.columns:
@ -652,7 +627,9 @@ def setvars(gene,drug):
        #https://towardsdatascience.com/finally-why-we-use-an-80-20-split-for-training-and-test-data-plus-an-alternative-method-oh-yes-edc77e96295d
    else:
        sys.exit('\nFAIL: x_features has target variable included. FIX it and rerun!')
-    
+    #-------------------
+    # train-test split
+    #-------------------
    #x_train, x_test, y_train, y_test # traditional var_names
    # so my downstream code doesn't need to change    
    X, X_bts, y, y_bts = train_test_split(x_features, y_target
@ -664,16 +641,64 @@ def setvars(gene,drug):
    
    yc2 = Counter(y_bts)
    yc2_ratio = yc2[0]/yc2[1]
-
+    
+    ###############################################################################
+    #======================================================
+    # Determine categorical and numerical features
+    #======================================================
+    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
+    numerical_cols 
+    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
+    categorical_cols 
+    
+    ################################################################################
+    # IMPORTANT sanity checks
+    if len(X.columns) == len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN):
+        print('\nPASS: ML data with input features, training and test generated...'
+              , '\n\nTotal no. of input features:'        , len(X.columns)
+              , '\n--------No. of numerical features:'    , len(numerical_cols)
+              , '\n--------No. of categorical features:'  , len(categorical_cols)
+              
+              , '\n\nTotal no. of evolutionary features:' , len(X_evolFN)
+              
+              , '\n\nTotal no. of stability features:'    , len(X_stability_FN)
+              , '\n--------Common stabilty cols:'         , len(X_common_stability_Fnum)
+              , '\n--------Foldx cols:'                   , len(X_foldX_Fnum)
+              
+              , '\n\nTotal no. of affinity features:'     , len(X_affinityFN)
+              , '\n--------Common affinity cols:'         , len(common_affinity_Fnum)
+              , '\n--------Gene specific affinity cols:'  , len(gene_affinity_colnames)
+              
+              , '\n\nTotal no. of residue level features:', len(X_resprop_FN)
+              , '\n--------AA index cols:'                , len(X_aaindex_Fnum)
+              , '\n--------Residue Prop cols:'            , len(X_str_Fnum)
+              , '\n--------AA change Prop cols:'          , len(X_aap_Fcat)
+              
+              , '\n\nTotal no. of genomic features:'      , len(X_genomicFN)
+              , '\n--------MAF+OR cols:'                  , len(X_gn_mafor_Fnum)
+              , '\n--------Lineage cols:'                 , len(X_gn_linegae_Fnum)
+              , '\n--------Other cols:'                   , len(X_gn_Fcat)
+              )
+    else:
+        print('\nFAIL: numbers mismatch'
+              , '\nExpected:',len(X_evolFN) + len(X_stability_FN) + len(X_affinityFN) + len(X_resprop_FN) + len(X_genomicFN)
+              , '\nGot:', len(X.columns))
+        sys.exit()
+    ###############################################################################
    print('\n-------------------------------------------------------------'
-          , '\nSuccessfully split data with stratification: 70/30'
-          , '\nInput features data size:', x_features.shape
-          , '\nTrain data size:', X.shape
-          , '\nTest data size:', X_bts.shape
+          , '\nSuccessfully split data: ALL features'
+          , '\nactual values: training set'
+          , '\nimputed values: blind test set'
+          
+          , '\n\nTotal data size:', len(X) + len(X_bts)
+    
+          , '\n\nTrain data size:', X.shape
          , '\ny_train numbers:', yc1
-          , '\ny_train ratio:',yc1_ratio
-          , '\n'
+    
+          , '\n\nTest data size:', X_bts.shape
          , '\ny_test_numbers:', yc2
+    
+          , '\n\ny_train ratio:',yc1_ratio
          , '\ny_test ratio:', yc2_ratio
          , '\n-------------------------------------------------------------'
          )
@ -700,7 +725,7 @@ def setvars(gene,drug):
    #------------------------------
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_ros, y_ros = oversample.fit_resample(X, y)
-    print('\nSimple Random OverSampling\n', Counter(y_ros))
+    print('Simple Random OverSampling\n', Counter(y_ros))
    print(X_ros.shape)
    
    #------------------------------
@ -709,7 +734,7 @@ def setvars(gene,drug):
    #------------------------------
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_rus, y_rus = undersample.fit_resample(X, y)
-    print('\nSimple Random UnderSampling\n', Counter(y_rus))
+    print('Simple Random UnderSampling\n', Counter(y_rus))
    print(X_rus.shape)
    
    #------------------------------
@ -720,7 +745,7 @@ def setvars(gene,drug):
    X_ros, y_ros = oversample.fit_resample(X, y)
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_rouC, y_rouC = undersample.fit_resample(X_ros, y_ros)
-    print('\nSimple Combined Over and UnderSampling\n',  Counter(y_rouC))
+    print('Simple Combined Over and UnderSampling\n',  Counter(y_rouC))
    print(X_rouC.shape)
    
    #------------------------------
@ -740,7 +765,7 @@ def setvars(gene,drug):
    categorical_colind = X.columns.get_indexer(list(categorical_ix))
    categorical_colind
    
-    k_sm = 5 # 5 is default
+    k_sm = 5 # 5 is deafult
    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
--- a/scripts/ml/ml_data_fg.py
+++ b/scripts/ml/ml_data_fg.py
@ -61,7 +61,6 @@ def setvars(gene,drug):
    jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
    
    #%% FOR LATER: Combine ED logo data
-    #%% DONE: active aa site annotations **DONE on 15/05/2022 as part of generating merged_dfs
    ###########################################################################
    rs = {'random_state': 42}
    njobs = {'n_jobs': 10}
@ -419,7 +418,7 @@ def setvars(gene,drug):
    #---------------------------------------
    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    
-    #%% Data for ML
+    #%%########################################################################
    #==========================
    #     Data for ML
    #==========================
@ -551,8 +550,7 @@ def setvars(gene,drug):
                , 'polarity_change'
                , 'water_change'
                , 'active_site']
-    
-    
+       
    X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat
    ###############################################################################
    #========================
@ -594,8 +592,7 @@ def setvars(gene,drug):
    ###############################################################################
    #%% Define training and test data
    #======================================================
-    # Training and BLIND test set [UQ]: actual vs imputed
-    # No aa index but active_site included
+    # Training and BLIND test set: actual vs imputed
    # dst with actual values  : training set
    # dst with imputed values : blind test
    #======================================================
@ -612,9 +609,9 @@ def setvars(gene,drug):
    training_df['dst_mode'].value_counts()
    
    ####################################################################
-    #============
-    # ML data
-    #============
+    #=====================================
+    # ML data: actual vs imputed 
+    #=====================================
    #------
    # X: Training and Blind test (BTS)
    #------
@ -625,20 +622,8 @@ def setvars(gene,drug):
    # y
    #------
    y     = training_df['dst_mode']
-    y_bts = blind_test_df['dst_mode']
-    
-    # Quick check
-    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
-    for i in range(len(cols_to_mask)):
-        ind = i+1
-        print('\nindex:', i, '\nind:', ind)
-        print('\nMask count check:'
-              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
-              )
-    
-    print('Original Data\n', Counter(y)
-          , 'Data dim:', X.shape)
-    
+    y_bts = blind_test_df['dst_mode']  
+   
    yc1 = Counter(y)
    yc1_ratio = yc1[0]/yc1[1]
    
@ -705,7 +690,18 @@ def setvars(gene,drug):
          , '\ny_test ratio:', yc2_ratio
          , '\n-------------------------------------------------------------'
          )
+    ##########################################################################    
+    # Quick check
+    #(X['ligand_affinity_change']==0).sum() == (X['ligand_distance']>10).sum()
+    for i in range(len(cols_to_mask)):
+        ind = i+1
+        print('\nindex:', i, '\nind:', ind)
+        print('\nMask count check:'
+              , (my_df_ml[cols_to_mask[i]]==0).sum() == (my_df_ml['ligand_distance']>10).sum()
+              )
    
+    print('Original Data\n', Counter(y)
+          , 'Data dim:', X.shape)
    ###########################################################################
    #%% 
    ###########################################################################
@ -760,7 +756,7 @@ def setvars(gene,drug):
    k_sm = 5 # 5 is deafult
    sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs)
    X_smnc, y_smnc = sm_nc.fit_resample(X, y)
-    print('SMOTE_NC OverSampling\n', Counter(y_smnc))
+    print('\nSMOTE_NC OverSampling\n', Counter(y_smnc))
    print(X_smnc.shape)
    globals().update(locals()) # TROLOLOLOLOLOLS
    #print("i did a horrible hack :-)")
@ -774,7 +770,7 @@ def setvars(gene,drug):
    # sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k_sm, **rs)
    # X_sm, y_sm = sm.fit_resample(X, y)
    # print(X_sm.shape)
-    # print('SMOTE OverSampling\n', Counter(y_sm))
+    # print('\nSMOTE OverSampling\n', Counter(y_sm))
    # y_sm_df = y_sm.to_frame()
    # y_sm_df.value_counts().plot(kind = 'bar')
    
@ -785,7 +781,7 @@ def setvars(gene,drug):
    # sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all', **rs, **njobs ))
    # X_enn, y_enn = sm_enn.fit_resample(X, y)
    # print(X_enn.shape)
-    # print('SMOTE Over+Under Sampling combined\n', Counter(y_enn))
+    # print('\nSMOTE Over+Under Sampling combined\n', Counter(y_enn))
    
    ###############################################################################
    # TODO: Find over and undersampling JUST for categorical data
--- a/scripts/ml/run_7030.py
+++ b/scripts/ml/run_7030.py
@ -0,0 +1,499 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 20 13:05:23 2022
+
+@author: tanu
+"""
+import re
+import argparse
+###############################################################################
+# gene  = 'pncA'
+# drug  = 'pyrazinamide'
+#total_mtblineage_uc = 8
+
+#%% command line args: case sensitive
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+args = arg_parser.parse_args()
+
+drug    = args.drug
+gene    = args.gene
+
+###############################################################################
+#==================
+# other vars
+#==================
+tts_split    = '70/30'
+OutFile_suffix  = '7030'
+###############################################################################
+#==================
+# Import data
+#==================
+from ml_data_7030 import *
+setvars(gene,drug)
+from ml_data_7030 import *
+
+# from YC run_all_ML: run locally
+#from UQ_yc_RunAllClfs import run_all_ML
+
+#====================
+# Import ML function 
+#====================
+# TT run all ML clfs: baseline model
+from MultModelsCl import MultModelsCl
+
+############################################################################
+print('\n#####################################################################\n'
+      , '\nRunning ML analysis: feature groups '
+      , '\nGene name:', gene
+      , '\nDrug name:', drug)
+
+#==================
+# Specify outdir 
+#==================
+outdir_ml = outdir + 'ml/tts_7030/'
+print('\nOutput directory:', outdir_ml)
+outFile = outdir_ml + gene.lower() + '_baselineC_' + OutFile_suffix + '.csv'
+
+###############################################################################
+score_type_ordermapD = { 'mcc'      : 1
+                   , 'fscore'       : 2
+                   , 'jcc'          : 3
+                   , 'precision'    : 4
+                   , 'recall'       : 5      
+                   , 'accuracy'     : 6  
+                   , 'roc_auc'      : 7
+                   , 'TN'           : 8
+                   , 'FP'           : 9
+                   , 'FN'           : 10
+                   , 'TP'           : 11  
+                   , 'trainingY_neg': 12  
+                   , 'trainingY_pos': 13    
+                   , 'blindY_neg'   : 14
+                   , 'blindY_pos'   : 15
+                   , 'fit_time'     : 16
+                   , 'score_time'   : 17
+                   }
+
+# data dependent variable
+bts_size     = len(X_bts)
+###############################################################################
+#%% TTS: 7030 split
+# mm_skf_scoresD = MultModelsCl(input_df = X
+#                                         , target = y
+#                                         , var_type = 'mixed'
+#                                         , skf_cv = skf_cv
+#                                         , blind_test_input_df = X_bts
+#                                         , blind_test_target = y_bts)
+
+# baseline_all = pd.DataFrame(mm_skf_scoresD)
+# baseline_all = baseline_all.T
+# #baseline_train = baseline_all.filter(like='train_', axis=1)
+# baseline_CT = baseline_all.filter(like='test_', axis=1)
+# baseline_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
+
+# baseline_BT = baseline_all.filter(like='bts_', axis=1)
+# baseline_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
+
+# # Write csv
+# baseline_CT.to_csv(outdir_ml + gene.lower() + '_baseline_CT_allF.csv')
+# baseline_BT.to_csv(outdir_ml + gene.lower() + '_baseline_BT_allF.csv')
+
+#================
+# Baseline
+# No resampling
+#================  
+# other data dependent variables
+training_size_ns = len(X)
+n_features       = len(X.columns)
+
+scores_mmD = MultModelsCl(input_df = X
+                    , target = y
+                    , var_type = 'mixed'
+                    , skf_cv = skf_cv
+                    , blind_test_input_df = X_bts
+                    , blind_test_target = y_bts
+                    , add_cm = True 
+                    , add_yn = True)
+
+baseline_all_scores = pd.DataFrame(scores_mmD)
+
+baseline_all = baseline_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
+baseline_all = baseline_all.reset_index()
+baseline_all.rename(columns = {'index': 'original_names'}, inplace = True)
+
+# Indicate whether BT or CT
+bt_pattern = re.compile(r'bts_.*')
+baseline_all['data_source'] = baseline_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
+
+baseline_all['score_type'] = baseline_all['original_names'].str.replace('bts_|test_', '', regex = True)
+
+score_type_uniqueN = set(baseline_all['score_type'])
+cL1 = list(score_type_ordermapD.keys())
+cL2 = list(score_type_uniqueN)
+
+if set(cL1).issubset(cL2):
+    print('\nPASS: sorting df by score that is mapped onto the order I want')
+    baseline_all['score_order'] = baseline_all['score_type'].map(score_type_ordermapD)
+    baseline_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
+else:
+    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
+
+# add cols: specific
+baseline_all['resampling'] = 'none'
+baseline_all['training_size']  = training_size_ns
+
+# add cols: common
+baseline_all['n_features'] = n_features
+#baseline_all['test_size']  = bts_size
+#baseline_all['tts_split']  = tts_split
+
+###############################################################################
+#%% SMOTE NC: Oversampling [Numerical + categorical]
+# mm_skf_scoresD7 = MultModelsCl(input_df = X_smnc
+#                                         , target = y_smnc
+#                                         , var_type = 'mixed'
+#                                         , skf_cv = skf_cv
+#                                         , blind_test_input_df = X_bts
+#                                         , blind_test_target = y_bts)
+# smnc_all = pd.DataFrame(mm_skf_scoresD7)
+# smnc_all = smnc_all.T
+
+# smnc_CT = smnc_all.filter(like='test_', axis=1)
+# smnc_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
+
+# smnc_BT = smnc_all.filter(like='bts_', axis=1)
+# smnc_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
+
+# # Write csv
+# smnc_CT.to_csv(outdir_ml + gene.lower() + '_smnc_CT_allF.csv')
+# smnc_BT.to_csv(outdir_ml + gene.lower() + '_smnc_BT_allF.csv')
+
+#================
+# Baselone
+# SMOTE NC
+#================
+# other data dependent variables
+training_size_smnc = len(X_smnc)
+n_features         = len(X_smnc.columns)
+ 
+smnc_scores_mmD = MultModelsCl(input_df = X_smnc
+                    , target = y_smnc
+                    , var_type = 'mixed'
+                    , skf_cv = skf_cv
+                    , blind_test_input_df = X_bts
+                    , blind_test_target = y_bts
+                    , add_cm = True 
+                    , add_yn = True)
+
+smnc_all_scores = pd.DataFrame(smnc_scores_mmD)
+
+smnc_all = smnc_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
+smnc_all = smnc_all.reset_index()
+smnc_all.rename(columns = {'index': 'original_names'}, inplace = True)
+
+# Indicate whether BT or CT
+bt_pattern = re.compile(r'bts_.*')
+smnc_all['data_source'] = smnc_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
+
+smnc_all['score_type'] = smnc_all['original_names'].str.replace('bts_|test_', '', regex = True)
+
+score_type_uniqueN = set(smnc_all['score_type'])
+cL1 = list(score_type_ordermapD.keys())
+cL2 = list(score_type_uniqueN)
+
+if set(cL1).issubset(cL2):
+    print('\nPASS: sorting df by score that is mapped onto the order I want')
+    smnc_all['score_order'] = smnc_all['score_type'].map(score_type_ordermapD)
+    smnc_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
+else:
+    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
+  
+# add cols: specific
+smnc_all['resampling']     = 'smnc'
+smnc_all['training_size']  = training_size_smnc
+
+# add cols: common
+smnc_all['n_features'] = n_features
+#smnc_all['test_size']  = bts_size
+#smnc_all['tts_split']  = tts_split
+###############################################################################
+#%% ROS: Numerical + categorical
+# mm_skf_scoresD3 = MultModelsCl(input_df = X_ros
+#                                         , target = y_ros
+#                                         , var_type = 'mixed'
+#                                         , skf_cv = skf_cv
+#                                         , blind_test_input_df = X_bts
+#                                         , blind_test_target = y_bts)
+# ros_all = pd.DataFrame(mm_skf_scoresD3)
+# ros_all = ros_all.T
+
+# ros_CT  = ros_all.filter(like='test_', axis=1)
+# ros_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
+
+# ros_BT  = ros_all.filter(like='bts_', axis=1)
+# ros_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
+
+# # Write csv
+# ros_CT.to_csv(outdir_ml + gene.lower() + '_ros_CT_allF.csv')
+# ros_BT.to_csv(outdir_ml + gene.lower() + '_ros_BT_allF.csv')
+#================
+# Baseline
+# ROS
+#================
+# other data dependent variables
+training_size_ros = len(X_ros)
+n_features        = len(X_ros.columns)
+
+ros_scores_mmD = MultModelsCl(input_df = X_ros
+                    , target = y_ros
+                    , var_type = 'mixed'
+                    , skf_cv = skf_cv
+                    , blind_test_input_df = X_bts
+                    , blind_test_target = y_bts
+                    , add_cm = True 
+                    , add_yn = True)
+
+ros_all_scores = pd.DataFrame(ros_scores_mmD)
+
+ros_all = ros_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
+ros_all = ros_all.reset_index()
+ros_all.rename(columns = {'index': 'original_names'}, inplace = True)
+
+# Indicate whether BT or CT
+bt_pattern = re.compile(r'bts_.*')
+ros_all['data_source'] = ros_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
+
+ros_all['score_type'] = ros_all['original_names'].str.replace('bts_|test_', '', regex = True)
+
+score_type_uniqueN = set(ros_all['score_type'])
+cL1 = list(score_type_ordermapD.keys())
+cL2 = list(score_type_uniqueN)
+
+if set(cL1).issubset(cL2):
+    print('\nPASS: sorting df by score that is mapped onto the order I want')
+    ros_all['score_order'] = ros_all['score_type'].map(score_type_ordermapD)
+    ros_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
+else:
+    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
+
+# add cols: specific
+ros_all['resampling'] = 'ros'
+ros_all['training_size']  = training_size_ros
+
+# add cols: common
+ros_all['n_features'] = n_features
+#ros_all['test_size']  = bts_size
+#ros_all['tts_split']  = tts_split
+###############################################################################
+#%% RUS: Numerical + categorical
+# mm_skf_scoresD4 = MultModelsCl(input_df = X_rus
+#                                         , target = y_rus
+#                                         , var_type = 'mixed'
+#                                         , skf_cv = skf_cv
+#                                         , blind_test_input_df = X_bts
+#                                         , blind_test_target = y_bts)
+# rus_all = pd.DataFrame(mm_skf_scoresD4)
+# rus_all = rus_all.T
+
+# rus_CT = rus_all.filter(like='test_', axis=1)
+# rus_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
+
+# rus_BT = rus_all.filter(like='bts_' , axis=1)
+# rus_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
+
+# # Write csv
+# rus_CT.to_csv(outdir_ml + gene.lower() + '_rus_CT_allF.csv')
+# rus_BT.to_csv(outdir_ml + gene.lower() + '_rus_BT_allF.csv')
+
+#================
+# Baseline
+# RUS
+#================
+# other data dependent variables
+training_size_rus = len(X_rus)
+n_features        = len(X_rus.columns)
+
+rus_scores_mmD = MultModelsCl(input_df = X_rus
+                    , target = y_rus
+                    , var_type = 'mixed'
+                    , skf_cv = skf_cv
+                    , blind_test_input_df = X_bts
+                    , blind_test_target = y_bts
+                    , add_cm = True 
+                    , add_yn = True)
+
+rus_all_scores = pd.DataFrame(rus_scores_mmD)
+
+rus_all = rus_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
+rus_all = rus_all.reset_index()
+rus_all.rename(columns = {'index': 'original_names'}, inplace = True)
+
+# Indicate whether BT or CT
+bt_pattern = re.compile(r'bts_.*')
+rus_all['data_source'] = rus_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
+
+rus_all['score_type'] = rus_all['original_names'].str.replace('bts_|test_', '', regex = True)
+
+score_type_uniqueN = set(rus_all['score_type'])
+cL1 = list(score_type_ordermapD.keys())
+cL2 = list(score_type_uniqueN)
+
+if set(cL1).issubset(cL2):
+    print('\nPASS: sorting df by score that is mapped onto the order I want')
+    rus_all['score_order'] = rus_all['score_type'].map(score_type_ordermapD)
+    rus_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
+else:
+    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
+
+# add cols: specific
+rus_all['resampling'] = 'rus'
+rus_all['training_size']  = training_size_rus
+
+# add cols: common
+rus_all['n_features'] = n_features
+#rus_all['test_size']  = bts_size
+#rus_all['tts_split']  = tts_split
+###############################################################################
+#%% ROS + RUS Combined: Numerical + categorical
+# mm_skf_scoresD8 = MultModelsCl(input_df = X_rouC
+#                                         , target = y_rouC
+#                                         , var_type = 'mixed'
+#                                         , skf_cv = skf_cv
+#                                         , blind_test_input_df = X_bts
+#                                         , blind_test_target = y_bts)
+# rouC_all = pd.DataFrame(mm_skf_scoresD8)
+# rouC_all = rouC_all.T
+
+# rouC_CT = rouC_all.filter(like='test_', axis=1)
+# rouC_CT.sort_values(by = ['test_mcc'], ascending = False, inplace = True)
+
+# rouC_BT = rouC_all.filter(like='bts_', axis=1)
+# rouC_BT.sort_values(by = ['bts_mcc'], ascending = False, inplace = True)
+
+# # Write csv
+# rouC_CT.to_csv(outdir_ml + gene.lower() + '_rouC_CT_allF.csv')
+# rouC_BT.to_csv(outdir_ml + gene.lower() + '_rouC_BT_allF.csv')
+
+#================
+# Baseline
+# ROUC
+#================
+# other data dependent variables
+training_size_rouC = len(X_rouC)
+n_features         = len(X_rouC.columns)
+
+rouC_scores_mmD = MultModelsCl(input_df = X_rouC
+                    , target = y_rouC
+                    , var_type = 'mixed'
+                    , skf_cv = skf_cv
+                    , blind_test_input_df = X_bts
+                    , blind_test_target = y_bts
+                    , add_cm = True 
+                    , add_yn = True)
+
+rouC_all_scores = pd.DataFrame(rouC_scores_mmD)
+
+rouC_all = rouC_all_scores.filter(regex = 'bts_.*|test_.*|.*_time|TN|FP|FN|TP|.*_neg|.*_pos', axis = 0)
+rouC_all = rouC_all.reset_index()
+rouC_all.rename(columns = {'index': 'original_names'}, inplace = True)
+
+# Indicate whether BT or CT
+bt_pattern = re.compile(r'bts_.*')
+rouC_all['data_source'] = rouC_all.apply(lambda row: 'BT' if bt_pattern.search(row.original_names) else 'CV' , axis = 1)
+
+rouC_all['score_type'] = rouC_all['original_names'].str.replace('bts_|test_', '', regex = True)
+
+score_type_uniqueN = set(rouC_all['score_type'])
+cL1 = list(score_type_ordermapD.keys())
+cL2 = list(score_type_uniqueN)
+
+if set(cL1).issubset(cL2):
+    print('\nPASS: sorting df by score that is mapped onto the order I want')
+    rouC_all['score_order'] = rouC_all['score_type'].map(score_type_ordermapD)
+    rouC_all.sort_values(by = ['data_source', 'score_order'], ascending = [True, True], inplace = True)
+else:
+    sys.exit('\nFAIL: could not sort df as score mapping for ordering failed')
+
+# add cols: specific
+rouC_all['resampling'] = 'rouC'
+rouC_all['training_size']  = training_size_rouC
+
+# add cols: common
+rouC_all['n_features'] = n_features
+#rouC_all['test_size']  = bts_size
+#rouC_all['tts_split']  = tts_split
+
+
+
+
+
+###############################################################################
+#%% COMBINING all FG dfs
+#================
+# Combine all
+# https://stackoverflow.com/questions/39862654/pandas-concat-of-multiple-data-frames-using-only-common-columns
+#================
+dfs_combine = [baseline_all, smnc_all, ros_all, rus_all, rouC_all ]
+              
+dfs_nrows = []
+for df in dfs_combine:
+    dfs_nrows = dfs_nrows + [len(df)]
+dfs_nrows = max(dfs_nrows)
+    
+dfs_ncols = []
+for df in dfs_combine:
+    dfs_ncols = dfs_ncols + [len(df.columns)]
+dfs_ncols = max(dfs_ncols)
+           
+# dfs_ncols = []
+# dfs_ncols2 = mode(dfs_ncols.append(len(df.columns) for df in dfs_combine)
+# dfs_ncols2
+
+expected_nrows = len(dfs_combine) * dfs_nrows
+expected_ncols = dfs_ncols
+
+common_cols = list(set.intersection(*(set(df.columns) for df in dfs_combine)))
+
+if len(common_cols) == dfs_ncols :
+    combined_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
+    resampling_methods = combined_baseline[['resampling', 'training_size']]
+    resampling_methods = resampling_methods.drop_duplicates()
+    print('\nConcatenating dfs with different resampling methods:', tts_split
+          , '\nNo. of dfs combining:', len(dfs_combine)
+          , '\nThe sampling methods are:'
+          , '\n', resampling_methods)
+    if len(combined_baseline) == expected_nrows  and len(combined_baseline.columns) == expected_ncols:
+        print('\nPASS:', len(dfs_combine), 'dfs successfully combined'
+              , '\nnrows in combined_df:', len(combined_baseline)
+              , '\nncols in combined_df:', len(combined_baseline.columns))
+    else:
+        print('\nFAIL: concatenating failed'
+              , '\nExpected nrows:', expected_nrows
+              , '\nGot:', len(combined_baseline)
+              , '\nExpected ncols:', expected_ncols
+              , '\nGot:', len(combined_baseline.columns))
+        sys.exit()
+else:
+    sys.exit('\nConcatenting dfs not possible,check numbers ')
+    
+# Add further column indications
+combined_baseline['test_size'] = bts_size
+combined_baseline['tts_split'] = tts_split
+
+# TODO:
+# ADD y target ration for all 
+
+# # rpow bind 
+# if all(ll((baseline_all.columns == baseline_GN.columns == baseline_STR.columns)):
+#     print('\nPASS:colnames match, proceeding to rowbind')
+#     comb_df = pd.concat()], axis = 0, ignore_index = True ) combined_baseline
+###############################################################################
+#====================
+# Write output file
+#====================
+combined_baseline.to_csv(outFile, index = False)
+print('\nFile successfully written:', outFile)
+###############################################################################
--- a/scripts/ml/run_fg.py
+++ b/scripts/ml/run_fg.py
@ -30,9 +30,9 @@ os.chdir( homedir + '/git/LSHTM_analysis/scripts/ml/')
 #==================
 # Import data
 #==================
-from ml_data_dissected import *
+from ml_data_fg import *
 setvars(gene,drug)
-from ml_data_dissected import *
+from ml_data_fg import *

 # from YC run_all_ML: run locally
 #from UQ_yc_RunAllClfs import run_all_ML
@ -60,7 +60,7 @@ outFile = outdir_ml + gene.lower() + '_baseline_FG.csv'
 #==================
 # other vars
 #==================
-tts_split_name  = 'original'
+tts_split  = 'original'
 resampling      = 'none'

 ###############################################################################
@ -177,7 +177,7 @@ else:
    
 baseline_EV['feature_group'] = feature_gp_nameEV
 baseline_EV['resampling']    = resampling
-baseline_EV['tts_split']     = tts_split_name
+baseline_EV['tts_split']     = tts_split
 baseline_EV['n_features']    = n_featuresEV
 ###############################################################################
 #================
@ -221,7 +221,7 @@ else:
    
 baseline_GN['feature_group'] = feature_gp_nameGN
 baseline_GN['resampling'] = resampling
-baseline_GN['tts_split']     = tts_split_name
+baseline_GN['tts_split']     = tts_split
 baseline_GN['n_features']    = n_featuresGN
 ###############################################################################
 #all_featuresN   = X_evolFN + X_structural_FN + X_genomicFN
@ -268,7 +268,7 @@ else:
    
 baseline_STR['feature_group'] = feature_gp_nameSTR
 baseline_STR['resampling'] = resampling
-baseline_STR['tts_split']     = tts_split_name
+baseline_STR['tts_split']     = tts_split
 baseline_STR['n_features']    = n_featuresSTR
 ##############################################################################
 #================
@ -312,7 +312,7 @@ else:
    
 baseline_STB['feature_group'] = feature_gp_nameSTB
 baseline_STB['resampling'] = resampling
-baseline_STB['tts_split']     = tts_split_name
+baseline_STB['tts_split']     = tts_split
 baseline_STB['n_features']    = n_featuresSTB
 ###############################################################################
 #================
@ -356,7 +356,7 @@ else:
    
 baseline_AFF['feature_group'] = feature_gp_nameAFF
 baseline_AFF['resampling'] = resampling
-baseline_AFF['tts_split']     = tts_split_name
+baseline_AFF['tts_split']     = tts_split
 baseline_AFF['n_features']    = n_featuresAFF
 ###############################################################################
 #================
@ -400,7 +400,7 @@ else:
    
 baseline_RES['feature_group'] = feature_gp_nameRES
 baseline_RES['resampling'] = resampling
-baseline_RES['tts_split']     = tts_split_name
+baseline_RES['tts_split']     = tts_split
 baseline_RES['n_features']    = n_featuresRES
 ###############################################################################
 #================
@ -446,7 +446,7 @@ else:
    
 baseline_RNAA['feature_group'] = feature_gp_nameRNAA
 baseline_RNAA['resampling'] = resampling
-baseline_RNAA['tts_split']     = tts_split_name
+baseline_RNAA['tts_split']     = tts_split
 baseline_RNAA['n_features']    = n_featuresRNAA
 ###############################################################################
 #================
@ -492,7 +492,7 @@ else:
    
 baseline_SNAA['feature_group'] = feature_gp_nameSNAA
 baseline_SNAA['resampling']    = resampling
-baseline_SNAA['tts_split']     = tts_split_name
+baseline_SNAA['tts_split']     = tts_split
 baseline_SNAA['n_features']    = n_featuresSNAA
 ###############################################################################
 #%% COMBINING all FG dfs
@ -525,7 +525,7 @@ if len(common_cols) == dfs_ncols :
    combined_FG_baseline = pd.concat([df[common_cols] for df in dfs_combine], ignore_index=True)
    fgs = combined_FG_baseline[['feature_group', 'n_features']]
    fgs = fgs.drop_duplicates()
-    print('\nConcatenating dfs with feature groups after ML analysis (sampling type):' 
+    print('\nConcatenating dfs with feature groups after ML analysis:' 
          , '\nNo. of dfs combining:', len(dfs_combine)
          , '\nSampling type:', resampling
          , '\nThe feature groups are:'