From e55906d2c78168294070c2db0819dbad2165f050 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 29 Jul 2022 00:12:43 +0100 Subject: [PATCH] saving work --- scripts/ml/combined_model/cm_logo_skf.py | 13 +++++++------ scripts/ml/ml_functions/SplitTTS.py | 4 +++- scripts/ml/ml_functions/ml_data_combined.py | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py index f57ae43..fb1f5d7 100755 --- a/scripts/ml/combined_model/cm_logo_skf.py +++ b/scripts/ml/combined_model/cm_logo_skf.py @@ -139,9 +139,9 @@ def CMLogoSkf(cm_input_df # else: # file_suffix = file_suffix - outFile = output_dir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix + ".csv" + #outFile = output_dir + str(n_tr_genes+1) + "genes_" + tts_split_type + '_' + file_suffix + ".csv" - print(outFile) + #print(outFile) #------- # training @@ -175,6 +175,7 @@ def CMLogoSkf(cm_input_df , '\nTEST Target dim:' , cm_bts_y.shape) print("Running Multiple models on LOGO with SKF") + #%%:Running Multiple models on LOGO with SKF # cD3_v2 = MultModelsCl_logo_skf(input_df = cm_X # two func were identical excpet for name cD3_v2 = MultModelsCl(input_df = cm_X @@ -203,11 +204,11 @@ def CMLogoSkf(cm_input_df #=============== # Complete Data #=============== -CMLogoSkf(cm_input_df = combined_df,file_suffix = "complete") -CMLogoSkf(cm_input_df = combined_df, std_gene_omit=['alr'], file_suffix = "complete") +#CMLogoSkf(cm_input_df = combined_df,file_suffix = "complete") +#CMLogoSkf(cm_input_df = combined_df, std_gene_omit=['alr'], file_suffix = "complete") #=============== # Actual Data #=============== -CMLogoSkf(cm_input_df = combined_df_actual, file_suffix = "actual") -CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual") +#CMLogoSkf(cm_input_df = combined_df_actual, file_suffix = "actual") +#CMLogoSkf(cm_input_df = combined_df_actual, std_gene_omit=['alr'], file_suffix = "actual") diff --git a/scripts/ml/ml_functions/SplitTTS.py b/scripts/ml/ml_functions/SplitTTS.py index ad417e4..8e1ed56 100644 --- a/scripts/ml/ml_functions/SplitTTS.py +++ b/scripts/ml/ml_functions/SplitTTS.py @@ -269,7 +269,9 @@ def split_tts(ml_input_data #k_sm = 5 # default k_sm = k_smote - sm_nc = SMOTENC(categorical_features=categorical_colind, k_neighbors = k_sm, **rs, **njobs) + sm_nc = SMOTENC(categorical_features=categorical_colind + , k_neighbors = k_sm + , **rs, **njobs) X_smnc, y_smnc = sm_nc.fit_resample(X, y) print('\nSMOTE_NC OverSampling\n', Counter(y_smnc)) print(X_smnc.shape) diff --git a/scripts/ml/ml_functions/ml_data_combined.py b/scripts/ml/ml_functions/ml_data_combined.py index 7dca351..f57f557 100644 --- a/scripts/ml/ml_functions/ml_data_combined.py +++ b/scripts/ml/ml_functions/ml_data_combined.py @@ -54,6 +54,7 @@ expected_ncols if len(common_cols) == expected_ncols: print('\nProceeding to combine based on common cols (n):', len(common_cols)) combined_df = pd.concat([df[common_cols] for df in dfs_combine], ignore_index = False) + print('\nSuccessfully combined dfs:' , '\nNo. of dfs combined:', len(dfs_combine) , '\nDim of combined df:', combined_df.shape) @@ -76,7 +77,6 @@ cm_input_df5 = combined_df[~combined_df['gene_name'].isin(omit_gene_alr)] combined_df['dst'].isna().sum() combined_df['dst'].value_counts().sum() - combined_df_actual = combined_df[~combined_df['dst'].isna()] ##############################################################################