diff --git a/UQ_ML_data.py b/UQ_ML_data.py index 27a6bd2..80c327b 100644 --- a/UQ_ML_data.py +++ b/UQ_ML_data.py @@ -27,12 +27,11 @@ def setvars(gene,drug): from imblearn.under_sampling import EditedNearestNeighbours from imblearn.under_sampling import RepeatedEditedNearestNeighbours - #%% REMOVE once config is set up - #from UQ_MultModelsCl import MultModelsCl + #%% FOR LATER: Combine ED logo data + #%% FOR LARER: active aa site annotations + ########################################################################### rs = {'random_state': 42} njobs = {'n_jobs': 10} - - #%% homedir = os.path.expanduser("~") #============== @@ -70,45 +69,22 @@ def setvars(gene,drug): # my_df['active_aa_pos'].dtype # -- CHECK script -- imports.py - #%%============================================================================ - #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer] - #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher'] - sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq'] - or_cols = ['or_mychisq', 'log10_or_mychisq'] - - print("count of NULL values before imputation\n") - my_df[or_cols].isnull().sum() - - my_dfI = pd.DataFrame(index = my_df['mutationinformation'] ) - - - my_dfI = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(my_df[or_cols]) - , index = my_df['mutationinformation'] - , columns = or_cols ) - my_dfI.columns = ['or_rawI', 'logorI'] - my_dfI.columns - my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column - my_dfI.head() - - # merge with original based on index - my_df['index_bm'] = my_df.index - mydf_imputed = pd.merge(my_df - , my_dfI - , on = 'mutationinformation') - mydf_imputed = mydf_imputed.set_index(['index_bm']) - - my_df['log10_or_mychisq'].isna().sum() - mydf_imputed['log10_or_mychisq'].isna().sum() - mydf_imputed['logorI'].isna().sum() - - len(my_df.columns) - len(mydf_imputed.columns) - + ########################################################################### + #%% Add lineage calculation columns + #FIXME: Check if this can be imported from config? + total_mtblineage_u = 8 + lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode'] + #bar = my_df[lineage_colnames] + my_df['lineage_proportion'] = my_df['lineage_count_unique']/my_df['lineage_count_all'] + my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_u + ########################################################################### #%% AA property change - + #-------------------- # Water prop change + #-------------------- my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water'] my_df['water_change'].value_counts() + water_prop_changeD = { 'hydrophobic_to_neutral' : 'change' , 'hydrophobic_to_hydrophobic' : 'no_change' @@ -123,8 +99,10 @@ def setvars(gene,drug): my_df['water_change'] = my_df['water_change'].map(water_prop_changeD) my_df['water_change'].value_counts() - + + #-------------------- # Polarity change + #-------------------- my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity'] my_df['polarity_change'].value_counts() @@ -148,8 +126,10 @@ def setvars(gene,drug): my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD) my_df['polarity_change'].value_counts() - + + #-------------------- # Electrostatics change + #-------------------- my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop'] my_df['electrostatics_change'].value_counts() @@ -174,8 +154,10 @@ def setvars(gene,drug): my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD) my_df['electrostatics_change'].value_counts() - - # Create a combined column summarising these three cols + + #-------------------- + # Summary change: Create a combined column summarising these three cols + #-------------------- detect_change = 'change' check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change'] #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int) @@ -188,24 +170,56 @@ def setvars(gene,drug): my_df['aa_prop_change'].value_counts() my_df['aa_prop_change'].dtype - #%% Add lineage calc - total_mtblineage_u = 8 - - lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode'] - #bar = my_df[lineage_colnames] - my_df['lineage_proportion'] = my_df['lineage_count_unique']/my_df['lineage_count_all'] - my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_u - - #%% Combine mmCSM_lig Data: DONE - #%% Combine PROVEAN data: DONE - #%% Combine ED logo data + + #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer] + #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher'] + sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq'] + or_cols = ['or_mychisq', 'log10_or_mychisq'] + + print("count of NULL values before imputation\n") + print(my_df[or_cols].isnull().sum()) + + my_dfI = pd.DataFrame(index = my_df['mutationinformation'] ) + + + my_dfI = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(my_df[or_cols]) + , index = my_df['mutationinformation'] + , columns = or_cols ) + my_dfI.columns = ['or_rawI', 'logorI'] + my_dfI.columns + my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column + my_dfI.head() + print("count of NULL values AFTER imputation\n") + print(my_dfI.isnull().sum()) + + #------------------------------------------- + # OR df Merge: with original based on index + #------------------------------------------- + my_df['index_bm'] = my_df.index + mydf_imputed = pd.merge(my_df + , my_dfI + , on = 'mutationinformation') + mydf_imputed = mydf_imputed.set_index(['index_bm']) + + my_df['log10_or_mychisq'].isna().sum() + mydf_imputed['log10_or_mychisq'].isna().sum() + mydf_imputed['logorI'].isna().sum() + + len(my_df.columns) + len(mydf_imputed.columns) + + #----------------------------------------- + # REASSIGN my_df after imputing OR values + #----------------------------------------- + my_df = mydf_imputed.copy() + #%%######################################################################## + #========================== + # Data for ML + #========================== + my_df_ml = my_df.copy() + #%% Masking columns (mCSM-lig, mCSM-NA, mCSM-ppi2) values for lig_dist >10 - - # get logic from upstream! - #my_df_ml = my_df.copy() - my_df_ml = mydf_imputed.copy() - my_df_ml['mutationinformation'][my_df['ligand_distance']>10].value_counts() my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() my_df_ml.groupby(['mutationinformation'])['ligand_distance'].apply(lambda x: (x>10)).value_counts() @@ -213,7 +227,10 @@ def setvars(gene,drug): my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), 'ligand_affinity_change'] = 0 (my_df_ml['ligand_affinity_change'] == 0).sum() - #%%============================================================================ + #%%######################################################################## + #========================== + # BLIND test set + #========================== # Separate blind test set my_df_ml[drug].isna().sum() @@ -227,13 +244,14 @@ def setvars(gene,drug): training_df[drug].value_counts() training_df['dst_mode'].value_counts() - #%% Build X + #%% Build X: input for ML common_cols_stabiltyN = ['ligand_distance' , 'ligand_affinity_change' , 'duet_stability_change' , 'ddg_foldx' , 'deepddg' - , 'ddg_dynamut2'] + , 'ddg_dynamut2' + , 'mmcsm_lig'] foldX_cols = ['contacts' , 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' @@ -250,36 +268,45 @@ def setvars(gene,drug): , 'rd_values'] X_evolFN = ['consurf_score' - , 'snap2_score'] - - # quick inspection which lineage to use: - #foo = my_df_ml[['lineage', 'lineage_count_all', 'lineage_count_unique']] + , 'snap2_score' + , 'provean_score'] - X_genomicFN = ['maf' - # , 'or_mychisq' - # , 'or_logistic' - # , 'or_fisher' - # , 'pval_fisher' - #, 'lineage' - #, 'lineage_count_all' - #, 'lineage_count_unique' - ] + X_genomic_mafor = ['maf' + , 'logorI' + # , 'or_rawI' + # , 'or_mychisq' + # , 'or_logistic' + # , 'or_fisher' + # , 'pval_fisher' + ] + + X_genomic_linegae = ['lineage_proportion' + , 'dist_lineage_proportion' + #, 'lineage' # could be included as a category but it has L2;L4 formatting + , 'lineage_count_all' + , 'lineage_count_unique' + ] + + X_genomicFN = X_genomic_mafor+X_genomic_linegae #%% Construct numerical and categorical column names - # numerical feature names numerical_FN = common_cols_stabiltyN + foldX_cols + X_strFN + X_evolFN + X_genomicFN #categorical feature names categorical_FN = ['ss_class' - # , 'wt_prop_water' - # , 'lineage_labels' # misleading if using merged_df3 - # , 'mut_prop_water' - # , 'wt_prop_polarity' - # , 'mut_prop_polarity' - # , 'wt_calcprop' - # , 'mut_calcprop' - #, 'active_aa_pos' + # , 'wt_prop_water' + # , 'mut_prop_water' + # , 'wt_prop_polarity' + # , 'mut_prop_polarity' + # , 'wt_calcprop' + # , 'mut_calcprop' + , 'aa_prop_change' + , 'electrostatics_change' + , 'polarity_change' + , 'water_change' + , 'drtype_mode_labels' # beware then you can use it to predict +# , 'active_aa_pos' # TODO? ] #%% extracting dfs based on numerical, categorical column names diff --git a/UQ_TODO_categorical_classification_columns.py b/UQ_TODO_categorical_classification_columns.py index d8c9ebf..3bcaf02 100644 --- a/UQ_TODO_categorical_classification_columns.py +++ b/UQ_TODO_categorical_classification_columns.py @@ -94,16 +94,17 @@ check = ['mutationinformation', 'wild_type', 'water_change', 'polarity_change', check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change'] foo = my_df[check] -foo['new'] = (foo.values == detect_change).any(1).astype(int) -#foo['new2'] = foo[check_prop_cols].applymap(lambda x: detect_change in x).any(1).astype(int) # lose match so alwasys 1 -foo['new3'] = (foo[check_prop_cols].values == detect_change).any(1).astype(int) +foo['aa_prop_change'] = (foo.values == detect_change).any(1).astype(int) +#foo['aa_prop_change3'] = foo[check_prop_cols].applymap(lambda x: detect_change in x).any(1).astype(int) # lose match so alwasys 1 +foo['aa_prop_change2'] = (foo[check_prop_cols].values == detect_change).any(1).astype(int) -all(foo['new'] == foo['new3']) +all(foo['aa_prop_change'] == foo['aa_prop_change2']) #%%lineage -lineage_colnames = ['lineage', 'lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode'] +# snp freq and lineage_count_all differ because same mut can be in more than 1 lineage +lineage_colnames = ['snp_frequency', 'lineage', 'lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode'] bar = my_df[lineage_colnames] - tot_lineage_u = 8 +bar['lineage'].value_counts() bar['lineage_proportion'] = bar['lineage_count_unique']/bar['lineage_count_all'] bar['dist_lineage_proportion'] = bar['lineage_count_unique']/tot_lineage_u \ No newline at end of file