diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index 461845f..059cbc6 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -10,7 +10,8 @@ source("~/git/LSHTM_analysis/config/rpob.R") ############################# # GET the actual merged dfs ############################# -source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +source("~/git/LSHTM_analysis/scripts/plotting/get_ml_dfs.R") ############################# # Output files: merged data diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index d82a7c0..d901e9a 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -40,11 +40,12 @@ geneL_ppi2 = c("alr", "embb", "katg", "rpob") combining_dfs_plotting <- function( my_df_u - , gene_metadata - #, gene # ADDED - , lig_dist_colname = '' - , lig_dist_cutoff = ''){ - + , gene_metadata + #, gene # ADDED + , lig_dist_colname = '' + , lig_dist_cutoff = '' + , plotting = TRUE){ + # counting NAs in AF, OR cols # or_mychisq if (identical(sum(is.na(my_df_u$or_mychisq)) @@ -117,7 +118,7 @@ combining_dfs_plotting <- function( my_df_u , y = my_df_u , by = merging_cols , all.y = T) - #, all.x = T) + #, all.x = T) cat("\nDim of merged_df2: ", dim(merged_df2)) @@ -182,13 +183,13 @@ combining_dfs_plotting <- function( my_df_u ,"\nExpected no. of rows: ", expected_nrows_df2 ,"\nGot no. of rows: ", nrow(merged_df2)) }else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy" - , "\nExpected no. of rows after merge: ", expected_nrows_df2 - , "\nGot no. of rows: ", nrow(merged_df2) - , "\nQuitting!") + , "\nExpected no. of rows after merge: ", expected_nrows_df2 + , "\nGot no. of rows: ", nrow(merged_df2) + , "\nQuitting!") quit() - } - + } + } # Quick formatting: ordering df and pretty labels @@ -332,7 +333,7 @@ combining_dfs_plotting <- function( my_df_u }else{ stop("Cannot generate merged_df3") } -################################################################## + ################################################################## head(merged_df3$position); tail(merged_df3$position) # should be sorted # sanity check @@ -357,7 +358,7 @@ combining_dfs_plotting <- function( my_df_u merged_df3[[consurf_colNew]] = as.factor(merged_df3[[consurf_colNew]]) merged_df3[[consurf_colNew]] #levels(merged_df3$consurf_outcome) = c("nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9) - + merged_df2[[consurf_colNew]] = merged_df2[[consurf_colOld]] merged_df2[[consurf_colNew]] = as.factor(merged_df2[[consurf_colNew]]) merged_df2[[consurf_colNew]] @@ -378,7 +379,7 @@ combining_dfs_plotting <- function( my_df_u #---------------------------------------------- merged_df3$sensitivity = ifelse(merged_df3$dst_mode == 1, "R", "S") merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info_labels == "DM", "R", "S") - + merged_df2$sensitivity = ifelse(merged_df2$dst_mode == 1, "R", "S") merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info_labels == "DM", "R", "S") @@ -387,7 +388,7 @@ combining_dfs_plotting <- function( my_df_u check1 = all(merged_df3$mutation_info_labels == merged_df3$sensitivity) check2 = all(merged_df2$mutation_info_labels == merged_df2$sensitivity) - + if(check1 && check2){ cat("PASS: merged_df3 and merged_df2 have mutation info labels as R and S" , "\nIt also has sensitivity column" @@ -420,9 +421,9 @@ combining_dfs_plotting <- function( my_df_u # find which stability cols to average: should contain revised foldx scaled_cols_stab = c("duet_scaled" - , "deepddg_scaled" - , "ddg_dynamut2_scaled" - , "foldx_scaled_signC" # needed to get avg stability + , "deepddg_scaled" + , "ddg_dynamut2_scaled" + , "foldx_scaled_signC" # needed to get avg stability ) #----------------------------------------------- @@ -701,7 +702,7 @@ combining_dfs_plotting <- function( my_df_u # merged_df3$pos_count <-NULL merged_df3 = merged_df3[, !colnames(merged_df3)%in%c("pos_count")] head(merged_df3$pos_count) - + merged_df3 = merged_df3 %>% dplyr::add_count(position) class(merged_df3) @@ -710,7 +711,7 @@ combining_dfs_plotting <- function( my_df_u nc_change = which(colnames(merged_df3) == "n") colnames(merged_df3)[nc_change] <- "pos_count" class(merged_df3) - + #################################################################### #------------------------------------------------- # merged_df2: Rename existing pos_count @@ -726,14 +727,14 @@ combining_dfs_plotting <- function( my_df_u # already done in plotting_data #################################################################### # Choose few columns to return as plot_df - - merged_df3 = merged_df3[, colnames(merged_df3)%in%c(plotting_cols, "pos_count", "df2_pos_count_all")] - merged_df2 = merged_df2[, colnames(merged_df2)%in%c(plotting_cols, "df2_pos_count_all")] - + if (plotting){ + merged_df3 = merged_df3[, colnames(merged_df3)%in%c(plotting_cols, "pos_count", "df2_pos_count_all")] + merged_df2 = merged_df2[, colnames(merged_df2)%in%c(plotting_cols, "df2_pos_count_all")] + } #################################################################### return(list( merged_df2 , merged_df3 )) -cat("\nEnd of combining_dfs_plotting.R script") + cat("\nEnd of combining_dfs_plotting.R script") } \ No newline at end of file diff --git a/scripts/ml/ml_functions/GetMLData_v2.py b/scripts/ml/ml_functions/GetMLData_v2.py new file mode 100755 index 0000000..180d586 --- /dev/null +++ b/scripts/ml/ml_functions/GetMLData_v2.py @@ -0,0 +1,684 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 6 13:41:54 2022 + +@author: tanu +""" + +#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline +import os, sys +import pandas as pd +import numpy as np +print(np.__version__) +print(pd.__version__) +import pprint as pp +from copy import deepcopy +from collections import Counter +from sklearn.impute import KNNImputer as KNN +from imblearn.over_sampling import RandomOverSampler +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import SMOTE +from sklearn.datasets import make_classification +from imblearn.combine import SMOTEENN +from imblearn.combine import SMOTETomek + +from imblearn.over_sampling import SMOTENC +from imblearn.under_sampling import EditedNearestNeighbours +from imblearn.under_sampling import RepeatedEditedNearestNeighbours + +from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, average_precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score, classification_report + +from sklearn.model_selection import train_test_split, cross_validate, cross_val_score +from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RepeatedKFold + +from sklearn.pipeline import Pipeline, make_pipeline +import argparse +import re + + +def getmldata(gene, drug + , data_combined_model = False + , use_or = False + , omit_all_genomic_features = False + , write_maskfile = False + , write_outfile = False): + + #%% FOR LATER: Combine ED logo data + #%% constructuing genomic feature group + #======================== + # FG: Genomic features + #======================== + X_gn_maf_Fnum = ['maf'] + #X_gn_or_Fnum = ['logorI', 'or_rawI', 'or_mychisq', 'or_logistic', 'or_fisher', 'pval_fisher'] + + X_gn_linegae_Fnum = ['lineage_proportion' + , 'dist_lineage_proportion' + #, 'lineage' # could be included as a category but it has L2;L4 formatting + , 'lineage_count_all' + , 'lineage_count_unique'] + + # X_gn_Fcat = ['drtype_mode_labels' # beware then you can't use it to predict [USED it for uq_v1, not v2] + # #, 'gene_name'] # will be required for the combined stuff + #X_gn_Fcat = [] + + if data_combined_model: + X_geneF = ['gene_name'] + else: + X_geneF = [] + + if use_or: + X_gn_or_Fnum = ['logorI'] + else: + X_gn_or_Fnum = [] + + if omit_all_genomic_features: + print('\nOmitting all genomic features (n):', len(X_gn_maf_Fnum) + len(X_gn_or_Fnum) + len(X_gn_linegae_Fnum) + len(X_geneF)) + X_genomicFN = [] + if use_or: + sys.exit('\nError: omitting genomic feature and using odds ratio are mutually exclusive') + else: + X_genomicFN = X_gn_maf_Fnum + X_gn_or_Fnum + X_gn_linegae_Fnum + X_geneF + + #%% + ########################################################################### + + homedir = os.path.expanduser("~") + + geneL_basic = ['pnca'] + geneL_na = ['gid'] + geneL_na_ppi2 = ['rpob'] + geneL_ppi2 = ['alr', 'embb', 'katg'] + + #num_type = ['int64', 'float64'] + num_type = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] + cat_type = ['object', 'bool'] + + #============== + # directories + #============== + datadir = homedir + '/git/Data/' + indir = datadir + drug + '/input/' + outdir = datadir + drug + '/output/' + #outdir_ml = outdir + 'ml/' + outdir_ml = homedir + '/git/LSHTM_ML/output/' + + #========================== + # outfile for ML training: + #========================== + outFile_ml = outdir_ml + gene.lower() + '_training_data.csv' + + outFile_mask_ml = outdir_ml + 'genes/mask_check/' + gene.lower() + '_mask_check.csv' + + #======= + # input + #======= + + #--------- + # File 1 + #--------- + infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' + #infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + + my_features_df = pd.read_csv(infile_ml1, index_col = 0) + my_features_df = my_features_df .reset_index(drop = True) + my_features_df.index + + my_features_df.dtypes + mycols = my_features_df.columns + + #--------- + # File 2 + #--------- + infile_aaindex = outdir + 'aa_index/' + gene.lower() + '_aa.csv' + aaindex_df = pd.read_csv(infile_aaindex, index_col = 0) + aaindex_df.dtypes + + #----------- + # check for non-numerical columns + #----------- + if any(aaindex_df.dtypes==object): + print('\naaindex_df contains non-numerical data') + + aaindex_df_object = aaindex_df.select_dtypes(include = cat_type) + print('\nTotal no. of non-numerial columns:', len(aaindex_df_object.columns)) + + expected_aa_ncols = len(aaindex_df.columns) - len(aaindex_df_object.columns) + + #----------- + # Extract numerical data only + #----------- + print('\nSelecting numerical data only') + aaindex_df = aaindex_df.select_dtypes(include = num_type) + + #--------------------------- + # aaindex: sanity check 1 + #--------------------------- + if len(aaindex_df.columns) == expected_aa_ncols: + print('\nPASS: successfully selected numerical columns only for aaindex_df') + else: + print('\nFAIL: Numbers mismatch' + , '\nExpected ncols:', expected_aa_ncols + , '\nGot:', len(aaindex_df.columns)) + + #--------------- + # check for NA + #--------------- + print('\nNow checking for NA in the remaining aaindex_cols') + c1 = aaindex_df.isna().sum() + c2 = c1.sort_values(ascending=False) + print('\nCounting aaindex_df cols with NA' + , '\nncols with NA:', sum(c2>0), 'columns' + , '\nDropping these...' + , '\nOriginal ncols:', len(aaindex_df.columns) + ) + aa_df = aaindex_df.dropna(axis=1) + + print('\nRevised df ncols:', len(aa_df.columns)) + + c3 = aa_df.isna().sum() + c4 = c3.sort_values(ascending=False) + + print('\nChecking NA in revised df...') + + if sum(c4>0): + sys.exit('\nFAIL: aaindex_df still contains cols with NA, please check and drop these before proceeding...') + else: + print('\nPASS: cols with NA successfully dropped from aaindex_df' + , '\nProceeding with combining aa_df with other features_df') + + #--------------------------- + # aaindex: sanity check 2 + #--------------------------- + expected_aa_ncols2 = len(aaindex_df.columns) - sum(c2>0) + if len(aa_df.columns) == expected_aa_ncols2: + print('\nPASS: ncols match' + , '\nExpected ncols:', expected_aa_ncols2 + , '\nGot:', len(aa_df.columns)) + else: + print('\nFAIL: Numbers mismatch' + , '\nExpected ncols:', expected_aa_ncols2 + , '\nGot:', len(aa_df.columns)) + + # Important: need this to identify aaindex cols + aa_df_cols = aa_df.columns + print('\nTotal no. of columns in clean aa_df:', len(aa_df_cols)) + + ############################################################################### + #%% Combining my_features_df and aaindex_df + #=========================== + # Merge my_df + aaindex_df + #=========================== + + if aa_df.columns[aa_df.columns.isin(my_features_df.columns)] == my_features_df.columns[my_features_df.columns.isin(aa_df.columns)]: + print('\nMerging on column: mutationinformation') + + if len(my_features_df) == len(aa_df): + expected_nrows = len(my_features_df) + print('\nProceeding to merge, expected nrows in merged_df:', expected_nrows) + else: + sys.exit('\nNrows mismatch, cannot merge. Please check' + , '\nnrows my_df:', len(my_features_df) + , '\nnrows aa_df:', len(aa_df)) + + #----------------- + # Reset index: mutationinformation + # Very important for merging + #----------------- + aa_df = aa_df.reset_index() + + expected_ncols = len(my_features_df.columns) + len(aa_df.columns) - 1 # for the no. of merging col + + #----------------- + # Merge: my_features_df + aa_df + #----------------- + merged_df = pd.merge(my_features_df + , aa_df + , on = 'mutationinformation') + + #--------------------------- + # aaindex: sanity check 3 + #--------------------------- + if len(merged_df.columns) == expected_ncols: + print('\nPASS: my_features_df and aa_df successfully combined' + , '\nnrows:', len(merged_df) + , '\nncols:', len(merged_df.columns)) + else: + sys.exit('\nFAIL: could not combine my_features_df and aa_df' + , '\nCheck dims and merging cols!') + + #-------- + # Reassign so downstream code doesn't need to change + #-------- + my_df = merged_df.copy() + + #%% Data: my_df + # Check if non structural pos have crept in + # IDEALLY remove from source! But for rpoB do it here + # Drop NA where numerical cols have them + if gene.lower() in geneL_na_ppi2: + #D1148 get rid of + na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) + + ########################################################################### + #%% Add lineage calculation columns + #FIXME: Check if this can be imported from config? + total_mtblineage_uc = 8 + lineage_colnames = ['lineage_list_all', 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', 'lineage_multimode'] + #bar = my_df[lineage_colnames] + my_df['lineage_proportion'] = my_df['lineage_count_unique']/my_df['lineage_count_all'] + my_df['dist_lineage_proportion'] = my_df['lineage_count_unique']/total_mtblineage_uc + ########################################################################### + #%% Active site annotation column + # change from numberic to categorical + + if my_df['active_site'].dtype in num_type: + my_df['active_site'] = my_df['active_site'].astype(object) + my_df['active_site'].dtype + #%% AA property change + #-------------------- + # Water prop change + #-------------------- + my_df['water_change'] = my_df['wt_prop_water'] + str('_to_') + my_df['mut_prop_water'] + my_df['water_change'].value_counts() + + water_prop_changeD = { + 'hydrophobic_to_neutral' : 'change' + , 'hydrophobic_to_hydrophobic' : 'no_change' + , 'neutral_to_neutral' : 'no_change' + , 'neutral_to_hydrophobic' : 'change' + , 'hydrophobic_to_hydrophilic' : 'change' + , 'neutral_to_hydrophilic' : 'change' + , 'hydrophilic_to_neutral' : 'change' + , 'hydrophilic_to_hydrophobic' : 'change' + , 'hydrophilic_to_hydrophilic' : 'no_change' + } + + my_df['water_change'] = my_df['water_change'].map(water_prop_changeD) + my_df['water_change'].value_counts() + + #-------------------- + # Polarity change + #-------------------- + my_df['polarity_change'] = my_df['wt_prop_polarity'] + str('_to_') + my_df['mut_prop_polarity'] + my_df['polarity_change'].value_counts() + + polarity_prop_changeD = { + 'non-polar_to_non-polar' : 'no_change' + , 'non-polar_to_neutral' : 'change' + , 'neutral_to_non-polar' : 'change' + , 'neutral_to_neutral' : 'no_change' + , 'non-polar_to_basic' : 'change' + , 'acidic_to_neutral' : 'change' + , 'basic_to_neutral' : 'change' + , 'non-polar_to_acidic' : 'change' + , 'neutral_to_basic' : 'change' + , 'acidic_to_non-polar' : 'change' + , 'basic_to_non-polar' : 'change' + , 'neutral_to_acidic' : 'change' + , 'acidic_to_acidic' : 'no_change' + , 'basic_to_acidic' : 'change' + , 'basic_to_basic' : 'no_change' + , 'acidic_to_basic' : 'change'} + + my_df['polarity_change'] = my_df['polarity_change'].map(polarity_prop_changeD) + my_df['polarity_change'].value_counts() + + #-------------------- + # Electrostatics change + #-------------------- + my_df['electrostatics_change'] = my_df['wt_calcprop'] + str('_to_') + my_df['mut_calcprop'] + my_df['electrostatics_change'].value_counts() + + calc_prop_changeD = { + 'non-polar_to_non-polar' : 'no_change' + , 'non-polar_to_polar' : 'change' + , 'polar_to_non-polar' : 'change' + , 'non-polar_to_pos' : 'change' + , 'neg_to_non-polar' : 'change' + , 'non-polar_to_neg' : 'change' + , 'pos_to_polar' : 'change' + , 'pos_to_non-polar' : 'change' + , 'polar_to_polar' : 'no_change' + , 'neg_to_neg' : 'no_change' + , 'polar_to_neg' : 'change' + , 'pos_to_neg' : 'change' + , 'pos_to_pos' : 'no_change' + , 'polar_to_pos' : 'change' + , 'neg_to_polar' : 'change' + , 'neg_to_pos' : 'change' + } + + my_df['electrostatics_change'] = my_df['electrostatics_change'].map(calc_prop_changeD) + my_df['electrostatics_change'].value_counts() + + #-------------------- + # Summary change: Create a combined column summarising these three cols + #-------------------- + detect_change = 'change' + check_prop_cols = ['water_change', 'polarity_change', 'electrostatics_change'] + #my_df['aa_prop_change'] = (my_df.values == detect_change).any(1).astype(int) + my_df['aa_prop_change'] = (my_df[check_prop_cols].values == detect_change).any(1).astype(int) + my_df['aa_prop_change'].value_counts() + my_df['aa_prop_change'].dtype + + my_df['aa_prop_change'] = my_df['aa_prop_change'].map({1:'change' + , 0: 'no_change'}) + + my_df['aa_prop_change'].value_counts() + my_df['aa_prop_change'].dtype + + #%% IMPUTE values for OR [check script for exploration: UQ_or_imputer] + #-------------------- + # Impute OR values + #-------------------- + #or_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher'] + sel_cols = ['mutationinformation', 'or_mychisq', 'log10_or_mychisq'] + or_cols = ['or_mychisq', 'log10_or_mychisq'] + + print("count of NULL values before imputation\n") + print(my_df[or_cols].isnull().sum()) + + my_dfI = pd.DataFrame(index = my_df['mutationinformation'] ) + + + my_dfI = pd.DataFrame(KNN(n_neighbors=3, weights="uniform").fit_transform(my_df[or_cols]) + , index = my_df['mutationinformation'] + , columns = or_cols ) + my_dfI.columns = ['or_rawI', 'logorI'] + my_dfI.columns + my_dfI = my_dfI.reset_index(drop = False) # prevents old index from being added as a column + my_dfI.head() + print("count of NULL values AFTER imputation\n") + print(my_dfI.isnull().sum()) + + #------------------------------------------- + # OR df Merge: with original based on index + #------------------------------------------- + #my_df['index_bm'] = my_df.index + mydf_imputed = pd.merge(my_df + , my_dfI + , on = 'mutationinformation') + #mydf_imputed = mydf_imputed.set_index(['index_bm']) + + my_df['log10_or_mychisq'].isna().sum() + mydf_imputed['log10_or_mychisq'].isna().sum() + mydf_imputed['logorI'].isna().sum() # should be 0 + + len(my_df.columns) + len(mydf_imputed.columns) + + #----------------------------------------- + # REASSIGN my_df after imputing OR values + #----------------------------------------- + my_df = mydf_imputed.copy() + + if my_df['logorI'].isna().sum() == 0: + print('\nPASS: OR values imputed, data ready for ML') + else: + sys.exit('\nFAIL: something went wrong, Data not ready for ML. Please check upstream!') + + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #--------------------------------------- + # TODO: try other imputation like MICE + #--------------------------------------- + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + #%%######################################################################## + #========================== + # Data for ML + #========================== + my_df_ml = my_df.copy() + + # Build column names to mask for affinity chanhes + if gene.lower() in geneL_basic: + #X_stabilityN = common_cols_stabiltyN + gene_affinity_colnames = []# not needed as its the common ones + cols_to_mask = ['ligand_affinity_change'] + cols_to_mask_ppi2 = [] + cols_to_mask_na = [] + + if gene.lower() in geneL_ppi2: + gene_affinity_colnames = ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_ppi2_st_cols + #cols_to_mask = ['ligand_affinity_change', 'mcsm_ppi2_affinity'] + cols_to_mask = ['ligand_affinity_change'] + cols_to_mask_ppi2 = ['mcsm_ppi2_affinity'] + cols_to_mask_na = [] + + + if gene.lower() in geneL_na: + gene_affinity_colnames = ['mcsm_na_affinity'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_st_cols + cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity'] + cols_to_mask_ppi2 = [] + cols_to_mask_na = ['mcsm_na_affinity'] + + + if gene.lower() in geneL_na_ppi2: + gene_affinity_colnames = ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + #X_stabilityN = common_cols_stabiltyN + geneL_na_ppi2_st_cols + #cols_to_mask = ['ligand_affinity_change', 'mcsm_na_affinity', 'mcsm_ppi2_affinity'] + cols_to_mask = ['ligand_affinity_change']#, 'mcsm_na_affinity'] + cols_to_mask_ppi2 = ['mcsm_ppi2_affinity'] + cols_to_mask_na = ['mcsm_na_affinity'] + + + #======================= + # Masking columns: + # lig_dist >10 ==> mCSM-lig AND mCSM-NA col values == 0 + # interface_dist >10 ==> mCSM-ppi2 col values == 0 + #======================= + my_df_ml['mutationinformation'][my_df_ml['ligand_distance']>10].value_counts() + my_df_ml.groupby('mutationinformation')['ligand_distance'].apply(lambda x: (x>10)).value_counts() + my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask].value_counts() + + #--------------------------- + # mask ligand affinity column where ligand distance > 10 + #--------------------------- + # mask the mcsm ligand affinity AND mcsm_na affinity columns where ligand distance > 10 + my_df_ml.loc[(my_df_ml['ligand_distance'] > 10), cols_to_mask] = 0 + + #--------------------------- + # mask the mcsm_ppi2_affinity column where interface_dist > 10 + #--------------------------- + if len(cols_to_mask_ppi2) > 0: + my_df_ml.loc[(my_df_ml['interface_dist'] > 10), cols_to_mask_ppi2] = 0 + add_cols_mask = ['interface_dist'] + cols_to_mask_ppi2 + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] + else: + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ] + + #--------------------------- + # mask the na_affinity column where nca_distance > 10 + #--------------------------- + if len(cols_to_mask_na) > 0: + my_df_ml.loc[(my_df_ml['nca_distance'] > 10), cols_to_mask_na] = 0 + add_cols_mask = ['nca_distance'] + cols_to_mask_na + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask + add_cols_mask] + else: + mask_check = my_df_ml[['mutationinformation', 'ligand_distance'] + cols_to_mask ] + + + # sanity check: check script SANITY_CHECK_mask.py + if write_maskfile: + # write mask file for sanity check + #mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) + + mask_check.to_csv(outdir_ml + gene.lower() + '_mask_check.csv') + + ############################################################################### + #%% Feature groups (FG): Build X for Input ML + ############################################################################ + #=========================== + # FG1: Evolutionary features + #=========================== + X_evolFN = ['consurf_score' + , 'snap2_score' + , 'provean_score'] + + ############################################################################### + #======================== + # FG2: Stability features + #======================== + #-------- + # common + #-------- + X_common_stability_Fnum = [ + 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2' + , 'contacts'] + #-------- + # FoldX + #-------- + X_foldX_Fnum = [ 'electro_rr', 'electro_mm', 'electro_sm', 'electro_ss' + , 'disulfide_rr', 'disulfide_mm', 'disulfide_sm', 'disulfide_ss' + , 'hbonds_rr', 'hbonds_mm', 'hbonds_sm', 'hbonds_ss' + , 'partcov_rr', 'partcov_mm', 'partcov_sm', 'partcov_ss' + , 'vdwclashes_rr', 'vdwclashes_mm', 'vdwclashes_sm', 'vdwclashes_ss' + , 'volumetric_rr', 'volumetric_mm', 'volumetric_ss'] + + X_stability_FN = X_common_stability_Fnum + X_foldX_Fnum + + ############################################################################### + #=================== + # FG3: Affinity features + #=================== + common_affinity_Fnum = ['ligand_distance' + , 'ligand_affinity_change' + , 'mmcsm_lig'] + + # if gene.lower() in geneL_basic: + # X_affinityFN = common_affinity_Fnum + # else: + # X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + X_affinityFN = common_affinity_Fnum + gene_affinity_colnames + + ############################################################################### + #============================ + # FG4: Residue level features + #============================ + #----------- + # AA index + #----------- + X_aaindex_Fnum = list(aa_df_cols) + print('\nTotal no. of features for aaindex:', len(X_aaindex_Fnum)) + + #----------------- + # surface area + # depth + # hydrophobicity + #----------------- + X_str_Fnum = ['rsa' + #, 'asa' + , 'kd_values' + , 'rd_values'] + + #--------------------------- + # Other aa properties + # active site indication + #--------------------------- + X_aap_Fcat = ['ss_class' + # , 'wt_prop_water' + # , 'mut_prop_water' + # , 'wt_prop_polarity' + # , 'mut_prop_polarity' + # , 'wt_calcprop' + # , 'mut_calcprop' + , 'aa_prop_change' + , 'electrostatics_change' + , 'polarity_change' + , 'water_change' + , 'active_site'] + + X_resprop_FN = X_aaindex_Fnum + X_str_Fnum + X_aap_Fcat + ############################################################################### + #======================== + # FG5: Genomic features + #======================== + # See the beginnning section + if use_or: + print('\nALL Genomic features being used (n):', len(X_genomicFN) + , '\nThese are:', X_genomicFN) + else: + print('\nGenomic features being used EXCLUDING odds ratio (n):', len(X_genomicFN) + , '\nThese are:', X_genomicFN) + + ############################################################################### + #======================== + # FG6 collapsed: Structural : Atability + Affinity + ResidueProp + #======================== + X_structural_FN = X_stability_FN + X_affinityFN + X_resprop_FN + + ############################################################################### + #======================== + # BUILDING all features + #======================== + all_featuresN = X_evolFN + X_structural_FN + X_genomicFN + + ############################################################################### + #%% Define training and test data + #================================================================ + # Training and BLIND test set: 70/30 + # dst with actual values : training set + # dst with imputed values : THROW AWAY [unrepresentative] + #================================================================ + my_df_ml[drug].isna().sum() + + # blind_test_df = my_df_ml[my_df_ml[drug].isna()] + # blind_test_df.shape + + # training_df = my_df_ml[my_df_ml[drug].notna()] + # training_df.shape + + # training_df = my_df_ml.copy() + + # # Target 1: dst_mode + # training_df[drug].value_counts() + # training_df['dst_mode'].value_counts() + + #all_training_df = my_df_ml[all_featuresN] + + # Getting the dst column as this will be required for tts_split() + if 'dst' in my_df_ml: + print('\ndst column exists') + if my_df_ml['dst'].equals(my_df_ml[drug]): + print('\nand this is identical to drug column:', drug) + + all_featuresN2 = all_featuresN + ['dst', 'dst_mode'] + all_training_df = my_df_ml[all_featuresN2] + + print('\nAll feature names:', all_featuresN2) + #################################################################### + + #========================================================================== + if write_maskfile: + print('\nPASS: and now writing file to check masked columns and values:', outFile_mask_ml ) + mask_check.sort_values(by = ['ligand_distance'], ascending = True, inplace = True) + mask_check.to_csv(outFile_mask_ml, index = False) + else: + print('\nPASS: but NOT writing mask file') + #========================================================================== + if write_outfile: + print('\nPASS: and now writing processed file for ml:', outFile_ml) + #all_training_df.to_csv(outFile_ml, index = False) + else: + print('\nPASS: But NOT writing processed file') + #========================================================================== + + print('\n#################################################################' + , '\nSUCCESS: Extacted training data for gene:', gene.lower() + , '\nDim of training_df:', all_training_df.shape) + if use_or: + print('\nThis includes Odds Ratio' + , '\n###########################################################') + else: + print('\nThis EXCLUDES Odds Ratio' + , '\n############################################################') + + return(all_training_df) \ No newline at end of file diff --git a/scripts/plotting/get_ml_dfs.R b/scripts/plotting/get_ml_dfs.R new file mode 100644 index 0000000..0532e7e --- /dev/null +++ b/scripts/plotting/get_ml_dfs.R @@ -0,0 +1,103 @@ +#!/usr/bin/env Rscript + +######################################################### +# TASK: Get formatted data for plots +######################################################### +# working dir and loading libraries +getwd() +source("~/git/LSHTM_analysis/scripts/Header_TT.R") +source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R") +# cmd args passed +# in from other scripts +# to call this +# set drug and gene name + +#========================================== +# variables for affinity: +# comes from functions/plotting_globals.R +#========================================== + +cat("\nGlobal variables for Ligand:" + , "\nligand distance colname:", LigDist_colname + , "\nligand distance cut off:", LigDist_cutoff) + +cat("\nGlobal variables for mCSM-PPI2 affinity:" + , "\nPPI2 distance colname:", ppi2Dist_colname + , "\nPPI2 cut off:", DistCutOff) + +cat("\nGlobal variables for mCSM-NA affinity:" + , "\nligand distance colname:", naDist_colname + , "\nligand distance cut off:", DistCutOff) + + +#=========== +# input +#=========== +#-------------------------------------------- +# call: import_dirs() +# comes from functions/plotting_globals.R +#-------------------------------------------- +import_dirs(drug, gene) + +#--------------------------- +# call: plotting_data() +#--------------------------- +if (!exists("infile_params") && exists("gene")){ + #if (!is.character(infile_params) && exists("gene")){ # when running as cmd + in_filename_params = paste0(tolower(gene), "_all_params.csv") + infile_params = paste0(outdir, "/", in_filename_params) + cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") +} + +# Input 1: +cat("\nReading mcsm combined data file: ", infile_params) +mcsm_df = read.csv(infile_params, header = T) +if (tolower(gene)%in%c("rpob")){ + mcsm_df = mcsm_df[mcsm_df$position!=1148,] +} + +pd_df = plotting_data(mcsm_df + , gene = gene # ADDED + , lig_dist_colname = LigDist_colname + , lig_dist_cutoff = LigDist_cutoff) + +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting() + +max_ang <- round(max(my_df_u[[LigDist_colname]])) +min_ang <- round(min(my_df_u[[LigDist_colname]])) + +cat("\nLigand distance colname:", LigDist_colname + , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b" + , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b") + +#-------------------------------- +# call: combining_dfs_plotting() +#-------------------------------- +if (!exists("infile_metadata") && exists("gene")){ + #if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd + in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid + infile_metadata = paste0(outdir, "/", in_filename_metadata) + cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") +} + +# Input 2: read _meta data.csv +cat("\nReading meta data file: ", infile_metadata) + +gene_metadata <- read.csv(infile_metadata + , stringsAsFactors = F + , header = T) + +cat("\nDim of meta data file: ", dim(gene_metadata)) + +all_plot_dfs = combining_dfs_plotting(my_df_u + , gene_metadata + #, gene = gene # ADDED + , lig_dist_colname = '' + , lig_dist_cutoff = '' + , plotting = FALSE) + +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +#merged_df2_comp = all_plot_dfs[[3]] +#merged_df3_comp = all_plot_dfs[[4]]