diff --git a/scripts/plotting/mcsm_mean_affinity_ensemble.R b/scripts/plotting/mcsm_mean_affinity_ensemble.R index d21b263..068325b 100644 --- a/scripts/plotting/mcsm_mean_affinity_ensemble.R +++ b/scripts/plotting/mcsm_mean_affinity_ensemble.R @@ -1,9 +1,9 @@ #source("~/git/LSHTM_analysis/config/pnca.R") #source("~/git/LSHTM_analysis/config/alr.R") #source("~/git/LSHTM_analysis/config/gid.R") -#source("~/git/LSHTM_analysis/config/embb.R") +source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/katg.R") -source("~/git/LSHTM_analysis/config/rpob.R") +#source("~/git/LSHTM_analysis/config/rpob.R") source("/home/tanu/git/LSHTM_analysis/my_header.R") ######################################################### @@ -11,65 +11,27 @@ source("/home/tanu/git/LSHTM_analysis/my_header.R") # across all affinity tools for a given structure # as applicable... ######################################################### - -#======= -# output -#======= -outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene)) - -#OutFile1 -outfile_mean_aff = paste0(outdir_images, "/", tolower(gene) - , "_mean_affinity_all.csv") -print(paste0("Output file:", outfile_mean_aff)) - -#OutFile2 -outfile_mean_aff_priorty = paste0(outdir_images, "/", tolower(gene) - , "_mean_affinity_priority.csv") -print(paste0("Output file:", outfile_mean_aff_priorty)) - -#%%=============================================================== - #============= # Input #============= df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") df3 = read.csv(df3_filename) length(df3$mutationinformation) - -# mut_info checks -table(df3$mutation_info) -table(df3$mutation_info_orig) -table(df3$mutation_info_labels_orig) - -# used in plots and analyses -table(df3$mutation_info_labels) # different, and matches dst_mode -table(df3$dst_mode) - -# create column based on dst mode with different colname -table(is.na(df3$dst)) -table(is.na(df3$dst_mode)) - -#=============== -# Create column: sensitivity mapped to dst_mode -#=============== -df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S") -table(df3$sensitivity) - -length(unique((df3$mutationinformation))) -all_colnames = as.data.frame(colnames(df3)) +all_colnames= colnames(df3) +#%%=============================================================== +# FIXME: ADD distance to NA when SP replies +dist_columns = c("ligand_distance", "interface_dist") +DistCutOff = 10 common_cols = c("mutationinformation" - , "position" , "X5uhc_position" , "X5uhc_offset" + , "position" , "dst_mode" , "mutation_info_labels" - , "sensitivity" + , "sensitivity", dist_columns ) - , "ligand_distance" - , "interface_dist") - -all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)] -all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)] +all_colnames[grep("scaled" , all_colnames)] +all_colnames[grep("outcome" , all_colnames)] #=================== # stability cols @@ -106,7 +68,6 @@ outcome_cols_affinity = c( "ligand_outcome" , "mmcsm_lig_outcome" , "mcsm_ppi2_outcome" , "mcsm_na_outcome") - #=================== # conservation cols #=================== @@ -123,223 +84,233 @@ outcome_cols_affinity = c( "ligand_outcome" # , "snap2_outcome" # #consurf outcome doesn't exist # ) +all_cols= c(common_cols + ,raw_cols_stability, scaled_cols_stability, outcome_cols_stability + , raw_cols_affinity, scaled_cols_affinity, outcome_cols_affinity) +#======= +# output +#======= +outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene)) -###################################################################### -cols_to_consider = colnames(df3)[colnames(df3)%in%c(common_cols - , raw_cols_affinity - , scaled_cols_affinity - , outcome_cols_affinity - , raw_cols_stability - , scaled_cols_stability - , outcome_cols_stability - )] +#OutFile1 +outfile_mean_aff = paste0(outdir_images, "/", tolower(gene) + , "_mean_ligand.csv") +print(paste0("Output file:", outfile_mean_aff)) -cols_to_extract = cols_to_consider[cols_to_consider%in%c(common_cols - , scaled_cols_affinity)] +#OutFile2 +outfile_ppi2 = paste0(outdir_images, "/", tolower(gene) + , "_mean_ppi2.csv") +print(paste0("Output file:", outfile_ppi2)) + + + +#OutFile4 +#outfile_mean_aff_priorty = paste0(outdir_images, "/", tolower(gene) +# , "_mean_affinity_priority.csv") +#print(paste0("Output file:", outfile_mean_aff_priorty)) + +################################################################# +################################################################# +# mut positions +length(unique(df3$position)) + +# mut_info checks +table(df3$mutation_info) +table(df3$mutation_info_orig) +table(df3$mutation_info_labels_orig) + +# used in plots and analyses +table(df3$mutation_info_labels) # different, and matches dst_mode +table(df3$dst_mode) + +# create column based on dst mode with different colname +table(is.na(df3$dst)) +table(is.na(df3$dst_mode)) + +#=============== +# Create column: sensitivity mapped to dst_mode +#=============== +df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S") +table(df3$sensitivity) + +length(unique((df3$mutationinformation))) +all_colnames = as.data.frame(colnames(df3)) + +#=============== +# select columns specific to gene +#=============== +gene_aff_cols = colnames(df3)[colnames(df3)%in%c(outcome_cols_affinity + , scaled_cols_affinity)] + +gene_common_cols = colnames(df3)[colnames(df3)%in%common_cols] + +cols_to_extract = c(gene_common_cols + , gene_aff_cols) + +cat("\nExtracting", length(cols_to_extract), "columns") df3_plot = df3[, cols_to_extract] -############################################################## -# FIXME: ADD distance to NA when SP replies +table(df3_plot$mmcsm_lig_outcome) +table(df3_plot$ligand_outcome) -##################### -# Ensemble affinity: affinity_cols -# mcsm_lig, mmcsm_lig and mcsm_na -##################### +############################################################## +# mCSM-lig, mCSM-NA, mCSM-ppi2, mmCSM-lig +######################################### +cols_to_numeric = c("ligand_outcome" + , "mcsm_na_outcome" + , "mcsm_ppi2_outcome" + , "mmcsm_lig_outcome") + +#===================================== +# mCSM-lig: Filter ligand distance <10 +#DistCutOff = 10 +#LigDist_colname = "ligand_distance" # extract outcome cols and map numeric values to the categories # Destabilising == 0, and stabilising == 1 so rescaling can let -1 be destabilising -######################################### #===================================== -# Affintiy (2 cols): average the scores -# across predictors ==> average by -# position ==> scale b/w -1 and 1 +df3_lig = df3[, c("mutationinformation" + , "position" + , "ligand_distance" + , "ligand_affinity_change" + , "affinity_scaled" + , "ligand_outcome")] -# column to average: ens_affinity -#===================================== -cols_mcsm_lig = c("mutationinformation" - , "position" - , "sensitivity" - , "X5uhc_position" - , "X5uhc_offset" - , "ligand_distance" - , "ligand_outcome" - , "mmcsm_lig_outcome") -cols_mcsm_lig -df3_lig_ens = df3[, cols_mcsm_lig] +df3_lig = df3_lig[df3_lig["ligand_distance"]% - dplyr::group_by(position) %>% - dplyr::summarize(avg_ens_lig = mean(ens_lig)) -class(mean_ens_lig_by_position) +# group by position +mean_lig_by_position <- df3_lig %>% + dplyr::group_by(position) %>% + #dplyr::summarize(avg_lig = max(df3_lig_num)) + #dplyr::summarize(avg_lig = mean(ligand_outcome)) + #dplyr::summarize(avg_lig = mean(affinity_scaled, na.rm = T)) + dplyr::summarize(avg_lig = mean(ligand_affinity_change, na.rm = T)) + +class(mean_lig_by_position) # convert to a df -mean_ens_lig_by_position = as.data.frame(mean_ens_lig_by_position) - -table(mean_ens_lig_by_position$avg_ens_lig) +mean_lig_by_position = as.data.frame(mean_lig_by_position) +table(mean_lig_by_position$avg_lig) # REscale b/w -1 and 1 -#en_aff_min = min(mean_ens_affinity_by_position['ens_affinity']) -#en_aff_max = max(mean_ens_affinity_by_position['ens_affinity']) +lig_min = min(mean_lig_by_position['avg_lig']) +lig_max = max(mean_lig_by_position['avg_lig']) -# scale the average affintiy value between -1 and 1 -# mean_ens_affinity_by_position['avg_ens_affinity_scaled'] = lapply(mean_ens_affinity_by_position['avg_ens_affinity'] -# , function(x) ifelse(x < 0, x/abs(en_aff_min), x/en_aff_max)) - -mean_ens_lig_by_position['avg_ens_lig_scaled'] = lapply(mean_ens_lig_by_position['avg_ens_lig'] +mean_lig_by_position['avg_lig_scaled'] = lapply(mean_lig_by_position['avg_lig'] , function(x) { - scales::rescale(x, to = c(-1,1) - #, from = c(en_aff_min,en_aff_max)) - , from = c(0,1)) + scales::rescale_mid(x + , to = c(-1,1) + , from = c(lig_min,lig_max) + , mid = 0) + #, from = c(0,1)) }) cat(paste0('Average (mcsm-lig+mmcsm-lig) scores:\n' - , head(mean_ens_lig_by_position['avg_ens_lig']) + , head(mean_lig_by_position['avg_lig']) , '\n---------------------------------------------------------------' , '\nAverage (mcsm-lig+mmcsm-lig) scaled scores:\n' - , head(mean_ens_lig_by_position['avg_ens_lig_scaled']))) + , head(mean_lig_by_position['avg_lig_scaled']))) -if ( nrow(mean_ens_lig_by_position) == expected_npos ){ - cat("\nPASS: Generated ensemble average values for ligand affinity" ) +if ( nrow(mean_lig_by_position) == length(unique(df3_lig$position)) ){ + cat("\nPASS: Generated average values for ligand affinity" ) }else{ stop(paste0("\nAbort: length mismatch for ligand affinity data")) } +max(mean_lig_by_position$avg_lig); min(mean_lig_by_position$avg_lig) +max(mean_lig_by_position$avg_lig_scaled); min(mean_lig_by_position$avg_lig_scaled) + ################################################################# - -#===================================== -# Affintiy (mCSM-ppi2): -#D1148G for rpob DOES NOT EXIST for 5UHC -#===================================== -cols_mcsm_ppi2 = c("mutationinformation" - , "position" - , "X5uhc_position" - , "X5uhc_offset" - , "sensitivity" - , "interface_dist" - #, "mcsm_ppi2_affinity" - #, "mcsm_ppi2_scaled" - , "mcsm_ppi2_outcome" - ) -cols_mcsm_ppi2 -df3_ppi2_raw = df3[, c(cols_mcsm_ppi2, "mcsm_ppi2_affinity", "mcsm_ppi2_scaled") ] - -table(df3_ppi2_raw$mcsm_ppi2_outcome) - -df3_ppi2 = df3[, cols_mcsm_ppi2] - -cols_to_numeric_ppi2 = c("mcsm_ppi2_outcome") -df3_ppi2[, cols_to_numeric_ppi2] <- sapply(df3_ppi2[, cols_to_numeric_ppi2] - , function(x){ifelse(x == "Descreasing", 0, 1)}) - -cols_to_average_ppi2 = which(colnames(df3_ppi2)%in%cols_to_numeric_ppi2) -cols_to_average_ppi2 - -#=============================== -# Filter interface <10 -Dist_cutoff = 10 -ppi2Dist_colname = "interface_dist" -#=============================== -table(df3_ppi2[ppi2Dist_colname]% + dplyr::group_by(position) %>% + #dplyr::summarize(avg_ppi2 = max(df3_ppi2_num)) + #dplyr::summarize(avg_ppi2 = mean(mcsm_ppi2_outcome)) + #dplyr::summarize(avg_ppi2 = mean(mcsm_ppi2_scaled, na.rm = T)) + dplyr::summarize(avg_ppi2 = mean(mcsm_ppi2_affinity, na.rm = T)) + +class(mean_ppi2_by_position) + +# convert to a df +mean_ppi2_by_position = as.data.frame(mean_ppi2_by_position) +table(mean_ppi2_by_position$avg_ppi2) + +# REscale b/w -1 and 1 +lig_min = min(mean_ppi2_by_position['avg_ppi2']) +lig_max = max(mean_ppi2_by_position['avg_ppi2']) + +mean_ppi2_by_position['avg_ppi2_scaled'] = lapply(mean_ppi2_by_position['avg_ppi2'] + , function(x) { + scales::rescale_mid(x + , to = c(-1,1) + , from = c(lig_min,lig_max) + , mid = 0) + #, from = c(0,1)) + }) + +cat(paste0('Average ppi2 scores:\n' + , head(mean_ppi2_by_position['avg_ppi2']) + , '\n---------------------------------------------------------------' + , '\nAverage ppi2 scaled scores:\n' + , head(mean_ppi2_by_position['avg_ppi2_scaled']))) + +if ( nrow(mean_ppi2_by_position) == length(unique(df3_ppi2$position)) ){ + cat("\nPASS: Generated average values for ppi2" ) +}else{ + stop(paste0("\nAbort: length mismatch for ppi2 data")) +} + +max(mean_ppi2_by_position$avg_ppi2); min(mean_ppi2_by_position$avg_ppi2) +max(mean_ppi2_by_position$avg_ppi2_scaled); min(mean_ppi2_by_position$avg_ppi2_scaled) + + +write.csv(mean_ppi2_by_position, outfile_ppi2 + , row.names = F) +cat("Finished writing file:\n" + , outfile_ppi2 + , "\nNo. of rows:", nrow(mean_ppi2_by_position) + , "\nNo. of cols:", ncol(mean_ppi2_by_position)) + # end of script #===============================================================