LSHTM_analysis/scripts/plotting/plotting_thesis/prominent_effects.R

#!/usr/bin/env Rscript
#source("~/git/LSHTM_analysis/config/alr.R")
source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/katg.R")
#source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/rpob.R")

# get plotting dfs
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")

source("~/git/LSHTM_analysis/scripts/plotting/plotting_colnames.R")

length(all_stability_cols); length(raw_stability_cols)
length(scaled_stability_cols); length(outcome_stability_cols)
length(affinity_dist_colnames)

static_cols =  c("mutationinformation", "position", "sensitivity")
other_cols_all = c(scaled_stability_cols, scaled_affinity_cols, affinity_dist_colnames)

#omit avg cols and foldx_scaled_signC cols
other_cols =  other_cols_all[grep("avg", other_cols_all, invert = T)]
other_cols = other_cols[grep("foldx_scaled_signC",other_cols, invert = T )]
other_cols

cols_to_extract =  c(static_cols, other_cols)
expected_ncols = length(static_cols) + length(other_cols)

str_df = merged_df3[, cols_to_extract]

if (ncol(str_df) == expected_ncols){
  cat("\nPASS: successfully extracted cols for calculating prominent effects")
}else{
  stop("\nAbort: Could not extract cols for calculating prominent effects")
}

#=========================
# Masking affinity columns
#=========================
# First make values for affinity cols 0 when their corresponding dist >10
head(str_df)

# replace in place affinity values >10
str_df[str_df["ligand_distance"]>10,"affinity_scaled"]=0
str_df[str_df["ligand_distance"]>10,"mmcsm_lig_scaled"]=0

#ppi2 gene: replace in place ppi2 affinity values where ppi2 dist >10

if (tolower(gene)%in%geneL_ppi2){
  str_df[str_df["interface_dist"]>10,"mcsm_ppi2_scaled"]=0
}

# na gene: replace in place na affinity values where na dist >10
if (tolower(gene)%in%geneL_na){
  str_df[str_df["XXXX"]>10,"mcsm_na_scaled"]=0
}

colnames(str_df)

head(str_df)

# get names of cols to calculate the prominent effects from
scaled_cols_tc = c("duet_scaled", "deepddg_scaled"
       , "ddg_dynamut2_scaled", "foldx_scaled","affinity_scaled"
       ,  "mmcsm_lig_scaled"  ,  "mcsm_ppi2_scaled")

#--------------------------------
#get rowmax for absolute values
#--------------------------------
#str_df$row_maximum = apply(str_df[,-1], 1, function(x){max(abs(x))})
#str_df$row_maximum = apply(str_df[,scaled_cols_tc], 1, function(x){max(abs(x))})

#correct
#BOO= abs(str_df[,scaled_cols_tc]) == str_df[,'row_maximum']; head(BOO)
#also corr but weird
#POO = which(abs(str_df[,scaled_cols_tc]) == str_df[,'row_maximum'], arr.ind =T); head(POO)

################################################
# #-------------
# # short df: try
# #-------------
# df2_short = str_df[str_df$position%in%c(167, 423, 427),]
# df2_short = str_df[str_df$position%in%c(170, 167, 493, 453, 435, 433, 480, 456, 445),]
# df2_short = str_df[str_df$position%in%c(435, 480),]
#
#
# give_col=function(x,y,df=df2_short){
#   df[df$position==x,y]
# }
#
# for (i in unique(df2_short$position) ){
#   print(i)
#   #print(paste0("\nNo. of unique positions:", length(unique(df2$position))) )
#   #cat(length(unique(df2$position)))
#   #df2_short[df2_short$position==i,scaled_cols_tc]
#
#   biggest     = max(abs(give_col(i,scaled_cols_tc)))
#
#   df2_short[df2_short$position==i,'abs_max_effect'] = biggest
#   df2_short[df2_short$position==i,'effect_type']= names(
#     give_col(i,scaled_cols_tc)[which(
#       abs(
#         give_col(i,scaled_cols_tc)
#       ) == biggest, arr.ind=T
#     )[, "col"]])
#
#   effect_name = df2_short[df2_short$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
#
#   # get index/rowname for value of max effect, and then use it to get the original sign
#   # here
#   #df2_short[df2_short$position==i,c(effect_name)]
#   #which(abs(df2_short[df2_short$position==i,c('position',effect_name)][effect_name])==biggest, arr.ind=T)
#   ind = rownames(which(abs(df2_short[df2_short$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
#   df2_short[df2_short$position==i,'effect_sign'] = sign(df2_short[effect_name][ind,])
# }
#
# # ends with suffix 2 if dups
# df2_short$effect_type = sub("\\.[0-9]+", "", df2_short$effect_type) # cull duplicate effect types that happen when there are exact duplicate values
#
# View(df2_short)


#===============
# whole df
#===============
give_col=function(x,y,df=str_df){
  df[df$position==x,y]
}

for (i in unique(str_df$position) ){
  print(i)
  #cat(length(unique(str_df$position)))

  biggest     = max(abs(give_col(i,scaled_cols_tc)))

  str_df[str_df$position==i,'abs_max_effect'] = biggest
  str_df[str_df$position==i,'effect_type']= names(
    give_col(i,scaled_cols_tc)[which(
      abs(
        give_col(i,scaled_cols_tc)
      ) == biggest, arr.ind=T
    )[, "col"]])[1]

  effect_name = unique(str_df[str_df$position==i,'effect_type'])#[1] # pick first one in case we have multiple exact values

  # get index/rowname for value of max effect, and then use it to get the original sign
  # here
  ind = rownames(which(abs(str_df[str_df$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
  str_df[str_df$position==i,'effect_sign'] = sign(str_df[effect_name][ind,])[1]
}

# ends with suffix 2 if dups
str_df$effect_type = sub("\\.[0-9]+", "", str_df$effect_type) # cull duplicate effect types that happen when there are exact duplicate values

colnames(str_df)

# check
str_df_check = str_df[str_df$position%in%c(24, 32,160, 303, 334),]
table(str_df$effect_type)

#================
# for Plots
#================
str_df_short = str_df[, c("mutationinformation","position","sensitivity"
                          , "effect_type"
                          , "effect_sign")]

table(str_df_short$effect_type)
table(str_df_short$effect_sign)
str(str_df_short)

# assign pe outcome
str_df_short$pe_outcome = ifelse(str_df_short$effect_sign<0, "DD", "SS")
table(str_df_short$pe_outcome )
table(str_df_short$effect_sign)

#==============
# group effect type:
# lig, ppi2, nuc. acid, stability
#==============

affcols = c("affinity_scaled",  "mmcsm_lig_scaled")
ppi2_cols = c("mcsm_ppi2_scaled")
#nuc_na_cols = c("mcsm_a_scaled")


#lig
table(str_df_short$effect_type)
str_df_short$effect_grouped = ifelse(str_df_short$effect_type%in%affcols
                                     , "lig"
                                     , str_df_short$effect_type)
table(str_df_short$effect_grouped)

#ppi2
str_df_short$effect_grouped = ifelse(str_df_short$effect_grouped%in%ppi2_cols
                                     , "ppi2"
                                     , str_df_short$effect_grouped)
table(str_df_short$effect_grouped)

#stability
str_df_short$effect_grouped = ifelse(!str_df_short$effect_grouped%in%c("lig", "ppi2")
                                     , "stability"
                                     , str_df_short$effect_grouped)

table(str_df_short$effect_grouped)

# create a sign as well
str_df_short$pe_effect_outcome = paste0(str_df_short$pe_outcome, "_"
                                     , str_df_short$effect_grouped)

table(str_df_short$pe_effect_outcome)


#####################################################################
# Chimera: for colouring
####################################################################

#-------------------------------------
# get df with unique position
#--------------------------------------
#data[!duplicated(data$x), ]
str_df_plot = str_df[!duplicated(str_df$position),]

if (nrow(str_df_plot) == length(unique(str_df$position))){
  cat("\nPASS: successfully extracted df with unique positions")
}else{
  stop("\nAbort: Could not extract df with unique positions")
}

#-------------------------------------
# generate colours for effect types
#--------------------------------------
str_df_plot_cols = str_df_plot[, c("position", "sensitivity"
                              , affinity_dist_colnames
                              , "abs_max_effect"
                              , "effect_type"
                              , "effect_sign")]
head(str_df_plot_cols)

# colour intensity based on sign
str_df_plot_cols$colour_hue = ifelse(str_df_plot_cols$effect_sign<0, "bright", "light")
table(str_df_plot_cols$colour_hue)
head(str_df_plot_cols)
# colour based on effect
table(str_df_plot_cols$effect_type)

pe_colour_map = c("affinity_scaled"    = "salmon"
                , "mmcsm_lig_scaled"   = "salmon"
                , "mcsm_ppi2_scaled"   = "pink"
                , "mcsm_na_scaled"     = "orange"
                , "duet_scaled"        = "dimgray"
                , "deepddg_scaled"     = "dimgray"
                , "ddg_dynamut2_scaled"= "dimgray"
                , "foldx_scaled"       = "dimgray")

#unlist(d[c('a', 'a', 'c', 'b')], use.names=FALSE)

#map the colours
str_df_plot_cols$colour_map= unlist(map(str_df_plot_cols$effect_type
                                 ,function(x){pe_colour_map[[x]]}
                                 ))

str_df_plot_cols$colours = paste0(str_df_plot_cols$colour_hue
                                  , "_"
                                  , str_df_plot_cols$colour_map)
head(str_df_plot_cols$colours)
table(str_df_plot_cols$colours)


class(str_df_plot_cols$colour_map)
str(str_df_plot_cols)

# sort by colour
head(str_df_plot_cols)
str_df_plot_cols = str_df_plot_cols[order(str_df_plot_cols$colour_map), ]
head(str_df_plot_cols)

#======================================
# write file with prominent effects
#======================================
outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene), "/")
write.csv(str_df_plot_cols, paste0(outdir_images, tolower(gene), "_prominent_effects.csv"))

################################
# printing for chimera
###############################
str_df_plot_cols$pos_chain = paste0(str_df_plot_cols$position, ".B,")
table(str_df_plot_cols$colour_map)

#===================================================
#-------------------
# Ligand Affinity
#-------------------
foo = str_df_plot_cols[str_df_plot_cols$colours=="yellow",]
all(foo2$effect_sign == 1)

foo1 = str_df_plot_cols[str_df_plot_cols$colours=="bright_salmon",]
all(foo3$effect_sign == -1)

#light salmon: stabilising affinity
table(str_df_plot_cols$colours)

affinity_pos_l = str_df_plot_cols$pos_chain[str_df_plot_cols$colours=="light_salmon"]
affinity_pos_lc = paste(affinity_pos_l, collapse = "")
affinity_pos_lc
table(str_df_plot_cols$colours)[["light_salmon"]]

#bright salmon: DEstabilsing affinity
affinity_pos_b = str_df_plot_cols$pos_chain[str_df_plot_cols$colours=="bright_salmon"]
affinity_pos_bc = paste(affinity_pos_b, collapse = "")
affinity_pos_bc
table(str_df_plot_cols$colours)[["bright_salmon"]]

c1 = length(affinity_pos_l) + length(affinity_pos_b) == table(str_df_plot_cols$colour_map)[["salmon"]]

if (c1){
  cat("PASS: affinity colour numbers checked")
}else{
  stop("Abort: affinity colour numbers mismtatch")
}

#put in chimera cmd
affinity_pos_lc
affinity_pos_bc

#===================================================
#-------------------
# ppi2 Affinity
#-------------------
foo2 = str_df_plot_cols[str_df_plot_cols$colours=="light_pink",]
all(foo2$effect_sign == 1)

foo3 = str_df_plot_cols[str_df_plot_cols$colours=="bright_pink",]
all(foo3$effect_sign == -1)

#light_pink: stabilising affinity
table(str_df_plot_cols$colours)

ppi2_pos_l = str_df_plot_cols$pos_chain[str_df_plot_cols$colours=="light_pink"]
ppi2_pos_lc = paste(ppi2_pos_l, collapse = "")
ppi2_pos_lc
table(str_df_plot_cols$colours)[["light_pink"]]

#bright pink: DEstabilsing affinity
ppi2_pos_b = str_df_plot_cols$pos_chain[str_df_plot_cols$colours=="bright_pink"]
ppi2_pos_bc = paste(ppi2_pos_b, collapse = "")
ppi2_pos_bc
table(str_df_plot_cols$colours)[["bright_pink"]]

c2 = length(ppi2_pos_l) + length(ppi2_pos_b) == table(str_df_plot_cols$colour_map)[["pink"]]

if (c2){
  cat("PASS: ppi2 colour numbers checked")
}else{
  stop("Abort: ppi2 colour numbers mismtatch")
}

#put in chimera cmd
ppi2_pos_lc
ppi2_pos_bc

#=========================================================
#-------------------
# Stability
#-------------------
foo4 = str_df_plot_cols[str_df_plot_cols$colours=="light_dimgray",]
all(foo2$effect_sign == 1)

foo5 = str_df_plot_cols[str_df_plot_cols$colours=="bright_dimgray",]
all(foo3$effect_sign == -1)

#light_dimgray: stabilising Stability
table(str_df_plot_cols$colours)

stab_pos_l = str_df_plot_cols$pos_chain[str_df_plot_cols$colours=="light_dimgray"]
stab_pos_lc = paste(stab_pos_l, collapse = "")
stab_pos_lc
table(str_df_plot_cols$colours)[["light_dimgray"]]

#bright_dimgray pink: DEstabilsing Stability
stab_pos_b = str_df_plot_cols$pos_chain[str_df_plot_cols$colours=="bright_dimgray"]
stab_pos_bc = paste(stab_pos_b, collapse = "")
stab_pos_bc
table(str_df_plot_cols$colours)[["bright_dimgray"]]

c3 = length(stab_pos_l) + length(stab_pos_b) == table(str_df_plot_cols$colour_map)[["dimgray"]]

if (c3){
  cat("PASS: stability colour numbers checked")
}else{
  stop("Abort: stability colour numbers mismtatch")
}

#put in chimera cmd
stab_pos_lc
stab_pos_bc


# stab tool count
table(str_df_plot_cols$effect_type)

table(str_df_plot_cols$effect_type, str_df_plot_cols$effect_sign)