still trying affinity phew
This commit is contained in:
parent
d94aa10c9b
commit
214e9232c6
1 changed files with 112 additions and 114 deletions
|
@ -3,7 +3,7 @@
|
||||||
#source("~/git/LSHTM_analysis/config/gid.R")
|
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
#source("~/git/LSHTM_analysis/config/embb.R")
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
#source("~/git/LSHTM_analysis/config/katg.R")
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
source("~/git/LSHTM_analysis/config/rpob.R")
|
#source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
|
||||||
source("/home/tanu/git/LSHTM_analysis/my_header.R")
|
source("/home/tanu/git/LSHTM_analysis/my_header.R")
|
||||||
#########################################################
|
#########################################################
|
||||||
|
@ -57,15 +57,17 @@ table(df3$sensitivity)
|
||||||
|
|
||||||
length(unique((df3$mutationinformation)))
|
length(unique((df3$mutationinformation)))
|
||||||
all_colnames = as.data.frame(colnames(df3))
|
all_colnames = as.data.frame(colnames(df3))
|
||||||
|
|
||||||
|
# FIXME: ADD distance to NA when SP replies
|
||||||
|
dist_columns = c("ligand_distance", "interface_dist")
|
||||||
|
DistCutOff = 10
|
||||||
common_cols = c("mutationinformation"
|
common_cols = c("mutationinformation"
|
||||||
, "X5uhc_position"
|
, "X5uhc_position"
|
||||||
, "X5uhc_offset"
|
, "X5uhc_offset"
|
||||||
, "position"
|
, "position"
|
||||||
, "dst_mode"
|
, "dst_mode"
|
||||||
, "mutation_info_labels"
|
, "mutation_info_labels"
|
||||||
, "sensitivity"
|
, "sensitivity", dist_columns )
|
||||||
, "ligand_distance"
|
|
||||||
, "interface_dist")
|
|
||||||
|
|
||||||
all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)]
|
all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)]
|
||||||
all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
|
all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
|
||||||
|
@ -73,20 +75,20 @@ all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
|
||||||
#===================
|
#===================
|
||||||
# stability cols
|
# stability cols
|
||||||
#===================
|
#===================
|
||||||
# raw_cols_stability = c("duet_stability_change"
|
raw_cols_stability = c("duet_stability_change"
|
||||||
# , "deepddg"
|
, "deepddg"
|
||||||
# , "ddg_dynamut2"
|
, "ddg_dynamut2"
|
||||||
# , "ddg_foldx")
|
, "ddg_foldx")
|
||||||
#
|
|
||||||
# scaled_cols_stability = c("duet_scaled"
|
scaled_cols_stability = c("duet_scaled"
|
||||||
# , "deepddg_scaled"
|
, "deepddg_scaled"
|
||||||
# , "ddg_dynamut2_scaled"
|
, "ddg_dynamut2_scaled"
|
||||||
# , "foldx_scaled")
|
, "foldx_scaled")
|
||||||
#
|
|
||||||
# outcome_cols_stability = c("duet_outcome"
|
outcome_cols_stability = c("duet_outcome"
|
||||||
# , "deepddg_outcome"
|
, "deepddg_outcome"
|
||||||
# , "ddg_dynamut2_outcome"
|
, "ddg_dynamut2_outcome"
|
||||||
# , "foldx_outcome")
|
, "foldx_outcome")
|
||||||
|
|
||||||
#===================
|
#===================
|
||||||
# affinity cols
|
# affinity cols
|
||||||
|
@ -123,113 +125,109 @@ outcome_cols_affinity = c( "ligand_outcome"
|
||||||
# #consurf outcome doesn't exist
|
# #consurf outcome doesn't exist
|
||||||
# )
|
# )
|
||||||
|
|
||||||
######################################################################
|
gene_aff_cols = colnames(df3)[colnames(df3)%in%scaled_cols_affinity]
|
||||||
cols_to_consider = colnames(df3)[colnames(df3)%in%c(common_cols
|
gene_stab_cols = colnames(df3)[colnames(df3)%in%scaled_cols_stability]
|
||||||
, raw_cols_affinity
|
gene_common_cols = colnames(df3)[colnames(df3)%in%common_cols]
|
||||||
, scaled_cols_affinity
|
|
||||||
, outcome_cols_affinity
|
|
||||||
# , raw_cols_stability
|
|
||||||
# , scaled_cols_stability
|
|
||||||
# , outcome_cols_stability
|
|
||||||
)]
|
|
||||||
|
|
||||||
cols_to_extract = cols_to_consider[cols_to_consider%in%c(common_cols
|
sel_cols = c(gene_common_cols
|
||||||
, raw_cols_affinity
|
, gene_stab_cols
|
||||||
, scaled_cols_affinity)]
|
, gene_aff_cols)
|
||||||
|
|
||||||
df3_plot = df3[, cols_to_extract]
|
#########################################
|
||||||
|
#df3_plot = df3[, cols_to_extract]
|
||||||
DistCutOff_colnames = c("ligand_distance", "interface_dist")
|
df3_plot = df3[, sel_cols]
|
||||||
DistCutOff = 10
|
|
||||||
|
|
||||||
|
######################
|
||||||
|
#FILTERING HMMMM!
|
||||||
|
#all dist <10
|
||||||
|
#for embb this results in 2 muts
|
||||||
|
######################
|
||||||
df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 | df3_plot$interface_dist <10),]
|
df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 | df3_plot$interface_dist <10),]
|
||||||
|
df3_affinity_filtered = df3_plot[ (df3_plot$ligand_distance<10 & df3_plot$interface_dist <10),]
|
||||||
|
|
||||||
c0u = unique(df3_affinity_filtered$position)
|
c0u = unique(df3_affinity_filtered$position)
|
||||||
length(c0u)
|
length(c0u)
|
||||||
|
|
||||||
foo = df3_affinity_filtered[df3_affinity_filtered$ligand_distance<10,]
|
#df = df3_affinity_filtered
|
||||||
bar = df3_affinity_filtered[df3_affinity_filtered$interface_dist<10,]
|
##########################################
|
||||||
|
#NO FILTERING: annotate the whole df
|
||||||
wilcox.test(foo$mmcsm_lig_scaled~foo$sensitivity)
|
df = df3_plot
|
||||||
wilcox.test(foo$mmcsm_lig~foo$sensitivity)
|
|
||||||
|
|
||||||
wilcox.test(foo$affinity_scaled~foo$sensitivity)
|
|
||||||
wilcox.test(foo$ligand_affinity_change~foo$sensitivity)
|
|
||||||
|
|
||||||
wilcox.test(bar$mcsm_na_scaled~bar$sensitivity)
|
|
||||||
wilcox.test(bar$mcsm_na_affinity~bar$sensitivity)
|
|
||||||
|
|
||||||
wilcox.test(bar$mcsm_ppi2_scaled~bar$sensitivity)
|
|
||||||
wilcox.test(bar$mcsm_ppi2_affinity~bar$sensitivity)
|
|
||||||
|
|
||||||
##############################################################
|
|
||||||
df = df3_affinity_filtered
|
|
||||||
sum(is.na(df))
|
sum(is.na(df))
|
||||||
df2 = na.omit(df) # Apply na.omit function
|
df2 = na.omit(df)
|
||||||
|
c0u = unique(df2$position)
|
||||||
|
length(c0u)
|
||||||
|
|
||||||
a = df2[df2$position==37,]
|
# reassign orig
|
||||||
sel_cols = c("mutationinformation", "position", scaled_cols_affinity)
|
my_df_raw = df3
|
||||||
a = a[, sel_cols]
|
|
||||||
|
|
||||||
##############################################################
|
# now subset
|
||||||
# FIXME: ADD distance to NA when SP replies
|
df3 = df2
|
||||||
|
#######################################################
|
||||||
#####################
|
#=================
|
||||||
# Ensemble affinity: affinity_cols
|
# affinity effect
|
||||||
# mcsm_lig, mmcsm_lig and mcsm_na
|
#=================
|
||||||
#####################
|
give_col=function(x,y,df=df3){
|
||||||
# extract outcome cols and map numeric values to the categories
|
df[df$position==x,y]
|
||||||
# Destabilising == 0, and stabilising == 1 so rescaling can let -1 be destabilising
|
|
||||||
#########################################
|
|
||||||
#=====================================
|
|
||||||
# Affintiy (2 cols): average the scores
|
|
||||||
# across predictors ==> average by
|
|
||||||
# position ==> scale b/w -1 and 1
|
|
||||||
|
|
||||||
# column to average: ens_affinity
|
|
||||||
#=====================================
|
|
||||||
cols_mcsm_lig = c("mutationinformation"
|
|
||||||
, "position"
|
|
||||||
, "sensitivity"
|
|
||||||
, "X5uhc_position"
|
|
||||||
, "X5uhc_offset"
|
|
||||||
, "ligand_distance"
|
|
||||||
, "ligand_outcome"
|
|
||||||
, "mmcsm_lig_outcome")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
######################################################################
|
|
||||||
##################
|
|
||||||
# merge: mean ensemble stability and affinity by_position
|
|
||||||
####################
|
|
||||||
# if ( class(mean_ens_stability_by_position) && class(mean_ens_affinity_by_position) != "data.frame"){
|
|
||||||
# cat("Y")
|
|
||||||
# }
|
|
||||||
|
|
||||||
common_cols = intersect(colnames(mean_ens_stability_by_position), colnames(mean_ens_affinity_by_position))
|
|
||||||
|
|
||||||
if (dim(mean_ens_stability_by_position) && dim(mean_ens_affinity_by_position)){
|
|
||||||
print(paste0("PASS: dim's match, mering dfs by column :", common_cols))
|
|
||||||
#combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position ))
|
|
||||||
combined_df = as.data.frame(merge(mean_ens_stability_by_position
|
|
||||||
, mean_ens_affinity_by_position
|
|
||||||
, by = common_cols
|
|
||||||
, all = T))
|
|
||||||
|
|
||||||
cat(paste0("\nnrows combined_df:", nrow(combined_df)
|
|
||||||
, "\nnrows combined_df:", ncol(combined_df)))
|
|
||||||
}else{
|
|
||||||
cat(paste0("FAIL: dim's mismatch, aborting cbind!"
|
|
||||||
, "\nnrows df1:", nrow(mean_duet_by_position)
|
|
||||||
, "\nnrows df2:", nrow(mean_affinity_by_position)))
|
|
||||||
quit()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (i in unique(df3$position) ){
|
||||||
|
#print(i)
|
||||||
|
biggest = max(abs(give_col(i,gene_aff_cols)))
|
||||||
|
|
||||||
|
df3[df3$position==i,'abs_max_effect'] = biggest
|
||||||
|
df3[df3$position==i,'effect_type']= names(
|
||||||
|
give_col(i,gene_aff_cols)[which(
|
||||||
|
abs(
|
||||||
|
give_col(i,gene_aff_cols)
|
||||||
|
) == biggest, arr.ind=T
|
||||||
|
)[, "col"]])
|
||||||
|
|
||||||
|
# effect_name = unique(df3[df3$position==i,'effect_type'])
|
||||||
|
effect_name = df3[df3$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
|
||||||
|
|
||||||
|
ind = rownames(which(abs(df3[df3$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
|
||||||
|
df3[df3$position==i,'effect_sign'] = sign(df3[effect_name][ind,])
|
||||||
|
}
|
||||||
|
|
||||||
|
df3$effect_type = sub("\\.[0-9]+", "", df3$effect_type) # cull duplicate effect types that happen when there are exact duplicate values
|
||||||
|
df3U = df3[!duplicated(df3[c('position')]), ]
|
||||||
|
table(df3U$effect_type)
|
||||||
|
#########################################################
|
||||||
|
#%% consider stability as well
|
||||||
|
df4 = df2
|
||||||
|
|
||||||
|
#=================
|
||||||
|
# stability + affinity effect
|
||||||
|
#=================
|
||||||
|
effect_cols = c(gene_aff_cols, gene_stab_cols)
|
||||||
|
|
||||||
|
give_col=function(x,y,df=df4){
|
||||||
|
df[df$position==x,y]
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i in unique(df4$position) ){
|
||||||
|
#print(i)
|
||||||
|
biggest = max(abs(give_col(i,effect_cols)))
|
||||||
|
|
||||||
|
df4[df4$position==i,'abs_max_effect'] = biggest
|
||||||
|
df4[df4$position==i,'effect_type']= names(
|
||||||
|
give_col(i,effect_cols)[which(
|
||||||
|
abs(
|
||||||
|
give_col(i,effect_cols)
|
||||||
|
) == biggest, arr.ind=T
|
||||||
|
)[, "col"]])
|
||||||
|
|
||||||
|
# effect_name = unique(df4[df4$position==i,'effect_type'])
|
||||||
|
effect_name = df4[df4$position==i,'effect_type'][1] # pick first one in case we have multiple exact values
|
||||||
|
|
||||||
|
ind = rownames(which(abs(df4[df4$position==i,c('position',effect_name)][effect_name])== biggest, arr.ind=T))
|
||||||
|
df4[df4$position==i,'effect_sign'] = sign(df4[effect_name][ind,])
|
||||||
|
}
|
||||||
|
|
||||||
|
df4$effect_type = sub("\\.[0-9]+", "", df4$effect_type) # cull duplicate effect types that happen when there are exact duplicate values
|
||||||
|
df4U = df4[!duplicated(df4[c('position')]), ]
|
||||||
|
table(df4U$effect_type)
|
||||||
|
|
||||||
#%%============================================================
|
#%%============================================================
|
||||||
# output
|
# output
|
||||||
write.csv(combined_df, outfile_mean_ens_st_aff
|
write.csv(combined_df, outfile_mean_ens_st_aff
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue