sorted combining_dfs.py with all other data files and tidied up get_plotting_dfs.R

This commit is contained in:
Tanushree Tunstall 2021-09-10 18:16:41 +01:00
parent 4ba4ff602e
commit 5c8a9e8f00
3 changed files with 111 additions and 76 deletions

View file

@ -169,25 +169,31 @@ i_join = 'inner'
#===================== #=====================
# some preprocessing # some preprocessing
#===================== #=====================
#-------------
#===========
# FoldX # FoldX
#------------- #===========
foldx_df.shape foldx_df.shape
#=======================
#----------------------
# scale foldx values # scale foldx values
#======================= #----------------------
# rename ddg column to ddg_foldx
foldx_df['ddg']
foldx_df = foldx_df.rename(columns = {'ddg':'ddg_foldx'})
foldx_df['ddg_foldx']
# Rescale values in Foldx_change col b/w -1 and 1 so negative numbers # Rescale values in Foldx_change col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive # stay neg and pos numbers stay positive
foldx_min = foldx_df['ddg'].min() foldx_min = foldx_df['ddg_foldx'].min()
foldx_max = foldx_df['ddg'].max() foldx_max = foldx_df['ddg_foldx'].max()
foldx_min foldx_min
foldx_max foldx_max
foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed') foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed')
foldx_df['foldx_scaled'] = foldx_df['ddg'].apply(foldx_scale) foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale)
print('Raw foldx scores:\n', foldx_df['ddg'] print('Raw foldx scores:\n', foldx_df['ddg_foldx']
, '\n---------------------------------------------------------------' , '\n---------------------------------------------------------------'
, '\nScaled foldx scores:\n', foldx_df['foldx_scaled']) , '\nScaled foldx scores:\n', foldx_df['foldx_scaled'])
@ -195,8 +201,8 @@ print('Raw foldx scores:\n', foldx_df['ddg']
fsmi = foldx_df['foldx_scaled'].min() fsmi = foldx_df['foldx_scaled'].min()
fsma = foldx_df['foldx_scaled'].max() fsma = foldx_df['foldx_scaled'].max()
c = foldx_df[foldx_df['ddg']>=0].count() c = foldx_df[foldx_df['ddg_foldx']>=0].count()
foldx_pos = c.get(key = 'ddg') foldx_pos = c.get(key = 'ddg_foldx')
c2 = foldx_df[foldx_df['foldx_scaled']>=0].count() c2 = foldx_df[foldx_df['foldx_scaled']>=0].count()
foldx_pos2 = c2.get(key = 'foldx_scaled') foldx_pos2 = c2.get(key = 'foldx_scaled')
@ -209,20 +215,30 @@ else:
, '\nGot:', foldx_pos2 , '\nGot:', foldx_pos2
, '\n======================================================') , '\n======================================================')
# rename ddg column to ddg_foldx #-------------------------
foldx_df['ddg'] # foldx outcome category
foldx_df = foldx_df.rename(columns = {'ddg':'ddg_foldx'}) #--------------------------
foldx_df['ddg_foldx'] foldx_df['foldx_outcome'] = foldx_df['ddg_foldx'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
foldx_df[foldx_df['ddg_foldx']>=0].count()
foc = foldx_df['foldx_outcome'].value_counts()
#------------- if foc['Stabilising'] == foldx_pos and foc['Stabilising'] == foldx_pos2:
print('\nPASS: Foldx outcome category created')
else:
print('\nFAIL: Foldx outcome category could NOT be created'
, '\nExpected number:', foldx_pos
, '\nGot:', foc[0]
, '\n======================================================')
sys.exit()
#=======================
# Deepddg # Deepddg
#------------- #=======================
deepddg_df.shape deepddg_df.shape
#======================= #-------------------------
# scale Deepddg values # scale Deepddg values
#======================= #-------------------------
# Rescale values in deepddg_change col b/w -1 and 1 so negative numbers # Rescale values in deepddg_change col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive # stay neg and pos numbers stay positive
deepddg_min = deepddg_df['deepddg'].min() deepddg_min = deepddg_df['deepddg'].min()
@ -252,6 +268,23 @@ else:
, '\nExpected number:', deepddg_pos , '\nExpected number:', deepddg_pos
, '\nGot:', deepddg_pos2 , '\nGot:', deepddg_pos2
, '\n======================================================') , '\n======================================================')
#--------------------------
# Deepddg outcome category
#--------------------------
deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
deepddg_df[deepddg_df['deepddg']>=0].count()
doc = deepddg_df['deepddg_outcome'].value_counts()
if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
print('\nPASS: Deepddg outcome category created')
else:
print('\nFAIL: Deepddg outcome category could NOT be created'
, '\nExpected number:', deepddg_pos
, '\nGot:', doc[0]
, '\n======================================================')
sys.exit()
#%%============================================================================= #%%=============================================================================
# Now merges begin # Now merges begin
#%%============================================================================= #%%=============================================================================

View file

@ -16,7 +16,9 @@ library(dplyr)
## my_df_u_lig ## my_df_u_lig
## dup_muts ## dup_muts
#======================================================== #========================================================
plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) { plotting_data <- function(df
, lig_dist_colname = 'ligand_distance'
, lig_dist_cutoff = 10) {
my_df = data.frame() my_df = data.frame()
my_df_u = data.frame() my_df_u = data.frame()
my_df_u_lig = data.frame() my_df_u_lig = data.frame()
@ -38,51 +40,51 @@ cat("\nInput dimensions:", dim(df))
#================================== #==================================
#------------------------------ #------------------------------
# adding foldx scaled values # # adding foldx scaled values
# scale data b/w -1 and 1 # # scale data b/w -1 and 1
#------------------------------ # #------------------------------
n = which(colnames(df) == "ddg"); n # n = which(colnames(df) == "ddg"); n
#
my_min = min(df[,n]); my_min # my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max # my_max = max(df[,n]); my_max
#
df$foldx_scaled = ifelse(df[,n] < 0 # df$foldx_scaled = ifelse(df[,n] < 0
, df[,n]/abs(my_min) # , df[,n]/abs(my_min)
, df[,n]/my_max) # , df[,n]/my_max)
# sanity check # # sanity check
my_min = min(df$foldx_scaled); my_min # my_min = min(df$foldx_scaled); my_min
my_max = max(df$foldx_scaled); my_max # my_max = max(df$foldx_scaled); my_max
#
if (my_min == -1 && my_max == 1){ # if (my_min == -1 && my_max == 1){
cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1" # cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
, "\nProceeding with assigning foldx outcome category") # , "\nProceeding with assigning foldx outcome category")
}else{ # }else{
cat("\nFAIL: could not scale foldx ddg values" # cat("\nFAIL: could not scale foldx ddg values"
, "Aborting!\n") # , "Aborting!\n")
} # }
#------------------------------ #------------------------------
# adding foldx outcome category # adding foldx outcome category
# ddg<0 = "Stabilising" (-ve) # ddg<0 = "Stabilising" (-ve)
#------------------------------ #------------------------------
c1 = table(df$ddg < 0) # c1 = table(df$ddg < 0)
df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising") # df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
c2 = table(df$ddg < 0) # c2 = table(df$ddg < 0)
#
if ( all(c1 == c2) ){ # if ( all(c1 == c2) ){
cat("\nPASS: foldx outcome successfully created") # cat("\nPASS: foldx outcome successfully created")
}else{ # }else{
cat("\nFAIL: foldx outcome could not be created. Aborting!\n") # cat("\nFAIL: foldx outcome could not be created. Aborting!\n")
exit() # exit()
} # }
#------------------------------ #------------------------------
# renaming foldx column from # renaming foldx column from
# "ddg" --> "ddg_foldx" # "ddg" --> "ddg_foldx"
#------------------------------ #------------------------------
# change name to foldx # # change name to foldx
colnames(df)[n] <- "ddg_foldx" # colnames(df)[n] <- "ddg_foldx"
#================================== #==================================
# extract unique mutation entries # extract unique mutation entries

View file

@ -97,33 +97,33 @@ merged_df3_comp = all_plot_dfs[[4]]
# adding deepddg scaled values # adding deepddg scaled values
# scale data b/w -1 and 1 # scale data b/w -1 and 1
#============================ #============================
n = which(colnames(merged_df3) == "deepddg"); n # n = which(colnames(merged_df3) == "deepddg"); n
#
my_min = min(merged_df3[,n]); my_min # my_min = min(merged_df3[,n]); my_min
my_max = max(merged_df3[,n]); my_max # my_max = max(merged_df3[,n]); my_max
#
merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0 # merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
, merged_df3[,n]/abs(my_min) # , merged_df3[,n]/abs(my_min)
, merged_df3[,n]/my_max) # , merged_df3[,n]/my_max)
# sanity check # # sanity check
my_min = min(merged_df3$deepddg_scaled); my_min # my_min = min(merged_df3$deepddg_scaled); my_min
my_max = max(merged_df3$deepddg_scaled); my_max # my_max = max(merged_df3$deepddg_scaled); my_max
#
if (my_min == -1 && my_max == 1){ # if (my_min == -1 && my_max == 1){
cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1" # cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
#, "\nProceeding with assigning deep outcome category") # #, "\nProceeding with assigning deep outcome category")
, "\n") # , "\n")
}else{ # }else{
cat("\nFAIL: could not scale DeepDDG ddg values" # cat("\nFAIL: could not scale DeepDDG ddg values"
, "Aborting!") # , "Aborting!")
} # }
#
#################################################################### ####################################################################
# Data for combining other dfs # Data for combining other dfs
#################################################################### ####################################################################
source("other_dfs_data.R") #source("other_dfs_data.R")
#################################################################### ####################################################################
# Data for subcols barplot (~heatmap) # Data for subcols barplot (~heatmap)