LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
2020-01-08 16:15:33 +00:00

252 lines
6.1 KiB
R

getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
getwd()
#=======================================================
#TASK:read cleaned data and perform rescaling
# of DUET stability scores
# of Pred affinity
#compare scaling methods with plots
#output normalised file
#=======================================================
####################
#### read file #####: this will be the output of my R script that cleans the data columns
####################
source("../Scripts/step3c_data_cleaning.R")
##This will outut two dataframes:
##data: unclean data: 335, 10
##df : cleaned df 335, 13
## you can remove data if you want as you will not need it
rm(data)
colnames(df)
#===================
#3a: PredAffLog
#===================
n = which(colnames(df) == "PredAffLog"); n
group = which(colnames(df) == "Lig_outcome"); group
#===================================================
# order according to PredAffLog values
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$PredAffLog)
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
df = df[order(df$PredAffLog),]
head(df$PredAffLog)
#sanity checks
head(df[,n]) #all negatives
tail(df[,n]) #all positives
#sanity checks
mean(df[,n])
#-0.9526746
tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.2112100 0.3926667
#===========================
#Same as above: in 2 steps
#===========================
#find range of your data
my_min = min(df[,n]); my_min #-3.948
my_max = max(df[,n]); my_max #2.23
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers
#===============================================
df$ratioPredAff = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
)#335 14
#sanity checks
head(df$ratioPredAff)
tail(df$ratioPredAff)
min(df$ratioPredAff); max(df$ratioPredAff)
tapply(df$ratioPredAff, df$Lig_outcome, min)
#Destabilizing Stabilizing
#-1.000000000 0.005381166
tapply(df$ratioPredAff, df$Lig_outcome, max)
#Destabilizing Stabilizing
#-0.001266464 1.000000000
#should be the same as below (281 and 54)
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
#uncomment as necessary
my_title = "Ligand_stability"
#my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioPredAff
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioPredAff )
, main = "ratio rescaling"
)
# titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
#clear variables
rm(my_min, my_max, my_title, n, group)
#===================
# 3b: DUET stability
#===================
dim(df) #335, 14
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
group = which(colnames(df) == "DUET_outcome"); group #12
#===================================================
# order according to DUET scores
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$DUETStability_Kcalpermol)
#ORDER BY DUET scores: negative values at the top and positive at the bottom
df = df[order(df$DUETStability_Kcalpermol),]
#sanity checks
head(df[,n]) #negatives
tail(df[,n]) #positives
#sanity checks
mean(df[,n])
#[1] -1.173316
tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.4297257 0.3978723
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers
#===============================================
#find range of your data
my_min = min(df[,n]); my_min #-3.87
my_max = max(df[,n]); my_max #1.689
df$ratioDUET = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
) #335, 15
#sanity check
head(df$ratioDUET)
tail(df$ratioDUET)
min(df$ratioDUET); max(df$ratioDUET)
#sanity checks
tapply(df$ratioDUET, df$DUET_outcome, min)
#Destabilizing Stabilizing
#-1.00000000 0.01065719
tapply(df$ratioDUET, df$DUET_outcome, max)
#Destabilizing Stabilizing
#-0.003875969 1.000000000
#should be the same as below (267 and 42)
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
#uncomment as necessary
my_title = "DUET_stability"
#my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioDUET
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioDUET )
, main = "ratio rescaling"
)
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
#===================
# write output as csv file
#===================
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15