import commit
This commit is contained in:
commit
bccfe68192
39 changed files with 6837 additions and 0 deletions
25
mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
Executable file
25
mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/bin/bash
|
||||
|
||||
#*************************************
|
||||
# need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#**********************************************************************
|
||||
# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
|
||||
# per line. Sort by unique, which automatically removes duplicates.
|
||||
# sace file in current directory
|
||||
#**********************************************************************
|
||||
infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
|
||||
outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
|
||||
|
||||
# sort unique entries and output to current directory
|
||||
sort -u ${infile} > ${outfile}
|
||||
|
||||
# count no. of unique snps mCSM will run on
|
||||
count=$(wc -l < ${outfile})
|
||||
|
||||
# print to console no. of unique snps mCSM will run on
|
||||
echo "${count} unique mutations for mCSM to run on"
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
#!/bin/bash
|
||||
|
||||
#*************************************
|
||||
#need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#**********************************************************************
|
||||
# TASK: submit requests using curl: HANDLE redirects and refresh url.
|
||||
# Iterate over mutation file and write/append result urls to a file
|
||||
# result url file: stored in the /Results directory
|
||||
# mutation file: one mutation per line, no chain ID
|
||||
# output: in a file, should be n urls (n=no. of mutations in file)
|
||||
# NOTE: these are just result urls, not actual values for results
|
||||
#**********************************************************************
|
||||
## iterate over mutation file; line by line and submit query using curl
|
||||
filename="../Data/pnca_mis_SNPs_v2_unique.csv"
|
||||
|
||||
## some useful messages
|
||||
echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
|
||||
COUNT=0
|
||||
while read -r line; do
|
||||
((COUNT++))
|
||||
mutation="${line}"
|
||||
# echo "${mutation}"
|
||||
pdb='../Data/complex1_no_water.pdb'
|
||||
mutation="${mutation}"
|
||||
chain="A"
|
||||
lig_id="PZA"
|
||||
affin_wt="0.99"
|
||||
host="http://biosig.unimelb.edu.au"
|
||||
call_url="/mcsm_lig/prediction"
|
||||
|
||||
##=========================================
|
||||
##html field_names names required for curl
|
||||
##complex_field:wild=@
|
||||
##mutation_field:mutation=@
|
||||
##chain_field:chain=@
|
||||
##ligand_field:lig_id@
|
||||
##energy_field:affin_wt
|
||||
#=========================================
|
||||
refresh_url=$(curl -L \
|
||||
-sS \
|
||||
-F "wild=@${pdb}" \
|
||||
-F "mutation=${mutation}" \
|
||||
-F "chain=${chain}" \
|
||||
-F "lig_id=${lig_id}" \
|
||||
-F "affin_wt=${affin_wt}" \
|
||||
${host}${call_url} | grep "http-equiv")
|
||||
|
||||
#echo $refresh_url
|
||||
#echo ${host}${refresh_url}
|
||||
|
||||
#use regex to extract the relevant bit from the refresh url
|
||||
#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
|
||||
|
||||
#Now build: result url using host and refresh url and write the urls to a file in the Results dir
|
||||
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
|
||||
sleep 10
|
||||
|
||||
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
|
||||
|
||||
echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
|
||||
#echo -n '.'
|
||||
done < "${filename}"
|
||||
|
||||
echo
|
||||
echo "Processing Complete"
|
||||
|
||||
##end of submitting query, receiving result url and storing results url in a file
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
#!/bin/bash
|
||||
#*************************************
|
||||
#need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#********************************************************************
|
||||
# TASK: submit result urls and fetch actual results using curl
|
||||
# iterate over each result url from the output of step1 in the stored
|
||||
# in file in /Results.
|
||||
# Use curl to fetch results and extract relevant sections using hxtools
|
||||
# and store these in another file in /Results
|
||||
# This script takes two arguments:
|
||||
# input file: file containing results url
|
||||
# In this case: 336_mCSM_lig_complex1_result_url.txt
|
||||
# output file: name of the file where extracted results will be stored
|
||||
# In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
|
||||
#*********************************************************************
|
||||
|
||||
#if [ "$#" -ne 2 ]; then
|
||||
#if [ -Z $1 ]; then
|
||||
# echo "
|
||||
# Please provide both Input and Output files.
|
||||
|
||||
# Usage: batch_read_urls.sh INFILE OUTFILE
|
||||
# "
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
# First argument: Input File
|
||||
# Second argument: Output File
|
||||
#infile=$1
|
||||
#outfile=$2
|
||||
|
||||
infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
|
||||
outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
|
||||
|
||||
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
|
||||
echo
|
||||
COUNT=0
|
||||
while read -r line; do
|
||||
#COUNT=$(($COUNT+1))
|
||||
((COUNT++))
|
||||
curl --silent ${line} \
|
||||
| hxnormalize -x \
|
||||
| hxselect -c div.span4 \
|
||||
| hxselect -c div.well \
|
||||
| sed -r -e 's/<[^>]*>//g' \
|
||||
| sed -re 's/ +//g' \
|
||||
>> ${outfile}
|
||||
#| tee -a ${outfile}
|
||||
# echo -n '.'
|
||||
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
|
||||
|
||||
done < "${infile}"
|
||||
|
||||
echo
|
||||
echo "Processing Complete"
|
|
@ -0,0 +1,52 @@
|
|||
#!/bin/bash
|
||||
#*************************************
|
||||
#need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#********************************************************************
|
||||
# TASK: Intermediate results processing
|
||||
# output file has a convenient delimiter of ":" that can be used to
|
||||
# format the file into two columns (col1: field_desc and col2: values)
|
||||
# However the section "PredictedAffinityChange:...." and
|
||||
# "DUETstabilitychange:.." are split over multiple lines and
|
||||
# prevent this from happening.Additionally there are other empty lines
|
||||
# that need to be omiited. In order ensure these sections are not split
|
||||
# over multiple lines, this script is written.
|
||||
#*********************************************************************
|
||||
|
||||
infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
|
||||
|
||||
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
|
||||
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
|
||||
|
||||
# Outputs records separated by a newline, that look something like this:
|
||||
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
|
||||
# Mutationinformation:
|
||||
# Wild-type:L
|
||||
# Position:4
|
||||
# Mutant-type:W
|
||||
# Chain:A
|
||||
# LigandID:PZA
|
||||
# Distancetoligand:15.911Å
|
||||
# DUETstabilitychange:-2.169Kcal/mol
|
||||
#
|
||||
# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
|
||||
# (...etc)
|
||||
|
||||
# This script brings everything in a convenient format for further processing in python.
|
||||
# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
|
||||
sed -i '/PredictedAffinityChange/ {
|
||||
N
|
||||
N
|
||||
N
|
||||
N
|
||||
s/\n//g
|
||||
}
|
||||
/DUETstabilitychange:/ {
|
||||
N
|
||||
N
|
||||
s/\n//g
|
||||
}
|
||||
/^$/d' ${infile}
|
29
mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
Executable file
29
mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
Executable file
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/python
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
|
||||
|
||||
outCols=[
|
||||
'PredictedAffinityChange',
|
||||
'Mutationinformation',
|
||||
'Wild-type',
|
||||
'Position',
|
||||
'Mutant-type',
|
||||
'Chain',
|
||||
'LigandID',
|
||||
'Distancetoligand',
|
||||
'DUETstabilitychange'
|
||||
]
|
||||
|
||||
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
|
||||
|
||||
outputs = defaultdict(list)
|
||||
|
||||
for item in lines:
|
||||
col, val = item.split(':')
|
||||
outputs[col].append(val)
|
||||
|
||||
dfOut=pd.DataFrame(outputs)
|
||||
|
||||
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
|
207
mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
Normal file
207
mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
Normal file
|
@ -0,0 +1,207 @@
|
|||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
|
||||
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
|
||||
getwd()
|
||||
|
||||
#=======================================================
|
||||
#TASK: To tidy the columns so you can generate figures
|
||||
#=======================================================
|
||||
####################
|
||||
#### read file #####: this will be the output from python script (csv file)
|
||||
####################
|
||||
data = read.csv("336_complex1_formatted_results.csv"
|
||||
, header = T
|
||||
, stringsAsFactors = FALSE)
|
||||
dim(data)
|
||||
#335, 10
|
||||
str(data)
|
||||
|
||||
###########################
|
||||
##### Data processing #####
|
||||
###########################
|
||||
|
||||
# populate mutation information columns as currently it is empty
|
||||
head(data$Mutationinformation)
|
||||
tail(data$Mutationinformation)
|
||||
|
||||
# should not be blank: create muation information
|
||||
data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
|
||||
|
||||
head(data$Mutationinformation)
|
||||
tail(data$Mutationinformation)
|
||||
#write.csv(data, 'test.csv')
|
||||
##########################################
|
||||
# Remove duplicate SNPs as a sanity check
|
||||
##########################################
|
||||
#very important
|
||||
table(duplicated(data$Mutationinformation))
|
||||
#FALSE
|
||||
#335
|
||||
|
||||
#extract duplicated entries
|
||||
dups = data[duplicated(data$Mutationinformation),] #0
|
||||
|
||||
#No of dups should match with the no. of TRUE in the above table
|
||||
#u_dups = unique(dups$Mutationinformation) #10
|
||||
sum( table(dups$Mutationinformation) ) #13
|
||||
|
||||
rm(dups)
|
||||
|
||||
#***************************************************************
|
||||
#select non-duplicated SNPs and create a new df
|
||||
df = data[!duplicated(data$Mutationinformation),] #309, 10
|
||||
#***************************************************************
|
||||
#sanity check
|
||||
u = unique(df$Mutationinformation)
|
||||
u2 = unique(data$Mutationinformation)
|
||||
table(u%in%u2)
|
||||
#TRUE
|
||||
#309
|
||||
#should all be 1, hence 309 1's
|
||||
sum(table(df$Mutationinformation) == 1)
|
||||
|
||||
#sort df by Position
|
||||
#MANUAL CHECKPOINT:
|
||||
#foo <- df[order(df$Position),]
|
||||
#df <- df[order(df$Position),]
|
||||
|
||||
rm(u, u2, dups)
|
||||
|
||||
####################
|
||||
#### give meaningful colnames to reflect units to enable correct data type
|
||||
####################
|
||||
|
||||
#=======
|
||||
#STEP 1
|
||||
#========
|
||||
#make a copy of the PredictedAffinityColumn and call it Lig_outcome
|
||||
df$Lig_outcome = df$PredictedAffinityChange #335, 11
|
||||
|
||||
#make Predicted...column numeric and outcome column categorical
|
||||
head(df$PredictedAffinityChange)
|
||||
df$PredictedAffinityChange = gsub("log.*"
|
||||
, ""
|
||||
, df$PredictedAffinityChange)
|
||||
|
||||
#sanity checks
|
||||
head(df$PredictedAffinityChange)
|
||||
|
||||
#should be numeric, check and if not make it numeric
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
#change to numeric
|
||||
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
|
||||
#should be TRUE
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
|
||||
#change the column name to indicate units
|
||||
n = which(colnames(df) == "PredictedAffinityChange"); n
|
||||
colnames(df)[n] = "PredAffLog"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 2
|
||||
#========
|
||||
#make Lig_outcome column categorical showing effect of mutation
|
||||
head(df$Lig_outcome)
|
||||
df$Lig_outcome = gsub("^.*-"
|
||||
, "",
|
||||
df$Lig_outcome)
|
||||
#sanity checks
|
||||
head(df$Lig_outcome)
|
||||
#should be factor, check and if not change it to factor
|
||||
is.factor(df$Lig_outcome)
|
||||
#change to factor
|
||||
df$Lig_outcome = as.factor(df$Lig_outcome)
|
||||
#should be TRUE
|
||||
is.factor(df$Lig_outcome)
|
||||
|
||||
#========
|
||||
#STEP 3
|
||||
#========
|
||||
#gsub
|
||||
head(df$Distancetoligand)
|
||||
df$Distancetoligand = gsub("Å"
|
||||
, ""
|
||||
, df$Distancetoligand)
|
||||
#sanity checks
|
||||
head(df$Distancetoligand)
|
||||
#should be numeric, check if not change it to numeric
|
||||
is.numeric(df$Distancetoligand)
|
||||
#change to numeric
|
||||
df$Distancetoligand = as.numeric(df$Distancetoligand)
|
||||
#should be TRUE
|
||||
is.numeric(df$Distancetoligand)
|
||||
|
||||
#change the column name to indicate units
|
||||
n = which(colnames(df) == "Distancetoligand")
|
||||
colnames(df)[n] <- "Dis_lig_Ang"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 4
|
||||
#========
|
||||
#gsub
|
||||
head(df$DUETstabilitychange)
|
||||
df$DUETstabilitychange = gsub("Kcal/mol"
|
||||
, ""
|
||||
, df$DUETstabilitychange)
|
||||
#sanity checks
|
||||
head(df$DUETstabilitychange)
|
||||
#should be numeric, check if not change it to numeric
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
#change to numeric
|
||||
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
|
||||
#should be TRUE
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
|
||||
#change the column name to indicate units
|
||||
n = which(colnames(df) == "DUETstabilitychange"); n
|
||||
colnames(df)[n] = "DUETStability_Kcalpermol"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 5
|
||||
#========
|
||||
#create yet another extra column: classification of DUET stability only
|
||||
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
|
||||
, "Stabilizing"
|
||||
, "Destabilizing") #335, 12
|
||||
|
||||
table(df$Lig_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#281 54
|
||||
|
||||
table(df$DUET_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#288 47
|
||||
#==============================
|
||||
#FIXME
|
||||
#Insert a venn diagram
|
||||
|
||||
#================================
|
||||
|
||||
|
||||
#========
|
||||
#STEP 6
|
||||
#========
|
||||
# assign wild and mutant colnames correctly
|
||||
|
||||
wt = which(colnames(df) == "Wild.type"); wt
|
||||
colnames(df)[wt] <- "Wild_type"
|
||||
colnames(df[wt])
|
||||
|
||||
mut = which(colnames(df) == "Mutant.type"); mut
|
||||
colnames(df)[mut] <- "Mutant_type"
|
||||
colnames(df[mut])
|
||||
|
||||
#========
|
||||
#STEP 7
|
||||
#========
|
||||
#create an extra column: maybe useful for some plots
|
||||
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
|
||||
|
||||
#clear variables
|
||||
rm(n, wt, mut)
|
||||
|
||||
################ end of data cleaning
|
252
mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
Normal file
252
mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
Normal file
|
@ -0,0 +1,252 @@
|
|||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
|
||||
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
|
||||
getwd()
|
||||
|
||||
#=======================================================
|
||||
#TASK:read cleaned data and perform rescaling
|
||||
# of DUET stability scores
|
||||
# of Pred affinity
|
||||
#compare scaling methods with plots
|
||||
#output normalised file
|
||||
#=======================================================
|
||||
|
||||
####################
|
||||
#### read file #####: this will be the output of my R script that cleans the data columns
|
||||
####################
|
||||
source("../Scripts/step3c_data_cleaning.R")
|
||||
##This will outut two dataframes:
|
||||
##data: unclean data: 335, 10
|
||||
##df : cleaned df 335, 13
|
||||
## you can remove data if you want as you will not need it
|
||||
rm(data)
|
||||
|
||||
colnames(df)
|
||||
|
||||
#===================
|
||||
#3a: PredAffLog
|
||||
#===================
|
||||
n = which(colnames(df) == "PredAffLog"); n
|
||||
group = which(colnames(df) == "Lig_outcome"); group
|
||||
|
||||
#===================================================
|
||||
# order according to PredAffLog values
|
||||
#===================================================
|
||||
# This is because this makes it easier to see the results of rescaling for debugging
|
||||
head(df$PredAffLog)
|
||||
|
||||
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
|
||||
df = df[order(df$PredAffLog),]
|
||||
head(df$PredAffLog)
|
||||
|
||||
#sanity checks
|
||||
head(df[,n]) #all negatives
|
||||
tail(df[,n]) #all positives
|
||||
|
||||
#sanity checks
|
||||
mean(df[,n])
|
||||
#-0.9526746
|
||||
|
||||
tapply(df[,n], df[,group], mean)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.2112100 0.3926667
|
||||
#===========================
|
||||
#Same as above: in 2 steps
|
||||
#===========================
|
||||
|
||||
#find range of your data
|
||||
my_min = min(df[,n]); my_min #-3.948
|
||||
my_max = max(df[,n]); my_max #2.23
|
||||
|
||||
#===============================================
|
||||
# WITHIN GROUP rescaling 2: method "ratio"
|
||||
# create column to store the rescaled values
|
||||
# Rescaling separately (Less dangerous)
|
||||
# =====> chosen one:as Nick prefers
|
||||
#===============================================
|
||||
df$ratioPredAff = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max
|
||||
)#335 14
|
||||
#sanity checks
|
||||
head(df$ratioPredAff)
|
||||
tail(df$ratioPredAff)
|
||||
|
||||
min(df$ratioPredAff); max(df$ratioPredAff)
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.000000000 0.005381166
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
#Destabilizing Stabilizing
|
||||
#-0.001266464 1.000000000
|
||||
|
||||
#should be the same as below (281 and 54)
|
||||
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
|
||||
|
||||
table(df$Lig_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#281 54
|
||||
|
||||
#===============================================
|
||||
# Hist and density plots to compare the rescaling
|
||||
# methods: Base R
|
||||
#===============================================
|
||||
#uncomment as necessary
|
||||
my_title = "Ligand_stability"
|
||||
#my_title = colnames(df[n])
|
||||
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(2,2))
|
||||
|
||||
hist(df[,n]
|
||||
, xlab = ""
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
hist(df$ratioPredAff
|
||||
, xlab = ""
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# Plot density plots underneath
|
||||
plot(density( df[,n] )
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
plot(density( df$ratioPredAff )
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = my_title
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
|
||||
#clear variables
|
||||
rm(my_min, my_max, my_title, n, group)
|
||||
|
||||
#===================
|
||||
# 3b: DUET stability
|
||||
#===================
|
||||
dim(df) #335, 14
|
||||
|
||||
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
|
||||
group = which(colnames(df) == "DUET_outcome"); group #12
|
||||
|
||||
#===================================================
|
||||
# order according to DUET scores
|
||||
#===================================================
|
||||
# This is because this makes it easier to see the results of rescaling for debugging
|
||||
head(df$DUETStability_Kcalpermol)
|
||||
|
||||
#ORDER BY DUET scores: negative values at the top and positive at the bottom
|
||||
df = df[order(df$DUETStability_Kcalpermol),]
|
||||
|
||||
#sanity checks
|
||||
head(df[,n]) #negatives
|
||||
tail(df[,n]) #positives
|
||||
|
||||
#sanity checks
|
||||
mean(df[,n])
|
||||
#[1] -1.173316
|
||||
|
||||
tapply(df[,n], df[,group], mean)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.4297257 0.3978723
|
||||
|
||||
#===============================================
|
||||
# WITHIN GROUP rescaling 2: method "ratio"
|
||||
# create column to store the rescaled values
|
||||
# Rescaling separately (Less dangerous)
|
||||
# =====> chosen one:as Nick prefers
|
||||
#===============================================
|
||||
#find range of your data
|
||||
my_min = min(df[,n]); my_min #-3.87
|
||||
my_max = max(df[,n]); my_max #1.689
|
||||
|
||||
df$ratioDUET = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max
|
||||
) #335, 15
|
||||
#sanity check
|
||||
head(df$ratioDUET)
|
||||
tail(df$ratioDUET)
|
||||
|
||||
min(df$ratioDUET); max(df$ratioDUET)
|
||||
|
||||
#sanity checks
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.00000000 0.01065719
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
#Destabilizing Stabilizing
|
||||
#-0.003875969 1.000000000
|
||||
|
||||
#should be the same as below (267 and 42)
|
||||
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
|
||||
|
||||
table(df$DUET_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#288 47
|
||||
|
||||
#===============================================
|
||||
# Hist and density plots to compare the rescaling
|
||||
# methods: Base R
|
||||
#===============================================
|
||||
#uncomment as necessary
|
||||
|
||||
my_title = "DUET_stability"
|
||||
#my_title = colnames(df[n])
|
||||
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(2,2))
|
||||
|
||||
hist(df[,n]
|
||||
, xlab = ""
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
hist(df$ratioDUET
|
||||
, xlab = ""
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# Plot density plots underneath
|
||||
plot(density( df[,n] )
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
plot(density( df$ratioDUET )
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = my_title
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
#===================
|
||||
# write output as csv file
|
||||
#===================
|
||||
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
|
Loading…
Add table
Add a link
Reference in a new issue