renamed files with shorter names and corrected paths for input and output files
This commit is contained in:
parent
4be0de97d7
commit
61fcd14b17
8 changed files with 244 additions and 164 deletions
|
@ -1,6 +1,9 @@
|
|||
# run step0-step3a for mcsm pipeline
|
||||
#!/bin/bash
|
||||
|
||||
# run all bash scripts for mcsm
|
||||
|
||||
#./step0_check_duplicate_SNPs.sh
|
||||
#./step1_mCSMLig_curl_submit_store_outputurl.sh
|
||||
./step2_mCSM_LIG_batch_outputurls_results.sh
|
||||
./step3a_mCSM_LIG_regex_output_formatting.sh
|
||||
#./step1_lig_output_urls.sh
|
||||
./step2_lig_results.sh
|
||||
./step3a_results_format_interim.sh
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv"
|
|||
infile_struc="/complex1_no_water.pdb"
|
||||
|
||||
outpath="${inpath}${processed_path}"
|
||||
outfile="/mCSM_lig_complex1_result_url.txt"
|
||||
outfile="/complex1_result_url.txt"
|
||||
|
||||
# create valid input and output filenames
|
||||
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
|
||||
|
@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
|
|||
|
||||
# create output file with the added number of muts from file
|
||||
# after much thought, bad idea as less generic!
|
||||
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
|
||||
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
|
||||
echo -e "${host}${result_url}" >> ${outfilename}
|
||||
#echo -n '.'
|
||||
done < "${filename}"
|
|
@ -35,10 +35,10 @@
|
|||
# specify variables for input and output paths and filenames
|
||||
inpath="${HOME}/git/Data/pyrazinamide/input"
|
||||
processed_path="/processed"
|
||||
infile="/mCSM_lig_complex1_result_url.txt"
|
||||
infile="/complex1_result_url.txt"
|
||||
|
||||
outpath="${inpath}${processed_path}"
|
||||
outfile="/mCSM_lig_complex1_output_MASTER.txt"
|
||||
outfile="/complex1_output_MASTER.txt"
|
||||
|
||||
# create valid input and output filenames
|
||||
filename="${inpath}${processed_path}${infile}"
|
|
@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input"
|
|||
processed_path="/processed"
|
||||
|
||||
# Create input file: copy and rename output file of step2
|
||||
oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
|
||||
newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
|
||||
oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
|
||||
newfile="${inpath}${processed_path}/complex1_output_processed.txt"
|
||||
cp $oldfile $newfile
|
||||
|
||||
#infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
|
||||
infile="/mCSM_lig_complex1_output_processed.txt"
|
||||
infile="/complex1_output_processed.txt"
|
||||
filename="${inpath}${processed_path}${infile}"
|
||||
|
||||
echo Input filename is : ${filename}
|
|
@ -1,29 +0,0 @@
|
|||
#!/usr/bin/python
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
|
||||
|
||||
outCols=[
|
||||
'PredictedAffinityChange',
|
||||
'Mutationinformation',
|
||||
'Wild-type',
|
||||
'Position',
|
||||
'Mutant-type',
|
||||
'Chain',
|
||||
'LigandID',
|
||||
'Distancetoligand',
|
||||
'DUETstabilitychange'
|
||||
]
|
||||
|
||||
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
|
||||
|
||||
outputs = defaultdict(list)
|
||||
|
||||
for item in lines:
|
||||
col, val = item.split(':')
|
||||
outputs[col].append(val)
|
||||
|
||||
dfOut=pd.DataFrame(outputs)
|
||||
|
||||
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
|
61
mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
Executable file
61
mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
Executable file
|
@ -0,0 +1,61 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
###################
|
||||
# load libraries
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
####################
|
||||
|
||||
#********************************************************************
|
||||
# TASK: Formatting results with nice colnames
|
||||
# step3a processed the mcsm results to remove all newlines and
|
||||
# brought data in a format where the delimiter ":" splits
|
||||
# data into a convenient format of "colname": "value".
|
||||
# this script formats the data and outputs a df with each row
|
||||
# as a mutation and its corresponding mcsm_values
|
||||
|
||||
# Requirements:
|
||||
# input: output of step3a, file containing "..._output_processed.txt"
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
# output: formatted .csv file
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
#***********************************************************************
|
||||
# specify variables for input and output paths and filenames
|
||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
|
||||
basedir = "/git/Data/pyrazinamide/input"
|
||||
inpath = "/processed"
|
||||
in_filename = "/complex1_output_processed.txt"
|
||||
infile = homedir + basedir + inpath + in_filename
|
||||
print("Input file is:", infile)
|
||||
|
||||
outpath = "/processed"
|
||||
out_filename = "/complex1_formatted_results.csv"
|
||||
outfile = homedir + basedir + outpath + out_filename
|
||||
print("Output file is:", outfile)
|
||||
# end of variable assignment for input and output files
|
||||
|
||||
outCols=[
|
||||
'PredictedAffinityChange',
|
||||
'Mutationinformation',
|
||||
'Wild-type',
|
||||
'Position',
|
||||
'Mutant-type',
|
||||
'Chain',
|
||||
'LigandID',
|
||||
'Distancetoligand',
|
||||
'DUETstabilitychange'
|
||||
]
|
||||
|
||||
lines = [line.rstrip('\n') for line in open(infile)]
|
||||
|
||||
outputs = defaultdict(list)
|
||||
|
||||
for item in lines:
|
||||
col, val = item.split(':')
|
||||
outputs[col].append(val)
|
||||
|
||||
dfOut=pd.DataFrame(outputs)
|
||||
|
||||
pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
|
|
@ -1,22 +1,42 @@
|
|||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
|
||||
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
|
||||
#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
|
||||
getwd()
|
||||
|
||||
#=======================================================
|
||||
# TASK: read formatted_results_df.csv to complete
|
||||
# missing info, adding DUET categories, assigning
|
||||
# meaningful colnames, etc.
|
||||
|
||||
# Requirements:
|
||||
# input: output of step3b, python processing,
|
||||
# path: Data/<drug>/input/processed/<filename>"
|
||||
# output: NO output as the next scripts refers to this
|
||||
# for yet more processing
|
||||
#=======================================================
|
||||
|
||||
# specify variables for input and output paths and filenames
|
||||
homedir = "~"
|
||||
basedir = "/git/Data/pyrazinamide/input"
|
||||
inpath = "/processed"
|
||||
in_filename = "/complex1_formatted_results.csv"
|
||||
infile = paste0(homedir, basedir, inpath, in_filename)
|
||||
print(paste0("Input file is:", infile))
|
||||
|
||||
#======================================================
|
||||
#TASK: To tidy the columns so you can generate figures
|
||||
#=======================================================
|
||||
####################
|
||||
#### read file #####: this will be the output from python script (csv file)
|
||||
####################
|
||||
data = read.csv("336_complex1_formatted_results.csv"
|
||||
data = read.csv(infile
|
||||
, header = T
|
||||
, stringsAsFactors = FALSE)
|
||||
dim(data)
|
||||
#335, 10
|
||||
str(data)
|
||||
|
||||
# clear variables
|
||||
rm(homedir, basedir, inpath, in_filename, infile)
|
||||
|
||||
###########################
|
||||
##### Data processing #####
|
||||
###########################
|
||||
|
@ -31,41 +51,38 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ
|
|||
head(data$Mutationinformation)
|
||||
tail(data$Mutationinformation)
|
||||
#write.csv(data, 'test.csv')
|
||||
|
||||
##########################################
|
||||
# Remove duplicate SNPs as a sanity check
|
||||
##########################################
|
||||
#very important
|
||||
# very important
|
||||
table(duplicated(data$Mutationinformation))
|
||||
#FALSE
|
||||
#335
|
||||
|
||||
#extract duplicated entries
|
||||
# extract duplicated entries
|
||||
dups = data[duplicated(data$Mutationinformation),] #0
|
||||
|
||||
#No of dups should match with the no. of TRUE in the above table
|
||||
# No of dups should match with the no. of TRUE in the above table
|
||||
#u_dups = unique(dups$Mutationinformation) #10
|
||||
sum( table(dups$Mutationinformation) ) #13
|
||||
|
||||
rm(dups)
|
||||
sum( table(dups$Mutationinformation) )
|
||||
|
||||
#***************************************************************
|
||||
#select non-duplicated SNPs and create a new df
|
||||
df = data[!duplicated(data$Mutationinformation),] #309, 10
|
||||
# select non-duplicated SNPs and create a new df
|
||||
df = data[!duplicated(data$Mutationinformation),]
|
||||
#***************************************************************
|
||||
#sanity check
|
||||
# sanity check
|
||||
u = unique(df$Mutationinformation)
|
||||
u2 = unique(data$Mutationinformation)
|
||||
table(u%in%u2)
|
||||
#TRUE
|
||||
#309
|
||||
#should all be 1, hence 309 1's
|
||||
|
||||
# should all be 1
|
||||
sum(table(df$Mutationinformation) == 1)
|
||||
|
||||
#sort df by Position
|
||||
#MANUAL CHECKPOINT:
|
||||
# sort df by Position
|
||||
# MANUAL CHECKPOINT:
|
||||
#foo <- df[order(df$Position),]
|
||||
#df <- df[order(df$Position),]
|
||||
|
||||
# clear variables
|
||||
rm(u, u2, dups)
|
||||
|
||||
####################
|
||||
|
@ -75,26 +92,28 @@ rm(u, u2, dups)
|
|||
#=======
|
||||
#STEP 1
|
||||
#========
|
||||
#make a copy of the PredictedAffinityColumn and call it Lig_outcome
|
||||
df$Lig_outcome = df$PredictedAffinityChange #335, 11
|
||||
# make a copy of the PredictedAffinityColumn and call it Lig_outcome
|
||||
df$Lig_outcome = df$PredictedAffinityChange
|
||||
|
||||
#make Predicted...column numeric and outcome column categorical
|
||||
#make Predicted...column numeric and outcome column categorical
|
||||
head(df$PredictedAffinityChange)
|
||||
df$PredictedAffinityChange = gsub("log.*"
|
||||
, ""
|
||||
, df$PredictedAffinityChange)
|
||||
|
||||
#sanity checks
|
||||
# sanity checks
|
||||
head(df$PredictedAffinityChange)
|
||||
|
||||
#should be numeric, check and if not make it numeric
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
#change to numeric
|
||||
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
|
||||
#should be TRUE
|
||||
# should be numeric, check and if not make it numeric
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
|
||||
#change the column name to indicate units
|
||||
# change to numeric
|
||||
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
|
||||
|
||||
# should be TRUE
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
|
||||
# change the column name to indicate units
|
||||
n = which(colnames(df) == "PredictedAffinityChange"); n
|
||||
colnames(df)[n] = "PredAffLog"
|
||||
colnames(df)[n]
|
||||
|
@ -102,38 +121,44 @@ colnames(df)[n]
|
|||
#========
|
||||
#STEP 2
|
||||
#========
|
||||
#make Lig_outcome column categorical showing effect of mutation
|
||||
# make Lig_outcome column categorical showing effect of mutation
|
||||
head(df$Lig_outcome)
|
||||
df$Lig_outcome = gsub("^.*-"
|
||||
, "",
|
||||
df$Lig_outcome)
|
||||
#sanity checks
|
||||
# sanity checks
|
||||
head(df$Lig_outcome)
|
||||
#should be factor, check and if not change it to factor
|
||||
|
||||
# should be factor, check and if not change it to factor
|
||||
is.factor(df$Lig_outcome)
|
||||
#change to factor
|
||||
|
||||
# change to factor
|
||||
df$Lig_outcome = as.factor(df$Lig_outcome)
|
||||
#should be TRUE
|
||||
|
||||
# should be TRUE
|
||||
is.factor(df$Lig_outcome)
|
||||
|
||||
#========
|
||||
#STEP 3
|
||||
#========
|
||||
#gsub
|
||||
# gsub
|
||||
head(df$Distancetoligand)
|
||||
df$Distancetoligand = gsub("Å"
|
||||
, ""
|
||||
, df$Distancetoligand)
|
||||
#sanity checks
|
||||
# sanity checks
|
||||
head(df$Distancetoligand)
|
||||
#should be numeric, check if not change it to numeric
|
||||
is.numeric(df$Distancetoligand)
|
||||
#change to numeric
|
||||
df$Distancetoligand = as.numeric(df$Distancetoligand)
|
||||
#should be TRUE
|
||||
|
||||
# should be numeric, check if not change it to numeric
|
||||
is.numeric(df$Distancetoligand)
|
||||
|
||||
#change the column name to indicate units
|
||||
# change to numeric
|
||||
df$Distancetoligand = as.numeric(df$Distancetoligand)
|
||||
|
||||
# should be TRUE
|
||||
is.numeric(df$Distancetoligand)
|
||||
|
||||
# change the column name to indicate units
|
||||
n = which(colnames(df) == "Distancetoligand")
|
||||
colnames(df)[n] <- "Dis_lig_Ang"
|
||||
colnames(df)[n]
|
||||
|
@ -146,16 +171,19 @@ head(df$DUETstabilitychange)
|
|||
df$DUETstabilitychange = gsub("Kcal/mol"
|
||||
, ""
|
||||
, df$DUETstabilitychange)
|
||||
#sanity checks
|
||||
# sanity checks
|
||||
head(df$DUETstabilitychange)
|
||||
#should be numeric, check if not change it to numeric
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
#change to numeric
|
||||
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
|
||||
#should be TRUE
|
||||
|
||||
# should be numeric, check if not change it to numeric
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
|
||||
#change the column name to indicate units
|
||||
# change to numeric
|
||||
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
|
||||
|
||||
# should be TRUE
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
|
||||
# change the column name to indicate units
|
||||
n = which(colnames(df) == "DUETstabilitychange"); n
|
||||
colnames(df)[n] = "DUETStability_Kcalpermol"
|
||||
colnames(df)[n]
|
||||
|
@ -163,25 +191,20 @@ colnames(df)[n]
|
|||
#========
|
||||
#STEP 5
|
||||
#========
|
||||
#create yet another extra column: classification of DUET stability only
|
||||
# create yet another extra column: classification of DUET stability only
|
||||
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
|
||||
, "Stabilizing"
|
||||
, "Destabilizing") #335, 12
|
||||
, "Destabilizing") # spelling to be consistent with mcsm
|
||||
|
||||
table(df$Lig_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#281 54
|
||||
|
||||
table(df$DUET_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#288 47
|
||||
|
||||
#==============================
|
||||
#FIXME
|
||||
#Insert a venn diagram
|
||||
|
||||
#================================
|
||||
|
||||
|
||||
#========
|
||||
#STEP 6
|
||||
#========
|
||||
|
@ -198,10 +221,10 @@ colnames(df[mut])
|
|||
#========
|
||||
#STEP 7
|
||||
#========
|
||||
#create an extra column: maybe useful for some plots
|
||||
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
|
||||
# create an extra column: maybe useful for some plots
|
||||
df$WildPos = paste0(df$Wild_type, df$Position)
|
||||
|
||||
#clear variables
|
||||
# clear variables
|
||||
rm(n, wt, mut)
|
||||
|
||||
################ end of data cleaning
|
|
@ -1,25 +1,47 @@
|
|||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
|
||||
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
|
||||
##################
|
||||
# load libraries
|
||||
library(compare)
|
||||
##################
|
||||
|
||||
getwd()
|
||||
|
||||
#=======================================================
|
||||
#TASK:read cleaned data and perform rescaling
|
||||
# TASK:read cleaned data and perform rescaling
|
||||
# of DUET stability scores
|
||||
# of Pred affinity
|
||||
#compare scaling methods with plots
|
||||
#output normalised file
|
||||
# compare scaling methods with plots
|
||||
|
||||
# Requirements:
|
||||
# input: R script, step3c_results_cleaning.R
|
||||
# path: Data/<drug>/input/processed/<filename>"
|
||||
# output: NO output as the next scripts refers to this
|
||||
# for yet more processing
|
||||
# output normalised file
|
||||
#=======================================================
|
||||
|
||||
# specify variables for input and output paths and filenames
|
||||
homedir = "~"
|
||||
currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
|
||||
in_filename = "/step3c_results_cleaning.R"
|
||||
infile = paste0(homedir, currdir, in_filename)
|
||||
print(paste0("Input file is:", infile))
|
||||
|
||||
# output file
|
||||
basedir = "/git/Data/pyrazinamide/input"
|
||||
outpath = "/processed"
|
||||
out_filename = "/mcsm_complex1_normalised.csv"
|
||||
outfile = paste0(homedir, basedir, outpath, out_filename)
|
||||
print(paste0("Output file is:", outfile))
|
||||
|
||||
####################
|
||||
#### read file #####: this will be the output of my R script that cleans the data columns
|
||||
####################
|
||||
source("../Scripts/step3c_data_cleaning.R")
|
||||
##This will outut two dataframes:
|
||||
##data: unclean data: 335, 10
|
||||
##df : cleaned df 335, 13
|
||||
## you can remove data if you want as you will not need it
|
||||
source(infile)
|
||||
|
||||
#This will outut two dataframes:
|
||||
# data: unclean data: 10 cols
|
||||
# df : cleaned df: 13 cols
|
||||
# you can remove data if you want as you will not need it
|
||||
rm(data)
|
||||
|
||||
colnames(df)
|
||||
|
@ -36,67 +58,60 @@ group = which(colnames(df) == "Lig_outcome"); group
|
|||
# This is because this makes it easier to see the results of rescaling for debugging
|
||||
head(df$PredAffLog)
|
||||
|
||||
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
|
||||
# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
|
||||
df = df[order(df$PredAffLog),]
|
||||
head(df$PredAffLog)
|
||||
|
||||
#sanity checks
|
||||
head(df[,n]) #all negatives
|
||||
tail(df[,n]) #all positives
|
||||
# sanity checks
|
||||
head(df[,n]) # all negatives
|
||||
tail(df[,n]) # all positives
|
||||
|
||||
#sanity checks
|
||||
# sanity checks
|
||||
mean(df[,n])
|
||||
#-0.9526746
|
||||
|
||||
tapply(df[,n], df[,group], mean)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.2112100 0.3926667
|
||||
|
||||
#===========================
|
||||
#Same as above: in 2 steps
|
||||
# Same as above: in 2 steps
|
||||
#===========================
|
||||
|
||||
#find range of your data
|
||||
my_min = min(df[,n]); my_min #-3.948
|
||||
my_max = max(df[,n]); my_max #2.23
|
||||
# find range of your data
|
||||
my_min = min(df[,n]); my_min #
|
||||
my_max = max(df[,n]); my_max #
|
||||
|
||||
#===============================================
|
||||
# WITHIN GROUP rescaling 2: method "ratio"
|
||||
# create column to store the rescaled values
|
||||
# Rescaling separately (Less dangerous)
|
||||
# =====> chosen one:as Nick prefers
|
||||
# =====> chosen one: preserves sign
|
||||
#===============================================
|
||||
df$ratioPredAff = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max
|
||||
)#335 14
|
||||
#sanity checks
|
||||
)# 14 cols
|
||||
# sanity checks
|
||||
head(df$ratioPredAff)
|
||||
tail(df$ratioPredAff)
|
||||
|
||||
min(df$ratioPredAff); max(df$ratioPredAff)
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.000000000 0.005381166
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
#Destabilizing Stabilizing
|
||||
#-0.001266464 1.000000000
|
||||
|
||||
#should be the same as below (281 and 54)
|
||||
# should be the same as below
|
||||
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
|
||||
|
||||
table(df$Lig_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#281 54
|
||||
|
||||
#===============================================
|
||||
# Hist and density plots to compare the rescaling
|
||||
# methods: Base R
|
||||
#===============================================
|
||||
#uncomment as necessary
|
||||
# uncomment as necessary
|
||||
my_title = "Ligand_stability"
|
||||
#my_title = colnames(df[n])
|
||||
# my_title = colnames(df[n])
|
||||
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
|
@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group)
|
|||
#===================
|
||||
# 3b: DUET stability
|
||||
#===================
|
||||
dim(df) #335, 14
|
||||
dim(df) # 14 cols
|
||||
|
||||
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
|
||||
group = which(colnames(df) == "DUET_outcome"); group #12
|
||||
|
@ -151,63 +166,53 @@ group = which(colnames(df) == "DUET_outcome"); group #12
|
|||
# This is because this makes it easier to see the results of rescaling for debugging
|
||||
head(df$DUETStability_Kcalpermol)
|
||||
|
||||
#ORDER BY DUET scores: negative values at the top and positive at the bottom
|
||||
# ORDER BY DUET scores: negative values at the top and positive at the bottom
|
||||
df = df[order(df$DUETStability_Kcalpermol),]
|
||||
|
||||
#sanity checks
|
||||
head(df[,n]) #negatives
|
||||
tail(df[,n]) #positives
|
||||
# sanity checks
|
||||
head(df[,n]) # negatives
|
||||
tail(df[,n]) # positives
|
||||
|
||||
#sanity checks
|
||||
# sanity checks
|
||||
mean(df[,n])
|
||||
#[1] -1.173316
|
||||
|
||||
tapply(df[,n], df[,group], mean)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.4297257 0.3978723
|
||||
|
||||
#===============================================
|
||||
# WITHIN GROUP rescaling 2: method "ratio"
|
||||
# create column to store the rescaled values
|
||||
# Rescaling separately (Less dangerous)
|
||||
# =====> chosen one:as Nick prefers
|
||||
# =====> chosen one: preserves sign
|
||||
#===============================================
|
||||
#find range of your data
|
||||
my_min = min(df[,n]); my_min #-3.87
|
||||
my_max = max(df[,n]); my_max #1.689
|
||||
# find range of your data
|
||||
my_min = min(df[,n]); my_min
|
||||
my_max = max(df[,n]); my_max
|
||||
|
||||
df$ratioDUET = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max
|
||||
) #335, 15
|
||||
#sanity check
|
||||
) # 15 cols
|
||||
# sanity check
|
||||
head(df$ratioDUET)
|
||||
tail(df$ratioDUET)
|
||||
|
||||
min(df$ratioDUET); max(df$ratioDUET)
|
||||
|
||||
#sanity checks
|
||||
# sanity checks
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
#Destabilizing Stabilizing
|
||||
#-1.00000000 0.01065719
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
#Destabilizing Stabilizing
|
||||
#-0.003875969 1.000000000
|
||||
|
||||
#should be the same as below (267 and 42)
|
||||
# should be the same as below (267 and 42)
|
||||
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
|
||||
|
||||
table(df$DUET_outcome)
|
||||
#Destabilizing Stabilizing
|
||||
#288 47
|
||||
|
||||
#===============================================
|
||||
# Hist and density plots to compare the rescaling
|
||||
# methods: Base R
|
||||
#===============================================
|
||||
#uncomment as necessary
|
||||
|
||||
# uncomment as necessary
|
||||
my_title = "DUET_stability"
|
||||
#my_title = colnames(df[n])
|
||||
|
||||
|
@ -246,7 +251,25 @@ mtext(text = my_title
|
|||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
# reorder by column name
|
||||
#data <- data[c("A", "B", "C")]
|
||||
colnames(df)
|
||||
df2 = df[c("X", "Mutationinformation", "WildPos", "Position"
|
||||
, "Wild_type", "Mutant_type"
|
||||
, "DUETStability_Kcalpermol", "DUET_outcome"
|
||||
, "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
|
||||
, "ratioDUET", "ratioPredAff"
|
||||
, "LigandID","Chain")]
|
||||
|
||||
# sanity check
|
||||
# should be True
|
||||
#compare(df, df2, allowAll = T)
|
||||
compare(df, df2, ignoreColOrder = T)
|
||||
#TRUE
|
||||
#reordered columns
|
||||
|
||||
#===================
|
||||
# write output as csv file
|
||||
#===================
|
||||
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
|
||||
#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
|
||||
write.csv(df2, outfile, row.names = FALSE)
|
Loading…
Add table
Add a link
Reference in a new issue