renamed files with shorter names and corrected paths for input and output files
This commit is contained in:
parent
4be0de97d7
commit
61fcd14b17
8 changed files with 244 additions and 164 deletions
|
@ -1,6 +1,9 @@
|
||||||
# run step0-step3a for mcsm pipeline
|
#!/bin/bash
|
||||||
|
|
||||||
|
# run all bash scripts for mcsm
|
||||||
|
|
||||||
#./step0_check_duplicate_SNPs.sh
|
#./step0_check_duplicate_SNPs.sh
|
||||||
#./step1_mCSMLig_curl_submit_store_outputurl.sh
|
#./step1_lig_output_urls.sh
|
||||||
./step2_mCSM_LIG_batch_outputurls_results.sh
|
./step2_lig_results.sh
|
||||||
./step3a_mCSM_LIG_regex_output_formatting.sh
|
./step3a_results_format_interim.sh
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv"
|
||||||
infile_struc="/complex1_no_water.pdb"
|
infile_struc="/complex1_no_water.pdb"
|
||||||
|
|
||||||
outpath="${inpath}${processed_path}"
|
outpath="${inpath}${processed_path}"
|
||||||
outfile="/mCSM_lig_complex1_result_url.txt"
|
outfile="/complex1_result_url.txt"
|
||||||
|
|
||||||
# create valid input and output filenames
|
# create valid input and output filenames
|
||||||
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
|
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
|
||||||
|
@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
|
||||||
|
|
||||||
# create output file with the added number of muts from file
|
# create output file with the added number of muts from file
|
||||||
# after much thought, bad idea as less generic!
|
# after much thought, bad idea as less generic!
|
||||||
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
|
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
|
||||||
echo -e "${host}${result_url}" >> ${outfilename}
|
echo -e "${host}${result_url}" >> ${outfilename}
|
||||||
#echo -n '.'
|
#echo -n '.'
|
||||||
done < "${filename}"
|
done < "${filename}"
|
|
@ -35,10 +35,10 @@
|
||||||
# specify variables for input and output paths and filenames
|
# specify variables for input and output paths and filenames
|
||||||
inpath="${HOME}/git/Data/pyrazinamide/input"
|
inpath="${HOME}/git/Data/pyrazinamide/input"
|
||||||
processed_path="/processed"
|
processed_path="/processed"
|
||||||
infile="/mCSM_lig_complex1_result_url.txt"
|
infile="/complex1_result_url.txt"
|
||||||
|
|
||||||
outpath="${inpath}${processed_path}"
|
outpath="${inpath}${processed_path}"
|
||||||
outfile="/mCSM_lig_complex1_output_MASTER.txt"
|
outfile="/complex1_output_MASTER.txt"
|
||||||
|
|
||||||
# create valid input and output filenames
|
# create valid input and output filenames
|
||||||
filename="${inpath}${processed_path}${infile}"
|
filename="${inpath}${processed_path}${infile}"
|
|
@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input"
|
||||||
processed_path="/processed"
|
processed_path="/processed"
|
||||||
|
|
||||||
# Create input file: copy and rename output file of step2
|
# Create input file: copy and rename output file of step2
|
||||||
oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
|
oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
|
||||||
newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
|
newfile="${inpath}${processed_path}/complex1_output_processed.txt"
|
||||||
cp $oldfile $newfile
|
cp $oldfile $newfile
|
||||||
|
|
||||||
#infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
|
infile="/complex1_output_processed.txt"
|
||||||
infile="/mCSM_lig_complex1_output_processed.txt"
|
|
||||||
filename="${inpath}${processed_path}${infile}"
|
filename="${inpath}${processed_path}${infile}"
|
||||||
|
|
||||||
echo Input filename is : ${filename}
|
echo Input filename is : ${filename}
|
|
@ -1,29 +0,0 @@
|
||||||
#!/usr/bin/python
|
|
||||||
import pandas as pd
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
|
|
||||||
|
|
||||||
outCols=[
|
|
||||||
'PredictedAffinityChange',
|
|
||||||
'Mutationinformation',
|
|
||||||
'Wild-type',
|
|
||||||
'Position',
|
|
||||||
'Mutant-type',
|
|
||||||
'Chain',
|
|
||||||
'LigandID',
|
|
||||||
'Distancetoligand',
|
|
||||||
'DUETstabilitychange'
|
|
||||||
]
|
|
||||||
|
|
||||||
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
|
|
||||||
|
|
||||||
outputs = defaultdict(list)
|
|
||||||
|
|
||||||
for item in lines:
|
|
||||||
col, val = item.split(':')
|
|
||||||
outputs[col].append(val)
|
|
||||||
|
|
||||||
dfOut=pd.DataFrame(outputs)
|
|
||||||
|
|
||||||
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
|
|
61
mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
Executable file
61
mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
Executable file
|
@ -0,0 +1,61 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
###################
|
||||||
|
# load libraries
|
||||||
|
import os, sys
|
||||||
|
import pandas as pd
|
||||||
|
from collections import defaultdict
|
||||||
|
####################
|
||||||
|
|
||||||
|
#********************************************************************
|
||||||
|
# TASK: Formatting results with nice colnames
|
||||||
|
# step3a processed the mcsm results to remove all newlines and
|
||||||
|
# brought data in a format where the delimiter ":" splits
|
||||||
|
# data into a convenient format of "colname": "value".
|
||||||
|
# this script formats the data and outputs a df with each row
|
||||||
|
# as a mutation and its corresponding mcsm_values
|
||||||
|
|
||||||
|
# Requirements:
|
||||||
|
# input: output of step3a, file containing "..._output_processed.txt"
|
||||||
|
# path: "Data/<drug>/input/processed/<filename>"
|
||||||
|
# output: formatted .csv file
|
||||||
|
# path: "Data/<drug>/input/processed/<filename>"
|
||||||
|
#***********************************************************************
|
||||||
|
# specify variables for input and output paths and filenames
|
||||||
|
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||||
|
|
||||||
|
basedir = "/git/Data/pyrazinamide/input"
|
||||||
|
inpath = "/processed"
|
||||||
|
in_filename = "/complex1_output_processed.txt"
|
||||||
|
infile = homedir + basedir + inpath + in_filename
|
||||||
|
print("Input file is:", infile)
|
||||||
|
|
||||||
|
outpath = "/processed"
|
||||||
|
out_filename = "/complex1_formatted_results.csv"
|
||||||
|
outfile = homedir + basedir + outpath + out_filename
|
||||||
|
print("Output file is:", outfile)
|
||||||
|
# end of variable assignment for input and output files
|
||||||
|
|
||||||
|
outCols=[
|
||||||
|
'PredictedAffinityChange',
|
||||||
|
'Mutationinformation',
|
||||||
|
'Wild-type',
|
||||||
|
'Position',
|
||||||
|
'Mutant-type',
|
||||||
|
'Chain',
|
||||||
|
'LigandID',
|
||||||
|
'Distancetoligand',
|
||||||
|
'DUETstabilitychange'
|
||||||
|
]
|
||||||
|
|
||||||
|
lines = [line.rstrip('\n') for line in open(infile)]
|
||||||
|
|
||||||
|
outputs = defaultdict(list)
|
||||||
|
|
||||||
|
for item in lines:
|
||||||
|
col, val = item.split(':')
|
||||||
|
outputs[col].append(val)
|
||||||
|
|
||||||
|
dfOut=pd.DataFrame(outputs)
|
||||||
|
|
||||||
|
pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
|
|
@ -1,22 +1,42 @@
|
||||||
getwd()
|
getwd()
|
||||||
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
|
#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
|
||||||
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
|
|
||||||
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
|
|
||||||
getwd()
|
getwd()
|
||||||
|
|
||||||
#=======================================================
|
#=======================================================
|
||||||
|
# TASK: read formatted_results_df.csv to complete
|
||||||
|
# missing info, adding DUET categories, assigning
|
||||||
|
# meaningful colnames, etc.
|
||||||
|
|
||||||
|
# Requirements:
|
||||||
|
# input: output of step3b, python processing,
|
||||||
|
# path: Data/<drug>/input/processed/<filename>"
|
||||||
|
# output: NO output as the next scripts refers to this
|
||||||
|
# for yet more processing
|
||||||
|
#=======================================================
|
||||||
|
|
||||||
|
# specify variables for input and output paths and filenames
|
||||||
|
homedir = "~"
|
||||||
|
basedir = "/git/Data/pyrazinamide/input"
|
||||||
|
inpath = "/processed"
|
||||||
|
in_filename = "/complex1_formatted_results.csv"
|
||||||
|
infile = paste0(homedir, basedir, inpath, in_filename)
|
||||||
|
print(paste0("Input file is:", infile))
|
||||||
|
|
||||||
|
#======================================================
|
||||||
#TASK: To tidy the columns so you can generate figures
|
#TASK: To tidy the columns so you can generate figures
|
||||||
#=======================================================
|
#=======================================================
|
||||||
####################
|
####################
|
||||||
#### read file #####: this will be the output from python script (csv file)
|
#### read file #####: this will be the output from python script (csv file)
|
||||||
####################
|
####################
|
||||||
data = read.csv("336_complex1_formatted_results.csv"
|
data = read.csv(infile
|
||||||
, header = T
|
, header = T
|
||||||
, stringsAsFactors = FALSE)
|
, stringsAsFactors = FALSE)
|
||||||
dim(data)
|
dim(data)
|
||||||
#335, 10
|
|
||||||
str(data)
|
str(data)
|
||||||
|
|
||||||
|
# clear variables
|
||||||
|
rm(homedir, basedir, inpath, in_filename, infile)
|
||||||
|
|
||||||
###########################
|
###########################
|
||||||
##### Data processing #####
|
##### Data processing #####
|
||||||
###########################
|
###########################
|
||||||
|
@ -31,41 +51,38 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ
|
||||||
head(data$Mutationinformation)
|
head(data$Mutationinformation)
|
||||||
tail(data$Mutationinformation)
|
tail(data$Mutationinformation)
|
||||||
#write.csv(data, 'test.csv')
|
#write.csv(data, 'test.csv')
|
||||||
|
|
||||||
##########################################
|
##########################################
|
||||||
# Remove duplicate SNPs as a sanity check
|
# Remove duplicate SNPs as a sanity check
|
||||||
##########################################
|
##########################################
|
||||||
#very important
|
# very important
|
||||||
table(duplicated(data$Mutationinformation))
|
table(duplicated(data$Mutationinformation))
|
||||||
#FALSE
|
|
||||||
#335
|
|
||||||
|
|
||||||
#extract duplicated entries
|
# extract duplicated entries
|
||||||
dups = data[duplicated(data$Mutationinformation),] #0
|
dups = data[duplicated(data$Mutationinformation),] #0
|
||||||
|
|
||||||
#No of dups should match with the no. of TRUE in the above table
|
# No of dups should match with the no. of TRUE in the above table
|
||||||
#u_dups = unique(dups$Mutationinformation) #10
|
#u_dups = unique(dups$Mutationinformation) #10
|
||||||
sum( table(dups$Mutationinformation) ) #13
|
sum( table(dups$Mutationinformation) )
|
||||||
|
|
||||||
rm(dups)
|
|
||||||
|
|
||||||
#***************************************************************
|
#***************************************************************
|
||||||
#select non-duplicated SNPs and create a new df
|
# select non-duplicated SNPs and create a new df
|
||||||
df = data[!duplicated(data$Mutationinformation),] #309, 10
|
df = data[!duplicated(data$Mutationinformation),]
|
||||||
#***************************************************************
|
#***************************************************************
|
||||||
#sanity check
|
# sanity check
|
||||||
u = unique(df$Mutationinformation)
|
u = unique(df$Mutationinformation)
|
||||||
u2 = unique(data$Mutationinformation)
|
u2 = unique(data$Mutationinformation)
|
||||||
table(u%in%u2)
|
table(u%in%u2)
|
||||||
#TRUE
|
|
||||||
#309
|
# should all be 1
|
||||||
#should all be 1, hence 309 1's
|
|
||||||
sum(table(df$Mutationinformation) == 1)
|
sum(table(df$Mutationinformation) == 1)
|
||||||
|
|
||||||
#sort df by Position
|
# sort df by Position
|
||||||
#MANUAL CHECKPOINT:
|
# MANUAL CHECKPOINT:
|
||||||
#foo <- df[order(df$Position),]
|
#foo <- df[order(df$Position),]
|
||||||
#df <- df[order(df$Position),]
|
#df <- df[order(df$Position),]
|
||||||
|
|
||||||
|
# clear variables
|
||||||
rm(u, u2, dups)
|
rm(u, u2, dups)
|
||||||
|
|
||||||
####################
|
####################
|
||||||
|
@ -75,26 +92,28 @@ rm(u, u2, dups)
|
||||||
#=======
|
#=======
|
||||||
#STEP 1
|
#STEP 1
|
||||||
#========
|
#========
|
||||||
#make a copy of the PredictedAffinityColumn and call it Lig_outcome
|
# make a copy of the PredictedAffinityColumn and call it Lig_outcome
|
||||||
df$Lig_outcome = df$PredictedAffinityChange #335, 11
|
df$Lig_outcome = df$PredictedAffinityChange
|
||||||
|
|
||||||
#make Predicted...column numeric and outcome column categorical
|
#make Predicted...column numeric and outcome column categorical
|
||||||
head(df$PredictedAffinityChange)
|
head(df$PredictedAffinityChange)
|
||||||
df$PredictedAffinityChange = gsub("log.*"
|
df$PredictedAffinityChange = gsub("log.*"
|
||||||
, ""
|
, ""
|
||||||
, df$PredictedAffinityChange)
|
, df$PredictedAffinityChange)
|
||||||
|
|
||||||
#sanity checks
|
# sanity checks
|
||||||
head(df$PredictedAffinityChange)
|
head(df$PredictedAffinityChange)
|
||||||
|
|
||||||
#should be numeric, check and if not make it numeric
|
# should be numeric, check and if not make it numeric
|
||||||
is.numeric( df$PredictedAffinityChange )
|
|
||||||
#change to numeric
|
|
||||||
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
|
|
||||||
#should be TRUE
|
|
||||||
is.numeric( df$PredictedAffinityChange )
|
is.numeric( df$PredictedAffinityChange )
|
||||||
|
|
||||||
#change the column name to indicate units
|
# change to numeric
|
||||||
|
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
|
||||||
|
|
||||||
|
# should be TRUE
|
||||||
|
is.numeric( df$PredictedAffinityChange )
|
||||||
|
|
||||||
|
# change the column name to indicate units
|
||||||
n = which(colnames(df) == "PredictedAffinityChange"); n
|
n = which(colnames(df) == "PredictedAffinityChange"); n
|
||||||
colnames(df)[n] = "PredAffLog"
|
colnames(df)[n] = "PredAffLog"
|
||||||
colnames(df)[n]
|
colnames(df)[n]
|
||||||
|
@ -102,38 +121,44 @@ colnames(df)[n]
|
||||||
#========
|
#========
|
||||||
#STEP 2
|
#STEP 2
|
||||||
#========
|
#========
|
||||||
#make Lig_outcome column categorical showing effect of mutation
|
# make Lig_outcome column categorical showing effect of mutation
|
||||||
head(df$Lig_outcome)
|
head(df$Lig_outcome)
|
||||||
df$Lig_outcome = gsub("^.*-"
|
df$Lig_outcome = gsub("^.*-"
|
||||||
, "",
|
, "",
|
||||||
df$Lig_outcome)
|
df$Lig_outcome)
|
||||||
#sanity checks
|
# sanity checks
|
||||||
head(df$Lig_outcome)
|
head(df$Lig_outcome)
|
||||||
#should be factor, check and if not change it to factor
|
|
||||||
|
# should be factor, check and if not change it to factor
|
||||||
is.factor(df$Lig_outcome)
|
is.factor(df$Lig_outcome)
|
||||||
#change to factor
|
|
||||||
|
# change to factor
|
||||||
df$Lig_outcome = as.factor(df$Lig_outcome)
|
df$Lig_outcome = as.factor(df$Lig_outcome)
|
||||||
#should be TRUE
|
|
||||||
|
# should be TRUE
|
||||||
is.factor(df$Lig_outcome)
|
is.factor(df$Lig_outcome)
|
||||||
|
|
||||||
#========
|
#========
|
||||||
#STEP 3
|
#STEP 3
|
||||||
#========
|
#========
|
||||||
#gsub
|
# gsub
|
||||||
head(df$Distancetoligand)
|
head(df$Distancetoligand)
|
||||||
df$Distancetoligand = gsub("Å"
|
df$Distancetoligand = gsub("Å"
|
||||||
, ""
|
, ""
|
||||||
, df$Distancetoligand)
|
, df$Distancetoligand)
|
||||||
#sanity checks
|
# sanity checks
|
||||||
head(df$Distancetoligand)
|
head(df$Distancetoligand)
|
||||||
#should be numeric, check if not change it to numeric
|
|
||||||
is.numeric(df$Distancetoligand)
|
# should be numeric, check if not change it to numeric
|
||||||
#change to numeric
|
|
||||||
df$Distancetoligand = as.numeric(df$Distancetoligand)
|
|
||||||
#should be TRUE
|
|
||||||
is.numeric(df$Distancetoligand)
|
is.numeric(df$Distancetoligand)
|
||||||
|
|
||||||
#change the column name to indicate units
|
# change to numeric
|
||||||
|
df$Distancetoligand = as.numeric(df$Distancetoligand)
|
||||||
|
|
||||||
|
# should be TRUE
|
||||||
|
is.numeric(df$Distancetoligand)
|
||||||
|
|
||||||
|
# change the column name to indicate units
|
||||||
n = which(colnames(df) == "Distancetoligand")
|
n = which(colnames(df) == "Distancetoligand")
|
||||||
colnames(df)[n] <- "Dis_lig_Ang"
|
colnames(df)[n] <- "Dis_lig_Ang"
|
||||||
colnames(df)[n]
|
colnames(df)[n]
|
||||||
|
@ -146,16 +171,19 @@ head(df$DUETstabilitychange)
|
||||||
df$DUETstabilitychange = gsub("Kcal/mol"
|
df$DUETstabilitychange = gsub("Kcal/mol"
|
||||||
, ""
|
, ""
|
||||||
, df$DUETstabilitychange)
|
, df$DUETstabilitychange)
|
||||||
#sanity checks
|
# sanity checks
|
||||||
head(df$DUETstabilitychange)
|
head(df$DUETstabilitychange)
|
||||||
#should be numeric, check if not change it to numeric
|
|
||||||
is.numeric(df$DUETstabilitychange)
|
# should be numeric, check if not change it to numeric
|
||||||
#change to numeric
|
|
||||||
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
|
|
||||||
#should be TRUE
|
|
||||||
is.numeric(df$DUETstabilitychange)
|
is.numeric(df$DUETstabilitychange)
|
||||||
|
|
||||||
#change the column name to indicate units
|
# change to numeric
|
||||||
|
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
|
||||||
|
|
||||||
|
# should be TRUE
|
||||||
|
is.numeric(df$DUETstabilitychange)
|
||||||
|
|
||||||
|
# change the column name to indicate units
|
||||||
n = which(colnames(df) == "DUETstabilitychange"); n
|
n = which(colnames(df) == "DUETstabilitychange"); n
|
||||||
colnames(df)[n] = "DUETStability_Kcalpermol"
|
colnames(df)[n] = "DUETStability_Kcalpermol"
|
||||||
colnames(df)[n]
|
colnames(df)[n]
|
||||||
|
@ -163,25 +191,20 @@ colnames(df)[n]
|
||||||
#========
|
#========
|
||||||
#STEP 5
|
#STEP 5
|
||||||
#========
|
#========
|
||||||
#create yet another extra column: classification of DUET stability only
|
# create yet another extra column: classification of DUET stability only
|
||||||
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
|
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
|
||||||
, "Stabilizing"
|
, "Stabilizing"
|
||||||
, "Destabilizing") #335, 12
|
, "Destabilizing") # spelling to be consistent with mcsm
|
||||||
|
|
||||||
table(df$Lig_outcome)
|
table(df$Lig_outcome)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#281 54
|
|
||||||
|
|
||||||
table(df$DUET_outcome)
|
table(df$DUET_outcome)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#288 47
|
|
||||||
#==============================
|
#==============================
|
||||||
#FIXME
|
#FIXME
|
||||||
#Insert a venn diagram
|
#Insert a venn diagram
|
||||||
|
|
||||||
#================================
|
#================================
|
||||||
|
|
||||||
|
|
||||||
#========
|
#========
|
||||||
#STEP 6
|
#STEP 6
|
||||||
#========
|
#========
|
||||||
|
@ -198,10 +221,10 @@ colnames(df[mut])
|
||||||
#========
|
#========
|
||||||
#STEP 7
|
#STEP 7
|
||||||
#========
|
#========
|
||||||
#create an extra column: maybe useful for some plots
|
# create an extra column: maybe useful for some plots
|
||||||
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
|
df$WildPos = paste0(df$Wild_type, df$Position)
|
||||||
|
|
||||||
#clear variables
|
# clear variables
|
||||||
rm(n, wt, mut)
|
rm(n, wt, mut)
|
||||||
|
|
||||||
################ end of data cleaning
|
################ end of data cleaning
|
|
@ -1,25 +1,47 @@
|
||||||
getwd()
|
##################
|
||||||
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
|
# load libraries
|
||||||
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
|
library(compare)
|
||||||
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
|
##################
|
||||||
|
|
||||||
getwd()
|
getwd()
|
||||||
|
|
||||||
#=======================================================
|
#=======================================================
|
||||||
#TASK:read cleaned data and perform rescaling
|
# TASK:read cleaned data and perform rescaling
|
||||||
# of DUET stability scores
|
# of DUET stability scores
|
||||||
# of Pred affinity
|
# of Pred affinity
|
||||||
#compare scaling methods with plots
|
# compare scaling methods with plots
|
||||||
#output normalised file
|
|
||||||
|
# Requirements:
|
||||||
|
# input: R script, step3c_results_cleaning.R
|
||||||
|
# path: Data/<drug>/input/processed/<filename>"
|
||||||
|
# output: NO output as the next scripts refers to this
|
||||||
|
# for yet more processing
|
||||||
|
# output normalised file
|
||||||
#=======================================================
|
#=======================================================
|
||||||
|
|
||||||
|
# specify variables for input and output paths and filenames
|
||||||
|
homedir = "~"
|
||||||
|
currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
|
||||||
|
in_filename = "/step3c_results_cleaning.R"
|
||||||
|
infile = paste0(homedir, currdir, in_filename)
|
||||||
|
print(paste0("Input file is:", infile))
|
||||||
|
|
||||||
|
# output file
|
||||||
|
basedir = "/git/Data/pyrazinamide/input"
|
||||||
|
outpath = "/processed"
|
||||||
|
out_filename = "/mcsm_complex1_normalised.csv"
|
||||||
|
outfile = paste0(homedir, basedir, outpath, out_filename)
|
||||||
|
print(paste0("Output file is:", outfile))
|
||||||
|
|
||||||
####################
|
####################
|
||||||
#### read file #####: this will be the output of my R script that cleans the data columns
|
#### read file #####: this will be the output of my R script that cleans the data columns
|
||||||
####################
|
####################
|
||||||
source("../Scripts/step3c_data_cleaning.R")
|
source(infile)
|
||||||
##This will outut two dataframes:
|
|
||||||
##data: unclean data: 335, 10
|
#This will outut two dataframes:
|
||||||
##df : cleaned df 335, 13
|
# data: unclean data: 10 cols
|
||||||
## you can remove data if you want as you will not need it
|
# df : cleaned df: 13 cols
|
||||||
|
# you can remove data if you want as you will not need it
|
||||||
rm(data)
|
rm(data)
|
||||||
|
|
||||||
colnames(df)
|
colnames(df)
|
||||||
|
@ -36,67 +58,60 @@ group = which(colnames(df) == "Lig_outcome"); group
|
||||||
# This is because this makes it easier to see the results of rescaling for debugging
|
# This is because this makes it easier to see the results of rescaling for debugging
|
||||||
head(df$PredAffLog)
|
head(df$PredAffLog)
|
||||||
|
|
||||||
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
|
# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
|
||||||
df = df[order(df$PredAffLog),]
|
df = df[order(df$PredAffLog),]
|
||||||
head(df$PredAffLog)
|
head(df$PredAffLog)
|
||||||
|
|
||||||
#sanity checks
|
# sanity checks
|
||||||
head(df[,n]) #all negatives
|
head(df[,n]) # all negatives
|
||||||
tail(df[,n]) #all positives
|
tail(df[,n]) # all positives
|
||||||
|
|
||||||
#sanity checks
|
# sanity checks
|
||||||
mean(df[,n])
|
mean(df[,n])
|
||||||
#-0.9526746
|
#-0.9526746
|
||||||
|
|
||||||
tapply(df[,n], df[,group], mean)
|
tapply(df[,n], df[,group], mean)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#-1.2112100 0.3926667
|
|
||||||
#===========================
|
#===========================
|
||||||
#Same as above: in 2 steps
|
# Same as above: in 2 steps
|
||||||
#===========================
|
#===========================
|
||||||
|
|
||||||
#find range of your data
|
# find range of your data
|
||||||
my_min = min(df[,n]); my_min #-3.948
|
my_min = min(df[,n]); my_min #
|
||||||
my_max = max(df[,n]); my_max #2.23
|
my_max = max(df[,n]); my_max #
|
||||||
|
|
||||||
#===============================================
|
#===============================================
|
||||||
# WITHIN GROUP rescaling 2: method "ratio"
|
# WITHIN GROUP rescaling 2: method "ratio"
|
||||||
# create column to store the rescaled values
|
# create column to store the rescaled values
|
||||||
# Rescaling separately (Less dangerous)
|
# Rescaling separately (Less dangerous)
|
||||||
# =====> chosen one:as Nick prefers
|
# =====> chosen one: preserves sign
|
||||||
#===============================================
|
#===============================================
|
||||||
df$ratioPredAff = ifelse(df[,n] < 0
|
df$ratioPredAff = ifelse(df[,n] < 0
|
||||||
, df[,n]/abs(my_min)
|
, df[,n]/abs(my_min)
|
||||||
, df[,n]/my_max
|
, df[,n]/my_max
|
||||||
)#335 14
|
)# 14 cols
|
||||||
#sanity checks
|
# sanity checks
|
||||||
head(df$ratioPredAff)
|
head(df$ratioPredAff)
|
||||||
tail(df$ratioPredAff)
|
tail(df$ratioPredAff)
|
||||||
|
|
||||||
min(df$ratioPredAff); max(df$ratioPredAff)
|
min(df$ratioPredAff); max(df$ratioPredAff)
|
||||||
|
|
||||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#-1.000000000 0.005381166
|
|
||||||
|
|
||||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#-0.001266464 1.000000000
|
|
||||||
|
|
||||||
#should be the same as below (281 and 54)
|
# should be the same as below
|
||||||
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
|
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
|
||||||
|
|
||||||
table(df$Lig_outcome)
|
table(df$Lig_outcome)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#281 54
|
|
||||||
|
|
||||||
#===============================================
|
#===============================================
|
||||||
# Hist and density plots to compare the rescaling
|
# Hist and density plots to compare the rescaling
|
||||||
# methods: Base R
|
# methods: Base R
|
||||||
#===============================================
|
#===============================================
|
||||||
#uncomment as necessary
|
# uncomment as necessary
|
||||||
my_title = "Ligand_stability"
|
my_title = "Ligand_stability"
|
||||||
#my_title = colnames(df[n])
|
# my_title = colnames(df[n])
|
||||||
|
|
||||||
# Set the margin on all sides
|
# Set the margin on all sides
|
||||||
par(oma = c(3,2,3,0)
|
par(oma = c(3,2,3,0)
|
||||||
|
@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group)
|
||||||
#===================
|
#===================
|
||||||
# 3b: DUET stability
|
# 3b: DUET stability
|
||||||
#===================
|
#===================
|
||||||
dim(df) #335, 14
|
dim(df) # 14 cols
|
||||||
|
|
||||||
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
|
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
|
||||||
group = which(colnames(df) == "DUET_outcome"); group #12
|
group = which(colnames(df) == "DUET_outcome"); group #12
|
||||||
|
@ -151,63 +166,53 @@ group = which(colnames(df) == "DUET_outcome"); group #12
|
||||||
# This is because this makes it easier to see the results of rescaling for debugging
|
# This is because this makes it easier to see the results of rescaling for debugging
|
||||||
head(df$DUETStability_Kcalpermol)
|
head(df$DUETStability_Kcalpermol)
|
||||||
|
|
||||||
#ORDER BY DUET scores: negative values at the top and positive at the bottom
|
# ORDER BY DUET scores: negative values at the top and positive at the bottom
|
||||||
df = df[order(df$DUETStability_Kcalpermol),]
|
df = df[order(df$DUETStability_Kcalpermol),]
|
||||||
|
|
||||||
#sanity checks
|
# sanity checks
|
||||||
head(df[,n]) #negatives
|
head(df[,n]) # negatives
|
||||||
tail(df[,n]) #positives
|
tail(df[,n]) # positives
|
||||||
|
|
||||||
#sanity checks
|
# sanity checks
|
||||||
mean(df[,n])
|
mean(df[,n])
|
||||||
#[1] -1.173316
|
|
||||||
|
|
||||||
tapply(df[,n], df[,group], mean)
|
tapply(df[,n], df[,group], mean)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#-1.4297257 0.3978723
|
|
||||||
|
|
||||||
#===============================================
|
#===============================================
|
||||||
# WITHIN GROUP rescaling 2: method "ratio"
|
# WITHIN GROUP rescaling 2: method "ratio"
|
||||||
# create column to store the rescaled values
|
# create column to store the rescaled values
|
||||||
# Rescaling separately (Less dangerous)
|
# Rescaling separately (Less dangerous)
|
||||||
# =====> chosen one:as Nick prefers
|
# =====> chosen one: preserves sign
|
||||||
#===============================================
|
#===============================================
|
||||||
#find range of your data
|
# find range of your data
|
||||||
my_min = min(df[,n]); my_min #-3.87
|
my_min = min(df[,n]); my_min
|
||||||
my_max = max(df[,n]); my_max #1.689
|
my_max = max(df[,n]); my_max
|
||||||
|
|
||||||
df$ratioDUET = ifelse(df[,n] < 0
|
df$ratioDUET = ifelse(df[,n] < 0
|
||||||
, df[,n]/abs(my_min)
|
, df[,n]/abs(my_min)
|
||||||
, df[,n]/my_max
|
, df[,n]/my_max
|
||||||
) #335, 15
|
) # 15 cols
|
||||||
#sanity check
|
# sanity check
|
||||||
head(df$ratioDUET)
|
head(df$ratioDUET)
|
||||||
tail(df$ratioDUET)
|
tail(df$ratioDUET)
|
||||||
|
|
||||||
min(df$ratioDUET); max(df$ratioDUET)
|
min(df$ratioDUET); max(df$ratioDUET)
|
||||||
|
|
||||||
#sanity checks
|
# sanity checks
|
||||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#-1.00000000 0.01065719
|
|
||||||
|
|
||||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#-0.003875969 1.000000000
|
|
||||||
|
|
||||||
#should be the same as below (267 and 42)
|
# should be the same as below (267 and 42)
|
||||||
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
|
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
|
||||||
|
|
||||||
table(df$DUET_outcome)
|
table(df$DUET_outcome)
|
||||||
#Destabilizing Stabilizing
|
|
||||||
#288 47
|
|
||||||
|
|
||||||
#===============================================
|
#===============================================
|
||||||
# Hist and density plots to compare the rescaling
|
# Hist and density plots to compare the rescaling
|
||||||
# methods: Base R
|
# methods: Base R
|
||||||
#===============================================
|
#===============================================
|
||||||
#uncomment as necessary
|
# uncomment as necessary
|
||||||
|
|
||||||
my_title = "DUET_stability"
|
my_title = "DUET_stability"
|
||||||
#my_title = colnames(df[n])
|
#my_title = colnames(df[n])
|
||||||
|
|
||||||
|
@ -246,7 +251,25 @@ mtext(text = my_title
|
||||||
, line = 0
|
, line = 0
|
||||||
, outer = TRUE)
|
, outer = TRUE)
|
||||||
|
|
||||||
|
# reorder by column name
|
||||||
|
#data <- data[c("A", "B", "C")]
|
||||||
|
colnames(df)
|
||||||
|
df2 = df[c("X", "Mutationinformation", "WildPos", "Position"
|
||||||
|
, "Wild_type", "Mutant_type"
|
||||||
|
, "DUETStability_Kcalpermol", "DUET_outcome"
|
||||||
|
, "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
|
||||||
|
, "ratioDUET", "ratioPredAff"
|
||||||
|
, "LigandID","Chain")]
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
# should be True
|
||||||
|
#compare(df, df2, allowAll = T)
|
||||||
|
compare(df, df2, ignoreColOrder = T)
|
||||||
|
#TRUE
|
||||||
|
#reordered columns
|
||||||
|
|
||||||
#===================
|
#===================
|
||||||
# write output as csv file
|
# write output as csv file
|
||||||
#===================
|
#===================
|
||||||
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
|
#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
|
||||||
|
write.csv(df2, outfile, row.names = FALSE)
|
Loading…
Add table
Add a link
Reference in a new issue