renamed files with shorter names and corrected paths for input and output files

This commit is contained in:
Tanushree Tunstall 2020-01-13 10:39:49 +00:00
parent 4be0de97d7
commit 61fcd14b17
8 changed files with 244 additions and 164 deletions

View file

@ -1,6 +1,9 @@
# run step0-step3a for mcsm pipeline #!/bin/bash
# run all bash scripts for mcsm
#./step0_check_duplicate_SNPs.sh #./step0_check_duplicate_SNPs.sh
#./step1_mCSMLig_curl_submit_store_outputurl.sh #./step1_lig_output_urls.sh
./step2_mCSM_LIG_batch_outputurls_results.sh ./step2_lig_results.sh
./step3a_mCSM_LIG_regex_output_formatting.sh ./step3a_results_format_interim.sh

View file

@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_struc="/complex1_no_water.pdb" infile_struc="/complex1_no_water.pdb"
outpath="${inpath}${processed_path}" outpath="${inpath}${processed_path}"
outfile="/mCSM_lig_complex1_result_url.txt" outfile="/complex1_result_url.txt"
# create valid input and output filenames # create valid input and output filenames
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv" #filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
# create output file with the added number of muts from file # create output file with the added number of muts from file
# after much thought, bad idea as less generic! # after much thought, bad idea as less generic!
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt #echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
echo -e "${host}${result_url}" >> ${outfilename} echo -e "${host}${result_url}" >> ${outfilename}
#echo -n '.' #echo -n '.'
done < "${filename}" done < "${filename}"

View file

@ -35,10 +35,10 @@
# specify variables for input and output paths and filenames # specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input" inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed" processed_path="/processed"
infile="/mCSM_lig_complex1_result_url.txt" infile="/complex1_result_url.txt"
outpath="${inpath}${processed_path}" outpath="${inpath}${processed_path}"
outfile="/mCSM_lig_complex1_output_MASTER.txt" outfile="/complex1_output_MASTER.txt"
# create valid input and output filenames # create valid input and output filenames
filename="${inpath}${processed_path}${infile}" filename="${inpath}${processed_path}${infile}"

View file

@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed" processed_path="/processed"
# Create input file: copy and rename output file of step2 # Create input file: copy and rename output file of step2
oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt" oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt" newfile="${inpath}${processed_path}/complex1_output_processed.txt"
cp $oldfile $newfile cp $oldfile $newfile
#infile="../Results/336_mCSM_lig_complex1_output_processed.txt" infile="/complex1_output_processed.txt"
infile="/mCSM_lig_complex1_output_processed.txt"
filename="${inpath}${processed_path}${infile}" filename="${inpath}${processed_path}${infile}"
echo Input filename is : ${filename} echo Input filename is : ${filename}

View file

@ -1,29 +0,0 @@
#!/usr/bin/python
import pandas as pd
from collections import defaultdict
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)

View file

@ -0,0 +1,61 @@
#!/usr/bin/python
###################
# load libraries
import os, sys
import pandas as pd
from collections import defaultdict
####################
#********************************************************************
# TASK: Formatting results with nice colnames
# step3a processed the mcsm results to remove all newlines and
# brought data in a format where the delimiter ":" splits
# data into a convenient format of "colname": "value".
# this script formats the data and outputs a df with each row
# as a mutation and its corresponding mcsm_values
# Requirements:
# input: output of step3a, file containing "..._output_processed.txt"
# path: "Data/<drug>/input/processed/<filename>"
# output: formatted .csv file
# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
# specify variables for input and output paths and filenames
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
basedir = "/git/Data/pyrazinamide/input"
inpath = "/processed"
in_filename = "/complex1_output_processed.txt"
infile = homedir + basedir + inpath + in_filename
print("Input file is:", infile)
outpath = "/processed"
out_filename = "/complex1_formatted_results.csv"
outfile = homedir + basedir + outpath + out_filename
print("Output file is:", outfile)
# end of variable assignment for input and output files
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open(infile)]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)

View file

@ -1,22 +1,42 @@
getwd() getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work #setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
getwd() getwd()
#======================================================= #=======================================================
# TASK: read formatted_results_df.csv to complete
# missing info, adding DUET categories, assigning
# meaningful colnames, etc.
# Requirements:
# input: output of step3b, python processing,
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
#=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
basedir = "/git/Data/pyrazinamide/input"
inpath = "/processed"
in_filename = "/complex1_formatted_results.csv"
infile = paste0(homedir, basedir, inpath, in_filename)
print(paste0("Input file is:", infile))
#======================================================
#TASK: To tidy the columns so you can generate figures #TASK: To tidy the columns so you can generate figures
#======================================================= #=======================================================
#################### ####################
#### read file #####: this will be the output from python script (csv file) #### read file #####: this will be the output from python script (csv file)
#################### ####################
data = read.csv("336_complex1_formatted_results.csv" data = read.csv(infile
, header = T , header = T
, stringsAsFactors = FALSE) , stringsAsFactors = FALSE)
dim(data) dim(data)
#335, 10
str(data) str(data)
# clear variables
rm(homedir, basedir, inpath, in_filename, infile)
########################### ###########################
##### Data processing ##### ##### Data processing #####
########################### ###########################
@ -31,41 +51,38 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ
head(data$Mutationinformation) head(data$Mutationinformation)
tail(data$Mutationinformation) tail(data$Mutationinformation)
#write.csv(data, 'test.csv') #write.csv(data, 'test.csv')
########################################## ##########################################
# Remove duplicate SNPs as a sanity check # Remove duplicate SNPs as a sanity check
########################################## ##########################################
#very important # very important
table(duplicated(data$Mutationinformation)) table(duplicated(data$Mutationinformation))
#FALSE
#335
#extract duplicated entries # extract duplicated entries
dups = data[duplicated(data$Mutationinformation),] #0 dups = data[duplicated(data$Mutationinformation),] #0
#No of dups should match with the no. of TRUE in the above table # No of dups should match with the no. of TRUE in the above table
#u_dups = unique(dups$Mutationinformation) #10 #u_dups = unique(dups$Mutationinformation) #10
sum( table(dups$Mutationinformation) ) #13 sum( table(dups$Mutationinformation) )
rm(dups)
#*************************************************************** #***************************************************************
#select non-duplicated SNPs and create a new df # select non-duplicated SNPs and create a new df
df = data[!duplicated(data$Mutationinformation),] #309, 10 df = data[!duplicated(data$Mutationinformation),]
#*************************************************************** #***************************************************************
#sanity check # sanity check
u = unique(df$Mutationinformation) u = unique(df$Mutationinformation)
u2 = unique(data$Mutationinformation) u2 = unique(data$Mutationinformation)
table(u%in%u2) table(u%in%u2)
#TRUE
#309 # should all be 1
#should all be 1, hence 309 1's
sum(table(df$Mutationinformation) == 1) sum(table(df$Mutationinformation) == 1)
#sort df by Position # sort df by Position
#MANUAL CHECKPOINT: # MANUAL CHECKPOINT:
#foo <- df[order(df$Position),] #foo <- df[order(df$Position),]
#df <- df[order(df$Position),] #df <- df[order(df$Position),]
# clear variables
rm(u, u2, dups) rm(u, u2, dups)
#################### ####################
@ -75,26 +92,28 @@ rm(u, u2, dups)
#======= #=======
#STEP 1 #STEP 1
#======== #========
#make a copy of the PredictedAffinityColumn and call it Lig_outcome # make a copy of the PredictedAffinityColumn and call it Lig_outcome
df$Lig_outcome = df$PredictedAffinityChange #335, 11 df$Lig_outcome = df$PredictedAffinityChange
#make Predicted...column numeric and outcome column categorical #make Predicted...column numeric and outcome column categorical
head(df$PredictedAffinityChange) head(df$PredictedAffinityChange)
df$PredictedAffinityChange = gsub("log.*" df$PredictedAffinityChange = gsub("log.*"
, "" , ""
, df$PredictedAffinityChange) , df$PredictedAffinityChange)
#sanity checks # sanity checks
head(df$PredictedAffinityChange) head(df$PredictedAffinityChange)
#should be numeric, check and if not make it numeric # should be numeric, check and if not make it numeric
is.numeric( df$PredictedAffinityChange )
#change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
#should be TRUE
is.numeric( df$PredictedAffinityChange ) is.numeric( df$PredictedAffinityChange )
#change the column name to indicate units # change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
# should be TRUE
is.numeric( df$PredictedAffinityChange )
# change the column name to indicate units
n = which(colnames(df) == "PredictedAffinityChange"); n n = which(colnames(df) == "PredictedAffinityChange"); n
colnames(df)[n] = "PredAffLog" colnames(df)[n] = "PredAffLog"
colnames(df)[n] colnames(df)[n]
@ -102,38 +121,44 @@ colnames(df)[n]
#======== #========
#STEP 2 #STEP 2
#======== #========
#make Lig_outcome column categorical showing effect of mutation # make Lig_outcome column categorical showing effect of mutation
head(df$Lig_outcome) head(df$Lig_outcome)
df$Lig_outcome = gsub("^.*-" df$Lig_outcome = gsub("^.*-"
, "", , "",
df$Lig_outcome) df$Lig_outcome)
#sanity checks # sanity checks
head(df$Lig_outcome) head(df$Lig_outcome)
#should be factor, check and if not change it to factor
# should be factor, check and if not change it to factor
is.factor(df$Lig_outcome) is.factor(df$Lig_outcome)
#change to factor
# change to factor
df$Lig_outcome = as.factor(df$Lig_outcome) df$Lig_outcome = as.factor(df$Lig_outcome)
#should be TRUE
# should be TRUE
is.factor(df$Lig_outcome) is.factor(df$Lig_outcome)
#======== #========
#STEP 3 #STEP 3
#======== #========
#gsub # gsub
head(df$Distancetoligand) head(df$Distancetoligand)
df$Distancetoligand = gsub("&Aring;" df$Distancetoligand = gsub("&Aring;"
, "" , ""
, df$Distancetoligand) , df$Distancetoligand)
#sanity checks # sanity checks
head(df$Distancetoligand) head(df$Distancetoligand)
#should be numeric, check if not change it to numeric
is.numeric(df$Distancetoligand) # should be numeric, check if not change it to numeric
#change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand)
#should be TRUE
is.numeric(df$Distancetoligand) is.numeric(df$Distancetoligand)
#change the column name to indicate units # change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand)
# should be TRUE
is.numeric(df$Distancetoligand)
# change the column name to indicate units
n = which(colnames(df) == "Distancetoligand") n = which(colnames(df) == "Distancetoligand")
colnames(df)[n] <- "Dis_lig_Ang" colnames(df)[n] <- "Dis_lig_Ang"
colnames(df)[n] colnames(df)[n]
@ -146,16 +171,19 @@ head(df$DUETstabilitychange)
df$DUETstabilitychange = gsub("Kcal/mol" df$DUETstabilitychange = gsub("Kcal/mol"
, "" , ""
, df$DUETstabilitychange) , df$DUETstabilitychange)
#sanity checks # sanity checks
head(df$DUETstabilitychange) head(df$DUETstabilitychange)
#should be numeric, check if not change it to numeric
is.numeric(df$DUETstabilitychange) # should be numeric, check if not change it to numeric
#change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
#should be TRUE
is.numeric(df$DUETstabilitychange) is.numeric(df$DUETstabilitychange)
#change the column name to indicate units # change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
# should be TRUE
is.numeric(df$DUETstabilitychange)
# change the column name to indicate units
n = which(colnames(df) == "DUETstabilitychange"); n n = which(colnames(df) == "DUETstabilitychange"); n
colnames(df)[n] = "DUETStability_Kcalpermol" colnames(df)[n] = "DUETStability_Kcalpermol"
colnames(df)[n] colnames(df)[n]
@ -163,25 +191,20 @@ colnames(df)[n]
#======== #========
#STEP 5 #STEP 5
#======== #========
#create yet another extra column: classification of DUET stability only # create yet another extra column: classification of DUET stability only
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0 df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
, "Stabilizing" , "Stabilizing"
, "Destabilizing") #335, 12 , "Destabilizing") # spelling to be consistent with mcsm
table(df$Lig_outcome) table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
table(df$DUET_outcome) table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#============================== #==============================
#FIXME #FIXME
#Insert a venn diagram #Insert a venn diagram
#================================ #================================
#======== #========
#STEP 6 #STEP 6
#======== #========
@ -198,10 +221,10 @@ colnames(df[mut])
#======== #========
#STEP 7 #STEP 7
#======== #========
#create an extra column: maybe useful for some plots # create an extra column: maybe useful for some plots
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13 df$WildPos = paste0(df$Wild_type, df$Position)
#clear variables # clear variables
rm(n, wt, mut) rm(n, wt, mut)
################ end of data cleaning ################ end of data cleaning

View file

@ -1,25 +1,47 @@
getwd() ##################
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work # load libraries
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad library(compare)
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac ##################
getwd() getwd()
#======================================================= #=======================================================
#TASK:read cleaned data and perform rescaling # TASK:read cleaned data and perform rescaling
# of DUET stability scores # of DUET stability scores
# of Pred affinity # of Pred affinity
#compare scaling methods with plots # compare scaling methods with plots
#output normalised file
# Requirements:
# input: R script, step3c_results_cleaning.R
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
# output normalised file
#======================================================= #=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
in_filename = "/step3c_results_cleaning.R"
infile = paste0(homedir, currdir, in_filename)
print(paste0("Input file is:", infile))
# output file
basedir = "/git/Data/pyrazinamide/input"
outpath = "/processed"
out_filename = "/mcsm_complex1_normalised.csv"
outfile = paste0(homedir, basedir, outpath, out_filename)
print(paste0("Output file is:", outfile))
#################### ####################
#### read file #####: this will be the output of my R script that cleans the data columns #### read file #####: this will be the output of my R script that cleans the data columns
#################### ####################
source("../Scripts/step3c_data_cleaning.R") source(infile)
##This will outut two dataframes:
##data: unclean data: 335, 10 #This will outut two dataframes:
##df : cleaned df 335, 13 # data: unclean data: 10 cols
## you can remove data if you want as you will not need it # df : cleaned df: 13 cols
# you can remove data if you want as you will not need it
rm(data) rm(data)
colnames(df) colnames(df)
@ -36,67 +58,60 @@ group = which(colnames(df) == "Lig_outcome"); group
# This is because this makes it easier to see the results of rescaling for debugging # This is because this makes it easier to see the results of rescaling for debugging
head(df$PredAffLog) head(df$PredAffLog)
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom # ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
df = df[order(df$PredAffLog),] df = df[order(df$PredAffLog),]
head(df$PredAffLog) head(df$PredAffLog)
#sanity checks # sanity checks
head(df[,n]) #all negatives head(df[,n]) # all negatives
tail(df[,n]) #all positives tail(df[,n]) # all positives
#sanity checks # sanity checks
mean(df[,n]) mean(df[,n])
#-0.9526746 #-0.9526746
tapply(df[,n], df[,group], mean) tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.2112100 0.3926667
#=========================== #===========================
#Same as above: in 2 steps # Same as above: in 2 steps
#=========================== #===========================
#find range of your data # find range of your data
my_min = min(df[,n]); my_min #-3.948 my_min = min(df[,n]); my_min #
my_max = max(df[,n]); my_max #2.23 my_max = max(df[,n]); my_max #
#=============================================== #===============================================
# WITHIN GROUP rescaling 2: method "ratio" # WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values # create column to store the rescaled values
# Rescaling separately (Less dangerous) # Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers # =====> chosen one: preserves sign
#=============================================== #===============================================
df$ratioPredAff = ifelse(df[,n] < 0 df$ratioPredAff = ifelse(df[,n] < 0
, df[,n]/abs(my_min) , df[,n]/abs(my_min)
, df[,n]/my_max , df[,n]/my_max
)#335 14 )# 14 cols
#sanity checks # sanity checks
head(df$ratioPredAff) head(df$ratioPredAff)
tail(df$ratioPredAff) tail(df$ratioPredAff)
min(df$ratioPredAff); max(df$ratioPredAff) min(df$ratioPredAff); max(df$ratioPredAff)
tapply(df$ratioPredAff, df$Lig_outcome, min) tapply(df$ratioPredAff, df$Lig_outcome, min)
#Destabilizing Stabilizing
#-1.000000000 0.005381166
tapply(df$ratioPredAff, df$Lig_outcome, max) tapply(df$ratioPredAff, df$Lig_outcome, max)
#Destabilizing Stabilizing
#-0.001266464 1.000000000
#should be the same as below (281 and 54) # should be the same as below
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0) sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
table(df$Lig_outcome) table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
#=============================================== #===============================================
# Hist and density plots to compare the rescaling # Hist and density plots to compare the rescaling
# methods: Base R # methods: Base R
#=============================================== #===============================================
#uncomment as necessary # uncomment as necessary
my_title = "Ligand_stability" my_title = "Ligand_stability"
#my_title = colnames(df[n]) # my_title = colnames(df[n])
# Set the margin on all sides # Set the margin on all sides
par(oma = c(3,2,3,0) par(oma = c(3,2,3,0)
@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group)
#=================== #===================
# 3b: DUET stability # 3b: DUET stability
#=================== #===================
dim(df) #335, 14 dim(df) # 14 cols
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10 n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
group = which(colnames(df) == "DUET_outcome"); group #12 group = which(colnames(df) == "DUET_outcome"); group #12
@ -151,63 +166,53 @@ group = which(colnames(df) == "DUET_outcome"); group #12
# This is because this makes it easier to see the results of rescaling for debugging # This is because this makes it easier to see the results of rescaling for debugging
head(df$DUETStability_Kcalpermol) head(df$DUETStability_Kcalpermol)
#ORDER BY DUET scores: negative values at the top and positive at the bottom # ORDER BY DUET scores: negative values at the top and positive at the bottom
df = df[order(df$DUETStability_Kcalpermol),] df = df[order(df$DUETStability_Kcalpermol),]
#sanity checks # sanity checks
head(df[,n]) #negatives head(df[,n]) # negatives
tail(df[,n]) #positives tail(df[,n]) # positives
#sanity checks # sanity checks
mean(df[,n]) mean(df[,n])
#[1] -1.173316
tapply(df[,n], df[,group], mean) tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.4297257 0.3978723
#=============================================== #===============================================
# WITHIN GROUP rescaling 2: method "ratio" # WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values # create column to store the rescaled values
# Rescaling separately (Less dangerous) # Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers # =====> chosen one: preserves sign
#=============================================== #===============================================
#find range of your data # find range of your data
my_min = min(df[,n]); my_min #-3.87 my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max #1.689 my_max = max(df[,n]); my_max
df$ratioDUET = ifelse(df[,n] < 0 df$ratioDUET = ifelse(df[,n] < 0
, df[,n]/abs(my_min) , df[,n]/abs(my_min)
, df[,n]/my_max , df[,n]/my_max
) #335, 15 ) # 15 cols
#sanity check # sanity check
head(df$ratioDUET) head(df$ratioDUET)
tail(df$ratioDUET) tail(df$ratioDUET)
min(df$ratioDUET); max(df$ratioDUET) min(df$ratioDUET); max(df$ratioDUET)
#sanity checks # sanity checks
tapply(df$ratioDUET, df$DUET_outcome, min) tapply(df$ratioDUET, df$DUET_outcome, min)
#Destabilizing Stabilizing
#-1.00000000 0.01065719
tapply(df$ratioDUET, df$DUET_outcome, max) tapply(df$ratioDUET, df$DUET_outcome, max)
#Destabilizing Stabilizing
#-0.003875969 1.000000000
#should be the same as below (267 and 42) # should be the same as below (267 and 42)
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0) sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
table(df$DUET_outcome) table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#=============================================== #===============================================
# Hist and density plots to compare the rescaling # Hist and density plots to compare the rescaling
# methods: Base R # methods: Base R
#=============================================== #===============================================
#uncomment as necessary # uncomment as necessary
my_title = "DUET_stability" my_title = "DUET_stability"
#my_title = colnames(df[n]) #my_title = colnames(df[n])
@ -246,7 +251,25 @@ mtext(text = my_title
, line = 0 , line = 0
, outer = TRUE) , outer = TRUE)
# reorder by column name
#data <- data[c("A", "B", "C")]
colnames(df)
df2 = df[c("X", "Mutationinformation", "WildPos", "Position"
, "Wild_type", "Mutant_type"
, "DUETStability_Kcalpermol", "DUET_outcome"
, "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
, "ratioDUET", "ratioPredAff"
, "LigandID","Chain")]
# sanity check
# should be True
#compare(df, df2, allowAll = T)
compare(df, df2, ignoreColOrder = T)
#TRUE
#reordered columns
#=================== #===================
# write output as csv file # write output as csv file
#=================== #===================
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15 #write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
write.csv(df2, outfile, row.names = FALSE)