From 61fcd14b17ffb6bcb17ced1805c5e063f436bfd5 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 13 Jan 2020 10:39:49 +0000 Subject: [PATCH] renamed files with shorter names and corrected paths for input and output files --- .../pyrazinamide/scripts/mcsm/run.sh | 11 +- ..._outputurl.sh => step1_lig_output_urls.sh} | 4 +- ...uturls_results.sh => step2_lig_results.sh} | 4 +- ...ng.sh => step3a_results_format_interim.sh} | 7 +- .../scripts/mcsm/step3b_format_results.py | 29 ---- .../scripts/mcsm/step3b_results_format_df.py | 61 ++++++++ ...a_cleaning.R => step3c_results_cleaning.R} | 147 ++++++++++-------- ..._normalise.R => step4_results_normalise.R} | 145 +++++++++-------- 8 files changed, 244 insertions(+), 164 deletions(-) rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step1_mCSMLig_curl_submit_store_outputurl.sh => step1_lig_output_urls.sh} (97%) rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step2_mCSM_LIG_batch_outputurls_results.sh => step2_lig_results.sh} (95%) rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step3a_mCSM_LIG_regex_output_formatting.sh => step3a_results_format_interim.sh} (89%) delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step3c_data_cleaning.R => step3c_results_cleaning.R} (62%) rename mcsm_analysis/pyrazinamide/scripts/mcsm/{step4_normalise.R => step4_results_normalise.R} (62%) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh index 2674915..7e00fb1 100755 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh @@ -1,6 +1,9 @@ -# run step0-step3a for mcsm pipeline +#!/bin/bash + +# run all bash scripts for mcsm #./step0_check_duplicate_SNPs.sh -#./step1_mCSMLig_curl_submit_store_outputurl.sh -./step2_mCSM_LIG_batch_outputurls_results.sh -./step3a_mCSM_LIG_regex_output_formatting.sh +#./step1_lig_output_urls.sh +./step2_lig_results.sh +./step3a_results_format_interim.sh + diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh similarity index 97% rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh index faf0b7d..83dab94 100755 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh @@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv" infile_struc="/complex1_no_water.pdb" outpath="${inpath}${processed_path}" -outfile="/mCSM_lig_complex1_result_url.txt" +outfile="/complex1_result_url.txt" # create valid input and output filenames #filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv" @@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..." # create output file with the added number of muts from file # after much thought, bad idea as less generic! -#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt +#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt echo -e "${host}${result_url}" >> ${outfilename} #echo -n '.' done < "${filename}" diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh similarity index 95% rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh index 717c1aa..10c9291 100755 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh @@ -35,10 +35,10 @@ # specify variables for input and output paths and filenames inpath="${HOME}/git/Data/pyrazinamide/input" processed_path="/processed" -infile="/mCSM_lig_complex1_result_url.txt" +infile="/complex1_result_url.txt" outpath="${inpath}${processed_path}" -outfile="/mCSM_lig_complex1_output_MASTER.txt" +outfile="/complex1_output_MASTER.txt" # create valid input and output filenames filename="${inpath}${processed_path}${infile}" diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh similarity index 89% rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh index 0b743fe..f9c2c09 100755 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh @@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input" processed_path="/processed" # Create input file: copy and rename output file of step2 -oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt" -newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt" +oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt" +newfile="${inpath}${processed_path}/complex1_output_processed.txt" cp $oldfile $newfile -#infile="../Results/336_mCSM_lig_complex1_output_processed.txt" -infile="/mCSM_lig_complex1_output_processed.txt" +infile="/complex1_output_processed.txt" filename="${inpath}${processed_path}${infile}" echo Input filename is : ${filename} diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py deleted file mode 100755 index a780576..0000000 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/python -import pandas as pd -from collections import defaultdict - -#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt' - -outCols=[ - 'PredictedAffinityChange', - 'Mutationinformation', - 'Wild-type', - 'Position', - 'Mutant-type', - 'Chain', - 'LigandID', - 'Distancetoligand', - 'DUETstabilitychange' - ] - -lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')] - -outputs = defaultdict(list) - -for item in lines: - col, val = item.split(':') - outputs[col].append(val) - -dfOut=pd.DataFrame(outputs) - -pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py new file mode 100755 index 0000000..bc14609 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py @@ -0,0 +1,61 @@ +#!/usr/bin/python + +################### +# load libraries +import os, sys +import pandas as pd +from collections import defaultdict +#################### + +#******************************************************************** +# TASK: Formatting results with nice colnames +# step3a processed the mcsm results to remove all newlines and +# brought data in a format where the delimiter ":" splits +# data into a convenient format of "colname": "value". +# this script formats the data and outputs a df with each row +# as a mutation and its corresponding mcsm_values + +# Requirements: +# input: output of step3a, file containing "..._output_processed.txt" + # path: "Data//input/processed/" +# output: formatted .csv file + # path: "Data//input/processed/" +#*********************************************************************** +# specify variables for input and output paths and filenames +homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde + +basedir = "/git/Data/pyrazinamide/input" +inpath = "/processed" +in_filename = "/complex1_output_processed.txt" +infile = homedir + basedir + inpath + in_filename +print("Input file is:", infile) + +outpath = "/processed" +out_filename = "/complex1_formatted_results.csv" +outfile = homedir + basedir + outpath + out_filename +print("Output file is:", outfile) +# end of variable assignment for input and output files + +outCols=[ + 'PredictedAffinityChange', + 'Mutationinformation', + 'Wild-type', + 'Position', + 'Mutant-type', + 'Chain', + 'LigandID', + 'Distancetoligand', + 'DUETstabilitychange' + ] + +lines = [line.rstrip('\n') for line in open(infile)] + +outputs = defaultdict(list) + +for item in lines: + col, val = item.split(':') + outputs[col].append(val) + +dfOut=pd.DataFrame(outputs) + +pd.DataFrame.to_csv(dfOut, outfile, columns=outCols) diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R similarity index 62% rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R index 4876b5e..c58dc8b 100644 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R @@ -1,22 +1,42 @@ getwd() -#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work -setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad -#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac +#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") getwd() #======================================================= +# TASK: read formatted_results_df.csv to complete +# missing info, adding DUET categories, assigning +# meaningful colnames, etc. + +# Requirements: +# input: output of step3b, python processing, + # path: Data//input/processed/" +# output: NO output as the next scripts refers to this +# for yet more processing +#======================================================= + +# specify variables for input and output paths and filenames +homedir = "~" +basedir = "/git/Data/pyrazinamide/input" +inpath = "/processed" +in_filename = "/complex1_formatted_results.csv" +infile = paste0(homedir, basedir, inpath, in_filename) +print(paste0("Input file is:", infile)) + +#====================================================== #TASK: To tidy the columns so you can generate figures #======================================================= #################### #### read file #####: this will be the output from python script (csv file) #################### -data = read.csv("336_complex1_formatted_results.csv" +data = read.csv(infile , header = T , stringsAsFactors = FALSE) dim(data) -#335, 10 str(data) +# clear variables +rm(homedir, basedir, inpath, in_filename, infile) + ########################### ##### Data processing ##### ########################### @@ -31,41 +51,38 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ head(data$Mutationinformation) tail(data$Mutationinformation) #write.csv(data, 'test.csv') + ########################################## # Remove duplicate SNPs as a sanity check ########################################## -#very important +# very important table(duplicated(data$Mutationinformation)) -#FALSE -#335 -#extract duplicated entries +# extract duplicated entries dups = data[duplicated(data$Mutationinformation),] #0 -#No of dups should match with the no. of TRUE in the above table +# No of dups should match with the no. of TRUE in the above table #u_dups = unique(dups$Mutationinformation) #10 -sum( table(dups$Mutationinformation) ) #13 - -rm(dups) +sum( table(dups$Mutationinformation) ) #*************************************************************** -#select non-duplicated SNPs and create a new df -df = data[!duplicated(data$Mutationinformation),] #309, 10 +# select non-duplicated SNPs and create a new df +df = data[!duplicated(data$Mutationinformation),] #*************************************************************** -#sanity check +# sanity check u = unique(df$Mutationinformation) u2 = unique(data$Mutationinformation) table(u%in%u2) -#TRUE -#309 -#should all be 1, hence 309 1's + +# should all be 1 sum(table(df$Mutationinformation) == 1) -#sort df by Position -#MANUAL CHECKPOINT: +# sort df by Position +# MANUAL CHECKPOINT: #foo <- df[order(df$Position),] #df <- df[order(df$Position),] +# clear variables rm(u, u2, dups) #################### @@ -75,26 +92,28 @@ rm(u, u2, dups) #======= #STEP 1 #======== -#make a copy of the PredictedAffinityColumn and call it Lig_outcome -df$Lig_outcome = df$PredictedAffinityChange #335, 11 +# make a copy of the PredictedAffinityColumn and call it Lig_outcome +df$Lig_outcome = df$PredictedAffinityChange -#make Predicted...column numeric and outcome column categorical + #make Predicted...column numeric and outcome column categorical head(df$PredictedAffinityChange) df$PredictedAffinityChange = gsub("log.*" , "" , df$PredictedAffinityChange) -#sanity checks +# sanity checks head(df$PredictedAffinityChange) -#should be numeric, check and if not make it numeric -is.numeric( df$PredictedAffinityChange ) -#change to numeric -df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange) -#should be TRUE +# should be numeric, check and if not make it numeric is.numeric( df$PredictedAffinityChange ) -#change the column name to indicate units +# change to numeric +df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange) + +# should be TRUE +is.numeric( df$PredictedAffinityChange ) + +# change the column name to indicate units n = which(colnames(df) == "PredictedAffinityChange"); n colnames(df)[n] = "PredAffLog" colnames(df)[n] @@ -102,38 +121,44 @@ colnames(df)[n] #======== #STEP 2 #======== -#make Lig_outcome column categorical showing effect of mutation +# make Lig_outcome column categorical showing effect of mutation head(df$Lig_outcome) df$Lig_outcome = gsub("^.*-" , "", df$Lig_outcome) -#sanity checks +# sanity checks head(df$Lig_outcome) -#should be factor, check and if not change it to factor + +# should be factor, check and if not change it to factor is.factor(df$Lig_outcome) -#change to factor + +# change to factor df$Lig_outcome = as.factor(df$Lig_outcome) -#should be TRUE + +# should be TRUE is.factor(df$Lig_outcome) #======== #STEP 3 #======== -#gsub +# gsub head(df$Distancetoligand) df$Distancetoligand = gsub("Å" , "" , df$Distancetoligand) -#sanity checks +# sanity checks head(df$Distancetoligand) -#should be numeric, check if not change it to numeric -is.numeric(df$Distancetoligand) -#change to numeric -df$Distancetoligand = as.numeric(df$Distancetoligand) -#should be TRUE + +# should be numeric, check if not change it to numeric is.numeric(df$Distancetoligand) -#change the column name to indicate units +# change to numeric +df$Distancetoligand = as.numeric(df$Distancetoligand) + +# should be TRUE +is.numeric(df$Distancetoligand) + +# change the column name to indicate units n = which(colnames(df) == "Distancetoligand") colnames(df)[n] <- "Dis_lig_Ang" colnames(df)[n] @@ -146,16 +171,19 @@ head(df$DUETstabilitychange) df$DUETstabilitychange = gsub("Kcal/mol" , "" , df$DUETstabilitychange) -#sanity checks +# sanity checks head(df$DUETstabilitychange) -#should be numeric, check if not change it to numeric -is.numeric(df$DUETstabilitychange) -#change to numeric -df$DUETstabilitychange = as.numeric(df$DUETstabilitychange) -#should be TRUE + +# should be numeric, check if not change it to numeric is.numeric(df$DUETstabilitychange) -#change the column name to indicate units +# change to numeric +df$DUETstabilitychange = as.numeric(df$DUETstabilitychange) + +# should be TRUE +is.numeric(df$DUETstabilitychange) + +# change the column name to indicate units n = which(colnames(df) == "DUETstabilitychange"); n colnames(df)[n] = "DUETStability_Kcalpermol" colnames(df)[n] @@ -163,25 +191,20 @@ colnames(df)[n] #======== #STEP 5 #======== -#create yet another extra column: classification of DUET stability only +# create yet another extra column: classification of DUET stability only df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0 , "Stabilizing" - , "Destabilizing") #335, 12 + , "Destabilizing") # spelling to be consistent with mcsm table(df$Lig_outcome) -#Destabilizing Stabilizing -#281 54 table(df$DUET_outcome) -#Destabilizing Stabilizing -#288 47 + #============================== #FIXME #Insert a venn diagram - #================================ - #======== #STEP 6 #======== @@ -198,10 +221,10 @@ colnames(df[mut]) #======== #STEP 7 #======== -#create an extra column: maybe useful for some plots -df$WildPos = paste0(df$Wild_type, df$Position) #335, 13 +# create an extra column: maybe useful for some plots +df$WildPos = paste0(df$Wild_type, df$Position) -#clear variables +# clear variables rm(n, wt, mut) ################ end of data cleaning diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R similarity index 62% rename from mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R rename to mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R index 4721e29..eb24cab 100644 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R @@ -1,25 +1,47 @@ -getwd() -#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work -setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad -#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac +################## +# load libraries + library(compare) +################## + getwd() #======================================================= -#TASK:read cleaned data and perform rescaling +# TASK:read cleaned data and perform rescaling # of DUET stability scores # of Pred affinity -#compare scaling methods with plots -#output normalised file +# compare scaling methods with plots + +# Requirements: +# input: R script, step3c_results_cleaning.R + # path: Data//input/processed/" +# output: NO output as the next scripts refers to this +# for yet more processing +# output normalised file #======================================================= +# specify variables for input and output paths and filenames +homedir = "~" +currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm" +in_filename = "/step3c_results_cleaning.R" +infile = paste0(homedir, currdir, in_filename) +print(paste0("Input file is:", infile)) + +# output file +basedir = "/git/Data/pyrazinamide/input" +outpath = "/processed" +out_filename = "/mcsm_complex1_normalised.csv" +outfile = paste0(homedir, basedir, outpath, out_filename) +print(paste0("Output file is:", outfile)) + #################### #### read file #####: this will be the output of my R script that cleans the data columns #################### -source("../Scripts/step3c_data_cleaning.R") -##This will outut two dataframes: -##data: unclean data: 335, 10 -##df : cleaned df 335, 13 -## you can remove data if you want as you will not need it +source(infile) + +#This will outut two dataframes: +# data: unclean data: 10 cols +# df : cleaned df: 13 cols +# you can remove data if you want as you will not need it rm(data) colnames(df) @@ -36,67 +58,60 @@ group = which(colnames(df) == "Lig_outcome"); group # This is because this makes it easier to see the results of rescaling for debugging head(df$PredAffLog) -#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom +# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom df = df[order(df$PredAffLog),] head(df$PredAffLog) -#sanity checks -head(df[,n]) #all negatives -tail(df[,n]) #all positives +# sanity checks +head(df[,n]) # all negatives +tail(df[,n]) # all positives -#sanity checks +# sanity checks mean(df[,n]) #-0.9526746 tapply(df[,n], df[,group], mean) -#Destabilizing Stabilizing -#-1.2112100 0.3926667 + #=========================== -#Same as above: in 2 steps +# Same as above: in 2 steps #=========================== -#find range of your data -my_min = min(df[,n]); my_min #-3.948 -my_max = max(df[,n]); my_max #2.23 +# find range of your data +my_min = min(df[,n]); my_min # +my_max = max(df[,n]); my_max # #=============================================== # WITHIN GROUP rescaling 2: method "ratio" # create column to store the rescaled values # Rescaling separately (Less dangerous) -# =====> chosen one:as Nick prefers +# =====> chosen one: preserves sign #=============================================== df$ratioPredAff = ifelse(df[,n] < 0 , df[,n]/abs(my_min) , df[,n]/my_max - )#335 14 -#sanity checks + )# 14 cols +# sanity checks head(df$ratioPredAff) tail(df$ratioPredAff) min(df$ratioPredAff); max(df$ratioPredAff) tapply(df$ratioPredAff, df$Lig_outcome, min) -#Destabilizing Stabilizing -#-1.000000000 0.005381166 tapply(df$ratioPredAff, df$Lig_outcome, max) -#Destabilizing Stabilizing -#-0.001266464 1.000000000 -#should be the same as below (281 and 54) +# should be the same as below sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0) table(df$Lig_outcome) -#Destabilizing Stabilizing -#281 54 #=============================================== # Hist and density plots to compare the rescaling # methods: Base R #=============================================== -#uncomment as necessary +# uncomment as necessary my_title = "Ligand_stability" -#my_title = colnames(df[n]) +# my_title = colnames(df[n]) # Set the margin on all sides par(oma = c(3,2,3,0) @@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group) #=================== # 3b: DUET stability #=================== -dim(df) #335, 14 +dim(df) # 14 cols n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10 group = which(colnames(df) == "DUET_outcome"); group #12 @@ -151,63 +166,53 @@ group = which(colnames(df) == "DUET_outcome"); group #12 # This is because this makes it easier to see the results of rescaling for debugging head(df$DUETStability_Kcalpermol) -#ORDER BY DUET scores: negative values at the top and positive at the bottom +# ORDER BY DUET scores: negative values at the top and positive at the bottom df = df[order(df$DUETStability_Kcalpermol),] -#sanity checks -head(df[,n]) #negatives -tail(df[,n]) #positives +# sanity checks +head(df[,n]) # negatives +tail(df[,n]) # positives -#sanity checks +# sanity checks mean(df[,n]) -#[1] -1.173316 tapply(df[,n], df[,group], mean) -#Destabilizing Stabilizing -#-1.4297257 0.3978723 #=============================================== # WITHIN GROUP rescaling 2: method "ratio" # create column to store the rescaled values # Rescaling separately (Less dangerous) -# =====> chosen one:as Nick prefers +# =====> chosen one: preserves sign #=============================================== -#find range of your data -my_min = min(df[,n]); my_min #-3.87 -my_max = max(df[,n]); my_max #1.689 +# find range of your data +my_min = min(df[,n]); my_min +my_max = max(df[,n]); my_max df$ratioDUET = ifelse(df[,n] < 0 , df[,n]/abs(my_min) , df[,n]/my_max - ) #335, 15 -#sanity check + ) # 15 cols +# sanity check head(df$ratioDUET) tail(df$ratioDUET) min(df$ratioDUET); max(df$ratioDUET) -#sanity checks +# sanity checks tapply(df$ratioDUET, df$DUET_outcome, min) -#Destabilizing Stabilizing -#-1.00000000 0.01065719 tapply(df$ratioDUET, df$DUET_outcome, max) -#Destabilizing Stabilizing -#-0.003875969 1.000000000 -#should be the same as below (267 and 42) +# should be the same as below (267 and 42) sum(df$ratioDUET < 0); sum(df$ratioDUET > 0) table(df$DUET_outcome) -#Destabilizing Stabilizing -#288 47 #=============================================== # Hist and density plots to compare the rescaling # methods: Base R #=============================================== -#uncomment as necessary - +# uncomment as necessary my_title = "DUET_stability" #my_title = colnames(df[n]) @@ -246,7 +251,25 @@ mtext(text = my_title , line = 0 , outer = TRUE) +# reorder by column name +#data <- data[c("A", "B", "C")] +colnames(df) +df2 = df[c("X", "Mutationinformation", "WildPos", "Position" + , "Wild_type", "Mutant_type" + , "DUETStability_Kcalpermol", "DUET_outcome" + , "Dis_lig_Ang", "PredAffLog", "Lig_outcome" + , "ratioDUET", "ratioPredAff" + , "LigandID","Chain")] + +# sanity check +# should be True +#compare(df, df2, allowAll = T) +compare(df, df2, ignoreColOrder = T) +#TRUE +#reordered columns + #=================== # write output as csv file #=================== -write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15 +#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) +write.csv(df2, outfile, row.names = FALSE)