renamed files with shorter names and corrected paths for input and output files

This commit is contained in:
Tanushree Tunstall 2020-01-13 10:39:49 +00:00
parent 4be0de97d7
commit 61fcd14b17
8 changed files with 244 additions and 164 deletions

View file

@ -1,6 +1,9 @@
# run step0-step3a for mcsm pipeline
#!/bin/bash
# run all bash scripts for mcsm
#./step0_check_duplicate_SNPs.sh
#./step1_mCSMLig_curl_submit_store_outputurl.sh
./step2_mCSM_LIG_batch_outputurls_results.sh
./step3a_mCSM_LIG_regex_output_formatting.sh
#./step1_lig_output_urls.sh
./step2_lig_results.sh
./step3a_results_format_interim.sh

View file

@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_struc="/complex1_no_water.pdb"
outpath="${inpath}${processed_path}"
outfile="/mCSM_lig_complex1_result_url.txt"
outfile="/complex1_result_url.txt"
# create valid input and output filenames
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
# create output file with the added number of muts from file
# after much thought, bad idea as less generic!
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
echo -e "${host}${result_url}" >> ${outfilename}
#echo -n '.'
done < "${filename}"

View file

@ -35,10 +35,10 @@
# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
infile="/mCSM_lig_complex1_result_url.txt"
infile="/complex1_result_url.txt"
outpath="${inpath}${processed_path}"
outfile="/mCSM_lig_complex1_output_MASTER.txt"
outfile="/complex1_output_MASTER.txt"
# create valid input and output filenames
filename="${inpath}${processed_path}${infile}"

View file

@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
# Create input file: copy and rename output file of step2
oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
newfile="${inpath}${processed_path}/complex1_output_processed.txt"
cp $oldfile $newfile
#infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
infile="/mCSM_lig_complex1_output_processed.txt"
infile="/complex1_output_processed.txt"
filename="${inpath}${processed_path}${infile}"
echo Input filename is : ${filename}

View file

@ -1,29 +0,0 @@
#!/usr/bin/python
import pandas as pd
from collections import defaultdict
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)

View file

@ -0,0 +1,61 @@
#!/usr/bin/python
###################
# load libraries
import os, sys
import pandas as pd
from collections import defaultdict
####################
#********************************************************************
# TASK: Formatting results with nice colnames
# step3a processed the mcsm results to remove all newlines and
# brought data in a format where the delimiter ":" splits
# data into a convenient format of "colname": "value".
# this script formats the data and outputs a df with each row
# as a mutation and its corresponding mcsm_values
# Requirements:
# input: output of step3a, file containing "..._output_processed.txt"
# path: "Data/<drug>/input/processed/<filename>"
# output: formatted .csv file
# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
# specify variables for input and output paths and filenames
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
basedir = "/git/Data/pyrazinamide/input"
inpath = "/processed"
in_filename = "/complex1_output_processed.txt"
infile = homedir + basedir + inpath + in_filename
print("Input file is:", infile)
outpath = "/processed"
out_filename = "/complex1_formatted_results.csv"
outfile = homedir + basedir + outpath + out_filename
print("Output file is:", outfile)
# end of variable assignment for input and output files
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open(infile)]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)

View file

@ -1,22 +1,42 @@
getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
getwd()
#=======================================================
# TASK: read formatted_results_df.csv to complete
# missing info, adding DUET categories, assigning
# meaningful colnames, etc.
# Requirements:
# input: output of step3b, python processing,
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
#=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
basedir = "/git/Data/pyrazinamide/input"
inpath = "/processed"
in_filename = "/complex1_formatted_results.csv"
infile = paste0(homedir, basedir, inpath, in_filename)
print(paste0("Input file is:", infile))
#======================================================
#TASK: To tidy the columns so you can generate figures
#=======================================================
####################
#### read file #####: this will be the output from python script (csv file)
####################
data = read.csv("336_complex1_formatted_results.csv"
data = read.csv(infile
, header = T
, stringsAsFactors = FALSE)
dim(data)
#335, 10
str(data)
# clear variables
rm(homedir, basedir, inpath, in_filename, infile)
###########################
##### Data processing #####
###########################
@ -31,41 +51,38 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ
head(data$Mutationinformation)
tail(data$Mutationinformation)
#write.csv(data, 'test.csv')
##########################################
# Remove duplicate SNPs as a sanity check
##########################################
#very important
# very important
table(duplicated(data$Mutationinformation))
#FALSE
#335
#extract duplicated entries
# extract duplicated entries
dups = data[duplicated(data$Mutationinformation),] #0
#No of dups should match with the no. of TRUE in the above table
# No of dups should match with the no. of TRUE in the above table
#u_dups = unique(dups$Mutationinformation) #10
sum( table(dups$Mutationinformation) ) #13
rm(dups)
sum( table(dups$Mutationinformation) )
#***************************************************************
#select non-duplicated SNPs and create a new df
df = data[!duplicated(data$Mutationinformation),] #309, 10
# select non-duplicated SNPs and create a new df
df = data[!duplicated(data$Mutationinformation),]
#***************************************************************
#sanity check
# sanity check
u = unique(df$Mutationinformation)
u2 = unique(data$Mutationinformation)
table(u%in%u2)
#TRUE
#309
#should all be 1, hence 309 1's
# should all be 1
sum(table(df$Mutationinformation) == 1)
#sort df by Position
#MANUAL CHECKPOINT:
# sort df by Position
# MANUAL CHECKPOINT:
#foo <- df[order(df$Position),]
#df <- df[order(df$Position),]
# clear variables
rm(u, u2, dups)
####################
@ -75,26 +92,28 @@ rm(u, u2, dups)
#=======
#STEP 1
#========
#make a copy of the PredictedAffinityColumn and call it Lig_outcome
df$Lig_outcome = df$PredictedAffinityChange #335, 11
# make a copy of the PredictedAffinityColumn and call it Lig_outcome
df$Lig_outcome = df$PredictedAffinityChange
#make Predicted...column numeric and outcome column categorical
#make Predicted...column numeric and outcome column categorical
head(df$PredictedAffinityChange)
df$PredictedAffinityChange = gsub("log.*"
, ""
, df$PredictedAffinityChange)
#sanity checks
# sanity checks
head(df$PredictedAffinityChange)
#should be numeric, check and if not make it numeric
is.numeric( df$PredictedAffinityChange )
#change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
#should be TRUE
# should be numeric, check and if not make it numeric
is.numeric( df$PredictedAffinityChange )
#change the column name to indicate units
# change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
# should be TRUE
is.numeric( df$PredictedAffinityChange )
# change the column name to indicate units
n = which(colnames(df) == "PredictedAffinityChange"); n
colnames(df)[n] = "PredAffLog"
colnames(df)[n]
@ -102,38 +121,44 @@ colnames(df)[n]
#========
#STEP 2
#========
#make Lig_outcome column categorical showing effect of mutation
# make Lig_outcome column categorical showing effect of mutation
head(df$Lig_outcome)
df$Lig_outcome = gsub("^.*-"
, "",
df$Lig_outcome)
#sanity checks
# sanity checks
head(df$Lig_outcome)
#should be factor, check and if not change it to factor
# should be factor, check and if not change it to factor
is.factor(df$Lig_outcome)
#change to factor
# change to factor
df$Lig_outcome = as.factor(df$Lig_outcome)
#should be TRUE
# should be TRUE
is.factor(df$Lig_outcome)
#========
#STEP 3
#========
#gsub
# gsub
head(df$Distancetoligand)
df$Distancetoligand = gsub("&Aring;"
, ""
, df$Distancetoligand)
#sanity checks
# sanity checks
head(df$Distancetoligand)
#should be numeric, check if not change it to numeric
is.numeric(df$Distancetoligand)
#change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand)
#should be TRUE
# should be numeric, check if not change it to numeric
is.numeric(df$Distancetoligand)
#change the column name to indicate units
# change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand)
# should be TRUE
is.numeric(df$Distancetoligand)
# change the column name to indicate units
n = which(colnames(df) == "Distancetoligand")
colnames(df)[n] <- "Dis_lig_Ang"
colnames(df)[n]
@ -146,16 +171,19 @@ head(df$DUETstabilitychange)
df$DUETstabilitychange = gsub("Kcal/mol"
, ""
, df$DUETstabilitychange)
#sanity checks
# sanity checks
head(df$DUETstabilitychange)
#should be numeric, check if not change it to numeric
is.numeric(df$DUETstabilitychange)
#change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
#should be TRUE
# should be numeric, check if not change it to numeric
is.numeric(df$DUETstabilitychange)
#change the column name to indicate units
# change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
# should be TRUE
is.numeric(df$DUETstabilitychange)
# change the column name to indicate units
n = which(colnames(df) == "DUETstabilitychange"); n
colnames(df)[n] = "DUETStability_Kcalpermol"
colnames(df)[n]
@ -163,25 +191,20 @@ colnames(df)[n]
#========
#STEP 5
#========
#create yet another extra column: classification of DUET stability only
# create yet another extra column: classification of DUET stability only
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
, "Stabilizing"
, "Destabilizing") #335, 12
, "Destabilizing") # spelling to be consistent with mcsm
table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#==============================
#FIXME
#Insert a venn diagram
#================================
#========
#STEP 6
#========
@ -198,10 +221,10 @@ colnames(df[mut])
#========
#STEP 7
#========
#create an extra column: maybe useful for some plots
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
# create an extra column: maybe useful for some plots
df$WildPos = paste0(df$Wild_type, df$Position)
#clear variables
# clear variables
rm(n, wt, mut)
################ end of data cleaning

View file

@ -1,25 +1,47 @@
getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
##################
# load libraries
library(compare)
##################
getwd()
#=======================================================
#TASK:read cleaned data and perform rescaling
# TASK:read cleaned data and perform rescaling
# of DUET stability scores
# of Pred affinity
#compare scaling methods with plots
#output normalised file
# compare scaling methods with plots
# Requirements:
# input: R script, step3c_results_cleaning.R
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
# output normalised file
#=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
in_filename = "/step3c_results_cleaning.R"
infile = paste0(homedir, currdir, in_filename)
print(paste0("Input file is:", infile))
# output file
basedir = "/git/Data/pyrazinamide/input"
outpath = "/processed"
out_filename = "/mcsm_complex1_normalised.csv"
outfile = paste0(homedir, basedir, outpath, out_filename)
print(paste0("Output file is:", outfile))
####################
#### read file #####: this will be the output of my R script that cleans the data columns
####################
source("../Scripts/step3c_data_cleaning.R")
##This will outut two dataframes:
##data: unclean data: 335, 10
##df : cleaned df 335, 13
## you can remove data if you want as you will not need it
source(infile)
#This will outut two dataframes:
# data: unclean data: 10 cols
# df : cleaned df: 13 cols
# you can remove data if you want as you will not need it
rm(data)
colnames(df)
@ -36,67 +58,60 @@ group = which(colnames(df) == "Lig_outcome"); group
# This is because this makes it easier to see the results of rescaling for debugging
head(df$PredAffLog)
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
df = df[order(df$PredAffLog),]
head(df$PredAffLog)
#sanity checks
head(df[,n]) #all negatives
tail(df[,n]) #all positives
# sanity checks
head(df[,n]) # all negatives
tail(df[,n]) # all positives
#sanity checks
# sanity checks
mean(df[,n])
#-0.9526746
tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.2112100 0.3926667
#===========================
#Same as above: in 2 steps
# Same as above: in 2 steps
#===========================
#find range of your data
my_min = min(df[,n]); my_min #-3.948
my_max = max(df[,n]); my_max #2.23
# find range of your data
my_min = min(df[,n]); my_min #
my_max = max(df[,n]); my_max #
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers
# =====> chosen one: preserves sign
#===============================================
df$ratioPredAff = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
)#335 14
#sanity checks
)# 14 cols
# sanity checks
head(df$ratioPredAff)
tail(df$ratioPredAff)
min(df$ratioPredAff); max(df$ratioPredAff)
tapply(df$ratioPredAff, df$Lig_outcome, min)
#Destabilizing Stabilizing
#-1.000000000 0.005381166
tapply(df$ratioPredAff, df$Lig_outcome, max)
#Destabilizing Stabilizing
#-0.001266464 1.000000000
#should be the same as below (281 and 54)
# should be the same as below
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
#uncomment as necessary
# uncomment as necessary
my_title = "Ligand_stability"
#my_title = colnames(df[n])
# my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group)
#===================
# 3b: DUET stability
#===================
dim(df) #335, 14
dim(df) # 14 cols
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
group = which(colnames(df) == "DUET_outcome"); group #12
@ -151,63 +166,53 @@ group = which(colnames(df) == "DUET_outcome"); group #12
# This is because this makes it easier to see the results of rescaling for debugging
head(df$DUETStability_Kcalpermol)
#ORDER BY DUET scores: negative values at the top and positive at the bottom
# ORDER BY DUET scores: negative values at the top and positive at the bottom
df = df[order(df$DUETStability_Kcalpermol),]
#sanity checks
head(df[,n]) #negatives
tail(df[,n]) #positives
# sanity checks
head(df[,n]) # negatives
tail(df[,n]) # positives
#sanity checks
# sanity checks
mean(df[,n])
#[1] -1.173316
tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.4297257 0.3978723
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers
# =====> chosen one: preserves sign
#===============================================
#find range of your data
my_min = min(df[,n]); my_min #-3.87
my_max = max(df[,n]); my_max #1.689
# find range of your data
my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max
df$ratioDUET = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
) #335, 15
#sanity check
) # 15 cols
# sanity check
head(df$ratioDUET)
tail(df$ratioDUET)
min(df$ratioDUET); max(df$ratioDUET)
#sanity checks
# sanity checks
tapply(df$ratioDUET, df$DUET_outcome, min)
#Destabilizing Stabilizing
#-1.00000000 0.01065719
tapply(df$ratioDUET, df$DUET_outcome, max)
#Destabilizing Stabilizing
#-0.003875969 1.000000000
#should be the same as below (267 and 42)
# should be the same as below (267 and 42)
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
#uncomment as necessary
# uncomment as necessary
my_title = "DUET_stability"
#my_title = colnames(df[n])
@ -246,7 +251,25 @@ mtext(text = my_title
, line = 0
, outer = TRUE)
# reorder by column name
#data <- data[c("A", "B", "C")]
colnames(df)
df2 = df[c("X", "Mutationinformation", "WildPos", "Position"
, "Wild_type", "Mutant_type"
, "DUETStability_Kcalpermol", "DUET_outcome"
, "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
, "ratioDUET", "ratioPredAff"
, "LigandID","Chain")]
# sanity check
# should be True
#compare(df, df2, allowAll = T)
compare(df, df2, ignoreColOrder = T)
#TRUE
#reordered columns
#===================
# write output as csv file
#===================
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
write.csv(df2, outfile, row.names = FALSE)