import commit

This commit is contained in:
Tanushree Tunstall 2020-01-08 16:15:33 +00:00
commit bccfe68192
39 changed files with 6837 additions and 0 deletions

View file

@ -0,0 +1,25 @@
#!/bin/bash
#*************************************
# need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#**********************************************************************
# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
# per line. Sort by unique, which automatically removes duplicates.
# sace file in current directory
#**********************************************************************
infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
# sort unique entries and output to current directory
sort -u ${infile} > ${outfile}
# count no. of unique snps mCSM will run on
count=$(wc -l < ${outfile})
# print to console no. of unique snps mCSM will run on
echo "${count} unique mutations for mCSM to run on"

View file

@ -0,0 +1,72 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#**********************************************************************
# TASK: submit requests using curl: HANDLE redirects and refresh url.
# Iterate over mutation file and write/append result urls to a file
# result url file: stored in the /Results directory
# mutation file: one mutation per line, no chain ID
# output: in a file, should be n urls (n=no. of mutations in file)
# NOTE: these are just result urls, not actual values for results
#**********************************************************************
## iterate over mutation file; line by line and submit query using curl
filename="../Data/pnca_mis_SNPs_v2_unique.csv"
## some useful messages
echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
COUNT=0
while read -r line; do
((COUNT++))
mutation="${line}"
# echo "${mutation}"
pdb='../Data/complex1_no_water.pdb'
mutation="${mutation}"
chain="A"
lig_id="PZA"
affin_wt="0.99"
host="http://biosig.unimelb.edu.au"
call_url="/mcsm_lig/prediction"
##=========================================
##html field_names names required for curl
##complex_field:wild=@
##mutation_field:mutation=@
##chain_field:chain=@
##ligand_field:lig_id@
##energy_field:affin_wt
#=========================================
refresh_url=$(curl -L \
-sS \
-F "wild=@${pdb}" \
-F "mutation=${mutation}" \
-F "chain=${chain}" \
-F "lig_id=${lig_id}" \
-F "affin_wt=${affin_wt}" \
${host}${call_url} | grep "http-equiv")
#echo $refresh_url
#echo ${host}${refresh_url}
#use regex to extract the relevant bit from the refresh url
#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
#Now build: result url using host and refresh url and write the urls to a file in the Results dir
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
sleep 10
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
#echo -n '.'
done < "${filename}"
echo
echo "Processing Complete"
##end of submitting query, receiving result url and storing results url in a file

View file

@ -0,0 +1,59 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#********************************************************************
# TASK: submit result urls and fetch actual results using curl
# iterate over each result url from the output of step1 in the stored
# in file in /Results.
# Use curl to fetch results and extract relevant sections using hxtools
# and store these in another file in /Results
# This script takes two arguments:
# input file: file containing results url
# In this case: 336_mCSM_lig_complex1_result_url.txt
# output file: name of the file where extracted results will be stored
# In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
#*********************************************************************
#if [ "$#" -ne 2 ]; then
#if [ -Z $1 ]; then
# echo "
# Please provide both Input and Output files.
# Usage: batch_read_urls.sh INFILE OUTFILE
# "
# exit 1
#fi
# First argument: Input File
# Second argument: Output File
#infile=$1
#outfile=$2
infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
#COUNT=$(($COUNT+1))
((COUNT++))
curl --silent ${line} \
| hxnormalize -x \
| hxselect -c div.span4 \
| hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \
>> ${outfile}
#| tee -a ${outfile}
# echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
done < "${infile}"
echo
echo "Processing Complete"

View file

@ -0,0 +1,52 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#********************************************************************
# TASK: Intermediate results processing
# output file has a convenient delimiter of ":" that can be used to
# format the file into two columns (col1: field_desc and col2: values)
# However the section "PredictedAffinityChange:...." and
# "DUETstabilitychange:.." are split over multiple lines and
# prevent this from happening.Additionally there are other empty lines
# that need to be omiited. In order ensure these sections are not split
# over multiple lines, this script is written.
#*********************************************************************
infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
# Outputs records separated by a newline, that look something like this:
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
# Mutationinformation:
# Wild-type:L
# Position:4
# Mutant-type:W
# Chain:A
# LigandID:PZA
# Distancetoligand:15.911&Aring;
# DUETstabilitychange:-2.169Kcal/mol
#
# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
# (...etc)
# This script brings everything in a convenient format for further processing in python.
# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
sed -i '/PredictedAffinityChange/ {
N
N
N
N
s/\n//g
}
/DUETstabilitychange:/ {
N
N
s/\n//g
}
/^$/d' ${infile}

View file

@ -0,0 +1,29 @@
#!/usr/bin/python
import pandas as pd
from collections import defaultdict
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)

View file

@ -0,0 +1,207 @@
getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
getwd()
#=======================================================
#TASK: To tidy the columns so you can generate figures
#=======================================================
####################
#### read file #####: this will be the output from python script (csv file)
####################
data = read.csv("336_complex1_formatted_results.csv"
, header = T
, stringsAsFactors = FALSE)
dim(data)
#335, 10
str(data)
###########################
##### Data processing #####
###########################
# populate mutation information columns as currently it is empty
head(data$Mutationinformation)
tail(data$Mutationinformation)
# should not be blank: create muation information
data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
head(data$Mutationinformation)
tail(data$Mutationinformation)
#write.csv(data, 'test.csv')
##########################################
# Remove duplicate SNPs as a sanity check
##########################################
#very important
table(duplicated(data$Mutationinformation))
#FALSE
#335
#extract duplicated entries
dups = data[duplicated(data$Mutationinformation),] #0
#No of dups should match with the no. of TRUE in the above table
#u_dups = unique(dups$Mutationinformation) #10
sum( table(dups$Mutationinformation) ) #13
rm(dups)
#***************************************************************
#select non-duplicated SNPs and create a new df
df = data[!duplicated(data$Mutationinformation),] #309, 10
#***************************************************************
#sanity check
u = unique(df$Mutationinformation)
u2 = unique(data$Mutationinformation)
table(u%in%u2)
#TRUE
#309
#should all be 1, hence 309 1's
sum(table(df$Mutationinformation) == 1)
#sort df by Position
#MANUAL CHECKPOINT:
#foo <- df[order(df$Position),]
#df <- df[order(df$Position),]
rm(u, u2, dups)
####################
#### give meaningful colnames to reflect units to enable correct data type
####################
#=======
#STEP 1
#========
#make a copy of the PredictedAffinityColumn and call it Lig_outcome
df$Lig_outcome = df$PredictedAffinityChange #335, 11
#make Predicted...column numeric and outcome column categorical
head(df$PredictedAffinityChange)
df$PredictedAffinityChange = gsub("log.*"
, ""
, df$PredictedAffinityChange)
#sanity checks
head(df$PredictedAffinityChange)
#should be numeric, check and if not make it numeric
is.numeric( df$PredictedAffinityChange )
#change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
#should be TRUE
is.numeric( df$PredictedAffinityChange )
#change the column name to indicate units
n = which(colnames(df) == "PredictedAffinityChange"); n
colnames(df)[n] = "PredAffLog"
colnames(df)[n]
#========
#STEP 2
#========
#make Lig_outcome column categorical showing effect of mutation
head(df$Lig_outcome)
df$Lig_outcome = gsub("^.*-"
, "",
df$Lig_outcome)
#sanity checks
head(df$Lig_outcome)
#should be factor, check and if not change it to factor
is.factor(df$Lig_outcome)
#change to factor
df$Lig_outcome = as.factor(df$Lig_outcome)
#should be TRUE
is.factor(df$Lig_outcome)
#========
#STEP 3
#========
#gsub
head(df$Distancetoligand)
df$Distancetoligand = gsub("&Aring;"
, ""
, df$Distancetoligand)
#sanity checks
head(df$Distancetoligand)
#should be numeric, check if not change it to numeric
is.numeric(df$Distancetoligand)
#change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand)
#should be TRUE
is.numeric(df$Distancetoligand)
#change the column name to indicate units
n = which(colnames(df) == "Distancetoligand")
colnames(df)[n] <- "Dis_lig_Ang"
colnames(df)[n]
#========
#STEP 4
#========
#gsub
head(df$DUETstabilitychange)
df$DUETstabilitychange = gsub("Kcal/mol"
, ""
, df$DUETstabilitychange)
#sanity checks
head(df$DUETstabilitychange)
#should be numeric, check if not change it to numeric
is.numeric(df$DUETstabilitychange)
#change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
#should be TRUE
is.numeric(df$DUETstabilitychange)
#change the column name to indicate units
n = which(colnames(df) == "DUETstabilitychange"); n
colnames(df)[n] = "DUETStability_Kcalpermol"
colnames(df)[n]
#========
#STEP 5
#========
#create yet another extra column: classification of DUET stability only
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
, "Stabilizing"
, "Destabilizing") #335, 12
table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#==============================
#FIXME
#Insert a venn diagram
#================================
#========
#STEP 6
#========
# assign wild and mutant colnames correctly
wt = which(colnames(df) == "Wild.type"); wt
colnames(df)[wt] <- "Wild_type"
colnames(df[wt])
mut = which(colnames(df) == "Mutant.type"); mut
colnames(df)[mut] <- "Mutant_type"
colnames(df[mut])
#========
#STEP 7
#========
#create an extra column: maybe useful for some plots
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
#clear variables
rm(n, wt, mut)
################ end of data cleaning

View file

@ -0,0 +1,252 @@
getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
getwd()
#=======================================================
#TASK:read cleaned data and perform rescaling
# of DUET stability scores
# of Pred affinity
#compare scaling methods with plots
#output normalised file
#=======================================================
####################
#### read file #####: this will be the output of my R script that cleans the data columns
####################
source("../Scripts/step3c_data_cleaning.R")
##This will outut two dataframes:
##data: unclean data: 335, 10
##df : cleaned df 335, 13
## you can remove data if you want as you will not need it
rm(data)
colnames(df)
#===================
#3a: PredAffLog
#===================
n = which(colnames(df) == "PredAffLog"); n
group = which(colnames(df) == "Lig_outcome"); group
#===================================================
# order according to PredAffLog values
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$PredAffLog)
#ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
df = df[order(df$PredAffLog),]
head(df$PredAffLog)
#sanity checks
head(df[,n]) #all negatives
tail(df[,n]) #all positives
#sanity checks
mean(df[,n])
#-0.9526746
tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.2112100 0.3926667
#===========================
#Same as above: in 2 steps
#===========================
#find range of your data
my_min = min(df[,n]); my_min #-3.948
my_max = max(df[,n]); my_max #2.23
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers
#===============================================
df$ratioPredAff = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
)#335 14
#sanity checks
head(df$ratioPredAff)
tail(df$ratioPredAff)
min(df$ratioPredAff); max(df$ratioPredAff)
tapply(df$ratioPredAff, df$Lig_outcome, min)
#Destabilizing Stabilizing
#-1.000000000 0.005381166
tapply(df$ratioPredAff, df$Lig_outcome, max)
#Destabilizing Stabilizing
#-0.001266464 1.000000000
#should be the same as below (281 and 54)
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
#uncomment as necessary
my_title = "Ligand_stability"
#my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioPredAff
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioPredAff )
, main = "ratio rescaling"
)
# titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
#clear variables
rm(my_min, my_max, my_title, n, group)
#===================
# 3b: DUET stability
#===================
dim(df) #335, 14
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
group = which(colnames(df) == "DUET_outcome"); group #12
#===================================================
# order according to DUET scores
#===================================================
# This is because this makes it easier to see the results of rescaling for debugging
head(df$DUETStability_Kcalpermol)
#ORDER BY DUET scores: negative values at the top and positive at the bottom
df = df[order(df$DUETStability_Kcalpermol),]
#sanity checks
head(df[,n]) #negatives
tail(df[,n]) #positives
#sanity checks
mean(df[,n])
#[1] -1.173316
tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.4297257 0.3978723
#===============================================
# WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values
# Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers
#===============================================
#find range of your data
my_min = min(df[,n]); my_min #-3.87
my_max = max(df[,n]); my_max #1.689
df$ratioDUET = ifelse(df[,n] < 0
, df[,n]/abs(my_min)
, df[,n]/my_max
) #335, 15
#sanity check
head(df$ratioDUET)
tail(df$ratioDUET)
min(df$ratioDUET); max(df$ratioDUET)
#sanity checks
tapply(df$ratioDUET, df$DUET_outcome, min)
#Destabilizing Stabilizing
#-1.00000000 0.01065719
tapply(df$ratioDUET, df$DUET_outcome, max)
#Destabilizing Stabilizing
#-0.003875969 1.000000000
#should be the same as below (267 and 42)
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#===============================================
# Hist and density plots to compare the rescaling
# methods: Base R
#===============================================
#uncomment as necessary
my_title = "DUET_stability"
#my_title = colnames(df[n])
# Set the margin on all sides
par(oma = c(3,2,3,0)
, mar = c(1,3,5,2)
, mfrow = c(2,2))
hist(df[,n]
, xlab = ""
, main = "Raw values"
)
hist(df$ratioDUET
, xlab = ""
, main = "ratio rescaling"
)
# Plot density plots underneath
plot(density( df[,n] )
, main = "Raw values"
)
plot(density( df$ratioDUET )
, main = "ratio rescaling"
)
# graph titles
mtext(text = "Frequency"
, side = 2
, line = 0
, outer = TRUE)
mtext(text = my_title
, side = 3
, line = 0
, outer = TRUE)
#===================
# write output as csv file
#===================
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15