import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+#*************************************
+# need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#**********************************************************************
+# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
+# per line. Sort by unique, which automatically removes duplicates.
+# sace file in current directory
+#**********************************************************************
+infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
+outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
+
+# sort unique entries and output to current directory
+sort -u ${infile} > ${outfile}
+
+# count no. of unique snps mCSM will run on
+count=$(wc -l < ${outfile})
+
+# print to console no. of unique snps mCSM will run on
+echo "${count} unique mutations for mCSM to run on"
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@ -0,0 +1,72 @@
+#!/bin/bash
+
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#**********************************************************************
+# TASK: submit requests using curl: HANDLE redirects and refresh url. 
+# Iterate over mutation file and write/append result urls to a file
+# result url file: stored in the /Results directory
+# mutation file: one mutation per line, no chain ID
+# output: in a file, should be n urls (n=no. of mutations in file)
+# NOTE: these are just result urls, not actual values for results
+#**********************************************************************
+## iterate over mutation file; line by line and submit query using curl
+filename="../Data/pnca_mis_SNPs_v2_unique.csv"
+
+## some useful messages
+echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
+COUNT=0
+while read -r line; do
+((COUNT++))
+    mutation="${line}"
+#    echo "${mutation}"
+pdb='../Data/complex1_no_water.pdb'
+mutation="${mutation}"
+chain="A"
+lig_id="PZA"
+affin_wt="0.99"
+host="http://biosig.unimelb.edu.au"
+call_url="/mcsm_lig/prediction"
+
+##=========================================
+##html field_names names required for curl
+##complex_field:wild=@
+##mutation_field:mutation=@
+##chain_field:chain=@
+##ligand_field:lig_id@
+##energy_field:affin_wt
+#=========================================
+refresh_url=$(curl -L \
+     -sS \
+     -F "wild=@${pdb}" \
+     -F "mutation=${mutation}" \
+     -F "chain=${chain}" \
+     -F "lig_id=${lig_id}" \
+     -F "affin_wt=${affin_wt}" \
+     ${host}${call_url} | grep "http-equiv")
+
+#echo $refresh_url
+#echo ${host}${refresh_url}
+
+#use regex to extract the relevant bit from the refresh url
+#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
+
+#Now build: result url using host and refresh url and write the urls to a file in the Results dir
+result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
+sleep 10
+
+echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
+
+echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+#echo -n '.'
+done < "${filename}"
+
+echo
+echo "Processing Complete"
+
+##end of submitting query, receiving result url and storing results url in a file
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#********************************************************************
+# TASK: submit result urls and fetch actual results using curl
+# iterate over each result url from the output of step1 in the stored
+# in file in /Results.
+# Use curl to fetch results and extract relevant sections using hxtools
+# and store these in another file in /Results 
+# This script takes two arguments:
+# 	input file: file containing results url
+#				In this case: 336_mCSM_lig_complex1_result_url.txt
+# 	output file: name of the file where extracted results will be stored
+#				In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
+#*********************************************************************
+
+#if [ "$#" -ne 2 ]; then
+  #if [ -Z $1 ]; then
+#  echo "
+#  Please provide both Input and Output files.
+
+#  Usage: batch_read_urls.sh INFILE OUTFILE
+#  "
+#  exit 1
+#fi
+
+# First argument: Input File
+# Second argument: Output File
+#infile=$1
+#outfile=$2
+
+infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
+outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
+
+echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
+echo
+COUNT=0
+while read -r line; do
+#COUNT=$(($COUNT+1))
+((COUNT++))
+  curl --silent ${line} \
+    | hxnormalize -x \
+    | hxselect -c div.span4 \
+    | hxselect -c div.well \
+    | sed -r -e 's/<[^>]*>//g' \
+    | sed -re 's/ +//g' \
+    >> ${outfile}
+  #| tee -a ${outfile}
+#  echo -n '.'
+echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
+  
+done < "${infile}"
+
+echo
+echo "Processing Complete"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#********************************************************************
+# TASK: Intermediate results processing
+# output file has a convenient delimiter of ":" that can be used to 
+# format the file into two columns (col1: field_desc and col2: values)
+# However the section "PredictedAffinityChange:...." and 
+# "DUETstabilitychange:.." are split over multiple lines and 
+# prevent this from happening.Additionally there are other empty lines
+# that need to be omiited. In order ensure these sections are not split
+# over multiple lines, this script is written.
+#*********************************************************************
+
+infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
+
+#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
+# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
+
+# Outputs records separated by a newline, that look something like this:
+# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
+# Mutationinformation:
+# Wild-type:L
+# Position:4
+# Mutant-type:W
+# Chain:A
+# LigandID:PZA
+# Distancetoligand:15.911&Aring;
+# DUETstabilitychange:-2.169Kcal/mol
+# 
+# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
+# (...etc)
+
+# This script brings everything in a convenient format for further processing in python.
+# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
+sed -i '/PredictedAffinityChange/ {
+N
+N
+N
+N
+s/\n//g
+}
+/DUETstabilitychange:/ {
+N
+N
+s/\n//g
+}
+/^$/d' ${infile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
@ -0,0 +1,29 @@
+#!/usr/bin/python
+import pandas as pd
+from collections import defaultdict
+
+#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
+
+outCols=[
+        'PredictedAffinityChange',
+        'Mutationinformation',
+        'Wild-type',
+        'Position',
+        'Mutant-type',
+        'Chain',
+        'LigandID',
+        'Distancetoligand',
+        'DUETstabilitychange'
+        ]
+
+lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
+
+outputs = defaultdict(list)
+
+for item in lines:
+	col, val = item.split(':')
+	outputs[col].append(val)
+
+dfOut=pd.DataFrame(outputs)
+
+pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
@ -0,0 +1,207 @@
+getwd()
+#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+getwd()
+
+#=======================================================
+#TASK: To tidy the columns so you can generate figures
+#=======================================================
+####################
+#### read file #####: this will be the output from python script (csv file)
+####################
+data = read.csv("336_complex1_formatted_results.csv"
+              , header = T
+              , stringsAsFactors = FALSE)
+dim(data)
+#335, 10
+str(data)
+
+###########################
+##### Data processing #####
+###########################
+
+# populate mutation information columns as currently it is empty
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+
+# should not be blank: create muation information
+data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
+
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+#write.csv(data, 'test.csv')
+##########################################
+# Remove duplicate SNPs as a sanity check
+##########################################
+#very important
+table(duplicated(data$Mutationinformation))
+#FALSE   
+#335
+
+#extract duplicated entries
+dups = data[duplicated(data$Mutationinformation),] #0
+
+#No of dups should match with the no. of TRUE in the above table 
+#u_dups = unique(dups$Mutationinformation) #10
+sum( table(dups$Mutationinformation) ) #13
+
+rm(dups)
+
+#***************************************************************
+#select non-duplicated SNPs and create a new df
+df = data[!duplicated(data$Mutationinformation),] #309, 10
+#***************************************************************
+#sanity check
+u = unique(df$Mutationinformation)
+u2 = unique(data$Mutationinformation)
+table(u%in%u2)
+#TRUE 
+#309 
+#should all be 1, hence 309 1's
+sum(table(df$Mutationinformation) == 1)
+
+#sort df by Position
+#MANUAL CHECKPOINT:  
+#foo <- df[order(df$Position),]
+#df <- df[order(df$Position),]
+
+rm(u, u2, dups)
+
+####################
+#### give meaningful colnames to reflect units to enable correct data type
+####################
+
+#=======
+#STEP 1
+#========
+#make a copy of the PredictedAffinityColumn and call it Lig_outcome
+df$Lig_outcome = df$PredictedAffinityChange #335, 11
+
+#make Predicted...column numeric and outcome column categorical
+head(df$PredictedAffinityChange)
+df$PredictedAffinityChange = gsub("log.*"
+                                  , ""
+                                  , df$PredictedAffinityChange)
+
+#sanity checks
+head(df$PredictedAffinityChange)
+
+#should be numeric, check and if not make it numeric
+is.numeric( df$PredictedAffinityChange )
+#change to numeric
+df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
+#should be TRUE
+is.numeric( df$PredictedAffinityChange )
+
+#change the column name to indicate units
+n = which(colnames(df) == "PredictedAffinityChange"); n
+colnames(df)[n] = "PredAffLog"
+colnames(df)[n]
+
+#========
+#STEP 2
+#========
+#make Lig_outcome column categorical showing effect of mutation
+head(df$Lig_outcome)
+df$Lig_outcome = gsub("^.*-"
+                  , "",
+                  df$Lig_outcome)
+#sanity checks
+head(df$Lig_outcome)
+#should be factor, check and if not change it to factor
+is.factor(df$Lig_outcome) 
+#change to factor
+df$Lig_outcome = as.factor(df$Lig_outcome)
+#should be TRUE
+is.factor(df$Lig_outcome) 
+
+#========
+#STEP 3
+#========
+#gsub
+head(df$Distancetoligand)
+df$Distancetoligand = gsub("&Aring;"
+                           , ""
+                           , df$Distancetoligand)
+#sanity checks
+head(df$Distancetoligand)
+#should be numeric, check if not change it to numeric
+is.numeric(df$Distancetoligand)
+#change to numeric
+df$Distancetoligand = as.numeric(df$Distancetoligand)
+#should be TRUE
+is.numeric(df$Distancetoligand)
+
+#change the column name to indicate units
+n = which(colnames(df) == "Distancetoligand")
+colnames(df)[n] <- "Dis_lig_Ang"
+colnames(df)[n]
+
+#========
+#STEP 4
+#========
+#gsub
+head(df$DUETstabilitychange)
+df$DUETstabilitychange = gsub("Kcal/mol"
+                              , ""
+                              , df$DUETstabilitychange)
+#sanity checks
+head(df$DUETstabilitychange)
+#should be numeric, check if not change it to numeric
+is.numeric(df$DUETstabilitychange)
+#change to numeric 
+df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
+#should be TRUE
+is.numeric(df$DUETstabilitychange)
+
+#change the column name to indicate units
+n = which(colnames(df) == "DUETstabilitychange"); n
+colnames(df)[n] = "DUETStability_Kcalpermol"
+colnames(df)[n]
+
+#========
+#STEP 5
+#========
+#create yet another extra column: classification of DUET stability only
+df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
+                         , "Stabilizing"
+                         , "Destabilizing")  #335, 12
+
+table(df$Lig_outcome)
+#Destabilizing   Stabilizing 
+#281             54 
+
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing 
+#288             47 
+#==============================
+#FIXME
+#Insert a venn diagram
+
+#================================
+
+
+#========
+#STEP 6
+#========
+# assign wild and mutant colnames correctly
+
+wt = which(colnames(df) == "Wild.type"); wt
+colnames(df)[wt] <- "Wild_type"
+colnames(df[wt])
+
+mut = which(colnames(df) == "Mutant.type"); mut
+colnames(df)[mut] <- "Mutant_type"
+colnames(df[mut])
+
+#========
+#STEP 7
+#========
+#create an extra column: maybe useful for some plots
+df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
+
+#clear variables
+rm(n, wt, mut)
+
+################ end of data cleaning
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
@ -0,0 +1,252 @@
+getwd()
+#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+getwd()
+
+#=======================================================
+#TASK:read cleaned data and perform rescaling
+  # of DUET stability scores
+  # of Pred affinity
+#compare scaling methods with plots
+#output normalised file
+#=======================================================
+
+####################
+#### read file #####: this will be the output of my R script that cleans the data columns
+####################
+source("../Scripts/step3c_data_cleaning.R")
+##This will outut two dataframes:
+##data: unclean data: 335, 10
+##df : cleaned df 335, 13
+## you can remove data if you want as you will not need it
+rm(data)
+
+colnames(df)
+
+#===================
+#3a: PredAffLog
+#===================
+n = which(colnames(df) == "PredAffLog"); n
+group = which(colnames(df) == "Lig_outcome"); group 
+
+#===================================================
+# order according to PredAffLog values
+#===================================================
+# This is because this makes it easier to see the results of rescaling for debugging
+head(df$PredAffLog)
+
+#ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
+df = df[order(df$PredAffLog),] 
+head(df$PredAffLog)
+
+#sanity checks
+head(df[,n]) #all negatives
+tail(df[,n]) #all positives
+
+#sanity checks
+mean(df[,n])
+#-0.9526746
+
+tapply(df[,n], df[,group], mean)
+#Destabilizing   Stabilizing 
+#-1.2112100      0.3926667 
+#===========================
+#Same as above: in 2 steps
+#===========================
+
+#find range of your data
+my_min = min(df[,n]); my_min #-3.948
+my_max = max(df[,n]); my_max #2.23
+
+#===============================================
+# WITHIN GROUP rescaling 2: method "ratio"
+# create column to store the rescaled values
+# Rescaling separately (Less dangerous) 
+#       =====> chosen one:as Nick prefers
+#===============================================
+df$ratioPredAff = ifelse(df[,n] < 0
+                      , df[,n]/abs(my_min)
+                      , df[,n]/my_max
+                      )#335 14
+#sanity checks
+head(df$ratioPredAff)
+tail(df$ratioPredAff)
+
+min(df$ratioPredAff); max(df$ratioPredAff)
+
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+#Destabilizing   Stabilizing 
+#-1.000000000   0.005381166 
+
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+#Destabilizing   Stabilizing 
+#-0.001266464   1.000000000
+
+#should be the same as below (281 and 54)
+sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
+
+table(df$Lig_outcome)
+#Destabilizing   Stabilizing 
+#281              54
+
+#===============================================
+# Hist and density plots to compare the rescaling 
+# methods: Base R
+#===============================================
+#uncomment as necessary
+my_title = "Ligand_stability"
+#my_title = colnames(df[n])
+
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(2,2))
+
+hist(df[,n]
+     , xlab = ""
+     , main = "Raw values"
+)
+
+hist(df$ratioPredAff
+     , xlab = ""
+     , main = "ratio rescaling"
+)
+
+# Plot density plots underneath
+plot(density( df[,n] )
+     , main = "Raw values"
+)
+
+plot(density( df$ratioPredAff )
+     , main = "ratio rescaling"
+)
+
+# titles
+mtext(text = "Frequency"
+       , side = 2
+       , line = 0
+       , outer = TRUE)
+
+mtext(text = my_title
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+
+#clear variables 
+rm(my_min, my_max, my_title, n, group)
+
+#===================
+# 3b: DUET stability
+#===================
+dim(df) #335, 14
+
+n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
+group = which(colnames(df) == "DUET_outcome"); group #12
+
+#===================================================
+# order according to DUET scores
+#===================================================
+# This is because this makes it easier to see the results of rescaling for debugging
+head(df$DUETStability_Kcalpermol)
+
+#ORDER BY DUET scores: negative values at the top and positive at the bottom
+df = df[order(df$DUETStability_Kcalpermol),] 
+
+#sanity checks
+head(df[,n]) #negatives
+tail(df[,n]) #positives
+
+#sanity checks
+mean(df[,n])
+#[1] -1.173316
+
+tapply(df[,n], df[,group], mean)
+#Destabilizing   Stabilizing 
+#-1.4297257     0.3978723
+
+#===============================================
+# WITHIN GROUP rescaling 2: method "ratio"
+# create column to store the rescaled values
+# Rescaling separately (Less dangerous) 
+#       =====> chosen one:as Nick prefers
+#===============================================
+#find range of your data
+my_min = min(df[,n]); my_min #-3.87
+my_max = max(df[,n]); my_max #1.689
+
+df$ratioDUET = ifelse(df[,n] < 0
+                      , df[,n]/abs(my_min)
+                      , df[,n]/my_max
+                    ) #335, 15
+#sanity check
+head(df$ratioDUET)
+tail(df$ratioDUET)
+
+min(df$ratioDUET); max(df$ratioDUET)
+
+#sanity checks
+tapply(df$ratioDUET, df$DUET_outcome, min)
+#Destabilizing   Stabilizing 
+#-1.00000000    0.01065719
+
+tapply(df$ratioDUET, df$DUET_outcome, max)
+#Destabilizing   Stabilizing 
+#-0.003875969   1.000000000 
+
+#should be the same as below (267 and 42)
+sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
+
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing 
+#288             47
+
+#===============================================
+# Hist and density plots to compare the rescaling 
+# methods: Base R
+#===============================================
+#uncomment as necessary
+
+my_title = "DUET_stability"
+#my_title = colnames(df[n])
+
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(2,2))
+
+hist(df[,n]
+     , xlab = ""
+     , main = "Raw values"
+)
+
+hist(df$ratioDUET
+     , xlab = ""
+     , main = "ratio rescaling"
+)
+
+# Plot density plots underneath
+plot(density( df[,n] )
+     , main = "Raw values"
+)
+
+plot(density( df$ratioDUET )
+     , main = "ratio rescaling"
+)
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = my_title
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#===================
+# write output as csv file
+#===================
+write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15