renamed file paths and names to run mcsm

This commit is contained in:
Tanushree Tunstall 2020-01-10 12:18:18 +00:00
parent cf7d6f9f03
commit f026efb4db
4 changed files with 115 additions and 57 deletions

View file

@ -11,8 +11,8 @@
# per line. Sort by unique, which automatically removes duplicates.
# sace file in current directory
#**********************************************************************
infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
# sort unique entries and output to current directory
sort -u ${infile} > ${outfile}

View file

@ -1,30 +1,49 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#**********************************************************************
# TASK: submit requests using curl: HANDLE redirects and refresh url.
# Iterate over mutation file and write/append result urls to a file
# result url file: stored in the /Results directory
# mutation file: one mutation per line, no chain ID
# output: in a file, should be n urls (n=no. of mutations in file)
# Mutation file must have one mutation (format A1B) per line
# Requirements
# input: mutation list (format: A1B), complex struc: (pdb format)
# mutation: outFile from step0, one unique mutation/line, no chain ID
# path: "Data/<drug>/input/processed/<filename>"
# structure: pdb file of drug-target complex
# path: "Data/<drug>/input/structure/<filename>"
# output: should be n urls (n=no. of unique mutations in file)
# path: "Data/<drug>/input/processed/<filename>"
# NOTE: these are just result urls, not actual values for results
#**********************************************************************
## iterate over mutation file; line by line and submit query using curl
filename="../Data/pnca_mis_SNPs_v2_unique.csv"
# specify variables for input and output paths and filenames
## some useful messages
echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
struc_path="/structure"
infile_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_struc="/complex1_no_water.pdb"
outpath="${inpath}${processed_path}"
outfile="/mCSM_lig_complex1_result_url.txt"
# create valid input and output filenames
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
filename="${inpath}${processed_path}${infile_mut}"
echo Input File is: ${filename}
outfilename="${outpath}${outfile}"
echo Output File will be: ${outfilename}
# iterate over mutation file; line by line and submit query using curl
# some useful messages
echo -n -e "Processing $(wc -l < ${filename}) entries from ${infile_mut}\n"
COUNT=0
while read -r line; do
((COUNT++))
mutation="${line}"
mutation="${line}"
# echo "${mutation}"
pdb='../Data/complex1_no_water.pdb'
#pdb='../Data/complex1_no_water.pdb'
pdb="${inpath}${struc_path}${infile_struc}"
mutation="${mutation}"
chain="A"
lig_id="PZA"
@ -49,24 +68,31 @@ refresh_url=$(curl -L \
-F "affin_wt=${affin_wt}" \
${host}${call_url} | grep "http-equiv")
#echo $refresh_url
#echo ${host}${refresh_url}
#echo Refresh URL: $refresh_url
#echo Host+Refresh: ${host}${refresh_url}
#use regex to extract the relevant bit from the refresh url
#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
# use regex to extract the relevant bit from the refresh url
# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
#Now build: result url using host and refresh url and write the urls to a file in the Results dir
# Now build: result url using host and refresh url and write the urls to a file
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
sleep 10
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
# create output file with the added number of muts from file
# after much thought, bad idea as less generic!
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
echo -e "${host}${result_url}" >> ${outfilename}
#echo -n '.'
done < "${filename}"
echo
echo Output filename: ${outfilename}
echo
echo Number of urls saved: $(wc -l < ${filename})
echo
echo "Processing Complete"
##end of submitting query, receiving result url and storing results url in a file
# end of submitting query, receiving result url and storing results url in a file

View file

@ -1,23 +1,21 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#********************************************************************
# TASK: submit result urls and fetch actual results using curl
# iterate over each result url from the output of step1 in the stored
# in file in /Results.
# Iterate over each result url from the output of step1 stored in processed/
# Use curl to fetch results and extract relevant sections using hxtools
# and store these in another file in /Results
# This script takes two arguments:
# input file: file containing results url
# In this case: 336_mCSM_lig_complex1_result_url.txt
# output file: name of the file where extracted results will be stored
# In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
#*********************************************************************
# and store these in another file in processed/
# Requirements:
# input: output of step1, file containing result urls
# path: "Data/<drug>/input/processed/<filename>"
# output: name of the file where extracted results will be stored
# path: "Data/<drug>/input/processed/<filename>"
# Optional: can make these command line args you pass when calling script
# by uncommenting code as indicated
#*********************************************************************
############################# uncomment: to make it command line args
#if [ "$#" -ne 2 ]; then
#if [ -Z $1 ]; then
# echo "
@ -32,11 +30,26 @@
# Second argument: Output File
#infile=$1
#outfile=$2
############################ end of code block to make command line args
infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
infile="/mCSM_lig_complex1_result_url.txt"
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
outpath="${inpath}${processed_path}"
outfile="/mCSM_lig_complex1_output_MASTER.txt"
# create valid input and output filenames
filename="${inpath}${processed_path}${infile}"
echo Input File is: ${filename}
outfilename="${outpath}${outfile}"
echo Output File will be: ${outfilename}
# Iterate over each result url, and extract results using hxtools
# which nicely cleans and formats html
echo -n "Processing $(wc -l < ${filename}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
@ -48,12 +61,13 @@ while read -r line; do
| hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \
>> ${outfile}
#| tee -a ${outfile}
>> ${outfilename}
#| tee -a ${outfilename}
# echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..."
done < "${infile}"
done < "${filename}"
echo
echo "Processing Complete"

View file

@ -1,9 +1,4 @@
#!/bin/bash
#*************************************
#need to be in the correct directory
#*************************************
##: comments for code
#: commented out code
#********************************************************************
# TASK: Intermediate results processing
@ -11,15 +6,39 @@
# format the file into two columns (col1: field_desc and col2: values)
# However the section "PredictedAffinityChange:...." and
# "DUETstabilitychange:.." are split over multiple lines and
# prevent this from happening.Additionally there are other empty lines
# prevent this from happening. Additionally there are other empty lines
# that need to be omiited. In order ensure these sections are not split
# over multiple lines, this script is written.
#*********************************************************************
infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
# Requirements:
# input: output of step2, file containing mcsm results as described above
# path: "Data/<drug>/input/processed/<filename>"
# output: replaces file in place.
# Therefore first create a copy of the input file
# but rename it to remove the word "MASTER" and add the word "processed"
# file format: .txt
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
# NOTE: This replaces the file in place!
# the output is a txt file with no newlines and formatting
# to have the following format "<colname><:><value>
#***********************************************************************
# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
# Create input file: copy and rename output file of step2
oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
cp $oldfile $newfile
#infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
infile="/mCSM_lig_complex1_output_processed.txt"
filename="${inpath}${processed_path}${infile}"
echo Input filename is : ${filename}
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${filename} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${filename}
# Outputs records separated by a newline, that look something like this:
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
@ -36,7 +55,6 @@ infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
# (...etc)
# This script brings everything in a convenient format for further processing in python.
# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
sed -i '/PredictedAffinityChange/ {
N
N
@ -49,4 +67,4 @@ N
N
s/\n//g
}
/^$/d' ${infile}
/^$/d' ${filename}