LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh

76 lines
2.2 KiB
Bash
Executable file

#!/bin/bash
#********************************************************************
# TASK: submit result urls and fetch actual results using curl
# Iterate over each result url from the output of step1 stored in processed/
# Use curl to fetch results and extract relevant sections using hxtools
# and store these in another file in processed/
# Requirements:
# input: output of step1, file containing result urls
# path: "Data/<drug>/input/processed/<filename>"
# output: name of the file where extracted results will be stored
# path: "Data/<drug>/input/processed/<filename>"
# Optional: can make these command line args you pass when calling script
# by uncommenting code as indicated
#*********************************************************************
############################# uncomment: to make it command line args
#if [ "$#" -ne 2 ]; then
#if [ -Z $1 ]; then
# echo "
# Please provide both Input and Output files.
# Usage: batch_read_urls.sh INFILE OUTFILE
# "
# exit 1
#fi
# First argument: Input File
# Second argument: Output File
#infile=$1
#outfile=$2
############################ end of code block to make command line args
############# specify variables for input and output paths and filenames
homedir="${HOME}"
#echo Home directory is ${homedir}
basedir="/git/Data/pyrazinamide/input"
# input
inpath="/processed"
in_filename="/complex1_result_url.txt"
infile="${homedir}${basedir}${inpath}${in_filename}"
echo Input Mut filename: ${infile}
# output
outpath="/processed"
out_filename="/complex1_output_MASTER.txt"
outfile="${homedir}${basedir}${outpath}${out_filename}"
echo Output filename: ${outfile}
################## end of variable assignment for input and output files
# Iterate over each result url, and extract results using hxtools
# which nicely cleans and formats html
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
#COUNT=$(($COUNT+1))
((COUNT++))
curl --silent ${line} \
| hxnormalize -x \
| hxselect -c div.span4 \
| hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \
>> ${outfile}
#| tee -a ${outfile}
# echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
done < "${infile}"
echo
echo "Processing Complete"