73 lines
2.1 KiB
Bash
Executable file
73 lines
2.1 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
#********************************************************************
|
|
# TASK: submit result urls and fetch actual results using curl
|
|
# Iterate over each result url from the output of step1 stored in processed/
|
|
# Use curl to fetch results and extract relevant sections using hxtools
|
|
# and store these in another file in processed/
|
|
|
|
# Requirements:
|
|
# input: output of step1, file containing result urls
|
|
# path: "Data/<drug>/input/processed/<filename>"
|
|
# output: name of the file where extracted results will be stored
|
|
# path: "Data/<drug>/input/processed/<filename>"
|
|
|
|
# Optional: can make these command line args you pass when calling script
|
|
# by uncommenting code as indicated
|
|
#*********************************************************************
|
|
############################# uncomment: to make it command line args
|
|
#if [ "$#" -ne 2 ]; then
|
|
#if [ -Z $1 ]; then
|
|
# echo "
|
|
# Please provide both Input and Output files.
|
|
|
|
# Usage: batch_read_urls.sh INFILE OUTFILE
|
|
# "
|
|
# exit 1
|
|
#fi
|
|
|
|
# First argument: Input File
|
|
# Second argument: Output File
|
|
#infile=$1
|
|
#outfile=$2
|
|
############################ end of code block to make command line args
|
|
|
|
# specify variables for input and output paths and filenames
|
|
inpath="${HOME}/git/Data/pyrazinamide/input"
|
|
processed_path="/processed"
|
|
infile="/complex1_result_url.txt"
|
|
|
|
outpath="${inpath}${processed_path}"
|
|
outfile="/complex1_output_MASTER.txt"
|
|
|
|
# create valid input and output filenames
|
|
filename="${inpath}${processed_path}${infile}"
|
|
echo Input File is: ${filename}
|
|
|
|
outfilename="${outpath}${outfile}"
|
|
echo Output File will be: ${outfilename}
|
|
|
|
# Iterate over each result url, and extract results using hxtools
|
|
# which nicely cleans and formats html
|
|
echo -n "Processing $(wc -l < ${filename}) entries from ${infile}"
|
|
echo
|
|
COUNT=0
|
|
while read -r line; do
|
|
#COUNT=$(($COUNT+1))
|
|
((COUNT++))
|
|
curl --silent ${line} \
|
|
| hxnormalize -x \
|
|
| hxselect -c div.span4 \
|
|
| hxselect -c div.well \
|
|
| sed -r -e 's/<[^>]*>//g' \
|
|
| sed -re 's/ +//g' \
|
|
>> ${outfilename}
|
|
#| tee -a ${outfilename}
|
|
# echo -n '.'
|
|
echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..."
|
|
|
|
done < "${filename}"
|
|
|
|
echo
|
|
echo "Processing Complete"
|
|
|