#!/bin/bash #******************************************************************** # TASK: submit result urls and fetch actual results using curl # Iterate over each result url from the output of step1 stored in processed/ # Use curl to fetch results and extract relevant sections using hxtools # and store these in another file in processed/ # Requirements: # input: output of step1, file containing result urls # path: "Data//input/processed/" # output: name of the file where extracted results will be stored # path: "Data//input/processed/" # Optional: can make these command line args you pass when calling script # by uncommenting code as indicated #********************************************************************* ############################# uncomment: to make it command line args #if [ "$#" -ne 2 ]; then #if [ -Z $1 ]; then # echo " # Please provide both Input and Output files. # Usage: batch_read_urls.sh INFILE OUTFILE # " # exit 1 #fi # First argument: Input File # Second argument: Output File #infile=$1 #outfile=$2 ############################ end of code block to make command line args # specify variables for input and output paths and filenames inpath="${HOME}/git/Data/pyrazinamide/input" processed_path="/processed" infile="/complex1_result_url.txt" outpath="${inpath}${processed_path}" outfile="/complex1_output_MASTER.txt" # create valid input and output filenames filename="${inpath}${processed_path}${infile}" echo Input File is: ${filename} outfilename="${outpath}${outfile}" echo Output File will be: ${outfilename} # Iterate over each result url, and extract results using hxtools # which nicely cleans and formats html echo -n "Processing $(wc -l < ${filename}) entries from ${infile}" echo COUNT=0 while read -r line; do #COUNT=$(($COUNT+1)) ((COUNT++)) curl --silent ${line} \ | hxnormalize -x \ | hxselect -c div.span4 \ | hxselect -c div.well \ | sed -r -e 's/<[^>]*>//g' \ | sed -re 's/ +//g' \ >> ${outfilename} #| tee -a ${outfilename} # echo -n '.' echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..." done < "${filename}" echo echo "Processing Complete"