#!/bin/bash #******************************************************************** # TASK: submit result urls and fetch actual results using curl # Iterate over each result url from the output of step1 stored in processed/ # Use curl to fetch results and extract relevant sections using hxtools # and store these in another file in processed/ # Requirements: # input: output of step1, file containing result urls # path: "Data//input/processed/" # output: name of the file where extracted results will be stored # path: "Data//input/processed/" # Optional: can make these command line args you pass when calling script # by uncommenting code as indicated #********************************************************************* ############################# uncomment: to make it command line args #if [ "$#" -ne 2 ]; then #if [ -Z $1 ]; then # echo " # Please provide both Input and Output files. # Usage: batch_read_urls.sh INFILE OUTFILE # " # exit 1 #fi # First argument: Input File # Second argument: Output File #infile=$1 #outfile=$2 ############################ end of code block to make command line args ############# specify variables for input and output paths and filenames homedir="${HOME}" #echo Home directory is ${homedir} basedir="/git/Data/pyrazinamide/input" # input inpath="/processed" in_filename="/complex1_result_url.txt" infile="${homedir}${basedir}${inpath}${in_filename}" echo Input Mut filename: ${infile} # output outpath="/processed" out_filename="/complex1_output_MASTER.txt" outfile="${homedir}${basedir}${outpath}${out_filename}" echo Output filename: ${outfile} ################## end of variable assignment for input and output files # Iterate over each result url, and extract results using hxtools # which nicely cleans and formats html echo -n "Processing $(wc -l < ${infile}) entries from ${infile}" echo COUNT=0 while read -r line; do #COUNT=$(($COUNT+1)) ((COUNT++)) curl --silent ${line} \ | hxnormalize -x \ | hxselect -c div.span4 \ | hxselect -c div.well \ | sed -r -e 's/<[^>]*>//g' \ | sed -re 's/ +//g' \ >> ${outfile} #| tee -a ${outfile} # echo -n '.' echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..." done < "${infile}" echo echo "Processing Complete"