LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh

#!/bin/bash

#********************************************************************
# TASK: submit result urls and fetch actual results using curl
# Iterate over each result url from the output of step1 stored in processed/
# Use curl to fetch results and extract relevant sections using hxtools
# and store these in another file in processed/

# Requirements:
# input: output of step1, file containing result urls
	# path: "Data/<drug>/input/processed/<filename>"
# output: name of the file where extracted results will be stored
	# path: "Data/<drug>/input/processed/<filename>"

# Optional: can make these command line args you pass when calling script
# by uncommenting code as indicated
#*********************************************************************
############################# uncomment: to make it command line args
#if [ "$#" -ne 2 ]; then
  #if [ -Z $1 ]; then
#  echo "
#  Please provide both Input and Output files.

#  Usage: batch_read_urls.sh INFILE OUTFILE
#  "
#  exit 1
#fi

# First argument: Input File
# Second argument: Output File
#infile=$1
#outfile=$2
############################ end of code block to make command line args

# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
infile="/complex1_result_url.txt"

outpath="${inpath}${processed_path}"
outfile="/complex1_output_MASTER.txt"

# create valid input and output filenames
filename="${inpath}${processed_path}${infile}"
echo Input File is: ${filename}

outfilename="${outpath}${outfile}"
echo Output File will be: ${outfilename}

# Iterate over each result url, and extract results using hxtools
# which nicely cleans and formats html
echo -n "Processing $(wc -l < ${filename}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
#COUNT=$(($COUNT+1))
((COUNT++))
  curl --silent ${line} \
    | hxnormalize -x \
    | hxselect -c div.span4 \
    | hxselect -c div.well \
    | sed -r -e 's/<[^>]*>//g' \
    | sed -re 's/ +//g' \
    >> ${outfilename}
  #| tee -a ${outfilename}
#  echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..."

done < "${filename}"

echo
echo "Processing Complete"