76 lines
2.2 KiB
Bash
Executable file
76 lines
2.2 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
#********************************************************************
|
|
# TASK: submit result urls and fetch actual results using curl
|
|
# Iterate over each result url from the output of step1 stored in processed/
|
|
# Use curl to fetch results and extract relevant sections using hxtools
|
|
# and store these in another file in processed/
|
|
|
|
# Requirements:
|
|
# input: output of step1, file containing result urls
|
|
# path: "Data/<drug>/input/processed/<filename>"
|
|
# output: name of the file where extracted results will be stored
|
|
# path: "Data/<drug>/input/processed/<filename>"
|
|
|
|
# Optional: can make these command line args you pass when calling script
|
|
# by uncommenting code as indicated
|
|
#*********************************************************************
|
|
############################# uncomment: to make it command line args
|
|
#if [ "$#" -ne 2 ]; then
|
|
#if [ -Z $1 ]; then
|
|
# echo "
|
|
# Please provide both Input and Output files.
|
|
|
|
# Usage: batch_read_urls.sh INFILE OUTFILE
|
|
# "
|
|
# exit 1
|
|
#fi
|
|
|
|
# First argument: Input File
|
|
# Second argument: Output File
|
|
#infile=$1
|
|
#outfile=$2
|
|
############################ end of code block to make command line args
|
|
|
|
############# specify variables for input and output paths and filenames
|
|
homedir="${HOME}"
|
|
#echo Home directory is ${homedir}
|
|
basedir="/git/Data/pyrazinamide/input"
|
|
|
|
# input
|
|
inpath="/processed"
|
|
in_filename="/complex1_result_url.txt"
|
|
infile="${homedir}${basedir}${inpath}${in_filename}"
|
|
echo Input Mut filename: ${infile}
|
|
|
|
# output
|
|
outpath="/processed"
|
|
out_filename="/complex1_output_MASTER.txt"
|
|
outfile="${homedir}${basedir}${outpath}${out_filename}"
|
|
echo Output filename: ${outfile}
|
|
################## end of variable assignment for input and output files
|
|
|
|
# Iterate over each result url, and extract results using hxtools
|
|
# which nicely cleans and formats html
|
|
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
|
|
echo
|
|
COUNT=0
|
|
while read -r line; do
|
|
#COUNT=$(($COUNT+1))
|
|
((COUNT++))
|
|
curl --silent ${line} \
|
|
| hxnormalize -x \
|
|
| hxselect -c div.span4 \
|
|
| hxselect -c div.well \
|
|
| sed -r -e 's/<[^>]*>//g' \
|
|
| sed -re 's/ +//g' \
|
|
>> ${outfile}
|
|
#| tee -a ${outfile}
|
|
# echo -n '.'
|
|
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
|
|
|
|
done < "${infile}"
|
|
|
|
echo
|
|
echo "Processing Complete"
|
|
|