LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh

73 lines
2.1 KiB
Bash
Executable file

#!/bin/bash
#********************************************************************
# TASK: submit result urls and fetch actual results using curl
# Iterate over each result url from the output of step1 stored in processed/
# Use curl to fetch results and extract relevant sections using hxtools
# and store these in another file in processed/
# Requirements:
# input: output of step1, file containing result urls
# path: "Data/<drug>/input/processed/<filename>"
# output: name of the file where extracted results will be stored
# path: "Data/<drug>/input/processed/<filename>"
# Optional: can make these command line args you pass when calling script
# by uncommenting code as indicated
#*********************************************************************
############################# uncomment: to make it command line args
#if [ "$#" -ne 2 ]; then
#if [ -Z $1 ]; then
# echo "
# Please provide both Input and Output files.
# Usage: batch_read_urls.sh INFILE OUTFILE
# "
# exit 1
#fi
# First argument: Input File
# Second argument: Output File
#infile=$1
#outfile=$2
############################ end of code block to make command line args
# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
infile="/complex1_result_url.txt"
outpath="${inpath}${processed_path}"
outfile="/complex1_output_MASTER.txt"
# create valid input and output filenames
filename="${inpath}${processed_path}${infile}"
echo Input File is: ${filename}
outfilename="${outpath}${outfile}"
echo Output File will be: ${outfilename}
# Iterate over each result url, and extract results using hxtools
# which nicely cleans and formats html
echo -n "Processing $(wc -l < ${filename}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
#COUNT=$(($COUNT+1))
((COUNT++))
curl --silent ${line} \
| hxnormalize -x \
| hxselect -c div.span4 \
| hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \
>> ${outfilename}
#| tee -a ${outfilename}
# echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..."
done < "${filename}"
echo
echo "Processing Complete"