consistent input & output variables for filenames to run mcsm

This commit is contained in:
Tanushree Tunstall 2020-01-13 12:16:13 +00:00
parent ef99167679
commit e3f4c630a1
4 changed files with 71 additions and 55 deletions

View file

@ -15,35 +15,40 @@
# NOTE: these are just result urls, not actual values for results # NOTE: these are just result urls, not actual values for results
#********************************************************************** #**********************************************************************
# specify variables for input and output paths and filenames ############# specify variables for input and output paths and filenames
homedir="${HOME}"
#echo Home directory is ${homedir}
basedir="/git/Data/pyrazinamide/input"
inpath="${HOME}/git/Data/pyrazinamide/input" # input
processed_path="/processed" inpath_mut="/processed"
struc_path="/structure" in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_mut="/pnca_mis_SNPs_v2_unique.csv" infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
infile_struc="/complex1_no_water.pdb" echo Input Mut filename: ${infile_mut}
outpath="${inpath}${processed_path}" inpath_struc="/structure"
outfile="/complex1_result_url.txt" in_filename_struc="/complex1_no_water.pdb"
infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
echo Input Struc filename: ${infile_struc}
# create valid input and output filenames # output
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv" outpath="/processed"
filename="${inpath}${processed_path}${infile_mut}" out_filename="/complex1_result_url.txt"
echo Input File is: ${filename} outfile="${homedir}${basedir}${outpath}${out_filename}"
#echo Output filename: ${outfile}
################## end of variable assignment for input and output files
outfilename="${outpath}${outfile}" # iterate over mutation file (infile_mut); line by line and
echo Output File will be: ${outfilename} # submit query using curl
# iterate over mutation file; line by line and submit query using curl
# some useful messages # some useful messages
echo -n -e "Processing $(wc -l < ${filename}) entries from ${infile_mut}\n" echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
COUNT=0 COUNT=0
while read -r line; do while read -r line; do
((COUNT++)) ((COUNT++))
mutation="${line}" mutation="${line}"
# echo "${mutation}" # echo "${mutation}"
#pdb='../Data/complex1_no_water.pdb' #pdb='../Data/complex1_no_water.pdb'
pdb="${inpath}${struc_path}${infile_struc}" pdb="${infile_struc}"
mutation="${mutation}" mutation="${mutation}"
chain="A" chain="A"
lig_id="PZA" lig_id="PZA"
@ -51,7 +56,7 @@ affin_wt="0.99"
host="http://biosig.unimelb.edu.au" host="http://biosig.unimelb.edu.au"
call_url="/mcsm_lig/prediction" call_url="/mcsm_lig/prediction"
##========================================= #=========================================
##html field_names names required for curl ##html field_names names required for curl
##complex_field:wild=@ ##complex_field:wild=@
##mutation_field:mutation=@ ##mutation_field:mutation=@
@ -78,19 +83,20 @@ refresh_url=$(curl -L \
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g') result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
sleep 10 sleep 10
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..." echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
# create output file with the added number of muts from file # create output file with the added number of muts from file
# after much thought, bad idea as less generic! # after much thought, bad idea as less generic!
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt #echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
echo -e "${host}${result_url}" >> ${outfilename} echo -e "${host}${result_url}" >> ${outfile}
#echo -n '.' #echo -n '.'
done < "${filename}" done < "${infile_mut}"
#FIXME: stop executing if error else these echo statements are misleading!
echo echo
echo Output filename: ${outfilename} echo Output filename: ${outfile}
echo echo
echo Number of urls saved: $(wc -l < ${filename}) echo Number of urls saved: $(wc -l < ${infile_mut})
echo echo
echo "Processing Complete" echo "Processing Complete"

View file

@ -32,24 +32,27 @@
#outfile=$2 #outfile=$2
############################ end of code block to make command line args ############################ end of code block to make command line args
# specify variables for input and output paths and filenames ############# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input" homedir="${HOME}"
processed_path="/processed" #echo Home directory is ${homedir}
infile="/complex1_result_url.txt" basedir="/git/Data/pyrazinamide/input"
outpath="${inpath}${processed_path}" # input
outfile="/complex1_output_MASTER.txt" inpath="/processed"
in_filename="/complex1_result_url.txt"
infile="${homedir}${basedir}${inpath}${in_filename}"
echo Input Mut filename: ${infile}
# create valid input and output filenames # output
filename="${inpath}${processed_path}${infile}" outpath="/processed"
echo Input File is: ${filename} out_filename="/complex1_output_MASTER.txt"
outfile="${homedir}${basedir}${outpath}${out_filename}"
outfilename="${outpath}${outfile}" echo Output filename: ${outfile}
echo Output File will be: ${outfilename} ################## end of variable assignment for input and output files
# Iterate over each result url, and extract results using hxtools # Iterate over each result url, and extract results using hxtools
# which nicely cleans and formats html # which nicely cleans and formats html
echo -n "Processing $(wc -l < ${filename}) entries from ${infile}" echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
echo echo
COUNT=0 COUNT=0
while read -r line; do while read -r line; do
@ -61,12 +64,12 @@ while read -r line; do
| hxselect -c div.well \ | hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \ | sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \ | sed -re 's/ +//g' \
>> ${outfilename} >> ${outfile}
#| tee -a ${outfilename} #| tee -a ${outfile}
# echo -n '.' # echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..." echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
done < "${filename}" done < "${infile}"
echo echo
echo "Processing Complete" echo "Processing Complete"

View file

@ -22,22 +22,27 @@
# the output is a txt file with no newlines and formatting # the output is a txt file with no newlines and formatting
# to have the following format "<colname><:><value> # to have the following format "<colname><:><value>
#*********************************************************************** #***********************************************************************
# specify variables for input and output paths and filenames ############# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input" homedir="${HOME}"
processed_path="/processed" basedir="/git/Data/pyrazinamide/input"
inpath="/processed"
# Create input file: copy and rename output file of step2 # Create input file: copy and rename output file of step2
oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt" oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
newfile="${inpath}${processed_path}/complex1_output_processed.txt" newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
cp $oldfile $newfile cp $oldfile $newfile
infile="/complex1_output_processed.txt" echo Input filename is ${oldfile}
filename="${inpath}${processed_path}${infile}" echo
echo Output i.e copied filename is ${newfile}
echo Input filename is : ${filename} # output: No output perse
# Replacement in place inside the copied file
################## end of variable assignment for input and output files
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${filename} \ #sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${filename} # | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
# Outputs records separated by a newline, that look something like this: # Outputs records separated by a newline, that look something like this:
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing # PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
@ -66,4 +71,4 @@ N
N N
s/\n//g s/\n//g
} }
/^$/d' ${filename} /^$/d' ${newfile}

View file

@ -21,20 +21,22 @@ from collections import defaultdict
# output: formatted .csv file # output: formatted .csv file
# path: "Data/<drug>/input/processed/<filename>" # path: "Data/<drug>/input/processed/<filename>"
#*********************************************************************** #***********************************************************************
# specify variables for input and output paths and filenames ############# specify variables for input and output paths and filenames
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
basedir = "/git/Data/pyrazinamide/input" basedir = "/git/Data/pyrazinamide/input"
# input
inpath = "/processed" inpath = "/processed"
in_filename = "/complex1_output_processed.txt" in_filename = "/complex1_output_processed.txt"
infile = homedir + basedir + inpath + in_filename infile = homedir + basedir + inpath + in_filename
print("Input file is:", infile) print("Input file is:", infile)
# output
outpath = "/processed" outpath = "/processed"
out_filename = "/complex1_formatted_results.csv" out_filename = "/complex1_formatted_results.csv"
outfile = homedir + basedir + outpath + out_filename outfile = homedir + basedir + outpath + out_filename
print("Output file is:", outfile) print("Output file is:", outfile)
# end of variable assignment for input and output files ################## end of variable assignment for input and output files
outCols=[ outCols=[
'PredictedAffinityChange', 'PredictedAffinityChange',