consistent input & output variables for filenames to run mcsm

This commit is contained in:
Tanushree Tunstall 2020-01-13 12:16:13 +00:00
parent ef99167679
commit e3f4c630a1
4 changed files with 71 additions and 55 deletions

View file

@ -15,35 +15,40 @@
# NOTE: these are just result urls, not actual values for results
#**********************************************************************
# specify variables for input and output paths and filenames
############# specify variables for input and output paths and filenames
homedir="${HOME}"
#echo Home directory is ${homedir}
basedir="/git/Data/pyrazinamide/input"
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
struc_path="/structure"
infile_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_struc="/complex1_no_water.pdb"
# input
inpath_mut="/processed"
in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
echo Input Mut filename: ${infile_mut}
outpath="${inpath}${processed_path}"
outfile="/complex1_result_url.txt"
inpath_struc="/structure"
in_filename_struc="/complex1_no_water.pdb"
infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
echo Input Struc filename: ${infile_struc}
# create valid input and output filenames
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
filename="${inpath}${processed_path}${infile_mut}"
echo Input File is: ${filename}
# output
outpath="/processed"
out_filename="/complex1_result_url.txt"
outfile="${homedir}${basedir}${outpath}${out_filename}"
#echo Output filename: ${outfile}
################## end of variable assignment for input and output files
outfilename="${outpath}${outfile}"
echo Output File will be: ${outfilename}
# iterate over mutation file; line by line and submit query using curl
# iterate over mutation file (infile_mut); line by line and
# submit query using curl
# some useful messages
echo -n -e "Processing $(wc -l < ${filename}) entries from ${infile_mut}\n"
echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
COUNT=0
while read -r line; do
((COUNT++))
mutation="${line}"
# echo "${mutation}"
#pdb='../Data/complex1_no_water.pdb'
pdb="${inpath}${struc_path}${infile_struc}"
pdb="${infile_struc}"
mutation="${mutation}"
chain="A"
lig_id="PZA"
@ -51,7 +56,7 @@ affin_wt="0.99"
host="http://biosig.unimelb.edu.au"
call_url="/mcsm_lig/prediction"
##=========================================
#=========================================
##html field_names names required for curl
##complex_field:wild=@
##mutation_field:mutation=@
@ -78,19 +83,20 @@ refresh_url=$(curl -L \
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
sleep 10
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
# create output file with the added number of muts from file
# after much thought, bad idea as less generic!
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
echo -e "${host}${result_url}" >> ${outfilename}
echo -e "${host}${result_url}" >> ${outfile}
#echo -n '.'
done < "${filename}"
done < "${infile_mut}"
#FIXME: stop executing if error else these echo statements are misleading!
echo
echo Output filename: ${outfilename}
echo Output filename: ${outfile}
echo
echo Number of urls saved: $(wc -l < ${filename})
echo Number of urls saved: $(wc -l < ${infile_mut})
echo
echo "Processing Complete"

View file

@ -32,24 +32,27 @@
#outfile=$2
############################ end of code block to make command line args
# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
infile="/complex1_result_url.txt"
############# specify variables for input and output paths and filenames
homedir="${HOME}"
#echo Home directory is ${homedir}
basedir="/git/Data/pyrazinamide/input"
outpath="${inpath}${processed_path}"
outfile="/complex1_output_MASTER.txt"
# input
inpath="/processed"
in_filename="/complex1_result_url.txt"
infile="${homedir}${basedir}${inpath}${in_filename}"
echo Input Mut filename: ${infile}
# create valid input and output filenames
filename="${inpath}${processed_path}${infile}"
echo Input File is: ${filename}
outfilename="${outpath}${outfile}"
echo Output File will be: ${outfilename}
# output
outpath="/processed"
out_filename="/complex1_output_MASTER.txt"
outfile="${homedir}${basedir}${outpath}${out_filename}"
echo Output filename: ${outfile}
################## end of variable assignment for input and output files
# Iterate over each result url, and extract results using hxtools
# which nicely cleans and formats html
echo -n "Processing $(wc -l < ${filename}) entries from ${infile}"
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
echo
COUNT=0
while read -r line; do
@ -61,12 +64,12 @@ while read -r line; do
| hxselect -c div.well \
| sed -r -e 's/<[^>]*>//g' \
| sed -re 's/ +//g' \
>> ${outfilename}
#| tee -a ${outfilename}
>> ${outfile}
#| tee -a ${outfile}
# echo -n '.'
echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..."
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
done < "${filename}"
done < "${infile}"
echo
echo "Processing Complete"

View file

@ -22,22 +22,27 @@
# the output is a txt file with no newlines and formatting
# to have the following format "<colname><:><value>
#***********************************************************************
# specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed"
############# specify variables for input and output paths and filenames
homedir="${HOME}"
basedir="/git/Data/pyrazinamide/input"
inpath="/processed"
# Create input file: copy and rename output file of step2
oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
newfile="${inpath}${processed_path}/complex1_output_processed.txt"
oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
cp $oldfile $newfile
infile="/complex1_output_processed.txt"
filename="${inpath}${processed_path}${infile}"
echo Input filename is ${oldfile}
echo
echo Output i.e copied filename is ${newfile}
echo Input filename is : ${filename}
# output: No output perse
# Replacement in place inside the copied file
################## end of variable assignment for input and output files
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${filename} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${filename}
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
# Outputs records separated by a newline, that look something like this:
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
@ -66,4 +71,4 @@ N
N
s/\n//g
}
/^$/d' ${filename}
/^$/d' ${newfile}

View file

@ -21,20 +21,22 @@ from collections import defaultdict
# output: formatted .csv file
# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
# specify variables for input and output paths and filenames
############# specify variables for input and output paths and filenames
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
basedir = "/git/Data/pyrazinamide/input"
# input
inpath = "/processed"
in_filename = "/complex1_output_processed.txt"
infile = homedir + basedir + inpath + in_filename
print("Input file is:", infile)
# output
outpath = "/processed"
out_filename = "/complex1_formatted_results.csv"
outfile = homedir + basedir + outpath + out_filename
print("Output file is:", outfile)
# end of variable assignment for input and output files
################## end of variable assignment for input and output files
outCols=[
'PredictedAffinityChange',