consistent input & output variables for filenames to run mcsm

2020-01-13 12:16:13 +00:00 · 2020-01-13 12:16:13 +00:00 · e3f4c630a1
commit e3f4c630a1
parent ef99167679
4 changed files with 71 additions and 55 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
@ -15,35 +15,40 @@

 # NOTE: these are just result urls, not actual values for results
 #**********************************************************************
-# specify variables for input and output paths and filenames
+############# specify variables for input and output paths and filenames
+homedir="${HOME}"
+#echo Home directory is ${homedir}
+basedir="/git/Data/pyrazinamide/input"

-inpath="${HOME}/git/Data/pyrazinamide/input"
-processed_path="/processed"
-struc_path="/structure"
-infile_mut="/pnca_mis_SNPs_v2_unique.csv"
-infile_struc="/complex1_no_water.pdb"
+# input
+inpath_mut="/processed"
+in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
+infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
+echo Input Mut filename: ${infile_mut}

-outpath="${inpath}${processed_path}"
-outfile="/complex1_result_url.txt"
+inpath_struc="/structure"
+in_filename_struc="/complex1_no_water.pdb"
+infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
+echo Input Struc filename: ${infile_struc}

-# create valid input and output filenames
-#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
-filename="${inpath}${processed_path}${infile_mut}"
-echo Input File is: ${filename}
+# output
+outpath="/processed"
+out_filename="/complex1_result_url.txt"
+outfile="${homedir}${basedir}${outpath}${out_filename}"
+#echo Output filename: ${outfile}
+################## end of variable assignment for input and output files

-outfilename="${outpath}${outfile}"
-echo Output File will be: ${outfilename}
-
-# iterate over mutation file; line by line and submit query using curl
+# iterate over mutation file (infile_mut); line by line and 
+# submit query using curl
 # some useful messages
-echo -n -e "Processing $(wc -l < ${filename}) entries from ${infile_mut}\n"
+echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
 COUNT=0
 while read -r line; do
 ((COUNT++))
 mutation="${line}"
 #    echo "${mutation}"
 #pdb='../Data/complex1_no_water.pdb'
-pdb="${inpath}${struc_path}${infile_struc}"
+pdb="${infile_struc}"
 mutation="${mutation}"
 chain="A"
 lig_id="PZA"
@ -51,7 +56,7 @@ affin_wt="0.99"
 host="http://biosig.unimelb.edu.au"
 call_url="/mcsm_lig/prediction"

-##=========================================
+#=========================================
 ##html field_names names required for curl
 ##complex_field:wild=@
 ##mutation_field:mutation=@
@ -78,19 +83,20 @@ refresh_url=$(curl -L \
 result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
 sleep 10

-echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
+echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."

 # create output file with the added number of muts from file
 # after much thought, bad idea as less generic!
 #echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
-echo -e "${host}${result_url}" >> ${outfilename}
+echo -e "${host}${result_url}" >> ${outfile}
 #echo -n '.'
-done < "${filename}"
+done < "${infile_mut}"

+#FIXME: stop executing if error else these echo statements are misleading!
 echo
-echo Output filename: ${outfilename}
+echo Output filename: ${outfile}
 echo
-echo Number of urls saved: $(wc -l < ${filename})
+echo Number of urls saved: $(wc -l < ${infile_mut})
 echo
 echo "Processing Complete"

--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
@ -32,24 +32,27 @@
 #outfile=$2
 ############################ end of code block to make command line args

-# specify variables for input and output paths and filenames
-inpath="${HOME}/git/Data/pyrazinamide/input"
-processed_path="/processed"
-infile="/complex1_result_url.txt"
+############# specify variables for input and output paths and filenames
+homedir="${HOME}"
+#echo Home directory is ${homedir}
+basedir="/git/Data/pyrazinamide/input"

-outpath="${inpath}${processed_path}"
-outfile="/complex1_output_MASTER.txt"
+# input
+inpath="/processed"
+in_filename="/complex1_result_url.txt"
+infile="${homedir}${basedir}${inpath}${in_filename}"
+echo Input Mut filename: ${infile}

-# create valid input and output filenames
-filename="${inpath}${processed_path}${infile}"
-echo Input File is: ${filename}
-
-outfilename="${outpath}${outfile}"
-echo Output File will be: ${outfilename}
+# output
+outpath="/processed"
+out_filename="/complex1_output_MASTER.txt"
+outfile="${homedir}${basedir}${outpath}${out_filename}"
+echo Output filename: ${outfile}
+################## end of variable assignment for input and output files

 # Iterate over each result url, and extract results using hxtools 
 # which nicely cleans and formats html
-echo -n "Processing $(wc -l < ${filename}) entries from ${infile}"
+echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
 echo
 COUNT=0
 while read -r line; do
@ -61,12 +64,12 @@ while read -r line; do
    | hxselect -c div.well \
    | sed -r -e 's/<[^>]*>//g' \
    | sed -re 's/ +//g' \
-    >> ${outfilename}
-  #| tee -a ${outfilename}
+    >> ${outfile}
+  #| tee -a ${outfile}
 #  echo -n '.'
-echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..."  
+echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
  
-done < "${filename}"
+done < "${infile}"

 echo
 echo "Processing Complete"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
@ -22,22 +22,27 @@
 # the output is a txt file with no newlines and formatting 
 # to have the following format "<colname><:><value>
 #***********************************************************************
-# specify variables for input and output paths and filenames
-inpath="${HOME}/git/Data/pyrazinamide/input"
-processed_path="/processed"
+############# specify variables for input and output paths and filenames
+homedir="${HOME}"
+basedir="/git/Data/pyrazinamide/input"
+
+inpath="/processed"

 # Create input file: copy and rename output file of step2
-oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
-newfile="${inpath}${processed_path}/complex1_output_processed.txt"
+oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
+newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
 cp $oldfile $newfile

-infile="/complex1_output_processed.txt"
-filename="${inpath}${processed_path}${infile}"
+echo Input filename is ${oldfile}
+echo
+echo Output i.e copied filename is ${newfile}

-echo Input filename is : ${filename}
+# output: No output perse
+# Replacement in place inside the copied file
+################## end of variable assignment for input and output files

-#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${filename} \
-# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${filename}
+#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
+# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}

 # Outputs records separated by a newline, that look something like this:
 # PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
@ -66,4 +71,4 @@ N
 N
 s/\n//g
 }
-/^$/d' ${filename}
+/^$/d' ${newfile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
@ -21,20 +21,22 @@ from collections import defaultdict
 # output: formatted .csv file
 	# path: "Data/<drug>/input/processed/<filename>"
 #***********************************************************************
-# specify variables for input and output paths and filenames
+############# specify variables for input and output paths and filenames
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-
 basedir = "/git/Data/pyrazinamide/input"
+
+# input
 inpath = "/processed"
 in_filename = "/complex1_output_processed.txt"
 infile = homedir + basedir + inpath + in_filename
 print("Input file is:", infile)

+# output
 outpath = "/processed"
 out_filename = "/complex1_formatted_results.csv"
 outfile = homedir + basedir + outpath + out_filename
 print("Output file is:", outfile)
-# end of variable assignment for input and output files
+################## end of variable assignment for input and output files

 outCols=[
        'PredictedAffinityChange',