renamed file paths and names to run mcsm

2020-01-10 12:18:18 +00:00 · 2020-01-10 12:18:18 +00:00 · f026efb4db
commit f026efb4db
parent cf7d6f9f03
4 changed files with 115 additions and 57 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@ -11,8 +11,8 @@
 # per line. Sort by unique, which automatically removes duplicates.
 # sace file in current directory
 #**********************************************************************
-infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
+infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
-outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
+outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
 # sort unique entries and output to current directory
 sort -u ${infile} > ${outfile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@ -1,30 +1,49 @@
 #!/bin/bash
 #*************************************
 #need to be in the correct directory
 #*************************************
 ##: comments for code
 #: commented out code
 #**********************************************************************
 # TASK: submit requests using curl: HANDLE redirects and refresh url. 
 # Iterate over mutation file and write/append result urls to a file
-# result url file: stored in the /Results directory
+# Mutation file must have one mutation (format A1B) per line
-# mutation file: one mutation per line, no chain ID
+# Requirements
-# output: in a file, should be n urls (n=no. of mutations in file)
+# input: mutation list (format: A1B), complex struc: (pdb format)
    # mutation: outFile from step0, one unique mutation/line, no chain ID
    	# path: "Data/<drug>/input/processed/<filename>"
    # structure: pdb file of drug-target complex
    	# path: "Data/<drug>/input/structure/<filename>"
 # output: should be n urls (n=no. of unique mutations in file)
 	# path: "Data/<drug>/input/processed/<filename>"
 # NOTE: these are just result urls, not actual values for results
 #**********************************************************************
-## iterate over mutation file; line by line and submit query using curl
+# specify variables for input and output paths and filenames
 filename="../Data/pnca_mis_SNPs_v2_unique.csv"
-## some useful messages
+inpath="${HOME}/git/Data/pyrazinamide/input"
-echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
+processed_path="/processed"
 struc_path="/structure"
 infile_mut="/pnca_mis_SNPs_v2_unique.csv"
 infile_struc="/complex1_no_water.pdb"
 outpath="${inpath}${processed_path}"
 outfile="/mCSM_lig_complex1_result_url.txt"
 # create valid input and output filenames
 #filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
 filename="${inpath}${processed_path}${infile_mut}"
 echo Input File is: ${filename}
 outfilename="${outpath}${outfile}"
 echo Output File will be: ${outfilename}
 # iterate over mutation file; line by line and submit query using curl
 # some useful messages
 echo -n -e "Processing $(wc -l < ${filename}) entries from ${infile_mut}\n"
 COUNT=0
 while read -r line; do
 ((COUNT++))
 mutation="${line}"
 #    echo "${mutation}"
-pdb='../Data/complex1_no_water.pdb'
+#pdb='../Data/complex1_no_water.pdb'
 pdb="${inpath}${struc_path}${infile_struc}"
 mutation="${mutation}"
 chain="A"
 lig_id="PZA"
@ -49,24 +68,31 @@ refresh_url=$(curl -L \
     -F "affin_wt=${affin_wt}" \
     ${host}${call_url} | grep "http-equiv")
-#echo $refresh_url
+#echo Refresh URL: $refresh_url
-#echo ${host}${refresh_url}
+#echo Host+Refresh: ${host}${refresh_url}
 # use regex to extract the relevant bit from the refresh url
 # regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
-#Now build: result url using host and refresh url and write the urls to a file in the Results dir
+# Now build: result url using host and refresh url and write the urls to a file 
 result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
 sleep 10
 echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
-echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+# create output file with the added number of muts from file
 # after much thought, bad idea as less generic!
 #echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
 echo -e "${host}${result_url}" >> ${outfilename}
 #echo -n '.'
 done < "${filename}"
 echo
 echo Output filename: ${outfilename}
 echo
 echo Number of urls saved: $(wc -l < ${filename})
 echo
 echo "Processing Complete"
-##end of submitting query, receiving result url and storing results url in a file
+# end of submitting query, receiving result url and storing results url in a file
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@ -1,23 +1,21 @@
 #!/bin/bash
 #*************************************
 #need to be in the correct directory
 #*************************************
 ##: comments for code
 #: commented out code
 #********************************************************************
 # TASK: submit result urls and fetch actual results using curl
-# iterate over each result url from the output of step1 in the stored
+# Iterate over each result url from the output of step1 stored in processed/
 # in file in /Results.
 # Use curl to fetch results and extract relevant sections using hxtools
-# and store these in another file in /Results 
+# and store these in another file in processed/
 # This script takes two arguments:
 # 	input file: file containing results url
 #				In this case: 336_mCSM_lig_complex1_result_url.txt
 # 	output file: name of the file where extracted results will be stored
 #				In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
 #*********************************************************************
 # Requirements:
 # input: output of step1, file containing result urls
 	# path: "Data/<drug>/input/processed/<filename>"
 # output: name of the file where extracted results will be stored
 	# path: "Data/<drug>/input/processed/<filename>"
 # Optional: can make these command line args you pass when calling script
 # by uncommenting code as indicated
 #*********************************************************************
 ############################# uncomment: to make it command line args
 #if [ "$#" -ne 2 ]; then
  #if [ -Z $1 ]; then
 #  echo "
@ -32,11 +30,26 @@
 # Second argument: Output File
 #infile=$1
 #outfile=$2
 ############################ end of code block to make command line args
-infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
+# specify variables for input and output paths and filenames
-outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
+inpath="${HOME}/git/Data/pyrazinamide/input"
 processed_path="/processed"
 infile="/mCSM_lig_complex1_result_url.txt"
-echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
+outpath="${inpath}${processed_path}"
 outfile="/mCSM_lig_complex1_output_MASTER.txt"
 # create valid input and output filenames
 filename="${inpath}${processed_path}${infile}"
 echo Input File is: ${filename}
 outfilename="${outpath}${outfile}"
 echo Output File will be: ${outfilename}
 # Iterate over each result url, and extract results using hxtools 
 # which nicely cleans and formats html
 echo -n "Processing $(wc -l < ${filename}) entries from ${infile}"
 echo
 COUNT=0
 while read -r line; do
@ -48,12 +61,13 @@ while read -r line; do
    | hxselect -c div.well \
    | sed -r -e 's/<[^>]*>//g' \
    | sed -re 's/ +//g' \
-    >> ${outfile}
+    >> ${outfilename}
-  #| tee -a ${outfile}
+  #| tee -a ${outfilename}
 #  echo -n '.'
-echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
+echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..."  
-done < "${infile}"
+done < "${filename}"
 echo
 echo "Processing Complete"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@ -1,9 +1,4 @@
 #!/bin/bash
 #*************************************
 #need to be in the correct directory
 #*************************************
 ##: comments for code
 #: commented out code
 #********************************************************************
 # TASK: Intermediate results processing
@ -14,12 +9,36 @@
 # prevent this from happening. Additionally there are other empty lines
 # that need to be omiited. In order ensure these sections are not split
 # over multiple lines, this script is written.
 #*********************************************************************
-infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
+# Requirements:
 # input: output of step2, file containing mcsm results as described above
 	# path: "Data/<drug>/input/processed/<filename>"
 # output: replaces file in place.
 # Therefore first create a copy of the input file
 # but rename it to remove the word "MASTER" and add the word "processed"
 # file format: .txt
-#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
+# NOTE: This replaces the file in place!
-# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
+# the output is a txt file with no newlines and formatting 
 # to have the following format "<colname><:><value>
 #***********************************************************************
 # specify variables for input and output paths and filenames
 inpath="${HOME}/git/Data/pyrazinamide/input"
 processed_path="/processed"
 # Create input file: copy and rename output file of step2
 oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
 newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
 cp $oldfile $newfile
 #infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
 infile="/mCSM_lig_complex1_output_processed.txt"
 filename="${inpath}${processed_path}${infile}"
 echo Input filename is : ${filename}
 #sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${filename} \
 # | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${filename}
 # Outputs records separated by a newline, that look something like this:
 # PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
@ -36,7 +55,6 @@ infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
 # (...etc)
 # This script brings everything in a convenient format for further processing in python.
 # bear in mind, this replaces the file in place, so make sure you retain a copy for your records
 sed -i '/PredictedAffinityChange/ {
 N
 N
@ -49,4 +67,4 @@ N
 N
 s/\n//g
 }
-/^$/d' ${infile}
+/^$/d' ${filename}