renamed file paths and names to run mcsm

2020-01-10 12:18:18 +00:00 · 2020-01-10 12:18:18 +00:00 · f026efb4db
commit f026efb4db
parent cf7d6f9f03
4 changed files with 115 additions and 57 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@ -11,8 +11,8 @@
 # per line. Sort by unique, which automatically removes duplicates.
 # sace file in current directory
 #**********************************************************************
-infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
-outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
+infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
+outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"

 # sort unique entries and output to current directory
 sort -u ${infile} > ${outfile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@ -1,30 +1,49 @@
 #!/bin/bash

-#*************************************
-#need to be in the correct directory
-#*************************************
-##: comments for code
-#: commented out code
-
 #**********************************************************************
 # TASK: submit requests using curl: HANDLE redirects and refresh url. 
 # Iterate over mutation file and write/append result urls to a file
-# result url file: stored in the /Results directory
-# mutation file: one mutation per line, no chain ID
-# output: in a file, should be n urls (n=no. of mutations in file)
+# Mutation file must have one mutation (format A1B) per line
+# Requirements
+# input: mutation list (format: A1B), complex struc: (pdb format)
+    # mutation: outFile from step0, one unique mutation/line, no chain ID
+    	# path: "Data/<drug>/input/processed/<filename>"
+    # structure: pdb file of drug-target complex
+    	# path: "Data/<drug>/input/structure/<filename>"
+# output: should be n urls (n=no. of unique mutations in file)
+	# path: "Data/<drug>/input/processed/<filename>"
+
 # NOTE: these are just result urls, not actual values for results
 #**********************************************************************
-## iterate over mutation file; line by line and submit query using curl
-filename="../Data/pnca_mis_SNPs_v2_unique.csv"
+# specify variables for input and output paths and filenames

-## some useful messages
-echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
+inpath="${HOME}/git/Data/pyrazinamide/input"
+processed_path="/processed"
+struc_path="/structure"
+infile_mut="/pnca_mis_SNPs_v2_unique.csv"
+infile_struc="/complex1_no_water.pdb"
+
+outpath="${inpath}${processed_path}"
+outfile="/mCSM_lig_complex1_result_url.txt"
+
+# create valid input and output filenames
+#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
+filename="${inpath}${processed_path}${infile_mut}"
+echo Input File is: ${filename}
+
+outfilename="${outpath}${outfile}"
+echo Output File will be: ${outfilename}
+
+# iterate over mutation file; line by line and submit query using curl
+# some useful messages
+echo -n -e "Processing $(wc -l < ${filename}) entries from ${infile_mut}\n"
 COUNT=0
 while read -r line; do
 ((COUNT++))
-    mutation="${line}"
+mutation="${line}"
 #    echo "${mutation}"
-pdb='../Data/complex1_no_water.pdb'
+#pdb='../Data/complex1_no_water.pdb'
+pdb="${inpath}${struc_path}${infile_struc}"
 mutation="${mutation}"
 chain="A"
 lig_id="PZA"
@ -49,24 +68,31 @@ refresh_url=$(curl -L \
     -F "affin_wt=${affin_wt}" \
     ${host}${call_url} | grep "http-equiv")

-#echo $refresh_url
-#echo ${host}${refresh_url}
+#echo Refresh URL: $refresh_url
+#echo Host+Refresh: ${host}${refresh_url}

-#use regex to extract the relevant bit from the refresh url
-#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
+# use regex to extract the relevant bit from the refresh url
+# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'

-#Now build: result url using host and refresh url and write the urls to a file in the Results dir
+# Now build: result url using host and refresh url and write the urls to a file 
 result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
 sleep 10

 echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."

-echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+# create output file with the added number of muts from file
+# after much thought, bad idea as less generic!
+#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+echo -e "${host}${result_url}" >> ${outfilename}
 #echo -n '.'
 done < "${filename}"

+echo
+echo Output filename: ${outfilename}
+echo
+echo Number of urls saved: $(wc -l < ${filename})
 echo
 echo "Processing Complete"

-##end of submitting query, receiving result url and storing results url in a file
+# end of submitting query, receiving result url and storing results url in a file

--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@ -1,23 +1,21 @@
 #!/bin/bash
-#*************************************
-#need to be in the correct directory
-#*************************************
-##: comments for code
-#: commented out code

 #********************************************************************
 # TASK: submit result urls and fetch actual results using curl
-# iterate over each result url from the output of step1 in the stored
-# in file in /Results.
+# Iterate over each result url from the output of step1 stored in processed/
 # Use curl to fetch results and extract relevant sections using hxtools
-# and store these in another file in /Results 
-# This script takes two arguments:
-# 	input file: file containing results url
-#				In this case: 336_mCSM_lig_complex1_result_url.txt
-# 	output file: name of the file where extracted results will be stored
-#				In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
-#*********************************************************************
+# and store these in another file in processed/

+# Requirements:
+# input: output of step1, file containing result urls
+	# path: "Data/<drug>/input/processed/<filename>"
+# output: name of the file where extracted results will be stored
+	# path: "Data/<drug>/input/processed/<filename>"
+
+# Optional: can make these command line args you pass when calling script
+# by uncommenting code as indicated
+#*********************************************************************
+############################# uncomment: to make it command line args
 #if [ "$#" -ne 2 ]; then
  #if [ -Z $1 ]; then
 #  echo "
@ -32,11 +30,26 @@
 # Second argument: Output File
 #infile=$1
 #outfile=$2
+############################ end of code block to make command line args

-infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
-outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
+# specify variables for input and output paths and filenames
+inpath="${HOME}/git/Data/pyrazinamide/input"
+processed_path="/processed"
+infile="/mCSM_lig_complex1_result_url.txt"

-echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
+outpath="${inpath}${processed_path}"
+outfile="/mCSM_lig_complex1_output_MASTER.txt"
+
+# create valid input and output filenames
+filename="${inpath}${processed_path}${infile}"
+echo Input File is: ${filename}
+
+outfilename="${outpath}${outfile}"
+echo Output File will be: ${outfilename}
+
+# Iterate over each result url, and extract results using hxtools 
+# which nicely cleans and formats html
+echo -n "Processing $(wc -l < ${filename}) entries from ${infile}"
 echo
 COUNT=0
 while read -r line; do
@ -48,12 +61,13 @@ while read -r line; do
    | hxselect -c div.well \
    | sed -r -e 's/<[^>]*>//g' \
    | sed -re 's/ +//g' \
-    >> ${outfile}
-  #| tee -a ${outfile}
+    >> ${outfilename}
+  #| tee -a ${outfilename}
 #  echo -n '.'
-echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
+echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..."  
  
-done < "${infile}"
+done < "${filename}"

 echo
 echo "Processing Complete"
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@ -1,9 +1,4 @@
 #!/bin/bash
-#*************************************
-#need to be in the correct directory
-#*************************************
-##: comments for code
-#: commented out code

 #********************************************************************
 # TASK: Intermediate results processing
@ -11,15 +6,39 @@
 # format the file into two columns (col1: field_desc and col2: values)
 # However the section "PredictedAffinityChange:...." and 
 # "DUETstabilitychange:.." are split over multiple lines and 
-# prevent this from happening.Additionally there are other empty lines
+# prevent this from happening. Additionally there are other empty lines
 # that need to be omiited. In order ensure these sections are not split
 # over multiple lines, this script is written.
-#*********************************************************************

-infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
+# Requirements:
+# input: output of step2, file containing mcsm results as described above
+	# path: "Data/<drug>/input/processed/<filename>"
+# output: replaces file in place.
+# Therefore first create a copy of the input file
+# but rename it to remove the word "MASTER" and add the word "processed"
+# file format: .txt

-#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
-# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
+# NOTE: This replaces the file in place!
+# the output is a txt file with no newlines and formatting 
+# to have the following format "<colname><:><value>
+#***********************************************************************
+# specify variables for input and output paths and filenames
+inpath="${HOME}/git/Data/pyrazinamide/input"
+processed_path="/processed"
+
+# Create input file: copy and rename output file of step2
+oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
+newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
+cp $oldfile $newfile
+
+#infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
+infile="/mCSM_lig_complex1_output_processed.txt"
+filename="${inpath}${processed_path}${infile}"
+
+echo Input filename is : ${filename}
+
+#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${filename} \
+# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${filename}

 # Outputs records separated by a newline, that look something like this:
 # PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
@ -36,7 +55,6 @@ infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
 # (...etc)

 # This script brings everything in a convenient format for further processing in python.
-# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
 sed -i '/PredictedAffinityChange/ {
 N
 N
@ -49,4 +67,4 @@ N
 N
 s/\n//g
 }
-/^$/d' ${infile}
+/^$/d' ${filename}