diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh index 22fada7..4c24392 100755 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh @@ -11,8 +11,8 @@ # per line. Sort by unique, which automatically removes duplicates. # sace file in current directory #********************************************************************** -infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv" -outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv" +infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv" +outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv" # sort unique entries and output to current directory sort -u ${infile} > ${outfile} diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh index 87f4265..faf0b7d 100755 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh @@ -1,30 +1,49 @@ #!/bin/bash -#************************************* -#need to be in the correct directory -#************************************* -##: comments for code -#: commented out code - #********************************************************************** # TASK: submit requests using curl: HANDLE redirects and refresh url. # Iterate over mutation file and write/append result urls to a file -# result url file: stored in the /Results directory -# mutation file: one mutation per line, no chain ID -# output: in a file, should be n urls (n=no. of mutations in file) +# Mutation file must have one mutation (format A1B) per line +# Requirements +# input: mutation list (format: A1B), complex struc: (pdb format) + # mutation: outFile from step0, one unique mutation/line, no chain ID + # path: "Data//input/processed/" + # structure: pdb file of drug-target complex + # path: "Data//input/structure/" +# output: should be n urls (n=no. of unique mutations in file) + # path: "Data//input/processed/" + # NOTE: these are just result urls, not actual values for results #********************************************************************** -## iterate over mutation file; line by line and submit query using curl -filename="../Data/pnca_mis_SNPs_v2_unique.csv" +# specify variables for input and output paths and filenames -## some useful messages -echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n" +inpath="${HOME}/git/Data/pyrazinamide/input" +processed_path="/processed" +struc_path="/structure" +infile_mut="/pnca_mis_SNPs_v2_unique.csv" +infile_struc="/complex1_no_water.pdb" + +outpath="${inpath}${processed_path}" +outfile="/mCSM_lig_complex1_result_url.txt" + +# create valid input and output filenames +#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv" +filename="${inpath}${processed_path}${infile_mut}" +echo Input File is: ${filename} + +outfilename="${outpath}${outfile}" +echo Output File will be: ${outfilename} + +# iterate over mutation file; line by line and submit query using curl +# some useful messages +echo -n -e "Processing $(wc -l < ${filename}) entries from ${infile_mut}\n" COUNT=0 while read -r line; do ((COUNT++)) - mutation="${line}" +mutation="${line}" # echo "${mutation}" -pdb='../Data/complex1_no_water.pdb' +#pdb='../Data/complex1_no_water.pdb' +pdb="${inpath}${struc_path}${infile_struc}" mutation="${mutation}" chain="A" lig_id="PZA" @@ -49,24 +68,31 @@ refresh_url=$(curl -L \ -F "affin_wt=${affin_wt}" \ ${host}${call_url} | grep "http-equiv") -#echo $refresh_url -#echo ${host}${refresh_url} +#echo Refresh URL: $refresh_url +#echo Host+Refresh: ${host}${refresh_url} -#use regex to extract the relevant bit from the refresh url -#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g' +# use regex to extract the relevant bit from the refresh url +# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g' -#Now build: result url using host and refresh url and write the urls to a file in the Results dir +# Now build: result url using host and refresh url and write the urls to a file result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g') sleep 10 echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..." -echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt +# create output file with the added number of muts from file +# after much thought, bad idea as less generic! +#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt +echo -e "${host}${result_url}" >> ${outfilename} #echo -n '.' done < "${filename}" +echo +echo Output filename: ${outfilename} +echo +echo Number of urls saved: $(wc -l < ${filename}) echo echo "Processing Complete" -##end of submitting query, receiving result url and storing results url in a file +# end of submitting query, receiving result url and storing results url in a file diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh index e250fe8..717c1aa 100755 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh @@ -1,23 +1,21 @@ #!/bin/bash -#************************************* -#need to be in the correct directory -#************************************* -##: comments for code -#: commented out code #******************************************************************** # TASK: submit result urls and fetch actual results using curl -# iterate over each result url from the output of step1 in the stored -# in file in /Results. +# Iterate over each result url from the output of step1 stored in processed/ # Use curl to fetch results and extract relevant sections using hxtools -# and store these in another file in /Results -# This script takes two arguments: -# input file: file containing results url -# In this case: 336_mCSM_lig_complex1_result_url.txt -# output file: name of the file where extracted results will be stored -# In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt -#********************************************************************* +# and store these in another file in processed/ +# Requirements: +# input: output of step1, file containing result urls + # path: "Data//input/processed/" +# output: name of the file where extracted results will be stored + # path: "Data//input/processed/" + +# Optional: can make these command line args you pass when calling script +# by uncommenting code as indicated +#********************************************************************* +############################# uncomment: to make it command line args #if [ "$#" -ne 2 ]; then #if [ -Z $1 ]; then # echo " @@ -32,11 +30,26 @@ # Second argument: Output File #infile=$1 #outfile=$2 +############################ end of code block to make command line args -infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt" -outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt" +# specify variables for input and output paths and filenames +inpath="${HOME}/git/Data/pyrazinamide/input" +processed_path="/processed" +infile="/mCSM_lig_complex1_result_url.txt" -echo -n "Processing $(wc -l < ${infile}) entries from ${infile}" +outpath="${inpath}${processed_path}" +outfile="/mCSM_lig_complex1_output_MASTER.txt" + +# create valid input and output filenames +filename="${inpath}${processed_path}${infile}" +echo Input File is: ${filename} + +outfilename="${outpath}${outfile}" +echo Output File will be: ${outfilename} + +# Iterate over each result url, and extract results using hxtools +# which nicely cleans and formats html +echo -n "Processing $(wc -l < ${filename}) entries from ${infile}" echo COUNT=0 while read -r line; do @@ -48,12 +61,13 @@ while read -r line; do | hxselect -c div.well \ | sed -r -e 's/<[^>]*>//g' \ | sed -re 's/ +//g' \ - >> ${outfile} - #| tee -a ${outfile} + >> ${outfilename} + #| tee -a ${outfilename} # echo -n '.' -echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..." +echo -e "Processing entry ${COUNT}/$(wc -l < ${filename})..." -done < "${infile}" +done < "${filename}" echo echo "Processing Complete" + diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh index 78dbdf5..0b743fe 100755 --- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh +++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh @@ -1,9 +1,4 @@ #!/bin/bash -#************************************* -#need to be in the correct directory -#************************************* -##: comments for code -#: commented out code #******************************************************************** # TASK: Intermediate results processing @@ -11,15 +6,39 @@ # format the file into two columns (col1: field_desc and col2: values) # However the section "PredictedAffinityChange:...." and # "DUETstabilitychange:.." are split over multiple lines and -# prevent this from happening.Additionally there are other empty lines +# prevent this from happening. Additionally there are other empty lines # that need to be omiited. In order ensure these sections are not split # over multiple lines, this script is written. -#********************************************************************* -infile="../Results/336_mCSM_lig_complex1_output_processed.txt" +# Requirements: +# input: output of step2, file containing mcsm results as described above + # path: "Data//input/processed/" +# output: replaces file in place. +# Therefore first create a copy of the input file +# but rename it to remove the word "MASTER" and add the word "processed" +# file format: .txt -#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \ -# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile} +# NOTE: This replaces the file in place! +# the output is a txt file with no newlines and formatting +# to have the following format "<:> +#*********************************************************************** +# specify variables for input and output paths and filenames +inpath="${HOME}/git/Data/pyrazinamide/input" +processed_path="/processed" + +# Create input file: copy and rename output file of step2 +oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt" +newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt" +cp $oldfile $newfile + +#infile="../Results/336_mCSM_lig_complex1_output_processed.txt" +infile="/mCSM_lig_complex1_output_processed.txt" +filename="${inpath}${processed_path}${infile}" + +echo Input filename is : ${filename} + +#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${filename} \ +# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${filename} # Outputs records separated by a newline, that look something like this: # PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing @@ -36,7 +55,6 @@ infile="../Results/336_mCSM_lig_complex1_output_processed.txt" # (...etc) # This script brings everything in a convenient format for further processing in python. -# bear in mind, this replaces the file in place, so make sure you retain a copy for your records sed -i '/PredictedAffinityChange/ { N N @@ -49,4 +67,4 @@ N N s/\n//g } -/^$/d' ${infile} +/^$/d' ${filename}