From af04c69d661a201b8811438bc6681a24b57fadb7 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sun, 16 Jan 2022 18:34:49 +0000 Subject: [PATCH] A MAAAADDD MAAADDD DAYYYYY,messy embb numbering agrrrhhhhh --- scripts/gene_targets_names.txt | 7 ++++ scripts/run_mutate2.sh | 65 ++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 scripts/gene_targets_names.txt create mode 100755 scripts/run_mutate2.sh diff --git a/scripts/gene_targets_names.txt b/scripts/gene_targets_names.txt new file mode 100644 index 0000000..9b9a970 --- /dev/null +++ b/scripts/gene_targets_names.txt @@ -0,0 +1,7 @@ +embb ethambutol +rpob rifampicin +alr cycloserine +katg isoniazid +pnca pyrazinamide +gid streptomycin + diff --git a/scripts/run_mutate2.sh b/scripts/run_mutate2.sh new file mode 100755 index 0000000..5dd5c60 --- /dev/null +++ b/scripts/run_mutate2.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -e + +# Metadata files edited: +# rpob: remove positions above 1151 +# embb: remove positions above 1054 +#======================================================================= +#https://www.biostars.org/p/336891/ +#python Mutate.py -v -o /path/to/output.fasta mutation_map_file.csv input.fasta +#======================================================================= +DATA_DIR="/home/tanu/git/Data" + +echo "Processing Mutation Files" +echo "-------------------------" +echo + +while read -r gene drug; do + echo "vvvvvvvvvvvvvvvvvvvvv" + echo "gene: $gene drug: $drug"; + echo "Source File: ${DATA_DIR}/${drug}/output/${gene}_metadata.csv" + MSA_MAP=${DATA_DIR}/${drug}/output/${gene}_msa_map.csv + wc -l ${DATA_DIR}/${drug}/output/${gene}_metadata.csv + cat ${DATA_DIR}/${drug}/output/${gene}_metadata.csv | rev| cut -d, -f1 |rev | tail -n +2 |sort | uniq -c > ${DATA_DIR}/${drug}/output/${gene}_metadata_mut_count.csv + cat ${DATA_DIR}/${drug}/output/${gene}_metadata.csv | rev| cut -d, -f1 |rev | tail -n +2 |sort | sed -e 's/"//g' > $MSA_MAP + echo "Output to: $MSA_MAP" + sed -i "s/^/${gene},/" $MSA_MAP + wc -l $MSA_MAP + echo "^^^^^^^^^^^^^^^^^^^^^" + echo + echo "Running mutate.py on data file $MSA_MAP" + python3 mutate.py -v -o ${DATA_DIR}/${drug}/output/${gene}_msa_interim.csv $MSA_MAP $DATA_DIR/${drug}/input/${gene}_f2.fasta + echo "mutate.py completed" + echo + +done < gene_targets_names.txt + +# Stop here so we don't run the examples below :) +exit + +# make sure there is no new line at the end of the mutation file (snps.csv) +# check +cat output/gid_metadata.csv | rev| cut -d, -f1 |rev | tail -n +2 |sort | head + +cat output/gid_metadata.csv | rev| cut -d, -f1 |rev | tail -n +2 |sort | uniq -c > output/gid_metadata_mut_count.csv + + +cat output/gid_metadata.csv | rev| cut -d, -f1 |rev | tail -n +2 |sort > gid_msa_snp.csv +sed -i 's/^/gid,/' gid_msa_snp.csv + +#cp gid_msa_snp.csv gid_mut_map.csv +#%% +# Date: 16/01/22 +# pre processing +sed 's/"//g' gene_msa_snp.csv > gid_mut_map.csv + +# mut prefix for mutation map file MUST match fasta file header +python3 mutate.py -v -o /home/tanu/git/Data/streptomycin/output/TEST2.csv /home/tanu/git/Data/streptomycin/output/gid_mut_map.csv /home/tanu/git/Data/streptomycin/input/gid2.fasta + +wc -l TEST2.csv + +# post processing +sed -E 's/>.*//g' TEST2.csv | sed '/^$/d' > TEST3.csv +wc -l TEST3.csv + +#==============================================