From f72e81664d3434d34c49be05162f7ef63ea259af Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 14 Sep 2020 15:17:49 +0100
Subject: [PATCH] added mutate.py and run_mutate.sh to create MSA aligbments
 for mutant sequences required to generate logoplot from sequence in R

---
 scripts/mutate.py     | 179 ++++++++++++++++++++++++++++++++++++++++++
 scripts/run_mutate.sh |  17 ++++
 2 files changed, 196 insertions(+)
 create mode 100644 scripts/mutate.py
 create mode 100644 scripts/run_mutate.sh

diff --git a/scripts/mutate.py b/scripts/mutate.py
new file mode 100644
index 0000000..fcabc70
--- /dev/null
+++ b/scripts/mutate.py
@@ -0,0 +1,179 @@
+#!/usr/bin/python
+from __future__ import print_function
+from Bio import SeqIO
+from Bio.Seq import Seq
+from collections import OrderedDict
+import sys
+import argparse
+# https://github.com/jrjhealey/bioinfo-tools/blob/master/Mutate.py
+# https://www.biostars.org/p/336891/
+
+# TODO:
+#  - create some logic to 'group' mutations that will be applied to the same sequence, to
+#    make all switches at once
+#    - This will also probably break the verbose transversion output so the maths will need replacing
+#  - Create the ability to support INDELS (will also require pairwise alignment so that
+#    hamming distances remain meaningful.
+
+
+def get_args():
+    """Parse command line arguments"""
+    desc = "Mutate fasta sequences based on a file of sequence mappings."
+    epi = (
+        "This script takes a mapfile of the form:\n"
+        " SequenceID,A123B\n"
+        " SequenceID,X456Y\n"
+        "And performs substitutions/mutations. At preset it only does one SNP per sequence.\n"
+    )
+
+    try:
+        parser = argparse.ArgumentParser(
+            description=desc, epilog=epi, formatter_class=argparse.RawTextHelpFormatter
+        )
+        parser.add_argument(
+            "mutation_file",
+            action="store",
+            help='File of mutation mappings like so: "SeqID,X123Y"',
+        )
+        parser.add_argument(
+            "sequences",
+            action="store",
+            help="File of sequences to be mutated (fasta only).",
+        )
+        parser.add_argument(
+            "-v",
+            "--verbose",
+            action="store_true",
+            help="Verbose behaviour, printing parameters of the script.",
+        )
+        parser.add_argument(
+            "-o",
+            "--outfile",
+            action="store",
+            help="Output file for mutated sequences (default STDOUT).",
+        )
+        if len(sys.argv) == 1:
+            parser.print_help(sys.stderr)
+            exit(1)
+    except:
+        sys.stderr.write(
+            "An exception occurred with argument parsing. Check your provided options.\n"
+        )
+
+    return parser.parse_args()
+
+
+class Mutation(object):
+    """A class wrapper for sequence IDs so that duplicate IDs can be used in a dictionary"""
+
+    def __init__(self, name):
+        self.name = name
+
+    def __repr__(self):
+        return "'" + self.name + "'"
+
+    def __str__(self):
+        return self.name
+
+
+def parse_mapfile(mapfile):
+    """Return a dict of mapped mutations.
+    File should resemble:
+     SequenceID,A123B
+     SequenceID2,X234Y
+    Sequence IDs should exactly match the fasta headers, as parsed by BioPython.
+    (">" symbols are optional)
+    """
+
+    with open(mapfile, "r") as handle:
+        mut_dict = OrderedDict()
+        for line in handle:
+            id, change = line.lstrip(">").rstrip("\n").split(",")
+            mut_dict[Mutation(id)] = change
+
+    for k, v in mut_dict.items():
+        assert v[0].isalpha(), (
+            "First character of mutation map is not a valid letter. Got: %s" % v[0]
+        )
+        assert v[-1].isalpha(), (
+            "Last character of mutation map is not a valid letter. Got: %s" % v[-1]
+        )
+        assert v[1:-1].isdigit(), (
+            "Location string of mutation map is not a valid number. Got: %s" % v[1:-1]
+        )
+
+    return mut_dict
+
+
+def morph(orig, loc, new, mutableseq, verbose):
+    """Perform actual sequence change (polymorphism only at present)"""
+    # Shift location to offset 0-based index
+    loc = loc - 1
+    assert mutableseq[loc] == orig, (
+        "Sequence does not match the mutation file for pre-exising residue. Expected %s , got %s "
+        % (orig, mutableseq[loc])
+    )
+    if verbose is True:
+        print(
+            "Performing change: {} -> {}, at location: {} (0 based)".format(
+                orig, new, loc
+            )
+        )
+    mutableseq[loc] = new
+    return mutableseq
+
+
+def hamming_distance(s1, s2):
+    """Return the Hamming distance between equal-length sequences"""
+    if len(s1) != len(s2):
+        raise ValueError("Undefined for sequences of unequal length")
+    return sum(ch1 != ch2 for ch1, ch2 in zip(s1.upper(), s2.upper()))
+
+
+def main():
+    args = get_args()
+    if args.outfile is not None:
+        ofh = open(args.outfile, "w")
+
+    # Parse the mutation file (get mutations by sequence)
+    mutations = parse_mapfile(args.mutation_file)
+    if args.verbose is True:
+        print("Got mutations:")
+        print(mutations)
+    # Iterate all sequences and make any substitutions necessary
+    for record in SeqIO.parse(args.sequences, "fasta"):
+        for k, v in mutations.items():
+            mutable = record.seq.upper().tomutable()
+#            mutable = record.seq.tomutable()
+#            print("MO:", mutable)
+            if k.name == record.id[0:4]: # BEWARE HARDCODING
+#                print("k.name:", k.name, "record.id:", record.id)
+                orig = v[0]
+                print("orig:", orig)
+                new = v[-1]
+                loc = int(v[1:-1])
+                if args.verbose:
+                    print(record.id)
+                newseq = morph(orig, loc, new, mutable, args.verbose)
+#                print("NS is:", newseq)
+                if args.verbose is True:
+                    print("Original: " + record.seq.upper())
+                    print(
+                        str((" " * int(loc - 2 + 11))) + "V"
+                    )  # Padded string to show where switch happened (not sure how it'll deal with line wrapping
+                    print("New:      " + newseq)
+                    print(
+                        "Distance: "
+                        + str(hamming_distance(str(record.seq), str(newseq)))
+                    )
+                if args.outfile is not None:
+                    ofh.write(">%s_%s\n%s\n" % (record.id, v, newseq))
+                if args.verbose is False:
+                    print(">%s_%s\n%s\n" % (record.id, v, newseq))
+
+    if args.outfile is not None:
+        ofh.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_mutate.sh b/scripts/run_mutate.sh
new file mode 100644
index 0000000..3de2210
--- /dev/null
+++ b/scripts/run_mutate.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+#=======================================================================
+#https://www.biostars.org/p/336891/
+#python Mutate.py -v -o /path/to/output.fasta mutation_map_file.csv input.fasta
+#=======================================================================
+
+
+# make sure there is no new line at the end of the mutation file (snps.csv)
+#python3 Mutate.py -v -o /home/tanu/git/Data/pyrazinamide/input/output.fasta mut_map.csv 3pl1.fasta.txt
+python3 mutate.py -v -o /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt /home/tanu/git/Data/pyrazinamide/output/pnca_all_muts_msa.csv /home/tanu/git/Data/pyrazinamide/input/3pl1.fasta.txt
+
+# remove fasta style header lines in the output i.e
+# lines beginning with '>' so the file is just the mutated seqs
+sed -i '/^>.*$/d' /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt
+printf 'No. of lines after cleaning: '
+cat /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt | wc -l