Source code for pydna.dseqrecord

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2013-2020 by Björn Johansson.  All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license.  Please see the LICENSE.txt file that should have been included
# as part of this package.
"""This module provides the :class:`Dseqrecord` class, for handling double stranded
DNA sequences. The Dseqrecord holds sequence information in the form of a :class:`pydna.dseq.Dseq`
object. The Dseq and Dseqrecord classes are subclasses of Biopythons
Seq and SeqRecord classes, respectively.

The Dseq and Dseqrecord classes support the notion of circular and linear DNA topology.
"""
from Bio.Restriction import RestrictionBatch as _RestrictionBatch
from Bio.Restriction import CommOnly
from pydna.dseq import Dseq as _Dseq
from pydna._pretty import pretty_str as _pretty_str
from pydna.utils import flatten as _flatten
from pydna.utils import memorize as _memorize
from pydna.utils import rc as _rc
from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from pydna.seqfeature import SeqFeature as _SeqFeature
from Bio import SeqIO
from Bio.SeqFeature import CompoundLocation as _CompoundLocation
from Bio.SeqFeature import FeatureLocation as _FeatureLocation
from pydna.seqrecord import SeqRecord as _SeqRecord
from Bio.Seq import translate as _translate
from pydna.utils import identifier_from_string as _identifier_from_string
import copy as _copy
import operator as _operator
import os as _os
import re as _re
import time as _time
import datetime as _datetime

import logging as _logging

_module_logger = _logging.getLogger("pydna." + __name__)


try:
    from IPython.display import display as _display
except ImportError:
    def _display_html(item, raw=None):
        return item
else:
    from IPython.display import display_html as _display_html


[docs]class Dseqrecord(_SeqRecord):
    """Dseqrecord is a double stranded version of the Biopython SeqRecord [#]_ class.
    The Dseqrecord object holds a Dseq object describing the sequence.
    Additionally, Dseqrecord hold meta information about the sequence in the
    from of a list of SeqFeatures, in the same way as the SeqRecord does.

    The Dseqrecord can be initialized with a string, Seq, Dseq, SeqRecord
    or another Dseqrecord. The sequence information will be stored in a
    Dseq object in all cases.

    Dseqrecord objects can be read or parsed from sequences in FASTA, EMBL or Genbank formats.
    See the :mod:`pydna.readers` and :mod:`pydna.parsers` modules for further information.

    There is a short representation associated with the Dseqrecord.
    ``Dseqrecord(-3)`` represents a linear sequence of length 2
    while ``Dseqrecord(o7)``
    represents a circular sequence of length 7.

    Dseqrecord and Dseq share the same concept of length. This length can be larger
    than each strand alone if they are staggered as in the example below.

    ::

        <-- length -->
        GATCCTTT
             AAAGCCTAG




    Parameters
    ----------
    record  : string, Seq, SeqRecord, Dseq or other Dseqrecord object
        This data will be used to form the seq property

    circular : bool, optional
        True or False reflecting the shape of the DNA molecule

    linear : bool, optional
        True or False reflecting the shape of the DNA molecule


    Examples
    --------

    >>> from pydna.dseqrecord import Dseqrecord
    >>> a=Dseqrecord("aaa")
    >>> a
    Dseqrecord(-3)
    >>> a.seq
    Dseq(-3)
    aaa
    ttt
    >>> from pydna.seq import Seq
    >>> b=Dseqrecord(Seq("aaa"))
    >>> b
    Dseqrecord(-3)
    >>> b.seq
    Dseq(-3)
    aaa
    ttt
    >>> from Bio.SeqRecord import SeqRecord
    >>> c=Dseqrecord(SeqRecord(Seq("aaa")))
    >>> c
    Dseqrecord(-3)
    >>> c.seq
    Dseq(-3)
    aaa
    ttt

    References
    ----------

    .. [#] http://biopython.org/wiki/SeqRecord

    """

    def __init__(self,
                 record,
                 *args,
                 linear=None,
                 circular=None,
                 n=5e-14,  # mol ( = 0.05 pmol)
                 **kwargs):

        _module_logger.info("### Dseqrecord initialized ###")
        _module_logger.info("argument linear = %s", linear)
        _module_logger.info("argument circular = %s", circular)

        if not (linear is None and circular is None):
            circular = (
                bool(circular)
                and bool(linear) ^ bool(circular)
                or linear is False
                and circular is None
            )
            linear = not circular

        _module_logger.info("linear = %s", linear)
        _module_logger.info("circular = %s", circular)

        if isinstance(record, str):
            _module_logger.info("record is a string")
            super().__init__(
                _Dseq(record, linear=linear, circular=circular),
                *args, **kwargs)

        # record is a Dseq object ?
        elif hasattr(record, "watson"):
            if record.circular and linear:
                record = record[:]
            elif record.linear and circular:
                record = record.looped()
            _module_logger.info("record is a Dseq object")
            super().__init__(record, *args, **kwargs)

        # record is a Bio.Seq object ?
        elif hasattr(record, "transcribe"):
            _module_logger.info("record is a Seq object")
            super().__init__(
                _Dseq(str(record), linear=linear, circular=circular),
                *args, **kwargs)

        # record is a Bio.SeqRecord or Dseqrecord object ?
        elif hasattr(record, "features"):
            _module_logger.info("record is a Bio.SeqRecord or Dseqrecord object")
            for key, value in list(record.__dict__.items()):
                setattr(self, key, value)
            record.letter_annotations = {}
            # record.seq is a Dseq object ?
            if hasattr(record.seq, "watson"):
                new_seq = _copy.copy(record.seq)
                if new_seq.circular and linear:
                    new_seq = new_seq[:]
                elif new_seq.linear and circular:
                    new_seq = new_seq.looped()
                self.seq = new_seq
            # record.seq is Bio.SeqRecord object ?
            else:
                self.seq = _Dseq(str(self.seq), linear=linear,
                                 circular=circular)
        else:
            raise ValueError("don't know what to do with {}".format(record))

        self.map_target = None
        self.n = n  # amount, set to 5E-14 which is 5 pmols
        self.annotations.update({"molecule_type": "DNA"})

[docs]    @classmethod
    def from_string(cls,
                    record: str = "",
                    *args,
                    linear=True,
                    circular=False,
                    n=5e-14,
                    **kwargs):
        """docstring."""
        # def from_string(cls, record:str="", *args,
        # linear=True, circular=False, n = 5E-14, **kwargs):
        obj = cls.__new__(cls)  # Does not call __init__
        obj._seq = _Dseq.quick(record,
                               _rc(record),
                               ovhg=0,
                               linear=linear,
                               circular=circular)
        obj.id = _pretty_str("id")
        obj.name = _pretty_str("name")
        obj.description = _pretty_str("description")
        obj.dbxrefs = []
        obj.annotations = {"molecule_type": "DNA"}
        obj._per_letter_annotations = {}
        obj.features = []
        obj.map_target = None
        obj.n = n
        obj.__dict__.update(kwargs)
        return obj

[docs]    @classmethod
    def from_SeqRecord(
        cls, record: _SeqRecord, *args, linear=True, circular=False, n=5e-14, **kwargs
    ):
        obj = cls.__new__(cls)  # Does not call __init__
        obj._seq = _Dseq.quick(
            str(record.seq),
            _rc(str(record.seq)),
            ovhg=0,
            linear=linear,
            circular=circular,
        )
        obj.id = record.id
        obj.name = record.name
        obj.description = record.description
        obj.dbxrefs = record.dbxrefs
        obj.annotations = {"molecule_type": "DNA"}
        obj.annotations.update(record.annotations)
        obj._per_letter_annotations = record._per_letter_annotations
        obj.features = record.features
        obj.map_target = None
        obj.n = n
        return obj

    @property
    def linear(self):
        """The linear property can not be set directly.
        Use :meth:`looped` or :meth:`tolinear`"""
        return self.seq.linear

    @property
    def circular(self):
        """The circular property can not be set directly.
        Use :meth:`looped` or :meth:`tolinear`"""
        return self.seq.circular

[docs]    def m(self):
        """This method returns the mass of the DNA molecule in grams. This is
        calculated as the product between the molecular weight of the Dseq object
        and the"""
        return self.seq.mw() * self.n  # Da(g/mol) * mol = g

[docs]    def extract_feature(self, n):
        """Extracts a feature and creates a new Dseqrecord object.

        Parameters
        ----------
        n  : int
            Indicates the feature to extract

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("atgtaa")
        >>> a.add_feature(2,4)
        >>> b=a.extract_feature(0)
        >>> b
        Dseqrecord(-2)
        >>> b.seq
        Dseq(-2)
        gt
        ca

        """
        return super().extract_feature(n)

[docs]    def useguid(self):
        """Url safe SEGUID for the sequence.

        This checksum is the same as seguid but with base64.urlsafe
        encoding instead of the normal base64. This means that
        the characters + and / are replaced with - and _ so that
        the checksum can be part of a URL.

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> a = Dseqrecord("aa")
        >>> a.useguid() # useguid is gBw0Jp907Tg_yX3jNgS4qQWttjU
        'gBw0Jp907Tg_yX3jNgS4qQWttjU'

        """
        return self.seq.useguid()

[docs]    def cseguid(self):
        """Url safe cSEGUID for a circular sequence.

        Only defined for circular sequences.
        The cSEGUID checksum uniquely identifies a circular
        sequence regardless of where the origin is set.
        The two Dseqrecord objects below are circular
        permutations.

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("agtatcgtacatg", circular=True)
        >>> a.cseguid() # cseguid is CTJbs6Fat8kLQxHj+/SC0kGEiYs
        'CTJbs6Fat8kLQxHj-_SC0kGEiYs'

        >>> a=Dseqrecord("gagtatcgtacat", circular=True)
        >>> a.cseguid()
        'CTJbs6Fat8kLQxHj-_SC0kGEiYs'

        """
        if self.linear:
            raise TypeError("cseguid is only defined for circular sequences.")
        return self.seq.cseguid()

[docs]    def lseguid(self):
        """Url safe lSEGUID for a linear sequence.

        Only defined for linear double stranded sequences.

        The lSEGUID checksum uniquely identifies a linear
        sequence independent of its representation.
        The two Dseqrecord objects below are each others
        reverse complements, so they do in fact refer to
        the same molecule.

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("agtatcgtacatg")
        >>> a.lseguid()
        'DPshMN4KeAjMovEjGEV4Kzj18lU'

        >>> b=Dseqrecord("catgtacgatact")
        >>> a.lseguid()
        'DPshMN4KeAjMovEjGEV4Kzj18lU'

        """
        if self.circular:
            raise TypeError("lseguid is only defined for linear sequences.")
        return self.seq.lseguid()

[docs]    def looped(self):
        """
        Circular version of the Dseqrecord object.

        The underlying linear Dseq object has to have compatible ends.

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("aaa")
        >>> a
        Dseqrecord(-3)
        >>> b=a.looped()
        >>> b
        Dseqrecord(o3)
        >>>

        See Also
        --------
        pydna.dseq.Dseq.looped
        """
        new = _copy.copy(self)
        # for key, value in list(self.__dict__.items()):
        #     setattr(new, key, value)
        new._seq = self.seq.looped()
        five_prime = self.seq.five_prime_end()
        for fn, fo in zip(new.features, self.features):
            if five_prime[0] == "5'":
                pass
                # fn.location = fn.location + self.seq.ovhg
            elif five_prime[0] == "3'":
                fn.location = fn.location + (-self.seq.ovhg)
            if fn.location.start < 0:
                loc1 = _FeatureLocation(len(new) + fn.location.start,
                                        len(new), strand=fn.strand)
                loc2 = _FeatureLocation(0, fn.location.end, strand=fn.strand)
                fn.location = _CompoundLocation([loc1, loc2])

            if fn.location.end > len(new):
                loc1 = _FeatureLocation(fn.location.start,
                                        len(new), strand=fn.strand)
                loc2 = _FeatureLocation(0, fn.location.end - len(new),
                                        strand=fn.strand)
                fn.location = _CompoundLocation([loc1, loc2])

            fn.qualifiers = fo.qualifiers
        # breakpoint()
        return new

[docs]    def tolinear(self):  # pragma: no cover
        """
        Returns a linear, blunt copy of a circular Dseqrecord object. The
        underlying Dseq object has to be circular.

        This method is deprecated, use slicing instead. See example below.

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("aaa", circular = True)
        >>> a
        Dseqrecord(o3)
        >>> b=a[:]
        >>> b
        Dseqrecord(-3)
        >>>

        """
        import warnings as _warnings
        from pydna import _PydnaDeprecationWarning

        _warnings.warn(
            "tolinear method is obsolete; "
            "please use obj[:] "
            "instead of obj.tolinear().",
            _PydnaDeprecationWarning,
        )
        new = _copy.copy(self)
        for key, value in list(self.__dict__.items()):
            setattr(new, key, value)
        # new._seq = self.seq.tolinear()
        for fn, fo in zip(new.features, self.features):
            fn.qualifiers = fo.qualifiers

        return new

[docs]    def format(self, f="gb"):
        """Returns the sequence as a string using a format supported by Biopython
        SeqIO [#]_. Default is "gb" which is short for Genbank.

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> x=Dseqrecord("aaa")
        >>> x.annotations['date'] = '02-FEB-2013'
        >>> x
        Dseqrecord(-3)
        >>> print(x.format("gb"))
        LOCUS       name                       3 bp    DNA     linear   UNK 02-FEB-2013
        DEFINITION  description.
        ACCESSION   id
        VERSION     id
        KEYWORDS    .
        SOURCE      .
          ORGANISM  .
                    .
        FEATURES             Location/Qualifiers
        ORIGIN
                1 aaa
        //


        References
        ----------

        .. [#] http://biopython.org/wiki/SeqIO


        """

        s = super().format(f).strip()

        if f in ("genbank", "gb"):
            if self.circular:
                return _pretty_str(s[:55] + "circular" + s[63:])
            else:
                return _pretty_str(s[:55] + "linear  " + s[63:])
        else:
            return _pretty_str(s).strip()


[docs]    def write(self, filename=None, f="gb"):
        """Writes the Dseqrecord to a file using the format f, which must
        be a format supported by Biopython SeqIO for writing [#]_. Default
        is "gb" which is short for Genbank. Note that Biopython SeqIO reads
        more formats than it writes.

        Filename is the path to the file where the sequece is to be
        written. The filename is optional, if it is not given, the
        description property (string) is used together with the format.

        If obj is the Dseqrecord object, the default file name will be:

        ``<obj.locus>.<f>``

        Where <f> is "gb" by default. If the filename already exists and
        AND the sequence it contains is different, a new file name will be
        used so that the old file is not lost:

        ``<obj.locus>_NEW.<f>``

        References
        ----------
        .. [#] http://biopython.org/wiki/SeqIO

        """
        msg = ""
        if not filename:
            filename = "{name}.{type}".format(name=self.locus, type=f)
            # generate a name if no name was given
        if not isinstance(filename, str):  # is filename a string???
            raise ValueError("filename has to be a string, got", type(filename))
        name, ext = _os.path.splitext(filename)
        msg = f"<font face=monospace><a href='{filename}' target='_blank'>{filename}</a></font><br>"
        if not _os.path.isfile(filename):
            with open(filename, "w", encoding="utf8") as fp:
                fp.write(self.format(f))
        else:
            from pydna.readers import read
            old_file = read(filename)

            if self.seq != old_file.seq:
                # If new sequence is different, the old file is
                # renamed with "_OLD_" suffix:
                oldmtime = _datetime.datetime.fromtimestamp(_os.path.getmtime(filename)).isoformat()
                tstmp = int(_time.time() * 1_000_000)
                old_filename = f"{name}_OLD_{tstmp}{ext}"
                _os.rename(filename, old_filename)
                newcseguid = self.cseguid() if self.circular else "na"
                oldcseguid = old_file.cseguid() if old_file.circular else "na"
                with open(filename, "w", encoding="utf8") as fp:
                    fp.write(self.format(f))
                newmtime = _datetime.datetime.fromtimestamp(_os.path.getmtime(filename)).isoformat()
                msg = f"""
                <table style="padding:10px 10px;
                word-break:normal;
                border-color:#fe0000;
                border-collapse:collapse;
                border-spacing:1;
                font-family:monospace;
                font-size:large;
                font-weight:bold;
                text-align:left;
                border: 5px solid red;">
                <thead>
                  <tr style="color:#0000FF;border: 1px solid;text-align:left;">
                    <th style="color:#fe0000;border: 1px solid;text-align:center;font-size:xxx-large;text-align:left;">&#9888</th>
                    <th style="color:#f56b00;border: 1px solid;text-align:left;" colspan="2">Sequence change</th>
                  </tr>
                </thead>
                <tbody>
                  <tr style="color:#0000FF;border: 1px solid;text-align:left;">
                    <td>Filename</td>
                    <td style="color:#fe0000;border: 1px solid;text-align:left;"><a href='{filename}' target='_blank'>{filename}</a></td>
                    <td style="color:#32cb00;border: 1px solid;text-align:left;"><a href='{old_filename}' target='_blank'>{old_filename}</a></td>
                  </tr>
                  <tr style="color:#0000FF;border: 1px solid;text-align:left;">
                    <td >Saved</td>
                    <td style="color:#fe0000;border: 1px solid;text-align:left;">{newmtime}</td>
                    <td style="color:#32cb00;border: 1px solid;text-align:left;">{oldmtime}</td>
                  </tr>
                  <tr style="color:#0000FF;border: 1px solid;text-align:left;">
                    <td>Length</td>
                    <td style="color:#fe0000;border: 1px solid;text-align:left;">{len(self)}</td>
                    <td style="color:#32cb00;border: 1px solid;text-align:left;">{len(old_file)}</td>
                  </tr>
                  <tr style="color:#0000FF;border: 1px solid;text-align:left;">
                    <td>uSEGUID</td>
                    <td style="color:#fe0000;border: 1px solid;text-align:left;">{self.useguid()}</td>
                    <td style="color:#32cb00;border: 1px solid;text-align:left;">{old_file.useguid()}</td>
                  </tr>
                  <tr style="color:#0000FF;border: 1px solid;text-align:left;">
                    <td>cSEGUID</td>
                    <td style="color:#fe0000;border: 1px solid;text-align:left;">{newcseguid}</td>
                    <td style="color:#32cb00;border: 1px solid;text-align:left;">{oldcseguid}</td>
                  </tr>
                </tbody>
                </table>
                """
            elif "SEGUID" in old_file.annotations.get("comment", ""):
                pattern = r"(lSEGUID|cSEGUID|uSEGUID)_(\S{27})(_[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}){0,1}"
                # cSEGUID_NNNNNNNNNNNNNNNNNNNNNNNNNNN_2020-10-10T11:11:11.111111
                oldstamp = _re.search(pattern, old_file.description)
                newstamp = _re.search(pattern, self.description)
                newdescription = self.description
                if oldstamp and newstamp:
                    if oldstamp.group(0)[:35] == newstamp.group(0)[:35]:
                        newdescription = newdescription.replace(
                                         newstamp.group(0),
                                         oldstamp.group(0))
                elif oldstamp:
                    newdescription += " " + oldstamp.group(0)
                newobj = _copy.copy(self)
                newobj.description = newdescription

                with open(filename, "w", encoding="utf8") as fp:
                    fp.write(newobj.format(f))
            else:
                with open(filename, "w", encoding="utf8") as fp:
                    fp.write(self.format(f))
        return _display_html(msg, raw=True)

[docs]    def find(self, other):
        # TODO allow strings, seqs, seqrecords or Dseqrecords
        # TODO check for linearity of other, raise exception if not
        # TODO add tests and docstring for this method
        o = str(other.seq).upper()

        if self.linear:
            s = str(self.seq).upper()
        else:
            s = (
                str(self.seq).upper() + str(self.seq).upper()[: len(other) - 1]
            )  # allow wrapping around origin
        return s.find(o)

    def __str__(self):
        return ("Dseqrecord\n" "circular: {}\n" "size: {}\n").format(
            self.circular, len(self)
        ) + _SeqRecord.__str__(self)

    def __contains__(self, other):
        if other.lower() in str(self.seq).lower():
            return True
        else:
            s = self.seq.watson.replace(" ", "")
            ln = len(s)
            spc = 3 - ln % 3 if ln % 3 else 0
            s = "n" * spc + s + "nnn"
            for frame in range(3):
                if other.lower() in _translate(s[frame : frame + spc + ln]).lower():
                    return True
        return False


[docs]    def find_aminoacids(self, other):
        """
        >>> from pydna.dseqrecord import Dseqrecord
        >>> s=Dseqrecord("atgtacgatcgtatgctggttatattttag")
        >>> s.seq.translate()
        Seq('MYDRMLVIF*')
        >>> "RML" in s
        True
        >>> "MMM" in s
        False
        >>> s.seq.rc().translate()
        Seq('LKYNQHTIVH')
        >>> "QHT" in s.rc()
        True
        >>> "QHT" in s
        False
        >>> slc = s.find_aa("RML")
        >>> slc
        slice(9, 18, None)
        >>> s[slc]
        Dseqrecord(-9)
        >>> code = s[slc].seq
        >>> code
        Dseq(-9)
        cgtatgctg
        gcatacgac
        >>> code.translate()
        Seq('RML')
        """
        other = str(other).lower()
        assert self.seq.watson == "".join(self.seq.watson.split())
        s = self.seq.watson
        ln = len(s)
        spc = 3 - ln % 3 if ln % 3 else 0
        s = s + "n" * spc + "nnn"
        start = None
        for frame in range(3):
            try:
                start = _translate(s[frame : frame + ln + spc]).lower().index(other)
                break
            except ValueError:
                pass
        oh = self.seq.ovhg if self.seq.ovhg > 0 else 0
        if start == None:
            return None  # TODO return an emoty slice or False...?
        else:
            return slice(frame + start * 3 + oh, frame + (start + len(other)) * 3 + oh)

    find_aa = find_aminoacids

[docs]    def map_trace_files(self, pth, limit=25):  # TODO allow path-like objects
        import glob

        traces = []
        for name in glob.glob(pth):
            trace = SeqIO.read(name, "abi").lower()
            trace.annotations["filename"] = trace.fname = name
            traces.append(trace)
        if not traces:
            raise ValueError("No trace files found in {}".format(pth))
        if hasattr(self.map_target, "step"):
            area = self.map_target
        elif hasattr(self.map_target, "extract"):
            area = slice(self.map_target.location.start, self.map_target.location.end)
        else:
            area = None  # TODO allow other objects as well and do some checks on map target

        if area:
            self.matching_reads = []
            self.not_matching_reads = []
            target = str(self[area].seq).lower()
            target_rc = str(self[area].seq.rc()).lower()
            for trace in traces:
                if target in str(trace.seq) or target_rc in str(trace.seq):
                    self.matching_reads.append(trace)
                else:
                    self.not_matching_reads.append(trace)
            reads = self.matching_reads
        else:
            self.matching_reads = None
            self.not_matching_reads = None
            reads = traces

        matching_reads = []

        for read_ in reads:

            matches = _common_sub_strings(str(self.seq).lower(), str(read_.seq), limit)

            if not matches:
                continue

            if len(matches) > 1:
                newmatches = [
                    matches[0],
                ]
                for i, x in enumerate(matches[1:]):
                    g, f, h = matches[i]
                    if g + h < x[0] and f + h < x[1]:
                        newmatches.append(x)
            else:  # len(matches)==1
                newmatches = matches

            matching_reads.append(read_)

            if len(newmatches) > 1:
                ms = []
                for m in newmatches:
                    ms.append(_FeatureLocation(m[0], m[0] + m[2]))
                loc = _CompoundLocation(ms)
            else:
                a, b, c = newmatches[0]
                loc = _FeatureLocation(a, a + c)

            self.features.append(
                _SeqFeature(
                    loc,
                    qualifiers={"label": [read_.annotations["filename"]]},
                    type="trace",
                )
            )

        return [x.annotations["filename"] for x in matching_reads]

    def __repr__(self):
        return "Dseqrecord({}{})".format(
            {True: "-", False: "o"}[self.linear], len(self)
        )

    def _repr_pretty_(self, p, cycle):
        p.text(
            "Dseqrecord({}{})".format({True: "-", False: "o"}[self.linear], len(self))
        )

    def __add__(self, other):
        if hasattr(other, "seq") and hasattr(other.seq, "watson"):
            other = _copy.deepcopy(other)
            other_five_prime = other.seq.five_prime_end()
            if other_five_prime[0] == "5'":
                # add other.seq.ovhg
                for f in other.features:
                    f.location = f.location + other.seq.ovhg
            elif other_five_prime[0] == "3'":
                # subtract other.seq.ovhg (sign change)
                for f in other.features:
                    f.location = f.location + (-other.seq.ovhg)

            answer = Dseqrecord(_SeqRecord.__add__(self, other))
            answer.n = min(self.n, other.n)
        else:
            answer = Dseqrecord(_SeqRecord.__add__(self, Dseqrecord(other)))
            answer.n = self.n
        return answer

    def __mul__(self, number):
        if not isinstance(number, int):
            raise TypeError(
                "TypeError: can't multiply Dseqrecord by non-int of type {}".format(
                    type(number)
                )
            )
        if self.circular:
            raise TypeError("TypeError: can't multiply circular Dseqrecord.")
        if number > 0:
            new = _copy.copy(self)
            for i in range(1, number):
                new += self
            return new
        else:
            return self.__class__("")

    def __getitem__(self, sl):
        """docstring."""
        answer = Dseqrecord(_copy.copy(self))
        answer.seq = self.seq.__getitem__(sl)
        # answer.seq.alphabet = self.seq.alphabet

        sl_start = sl.start or 0  # 6
        sl_stop = sl.stop or len(answer.seq)  # 1

        if self.linear or sl_start < sl_stop:
            answer.features = super().__getitem__(sl).features
        elif self.circular and sl_start > sl_stop:
            answer.features = self.shifted(sl_start).features
            answer.features = [
                f
                for f in answer.features
                if f.location.parts[-1].end.position <= answer.seq.length
            ]
        else:
            answer = Dseqrecord("")
        identifier = "part_{id}".format(id=self.id)
        if answer.features:
            sf = max(answer.features, key=len)  # default
            if "label" in sf.qualifiers:
                identifier = " ".join(sf.qualifiers["label"])
            elif "note" in sf.qualifiers:
                identifier = " ".join(sf.qualifiers["note"])
        answer.id = _identifier_from_string(identifier)[:16]
        answer.name = _identifier_from_string("part_{name}".format(name=self.name))[:16]
        return answer

    def __eq__(self, other):
        try:
            if self.seq == other.seq and str(self.__dict__) == str(other.__dict__):
                return True
        except AttributeError:
            pass
        return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        """__hash__ must be based on __eq__"""
        return hash((str(self.seq).lower(), str(tuple(sorted(self.__dict__.items())))))

[docs]    def linearize(self, *enzymes):
        """This method is similar to :func:`cut` but throws an exception if there
        is not excactly one cut i.e. none or more than one digestion products.

        """

        if self.seq._linear:
            raise TypeError("Can only linearize circular molecules!")
        fragments = self.cut(*enzymes)
        if len(fragments) > 1:
            raise TypeError("More than one fragment is formed!")
        elif not fragments:
            raise TypeError("The enzyme(s) do not cut!")
        answer = fragments[0]
        answer.id = "{name}_lin".format(name=self.name)
        answer.name = answer.id[:16]
        return fragments[0]

[docs]    def no_cutters(self, batch: _RestrictionBatch = None):
        """docstring."""
        return self.seq.no_cutters(batch=batch or CommOnly)

[docs]    def unique_cutters(self, batch: _RestrictionBatch = None):
        """docstring."""
        return self.seq.unique_cutters(batch=batch or CommOnly)

[docs]    def once_cutters(self, batch: _RestrictionBatch = None):
        """docstring."""
        return self.seq.once_cutters(batch=batch or CommOnly)

[docs]    def twice_cutters(self, batch: _RestrictionBatch = None):
        """docstring."""
        return self.seq.twice_cutters(batch=batch or CommOnly)

[docs]    def n_cutters(self, n=3, batch: _RestrictionBatch = None):
        """docstring."""
        return self.seq.n_cutters(n=n, batch=batch or CommOnly)

[docs]    def cutters(self, batch: _RestrictionBatch = None):
        """docstring."""
        return self.seq.cutters(batch=batch or CommOnly)

[docs]    def cut(self, *enzymes):
        """Digest the Dseqrecord object with one or more restriction enzymes.
        returns a list of linear Dseqrecords. If there are no cuts, an empty
        list is returned.

        See also :func:`Dseq.cut`

        Parameters
        ----------

        enzymes : enzyme object or iterable of such objects
            A Bio.Restriction.XXX restriction object or iterable of such.

        Returns
        -------
        Dseqrecord_frags : list
            list of Dseqrecord objects formed by the digestion

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("ggatcc")
        >>> from Bio.Restriction import BamHI
        >>> a.cut(BamHI)
        (Dseqrecord(-5), Dseqrecord(-5))
        >>> frag1, frag2 = a.cut(BamHI)
        >>> frag1.seq
        Dseq(-5)
        g
        cctag
        >>> frag2.seq
        Dseq(-5)
        gatcc
            g


        """

        frags = self.seq.cut(enzymes)

        if not frags:
            return ()

        if self.linear:
            shift = frags[0].pos # 0
            features = self.features
        else:
            # breakpoint()
            shift = frags[0].pos
            features = self.shifted(shift).features
            for fr in frags:
                fr.pos -= shift + max(fr.ovhg, 0)
                # breakpoint()
        dsfs = []
        for fr in frags:
            dsf = Dseqrecord(fr, linear=True, n=self.n)
            start = fr.pos  # - shift
            end = fr.pos + fr.length  # - shift
            dsf.features = [_copy.copy(fe)
                            for fe in features
                            if start <= fe.location.start
                            and end >= fe.location.end]
            for fe in dsf.features:
                fe.location += -fr.pos
            dsfs.append(dsf)
        return tuple(dsfs)

[docs]    def number_of_cuts(self, *enzymes):
        """The number of cuts by digestion with the Restriction enzymes
        contained in the iterable."""
        return sum([len(enzyme.search(self.seq)) for enzyme
                    in _flatten(enzymes)])

[docs]    def cas9(self, RNA: str):
        """docstring."""
        fragments = []
        result = []
        for target in (self.seq, self.seq.rc()):
            fragments = [self[sl.start:sl.stop] for sl in target.cas9(RNA)]
            result.append(fragments)
        return result

[docs]    def reverse_complement(self):
        """Reverse complement.

        Examples
        --------
        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("ggaatt")
        >>> a
        Dseqrecord(-6)
        >>> a.seq
        Dseq(-6)
        ggaatt
        ccttaa
        >>> a.reverse_complement().seq
        Dseq(-6)
        aattcc
        ttaagg
        >>>

        See Also
        --------
        pydna.dseq.Dseq.reverse_complement

        """
        answer = type(self)(super().reverse_complement())
        answer.name = "{}_rc".format(self.name[:13])
        answer.description = self.description + "_rc"
        answer.id = self.id + "_rc"
        answer.seq._circular = self.seq.circular
        answer.seq._linear = self.seq.linear
        return answer

    rc = reverse_complement

[docs]    def shifted(self, shift):
        """Returns a circular Dseqrecord with a new origin <shift>.
        This only works on circular Dseqrecords. If we consider the following
        circular sequence:


        | ``GAAAT   <-- watson strand``
        | ``CTTTA   <-- crick strand``

        The T and the G on the watson strand are linked together as well
        as the A and the C of the of the crick strand.

        if ``shift`` is 1, this indicates a new origin at position 1:

        |    new origin at the | symbol:
        |
        | ``G|AAAT``
        | ``C|TTTA``

        new sequence:

        | ``AAATG``
        | ``TTTAC``

        Examples
        --------

        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("aaat",circular=True)
        >>> a
        Dseqrecord(o4)
        >>> a.seq
        Dseq(o4)
        aaat
        ttta
        >>> b=a.shifted(1)
        >>> b
        Dseqrecord(o4)
        >>> b.seq
        Dseq(o4)
        aata
        ttat

        """
        if self.linear:
            raise TypeError(
                "Sequence is linear, origin can only be shifted for circular sequences.\n"
            )

        ln = len(self)

        if not shift % ln:
            return self  # shift is a multiple of ln or 0
        else:
            shift %= ln  # 0<=shift<=ln

        newseq = (self.seq[shift:] + self.seq[:shift]).looped()
        shift = ln - shift
        newfeatures = []
        for feature in self.features:
            shiftedparts = [
                featurelocation + shift for featurelocation in feature.location.parts
            ]
            zero_length_parts = [
                featurelocation
                for featurelocation in shiftedparts
                if featurelocation.start == featurelocation.end
            ]
            newparts = []
            for location in shiftedparts:
                newstart = location.start % ln
                newend = location.end % ln
                if newstart < newend:
                    newparts.append(
                        _FeatureLocation(
                            newstart,
                            newend,
                            location.strand,
                            location.ref,
                            location.ref_db,
                        )
                    )
                elif newstart > newend:
                    if location.strand == 1:
                        newparts.extend(
                            [
                                _FeatureLocation(
                                    newstart,
                                    ln,
                                    location.strand,
                                    location.ref,
                                    location.ref_db,
                                ),
                                _FeatureLocation(
                                    0,
                                    newend,
                                    location.strand,
                                    location.ref,
                                    location.ref_db,
                                ),
                            ]
                        )
                    else:
                        newparts.extend(
                            [
                                _FeatureLocation(
                                    0,
                                    newend,
                                    location.strand,
                                    location.ref,
                                    location.ref_db,
                                ),
                                _FeatureLocation(
                                    newstart,
                                    ln,
                                    location.strand,
                                    location.ref,
                                    location.ref_db,
                                ),
                            ]
                        )
            p = next((p for p in newparts if p.end == shift), None)
            s = next((p for p in newparts if p.start == shift), None)
            if p and s:
                newparts.remove(p)
                newparts[newparts.index(s)] = _FeatureLocation(
                    p.start, s.end, p.strand, p.ref, p.ref_db
                )
            newparts = [p for p in newparts if p]
            newparts.extend(zero_length_parts)
            if newparts:
                newfeatures.append(
                    _SeqFeature(
                        location=sum(newparts),
                        type=feature.type,
                        id=feature.id,
                        qualifiers=feature.qualifiers,
                    )
                )
        newfeatures.sort(key=_operator.attrgetter("location.start"))
        answer = _copy.copy(self)
        answer.features = newfeatures
        answer.seq = newseq
        return answer

    @_memorize("pydna.dseqrecord.Dseqrecord.synced")
    def synced(self, ref, limit=25):
        """This method returns a new circular sequence (Dseqrecord object), which has been rotated
        in such a way that there is maximum overlap between the sequence and
        ref, which may be a string, Biopython Seq, SeqRecord object or
        another Dseqrecord object.

        The reason for using this could be to rotate a new recombinant plasmid so
        that it starts at the same position after cloning. See the example below:


        Examples
        --------

        >>> from pydna.dseqrecord import Dseqrecord
        >>> a=Dseqrecord("gaat",circular=True)
        >>> a.seq
        Dseq(o4)
        gaat
        ctta
        >>> d = a[2:] + a[:2]
        >>> d.seq
        Dseq(-4)
        atga
        tact
        >>> insert=Dseqrecord("CCC")
        >>> recombinant = (d+insert).looped()
        >>> recombinant.seq
        Dseq(o7)
        atgaCCC
        tactGGG
        >>> recombinant.synced(a).seq
        Dseq(o7)
        gaCCCat
        ctGGGta

        """

        if self.linear:
            raise TypeError("Only circular DNA can be synced!")

        newseq = _copy.copy(self)

        s = str(self.seq.watson).lower()
        s_rc = str(self.seq.crick).lower()

        if hasattr(ref, "seq"):
            r = ref.seq
            if hasattr(r, "watson"):
                r = str(r.watson).lower()
            else:
                r = str(r).lower()
        else:
            r = str(ref.lower())

        lim = min(limit, limit * (len(s) // limit) + 1)

        c = _common_sub_strings(s + s, r, limit=lim)
        d = _common_sub_strings(s_rc + s_rc, r, limit=lim)

        c = [(x[0], x[2]) for x in c if x[1] == 0]
        d = [(x[0], x[2]) for x in d if x[1] == 0]

        if not c and not d:
            raise TypeError("There is no overlap between sequences!")

        if c:
            start, length = c.pop(0)
        else:
            start, length = 0, 0

        if d:
            start_rc, length_rc = d.pop(0)
        else:
            start_rc, length_rc = 0, 0

        if length_rc > length:
            start = start_rc
            newseq = newseq.rc()

        if start == 0:
            result = newseq
        else:
            result = newseq.shifted(start)
        _module_logger.info("synced")
        return Dseqrecord(result)

[docs]    def upper(self):
        """Returns an uppercase copy.
        >>> from pydna.dseqrecord import Dseqrecord
        >>> my_seq = Dseqrecord("aAa")
        >>> my_seq.seq
        Dseq(-3)
        aAa
        tTt
        >>> upper = my_seq.upper()
        >>> upper.seq
        Dseq(-3)
        AAA
        TTT
        >>>


        Returns
        -------
        Dseqrecord
            Dseqrecord object in uppercase


        See also
        --------
        pydna.dseqrecord.Dseqrecord.lower"""

        upper = _copy.deepcopy(self)
        upper.seq = upper.seq.upper()
        return upper

[docs]    def lower(self):
        """>>> from pydna.dseqrecord import Dseqrecord
        >>> my_seq = Dseqrecord("aAa")
        >>> my_seq.seq
        Dseq(-3)
        aAa
        tTt
        >>> upper = my_seq.upper()
        >>> upper.seq
        Dseq(-3)
        AAA
        TTT
        >>> lower = my_seq.lower()
        >>> lower
        Dseqrecord(-3)
        >>>

        Returns
        -------
        Dseqrecord
            Dseqrecord object in lowercase

        See also
        --------
        pydna.dseqrecord.Dseqrecord.upper

        """
        lower = _copy.deepcopy(self)
        lower.seq = lower.seq.lower()
        return lower

[docs]    def orfs(self, minsize=30):
        """docstring."""
        return tuple(Dseqrecord(s) for s in self.seq.orfs(minsize=minsize))


if __name__ == "__main__":
    cache = _os.getenv("pydna_cache")
    _os.environ["pydna_cache"] = "nocache"
    import doctest

    doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
    _os.environ["pydna_cache"] = cache