Source code for pydna.crispr
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Utilities for CRISPR/Cas target searching and protospacer extraction.
"""
import re
from abc import ABC
from abc import abstractmethod
from typing import Type
from typing import TYPE_CHECKING
from typing import List
from typing import TypeVar
if TYPE_CHECKING: # pragma: no cover
from pydna.dseqrecord import Dseqrecord
DseqrecordType = TypeVar("DseqrecordType", bound="Dseqrecord")
class _cas(ABC):
"""
Abstract base class for CRISPR-associated nucleases.
pam, scaffold and cut location is set by a subclass
such as Cas9 below.
The meaning of size, fst5 and fst3 are the same as for the restriciton
enzymes in the Biopython restriction module (Bio.Restriction).
"""
scaffold: str = "ND"
pam: str = "ND"
size: int = 0
fst5: int = 0
fst3: int = 0
def __init__(self, protospacer: str) -> None:
"""
Initialize the nuclease with a protospacer sequence.
The sequence is a string. Use the protospacer function
to extract a sequence from a Dseqrecord.
Args:
protospacer: Protospacer sequence used to build the search pattern.
"""
from pydna.sequence_regex import compute_regex_site
self.protospacer: str = protospacer.upper()
self.compsite = compute_regex_site(f"{self.protospacer}{self.pam}")
@abstractmethod
def search(self, dna, linear: bool = True) -> List[int]:
"""Return a list of cutting sites of the enzyme in the sequence.
dna must be an instance of:
- pydna.dseq.Dseq
- Bio.Seq.Seq
- Bio.Seq.MutableSeq
pydna.dseqrecord.Dseqrecord or Bio.SeqRecord.SeqRecord will not work.
This limitation is by design t omirror enzymes in the
Biopython Bio.Restriction class
The linear argument is laso there for compatibility with the
Biopython Bio.Restriction class.
An important caveat is that search ignores the circular property of
pydna.dseq.Dseq.
If linear is False, the restriction sites that span over the boundaries
will be included.
The positions are the first base of the 3' fragment,
i.e. the first base after the position the enzyme will cut.
"""
raise NotImplementedError # pragma: no cover
def __repr__(self) -> str:
"""
Return a compact representation of the Cas9+gRNA nuclease instance.
Returns:
String representation with abbreviated protospacer.
"""
return f"{type(self).__name__}({self.protospacer[:3]}..{self.protospacer[-3:]})"
def __str__(self) -> str:
"""
Return the guide RNA protospacer and scaffold as FASTA-like string.
"""
return f">{type(self).__name__} protospacer scaffold\n{self.protospacer} {self.scaffold}"
[docs]
class cas9(_cas):
"""docstring.
.. code-block::
fst5 --|fst3
|----------------
PAM
5'-NNGGAAGAGTAATACACTA-AAANGGNN-3'
||||||||||||||||||| ||||||||
3'-NNCCTTCTCATTATGTGAT-TTTNCCNN-5'
||||||||||||||||| |||
5'-GGAAGAGTAATACACTA AAA-g-u-a-a-g-g-3' Scaffold (lower case)
---gRNA spacer------- u-a
u-a
u-a
u-a
a-u
g-u-g
a a
g-c-a
c-g
u-a
a-u
g a
a-a
"""
scaffold: str = "GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGG"
pam: str = ".GG"
size: int = 20
fst5: int = 17
fst3: int = -3
ovhg: int = fst5 - (size + fst3)
[docs]
def search(self, dna, linear: bool = True) -> List[int]:
"""
Search for Cas9 target sites in a DNA sequence.
Args:
dna: string, Bio.Seq.Seq or pydna.dseq.Dseq
linear: Whether the DNA is linear or circular.
Returns:
A list of cut site positions.
"""
from pydna.dseqrecord import Dseqrecord
from pydna.sequence_regex import dseqrecord_finditer
if not hasattr(dna, "_data"):
raise TypeError
results: List[int] = []
query = Dseqrecord(dna, circular=(not linear))
matches_fwd = dseqrecord_finditer(self.compsite, query)
matches_rev = dseqrecord_finditer(self.compsite, query.reverse_complement())
for mobj in matches_fwd:
results.append((mobj.start() + self.fst5 + 1) % len(dna))
for mobj in matches_rev:
results.append((len(dna) - (mobj.start() + self.fst5) + 1) % len(dna))
return results
[docs]
def protospacer(guide_construct: DseqrecordType, cas: Type[_cas] = cas9) -> List[str]:
"""
Extract protospacer sequences from a guide construct. This can for example
be a plasmid containing the guide construct. This function returns a
list since a several protospacers can be present.
Args:
guide_construct: Sequence construct containing protospacer and scaffold.
cas: CRISPR nuclease class defining spacer size and scaffold.
Returns:
A list of protospacer sequences found in Watson and Crick orientations.
"""
if guide_construct.circular:
total_length = cas.size + len(cas.scaffold)
guide_construct = guide_construct[:] + guide_construct[: total_length - 1]
result = []
for s in guide_construct.seq.watson.upper(), guide_construct.seq.crick.upper():
result.extend(
mobj.group("ps")
for mobj in re.finditer(
f"(?P<ps>.{{{cas.size}}})(?:{cas.scaffold})",
s,
)
)
return result