from pathlib import Path from typing import ( Any, Callable, Dict, Iterable, Iterator, List, Optional, Protocol, Sequence, Tuple, Union, overload, ) import numpy as np from cymem.cymem import Pool from thinc.types import Floats1d, Floats2d, Ints2d from ..lexeme import Lexeme from ..vocab import Vocab from ._dict_proxies import SpanGroups from ._retokenize import Retokenizer from .span import Span from .token import Token from .underscore import Underscore DOCBIN_ALL_ATTRS: Tuple[str, ...] class Doc: vocab: Vocab mem: Pool spans: SpanGroups max_length: int length: int sentiment: float cats: Dict[str, float] user_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]] tensor: np.ndarray[Any, np.dtype[np.float64]] user_data: Dict[str, Any] has_unknown_spaces: bool _context: Any @classmethod def set_extension( cls, name: str, default: Optional[Any] = ..., getter: Optional[Callable[[Doc], Any]] = ..., setter: Optional[Callable[[Doc, Any], None]] = ..., method: Optional[Callable[[Doc, Any]]] = ..., force: bool = ..., ) -> None: ... @classmethod def get_extension( cls, name: str ) -> Tuple[ Optional[Any], Optional[Callable[[Doc, Any]]], Optional[Callable[[Doc], Any]], Optional[Callable[[Doc, Any], None]], ]: ... @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod def remove_extension( cls, name: str ) -> Tuple[ Optional[Any], Optional[Callable[[Doc, Any]]], Optional[Callable[[Doc], Any]], Optional[Callable[[Doc, Any], None]], ]: ... def __init__( self, vocab: Vocab, words: Optional[List[str]] = ..., spaces: Optional[List[bool]] = ..., user_data: Optional[Dict[Any, Any]] = ..., tags: Optional[List[str]] = ..., pos: Optional[List[str]] = ..., morphs: Optional[List[str]] = ..., lemmas: Optional[List[str]] = ..., heads: Optional[List[int]] = ..., deps: Optional[List[str]] = ..., sent_starts: Optional[List[Union[bool, int, None]]] = ..., ents: Optional[List[str]] = ..., ) -> None: ... @property def _(self) -> Underscore: ... @property def is_tagged(self) -> bool: ... @property def is_parsed(self) -> bool: ... @property def is_nered(self) -> bool: ... @property def is_sentenced(self) -> bool: ... def has_annotation( self, attr: Union[int, str], *, require_complete: bool = ... ) -> bool: ... @overload def __getitem__(self, i: int) -> Token: ... @overload def __getitem__(self, i: slice) -> Span: ... def __iter__(self) -> Iterator[Token]: ... def __len__(self) -> int: ... def __unicode__(self) -> str: ... def __bytes__(self) -> bytes: ... def __str__(self) -> str: ... def __repr__(self) -> str: ... @property def doc(self) -> Doc: ... def char_span( self, start_idx: int, end_idx: int, label: Union[int, str] = ..., kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., span_id: Union[int, str] = ..., ) -> Optional[Span]: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... @property def has_vector(self) -> bool: ... vector: Floats1d vector_norm: float @property def text(self) -> str: ... @property def text_with_ws(self) -> str: ... # Ideally the getter would output Tuple[Span] # see https://github.com/python/mypy/issues/3004 @property def ents(self) -> Sequence[Span]: ... @ents.setter def ents(self, value: Sequence[Span]) -> None: ... def set_ents( self, entities: List[Span], *, blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., default: str = ... ) -> None: ... @property def noun_chunks(self) -> Iterator[Span]: ... @property def sents(self) -> Iterator[Span]: ... @property def lang(self) -> int: ... @property def lang_(self) -> str: ... def count_by( self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ... ) -> Dict[Any, int]: ... def from_array( self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d ) -> Doc: ... def to_array( self, py_attr_ids: Union[int, str, List[Union[int, str]]] ) -> np.ndarray[Any, np.dtype[np.float64]]: ... @staticmethod def from_docs( docs: List[Doc], ensure_whitespace: bool = ..., attrs: Optional[Union[Tuple[Union[str, int]], List[Union[int, str]]]] = ..., ) -> Doc: ... def get_lca_matrix(self) -> Ints2d: ... def copy(self) -> Doc: ... def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = ... ) -> None: ... def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = ... ) -> Doc: ... def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ... def from_bytes(self, bytes_data: bytes, *, exclude: Iterable[str] = ...) -> Doc: ... def to_dict(self, *, exclude: Iterable[str] = ...) -> Dict[str, Any]: ... def from_dict( self, msg: Dict[str, Any], *, exclude: Iterable[str] = ... ) -> Doc: ... def extend_tensor(self, tensor: Floats2d) -> None: ... def retokenize(self) -> Retokenizer: ... def to_json(self, underscore: Optional[List[str]] = ...) -> Dict[str, Any]: ... def from_json( self, doc_json: Dict[str, Any] = ..., validate: bool = False ) -> Doc: ... def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ...