GITENV file updated 18

parent 909bc7f7
from typing import Union, Iterable, Dict, Any
from pathlib import Path
import sys
# set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings
setup_default_warnings() # noqa: E402
# These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
from thinc.api import Config
from . import pipeline # noqa: F401
from .cli.info import info # noqa: F401
from .glossary import explain # noqa: F401
from .about import __version__ # noqa: F401
from .util import registry, logger # noqa: F401
from .errors import Errors
from .language import Language
from .vocab import Vocab
from . import util
if sys.maxunicode == 65535:
raise SystemError(Errors.E130)
def load(
name: Union[str, Path],
*,
vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = util.SimpleFrozenList(),
exclude: Iterable[str] = util.SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language:
"""Load a spaCy model from an installed package or a local path.
name (str): Package name or model path.
vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
return util.load_model(
name, vocab=vocab, disable=disable, exclude=exclude, config=config
)
def blank(
name: str,
*,
vocab: Union[Vocab, bool] = True,
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
meta: Dict[str, Any] = util.SimpleFrozenDict(),
) -> Language:
"""Create a blank nlp object for a given language code.
name (str): The language code, e.g. "en".
vocab (Vocab): A Vocab object. If True, a vocab is created.
config (Dict[str, Any] / Config): Optional config overrides.
meta (Dict[str, Any]): Overrides for nlp.meta.
RETURNS (Language): The nlp object.
"""
LangClass = util.get_lang_class(name)
# We should accept both dot notation and nested dict here for consistency
config = util.dot_to_dict(config)
return LangClass.from_config(config, vocab=vocab, meta=meta)
if __name__ == "__main__":
from spacy.cli import setup_cli
setup_cli()
# fmt: off
__title__ = "spacy"
__version__ = "3.2.0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
__projects_branch__ = "v3"
This source diff could not be displayed because it is too large. You can view the blob instead.
# Reserve 64 values for flag features
from . cimport symbols
cdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUM
LIKE_EMAIL
IS_STOP
IS_OOV_DEPRECATED
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
IS_CURRENCY
FLAG19 = 19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
ID
ORTH
LOWER
NORM
SHAPE
PREFIX
SUFFIX
LENGTH
CLUSTER
LEMMA
POS
TAG
DEP
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB
LANG
ENT_KB_ID = symbols.ENT_KB_ID
MORPH
ENT_ID = symbols.ENT_ID
IDX
SENT_END
\ No newline at end of file
IDS = {
"": NULL_ATTR,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
"IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"IS_CURRENCY": IS_CURRENCY,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"ENT_ID": ENT_ID,
"ENT_KB_ID": ENT_KB_ID,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"LANG": LANG,
"MORPH": MORPH,
"IDX": IDX
}
# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
locals().update(IDS)
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
"""
Normalize a dictionary of attributes, converting them to ints.
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
can be ints or strings.
strings_map (StringStore): Defaults to None. If provided, encodes string
values into ints.
RETURNS (dict): Attributes dictionary with keys and optionally values
converted to ints.
"""
inty_attrs = {}
if _do_deprecated:
if 'F' in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
if 'L' in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
if 'pos' in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if 'morph' in stringy_attrs:
morphs = stringy_attrs.pop('morph')
if 'number' in stringy_attrs:
stringy_attrs.pop('number')
if 'tenspect' in stringy_attrs:
stringy_attrs.pop('tenspect')
morph_keys = [
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
'NumValue', 'PartType', 'Polite', 'StyleVariant',
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', 'PrepCase', 'Animacy' # U20
]
for key in morph_keys:
if key in stringy_attrs:
stringy_attrs.pop(key)
elif key.lower() in stringy_attrs:
stringy_attrs.pop(key.lower())
elif key.upper() in stringy_attrs:
stringy_attrs.pop(key.upper())
for name, value in stringy_attrs.items():
int_key = intify_attr(name)
if int_key is not None:
if strings_map is not None and isinstance(value, str):
if hasattr(strings_map, 'add'):
value = strings_map.add(value)
else:
value = strings_map[value]
inty_attrs[int_key] = value
return inty_attrs
def intify_attr(name):
"""
Normalize an attribute name, converting it to int.
stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)
RETURNS (int): int representation of the attribute, or None if it couldn't be converted.
"""
if isinstance(name, int):
return name
elif name in IDS:
return IDS[name]
elif name.upper() in IDS:
return IDS[name.upper()]
return None
"""Helpers for Python and platform compatibility."""
import sys
from thinc.util import copy_array
try:
import cPickle as pickle
except ImportError:
import pickle # type: ignore[no-redef]
try:
import copy_reg
except ImportError:
import copyreg as copy_reg # type: ignore[no-redef]
try:
from cupy.cuda.stream import Stream as CudaStream
except ImportError:
CudaStream = None
try:
import cupy
except ImportError:
cupy = None
if sys.version_info[:2] >= (3, 8): # Python 3.8+
from typing import Literal, Protocol, runtime_checkable
else:
from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401
# Important note: The importlib_metadata "backport" includes functionality
# that's not part of the built-in importlib.metadata. We should treat this
# import like the built-in and only use what's available there.
try: # Python 3.8+
import importlib.metadata as importlib_metadata
except ImportError:
from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401
from thinc.api import Optimizer # noqa: F401
pickle = pickle
copy_reg = copy_reg
CudaStream = CudaStream
cupy = cupy
copy_array = copy_array
is_windows = sys.platform.startswith("win")
is_linux = sys.platform.startswith("linux")
is_osx = sys.platform == "darwin"
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
seed = 0
gpu_allocator = null
[nlp]
lang = null
# List of pipeline component names, in order. The names should correspond to
# components defined in the [components block]
pipeline = []
# Components that are loaded but disabled by default
disabled = []
# Optional callbacks to modify the nlp object before it's initialized, after
# it's created and after the pipeline has been set up
before_creation = null
after_creation = null
after_pipeline_creation = null
# Default batch size to use with nlp.pipe and nlp.evaluate
batch_size = 1000
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
# The pipeline components and their models
[components]
# Readers for corpora like dev and train.
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false
# Limitations on training document length
max_length = 0
# Limitation on number of training examples
limit = 0
# Apply some simply data augmentation, where we replace tokens with variations.
# This is especially useful for punctuation and case replacement, to help
# generalize beyond corpora that don't/only have smart quotes etc.
augmenter = null
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false
# Limitations on training document length
max_length = 0
# Limitation on number of training examples
limit = 0
# Optional callback for data augmentation
augmenter = null
# Training hyper-parameters and additional features.
[training]
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
# Controls early-stopping. 0 disables early stopping.
patience = 1600
# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
# memory and shuffled within the training loop. -1 means stream train corpus
# rather than loading in memory with no shuffling within the training loop.
max_epochs = 0
max_steps = 20000
eval_frequency = 200
# Control how scores are printed and checkpoints are evaluated.
score_weights = {}
# Names of pipeline components that shouldn't be updated during training
frozen_components = []
# Names of pipeline components that should set annotations during training
annotating_components = []
# Location in the config where the dev corpus is defined
dev_corpus = "corpora.dev"
# Location in the config where the train corpus is defined
train_corpus = "corpora.train"
# Optional callback before nlp object is saved to disk after training
before_to_disk = null
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
learn_rate = 0.001
# These settings are used when nlp.initialize() is called (typically before
# training or pretraining). Components and the tokenizer can each define their
# own arguments via their initialize methods that are populated by the config.
# This lets them gather data resources, build label sets etc.
[initialize]
vectors = ${paths.vectors}
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
# Data and lookups for vocabulary
vocab_data = null
lookups = null
# Arguments passed to the tokenizer's initialize method
tokenizer = {}
# Arguments for initialize methods of the components (keyed by component)
components = {}
before_init = null
after_init = null
[paths]
raw_text = null
[pretraining]
max_epochs = 1000
dropout = 0.2
n_save_every = null
n_save_epoch = null
component = "tok2vec"
layer = ""
corpus = "corpora.pretrain"
[pretraining.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 3000
discard_oversize = false
tolerance = 0.2
get_length = null
[pretraining.objective]
@architectures = "spacy.PretrainCharacters.v1"
maxout_pieces = 3
hidden_size = 300
n_characters = 4
[pretraining.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001
[corpora]
[corpora.pretrain]
@readers = "spacy.JsonlCorpus.v1"
path = ${paths.raw_text}
min_length = 5
max_length = 500
limit = 0
import warnings
class ErrorsWithCodes(type):
def __getattribute__(self, code):
msg = super().__getattribute__(code)
if code.startswith("__"): # python system attributes like __class__
return msg
else:
return "[{code}] {msg}".format(code=code, msg=msg)
def setup_default_warnings():
# ignore certain numpy warnings
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
# warn about entity_ruler & matcher having no patterns only once
for pipe in ["matcher", "entity_ruler"]:
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
# warn once about lemmatizer without required POS
filter_warning("once", error_msg=Warnings.W108)
# floret vector table cannot be modified
filter_warning("once", error_msg="[W114]")
def filter_warning(action: str, error_msg: str):
"""Customize how spaCy should handle a certain warning.
error_msg (str): e.g. "W006", or a full error message
action (str): "default", "error", "ignore", "always", "module" or "once"
"""
warnings.filterwarnings(action, message=_escape_warning_msg(error_msg))
def _escape_warning_msg(msg):
"""To filter with warnings.filterwarnings, the [] brackets need to be escaped"""
return msg.replace("[", "\\[").replace("]", "\\]")
# fmt: off
class Warnings(metaclass=ErrorsWithCodes):
W005 = ("Doc object not parsed. This means displaCy won't be able to "
"generate a dependency visualization for it. Make sure the Doc "
"was processed with a model that supports dependency parsing, and "
"not just a language class like `English()`. For more info, see "
"the docs:\nhttps://spacy.io/usage/models")
W006 = ("No entities to visualize found in Doc object. If this is "
"surprising to you, make sure the Doc was processed using a model "
"that supports named entity recognition, and check the `doc.ents` "
"property manually if necessary.")
W007 = ("The model you're using has no word vectors loaded, so the result "
"of the {obj}.similarity method will be based on the tagger, "
"parser and NER, which may not give useful similarity judgements. "
"This may happen if you're using one of the small models, e.g. "
"`en_core_web_sm`, which don't ship with word vectors and only "
"use context-sensitive tensors. You can always add your own word "
"vectors, or use one of the larger models instead if available.")
W008 = ("Evaluating {obj}.similarity based on empty vectors.")
W011 = ("It looks like you're calling displacy.serve from within a "
"Jupyter notebook or a similar environment. This likely means "
"you're already running a local web server, so there's no need to "
"make displaCy start another one. Instead, you should be able to "
"replace displacy.serve with displacy.render to show the "
"visualization.")
W012 = ("A Doc object you're adding to the PhraseMatcher for pattern "
"'{key}' is parsed and/or tagged, but to match on '{attr}', you "
"don't actually need this information. This means that creating "
"the patterns is potentially much slower, because all pipeline "
"components are applied. To only create tokenized Doc objects, "
"try using `nlp.make_doc(text)` or process all texts as a stream "
"using `list(nlp.tokenizer.pipe(all_texts))`.")
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
"ignoring the duplicate entry.")
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.")
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
"you are constructing a parse tree incrementally by setting "
"token.head values, you can probably ignore this warning. Consider "
"using Doc(words, ..., heads=heads, deps=deps) instead.")
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "
"smaller JSON files instead.")
W028 = ("Doc.from_array was called with a vector of type '{type}', "
"but is expecting one of type uint64 instead. This may result "
"in problems with the vocab further on in the pipeline.")
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
"entities \"{entities}\". Use "
"`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
" to check the alignment. Misaligned entities ('-') will be "
"ignored during training.")
W033 = ("Training a new {model} using a model with no lexeme normalization "
"table. This may degrade the performance of the model to some "
"degree. If this is intentional or the language you're using "
"doesn't have a normalization table, please ignore this warning. "
"If this is surprising, make sure you have the spacy-lookups-data "
"package installed and load the table in your config. The "
"languages with lexeme normalization tables are currently: "
"{langs}\n\nLoad the table in your config with:\n\n"
"[initialize.lookups]\n"
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
"lang = ${{nlp.lang}}\n"
"tables = [\"lexeme_norm\"]\n")
W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
"attribute or operator.")
W036 = ("The component '{name}' does not have any patterns defined.")
# New warnings added in v3.x
W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
"'{name}' which is frozen. If you want to prevent retraining '{name}' "
"but want to train '{listener}' on top of it, you should add '{name}' to the "
"list of 'annotating_components' in the 'training' block in the config. "
"See the documentation for details: "
"https://spacy.io/usage/training#annotating-components")
W087 = ("Component '{name}' will be (re)trained, but the component '{listener}' "
"depends on it via a listener and is frozen. This means that the "
"performance of '{listener}' will be degraded. You can either freeze "
"both, or neither of the two. If you're sourcing the component from "
"an existing pipeline, you can use the `replace_listeners` setting in "
"the config block to replace its token-to-vector listener with a copy "
"and make it independent. For example, `replace_listeners = "
"[\"model.tok2vec\"]` See the documentation for details: "
"https://spacy.io/usage/training#config-components-listeners")
W088 = ("The pipeline component {name} implements a `begin_training` "
"method, which won't be called by spaCy. As of v3.0, `begin_training` "
"has been renamed to `initialize`, so you likely want to rename the "
"component method. See the documentation for details: "
"https://spacy.io/api/language#initialize")
W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
"to `nlp.initialize`.")
W090 = ("Could not locate any {format} files in path '{path}'.")
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
W093 = ("Could not find any data to train the {name} on. Is your "
"input data correctly formatted?")
W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
"spaCy version requirement: {version}. This can lead to compatibility "
"problems with older versions, or as new spaCy versions are "
"released, because the model may say it's compatible when it's "
'not. Consider changing the "spacy_version" in your meta.json to a '
"version range, with a lower and upper pin. For example: {example}")
W095 = ("Model '{model}' ({model_version}) was trained with spaCy "
"{version} and may not be 100% compatible with the current version "
"({current}). If you see errors or degraded performance, download "
"a newer compatible model or retrain your custom model with the "
"current spaCy version. For more details and available updates, "
"run: python -m spacy validate")
W096 = ("The method `nlp.disable_pipes` is now deprecated - use "
"`nlp.select_pipes` instead.")
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
"string \"Field1=Value1,Value2|Field2=Value3\".")
W101 = ("Skipping Doc custom extension '{name}' while merging docs.")
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
"word segmenters: {supported}. Defaulting to {default}.")
W104 = ("Skipping modifications for '{target}' segmenter. The current "
"segmenter is '{current}'.")
W105 = ("As of spaCy v3.0, the `{matcher}.pipe` method is deprecated. If you "
"need to match on a stream of documents, you can use `nlp.pipe` and "
"call the {matcher} on each Doc object.")
W107 = ("The property `Doc.{prop}` is deprecated. Use "
"`Doc.has_annotation(\"{attr}\")` instead.")
W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
"more tokens. Check that your pipeline includes components that "
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
"'morphologizer'.")
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
"required user hooks to the doc after processing.")
W110 = ("The DependencyMatcher token pattern {pattern} matched a span "
"{tokens} that is 2+ tokens long. Only the first token in the span "
"will be included in the results. For better results, token "
"patterns should return matches that are each exactly one token "
"long.")
W111 = ("Jupyter notebook detected: if using `prefer_gpu()` or "
"`require_gpu()`, include it in the same cell right before "
"`spacy.load()` to ensure that the model is loaded on the correct "
"device. More information: "
"http://spacy.io/usage/v3#jupyter-notebook-gpu")
W112 = ("The model specified to use for initial vectors ({name}) has no "
"vectors. This is almost certainly a mistake.")
W113 = ("Sourced component '{name}' may not work as expected: source "
"vectors are not identical to current pipeline vectors.")
W114 = ("Using multiprocessing with GPU models is not recommended and may "
"lead to errors.")
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
"Vectors are calculated from character ngrams.")
class Errors(metaclass=ErrorsWithCodes):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
"This usually happens when spaCy calls `nlp.{method}` with a custom "
"component name that's not registered on the current language class. "
"If you're using a Transformer, make sure to install 'spacy-transformers'. "
"If you're using a custom component, make sure you've added the "
"decorator `@Language.component` (for function components) or "
"`@Language.factory` (for class components).\n\nAvailable "
"factories: {opts}")
E003 = ("Not a valid pipeline component. Expected callable, but "
"got {component} (name: '{name}'). If you're using a custom "
"component factory, double-check that it correctly returns your "
"initialized component.")
E004 = ("Can't set up pipeline component: a factory for '{name}' already "
"exists. Existing factory: {func}. New factory: {new_func}")
E005 = ("Pipeline component '{name}' returned None. If you're using a "
"custom component, maybe you forgot to return the processed Doc?")
E006 = ("Invalid constraints for adding pipeline component. You can only "
"set one of the following: before (component name or index), "
"after (component name or index), first (True) or last (True). "
"Invalid configuration: {args}. Existing components: {opts}")
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
E008 = ("Can't restore disabled pipeline component '{name}' because it "
"doesn't exist in the pipeline anymore. If you want to remove "
"components from the pipeline, you should do it before calling "
"`nlp.select_pipes` or after restoring the disabled components.")
E010 = ("Word vectors set to length 0. This may be because you don't have "
"a model installed or loaded, or because your model doesn't "
"include word vectors. For more info, see the docs:\n"
"https://spacy.io/usage/models")
E011 = ("Unknown operator: '{op}'. Options: {opts}")
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
E016 = ("MultitaskObjective target should be function or one of: dep, "
"tag, ent, dep_tag_offset, ent_tag.")
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
"refers to an issue with the `Vocab` or `StringStore`.")
E019 = ("Can't create transition with unknown action ID: {action}. Action "
"IDs are enumerated in spacy/syntax/{src}.pyx.")
E022 = ("Could not find a transition with the name '{name}' in the NER "
"model.")
E024 = ("Could not find an optimal move to supervise the parser. Usually, "
"this means that the model can't be updated in a way that's valid "
"and satisfies the correct annotations specified in the GoldParse. "
"For example, are all labels added to the model? If you're "
"training a named entity recognizer, also make sure that none of "
"your annotated entity spans have leading or trailing whitespace "
"or punctuation. You can also use the `debug data` command to "
"validate your JSON-formatted training data. For details, run:\n"
"python -m spacy debug data --help")
E025 = ("String is too long: {length} characters. Max is 2**30.")
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
"length {length}.")
E027 = ("Arguments `words` and `spaces` should be sequences of the same "
"length, or `spaces` should be left default at None. `spaces` "
"should be a sequence of booleans, with True meaning that the "
"word owns a ' ' character following it.")
E028 = ("`words` expects a list of unicode strings, but got bytes instance: {value}")
E029 = ("`noun_chunks` requires the dependency parse, which requires a "
"statistical model to be installed and loaded. For more info, see "
"the documentation:\nhttps://spacy.io/usage/models")
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: `nlp.add_pipe('sentencizer')`. "
"Alternatively, add the dependency parser or sentence recognizer, "
"or set sentence boundaries by setting `doc[i].is_sent_start`.")
E031 = ("Invalid token: empty string ('') at position {i}.")
E033 = ("Cannot load into non-empty Doc of length {length}.")
E035 = ("Error creating span with start {start} and end {end} for Doc of "
"length {length}.")
E036 = ("Error calculating span: Can't find a token starting at character "
"offset {start}.")
E037 = ("Error calculating span: Can't find a token ending at character "
"offset {end}.")
E039 = ("Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/explosion/spaCy/issues")
E040 = ("Attempt to access token at {i}, max length {max_length}.")
E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
E042 = ("Error accessing `doc[{i}].nbor({j})`, for doc of length {length}.")
E043 = ("Refusing to write to token.sent_start if its document is parsed, "
"because this may cause inconsistent state.")
E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
"None, True, False")
E045 = ("Possibly infinite loop encountered while looking for {attr}.")
E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did "
"you forget to call the `set_extension` method?")
E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
"package or a valid path to a data directory.")
E052 = ("Can't find model directory: {path}")
E053 = ("Could not read {name} from {path}")
E054 = ("No valid '{setting}' setting found in model meta.json.")
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
"original string.\nKey: {key}\nOrths: {orths}")
E057 = ("Stepped slices not supported in Span objects. Try: "
"`list(tokens)[start:stop:step]` instead.")
E058 = ("Could not retrieve vector for key {key}.")
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
"({rows}, {cols}).")
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
"and 63 are occupied. You can replace one by specifying the "
"`flag_id` explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
E063 = ("Invalid value for `flag_id`: {value}. Flag IDs must be between 1 "
"and 63 (inclusive).")
E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
"string, the lexeme returned had an orth ID that did not match "
"the query string. This means that the cached lexeme structs are "
"mismatched to the string encoding table. The mismatched:\n"
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
E065 = ("Only one of the vector table's width and shape can be specified. "
"Got width {width} and shape {shape}.")
E067 = ("Invalid BILUO tag sequence: Got a tag starting with {start} "
"without a preceding 'B' (beginning of an entity). "
"Tag sequence:\n{tags}")
E068 = ("Invalid BILUO tag: '{tag}'.")
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
"match the one in the vocab ({vocab_orth}).")
E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
"are of length {length}. You can use `vocab.reset_vectors` to "
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")
E083 = ("Error setting extension: only one of `default`, `method`, or "
"`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
E084 = ("Error assigning label ID {label} to span: not in StringStore.")
E085 = ("Can't create lexeme for string '{string}'.")
E087 = ("Unknown displaCy style: {style}.")
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
"parser and NER models require roughly 1GB of temporary "
"memory per 100,000 characters in the input. This means long "
"texts may cause memory allocation errors. If you're not using "
"the parser or NER, it's probably safe to increase the "
"`nlp.max_length` limit. The limit is in number of characters, so "
"you can check whether your inputs are too long by checking "
"`len(text)`.")
E089 = ("Extensions can't have a setter argument without a getter "
"argument. Check the keyword arguments on `set_extension`.")
E090 = ("Extension '{name}' already exists on {obj}. To overwrite the "
"existing extension, set `force=True` on `{obj}.set_extension`.")
E091 = ("Invalid extension attribute {name}: expected callable or None, "
"but got: {value}")
E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
E094 = ("Error reading line {line_num} in vectors file {loc}.")
E095 = ("Can't write to frozen dictionary. This is likely an internal "
"error. Are you writing to a default function argument?")
E096 = ("Invalid object passed to displaCy: Can only visualize `Doc` or "
"Span objects, or dicts if set to `manual=True`.")
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
"phrase pattern (string) but got:\n{pattern}")
E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
E099 = ("Invalid pattern: the first node of pattern should be an anchor "
"node. The node should only contain RIGHT_ID and RIGHT_ATTRS.")
E100 = ("Nodes other than the anchor node should all contain {required}, "
"but these are missing: {missing}")
E101 = ("RIGHT_ID should be a new node and LEFT_ID should already have "
"have been declared in previous edges.")
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
"tokens to merge. If you want to find the longest non-overlapping "
"spans, you can use the util.filter_spans helper:\n"
"https://spacy.io/api/top-level#util.filter_spans")
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
"token can only be part of one entity, so make sure the entities "
"you're setting don't overlap. To work with overlapping entities, "
"consider using doc.spans instead.")
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
"settings: {opts}")
E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
E109 = ("Component '{name}' could not be run. Did you forget to "
"call `initialize()`?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
E111 = ("Pickling a token is not supported, because tokens are only views "
"of the parent Doc and can't exist on their own. A pickled token "
"would always have to include its Doc and Vocab, which has "
"practically no advantage over pickling the parent Doc directly. "
"So instead of pickling the token, pickle the Doc it belongs to.")
E112 = ("Pickling a span is not supported, because spans are only views "
"of the parent Doc and can't exist on their own. A pickled span "
"would always have to include its Doc and Vocab, which has "
"practically no advantage over pickling the parent Doc directly. "
"So instead of pickling the span, pickle the Doc it belongs to or "
"use Span.as_doc to convert the span to a standalone Doc object.")
E115 = ("All subtokens must have associated heads.")
E117 = ("The newly split tokens must match the text of the original token. "
"New orths: {new}. Old text: {old}.")
E118 = ("The custom extension attribute '{attr}' is not registered on the "
"`Token` object so it can't be set during retokenization. To "
"register an attribute, use the `Token.set_extension` classmethod.")
E119 = ("Can't set custom extension attribute '{attr}' during "
"retokenization because it's not writable. This usually means it "
"was registered with a getter function (and no setter) or as a "
"method extension, so the value is computed dynamically. To "
"overwrite a custom attribute manually, it should be registered "
"with a default value or with a getter AND setter.")
E120 = ("Can't set custom extension attributes during retokenization. "
"Expected dict mapping attribute names to values, but got: {value}")
E121 = ("Can't bulk merge spans. Attribute length {attr_len} should be "
"equal to span length ({span_len}).")
E122 = ("Cannot find token to be split. Did it get merged?")
E123 = ("Cannot find head of token to be split. Did it get merged?")
E125 = ("Unexpected value: {value}")
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
"This is likely a bug in spaCy, so feel free to open an issue.")
E130 = ("You are running a narrow unicode build, which is incompatible "
"with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
"unicode build instead. You can also rebuild Python and set the "
"`--enable-unicode=ucs4 flag`.")
E132 = ("The vectors for entities and probabilities for alias '{alias}' "
"should have equal length, but found {entities_length} and "
"{probabilities_length} respectively.")
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
"exceed 1, but found {sum}.")
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
"`kb.add_entity` and `kb.add_alias` to add entries.")
E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the "
"provided {found}.")
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
"by calling add_label, or by providing a representative batch of "
"examples to the component's `initialize` method.")
E145 = ("Error reading `{param}` from input file.")
E146 = ("Could not access {path}.")
E147 = ("Unexpected error in the {method} functionality of the "
"EntityLinker: {msg}. This is likely a bug in spaCy, so feel free "
"to open an issue: https://github.com/explosion/spaCy/issues")
E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that "
"each entity in `doc.ents` is assigned to a KB identifier.")
E149 = ("Error deserializing model. Check that the config used to create "
"the component matches the model being loaded.")
E150 = ("The language of the `nlp` object and the `vocab` should be the "
"same, but found '{nlp}' and '{vocab}' respectively.")
E152 = ("The attribute {attr} is not supported for token patterns. "
"Please use the option `validate=True` with the Matcher, PhraseMatcher, "
"or EntityRuler for more details.")
E153 = ("The value type {vtype} is not supported for token patterns. "
"Please use the option validate=True with Matcher, PhraseMatcher, "
"or EntityRuler for more details.")
E154 = ("One of the attributes or values is not supported for token "
"patterns. Please use the option `validate=True` with the Matcher, "
"PhraseMatcher, or EntityRuler for more details.")
E155 = ("The pipeline needs to include a {pipe} in order to use "
"Matcher or PhraseMatcher with the attribute {attr}. "
"Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
"instead of `list(nlp.tokenizer.pipe())`.")
E157 = ("Can't render negative values for dependency arc start or end. "
"Make sure that you're passing in absolute token indices, not "
"relative token offsets.\nstart: {start}, end: {end}, label: "
"{label}, direction: {dir}")
E158 = ("Can't add table '{name}' to lookups because it already exists.")
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
E160 = ("Can't find language data file: {path}")
E161 = ("Found an internal inconsistency when predicting entity links. "
"This is likely a bug in spaCy, so feel free to open an issue: "
"https://github.com/explosion/spaCy/issues")
E163 = ("cumsum was found to be unstable: its last element does not "
"correspond to sum")
E164 = ("x is neither increasing nor decreasing: {x}.")
E165 = ("Only one class present in the gold labels: {label}. "
"ROC AUC score is not defined in that case.")
E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
"Current DocBin: {current}\nOther DocBin: {other}")
E169 = ("Can't find module: {module}")
E170 = ("Cannot apply transition {name}: invalid for the current state.")
E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
"callable or None, but got: {arg_type}")
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
E177 = ("Ill-formed IOB input detected: {tag}")
E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you "
"accidentally passed a single pattern to Matcher.add instead of a "
"list of patterns? If you only want to add one pattern, make sure "
"to wrap it in a list. For example: `matcher.add('{key}', [pattern])`")
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
"Doc. If you only want to add one pattern, make sure to wrap it "
"in a list. For example: `matcher.add('{key}', [doc])`")
E180 = ("Span attributes can't be declared as required or assigned by "
"components, since spans are only views of the Doc. Use Doc and "
"Token attributes (or custom extension attributes) only and remove "
"the following: {attrs}")
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
"Only Doc and Token attributes are supported.")
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
"to define the attribute? For example: `{attr}.???`")
E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
"attributes are supported, for example: {solution}")
E184 = ("Only attributes without underscores are supported in component "
"attribute declarations (because underscore and non-underscore "
"attributes are connected anyways): {attr} -> {solution}")
E185 = ("Received invalid attribute in component attribute declaration: "
"`{obj}.{attr}`\nAttribute '{attr}' does not exist on {obj}.")
E187 = ("Only unicode strings are supported as labels.")
E189 = ("Each argument to `Doc.__init__` should be of equal length.")
E190 = ("Token head out of range in `Doc.from_array()` for token index "
"'{index}' with value '{value}' (equivalent to relative head "
"index: '{rel_head_index}'). The head indices should be relative "
"to the current token index rather than absolute indices in the "
"array.")
E191 = ("Invalid head: the head token must be from the same doc as the "
"token itself.")
E192 = ("Unable to resize vectors in place with cupy.")
E193 = ("Unable to resize vectors in place if the resized vector dimension "
"({new_dim}) is not the same as the current vector dimension "
"({curr_dim}).")
E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
E195 = ("Matcher can be called on {good} only, got {got}.")
E196 = ("Refusing to write to `token.is_sent_end`. Sentence boundaries can "
"only be fixed with `token.is_sent_start`.")
E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
E198 = ("Unable to return {n} most similar vectors for the current vectors "
"table, which contains {n_rows} vectors.")
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x
E858 = ("The {mode} vector table does not support this operation. "
"{alternative}")
E859 = ("The floret vector table cannot be modified.")
E860 = ("Can't truncate fasttext-bloom vectors.")
E861 = ("No 'keys' should be provided when initializing floret vectors "
"with 'minn' and 'maxn'.")
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
E863 = ("'maxn' must be greater than or equal to 'minn'.")
E864 = ("The complete vector table 'data' is required to initialize floret "
"vectors.")
E865 = ("A SpanGroup is not functional after the corresponding Doc has "
"been garbage collected. To keep using the spans, make sure that "
"the corresponding Doc object is still available in the scope of "
"your function.")
E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
E867 = ("The 'textcat' component requires at least two labels because it "
"uses mutually exclusive classes where exactly one label is True "
"for each doc. For binary classification tasks, you can use two "
"labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
"can use the 'textcat_multilabel' component with one label.")
E868 = ("Found a conflicting gold annotation in a reference document, "
"with the following char-based span occurring both in the gold ents "
"as well as in the negative spans: {span}.")
E869 = ("The notation '{label}' is not supported anymore. To annotate "
"negative NER samples, use `doc.spans[key]` instead, and "
"specify the key as 'incorrect_spans_key' when constructing "
"the NER component.")
E870 = ("Could not serialize the DocBin because it is too large. Consider "
"splitting up your documents into several doc bins and serializing "
"each separately. spacy.Corpus.v1 will search recursively for all "
"*.spacy files if you provide a directory instead of a filename as "
"the 'path'.")
E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}")
E872 = ("Unable to copy tokenizer from base model due to different "
'tokenizer settings: current tokenizer config "{curr_config}" '
'vs. base model "{base_config}"')
E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
"'{text}'. This is likely a bug in spaCy, so feel free to open an "
"issue: https://github.com/explosion/spaCy/issues")
E874 = ("Could not initialize the tok2vec model from component "
"'{component}' and layer '{layer}'.")
E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. "
"In the config, these are defined by the initialize.vectors setting.")
E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
"a list of spans, with each span represented by a tuple (start_char, end_char). "
"The tuple can be optionally extended with a label and a KB ID.")
E880 = ("The 'wandb' library could not be found - did you install it? "
"Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
"config section, instead of the 'WandbLogger'.")
E884 = ("The pipeline could not be initialized because the vectors "
"could not be found at '{vectors}'. If your pipeline was already "
"initialized/trained before, call 'resume_training' instead of 'initialize', "
"or initialize only the components that are new.")
E885 = ("entity_linker.set_kb received an invalid 'kb_loader' argument: expected "
"a callable function, but got: {arg_type}")
E886 = ("Can't replace {name} -> {tok2vec} listeners: path '{path}' not "
"found in config for component '{name}'.")
E887 = ("Can't replace {name} -> {tok2vec} listeners: the paths to replace "
"({paths}) don't match the available listeners in the model ({n_listeners}).")
E888 = ("Can't replace listeners for '{name}' ({pipe}): invalid upstream "
"component that doesn't seem to support listeners. Expected Tok2Vec "
"or Transformer component. If you didn't call nlp.replace_listeners "
"manually, this is likely a bug in spaCy.")
E889 = ("Can't replace '{tok2vec}' listeners of component '{name}' because "
"'{unknown}' is not in the pipeline. Available components: {opts}. "
"If you didn't call nlp.replace_listeners manually, this is likely "
"a bug in spaCy.")
E890 = ("Cannot add the alias '{alias}' to the Knowledge base. "
"Each alias should be a meaningful string.")
E891 = ("Alias '{alias}' could not be added to the Knowledge base. "
"This is likely a bug in spaCy.")
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
"If you're using a custom function, make sure the code is available. "
"If the function is provided by a third-party package, e.g. "
"spacy-transformers, make sure the package is installed in your "
"environment.\n\nAvailable names: {available}")
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
"'{lang}'.")
E895 = ("The 'textcat' component received gold-standard annotations with "
"multiple labels per document. In spaCy 3 you should use the "
"'textcat_multilabel' component for this instead. "
"Example of an offending annotation: {value}")
E896 = ("There was an error using the static vectors. Ensure that the vectors "
"of the vocab are properly initialized, or set 'include_static_vectors' "
"to False.")
E897 = ("Field '{field}' should be a dot-notation string referring to the "
"relevant section in the config, but found type {type} instead.")
E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
"is not set or None. If you've implemented a custom component, make "
"sure to store the component model as `self.model` in your "
"component's __init__ method.")
E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute "
"is not set or None. If you've implemented a custom component, make "
"sure to store the current `nlp` object's vocab as `self.vocab` in "
"your component's __init__ method.")
E900 = ("Could not run the full pipeline for evaluation. If you specified "
"frozen components, make sure they were already initialized and "
"trained. Full pipeline: {pipeline}")
E901 = ("Failed to remove existing output directory: {path}. If your "
"config and the components you train change between runs, a "
"non-empty output directory can lead to stale pipeline data. To "
"solve this, remove the existing directories in the output directory.")
E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
"Try checking whitespace and delimiters. See "
"https://spacy.io/api/cli#convert")
E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
"whitespace and delimiters. See https://spacy.io/api/cli#convert")
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
"dimension refers to the output width, after the linear projection "
"has been applied.")
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
"dimension refers to the width of the vectors table.")
E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
"are: {supported}")
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
E910 = ("Encountered NaN value when computing loss for component '{name}'.")
E911 = ("Invalid feature: {feat}. Must be a token attribute.")
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to "
"return the nlp object but got: {value}. Maybe you forgot to return "
"the modified object in your function?")
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
"float or int but got: {score_type}. To exclude the score from the "
"final score, set its weight to null in the [training.score_weights] "
"section of your training config.")
E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
E917 = ("Received invalid value {value} for `state_type` in "
"TransitionBasedParser: only 'parser' or 'ner' are valid options.")
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
"values are an instance of `spacy.vocab.Vocab` or True to create one"
" (default).")
E919 = ("A textcat `positive_label` '{pos_label}' was provided for training "
"data that does not appear to be a binary classification problem "
"with two labels. Labels found: {labels}")
E920 = ("The textcat's `positive_label` setting '{pos_label}' "
"does not match any label in the training data or provided during "
"initialization. Available labels: {labels}")
E921 = ("The method `set_output` can only be called on components that have "
"a Model with a `resize_output` attribute. Otherwise, the output "
"layer can not be dynamically changed.")
E922 = ("Component '{name}' has been initialized with an output dimension of "
"{nO} - cannot add any more labels.")
E923 = ("It looks like there is no proper sample data to initialize the "
"Model of component '{name}'. To check your input data paths and "
"annotation, run: python -m spacy debug data config.cfg "
"and include the same config override values you would specify "
"for the 'spacy train' command.")
E924 = ("The '{name}' component does not seem to be initialized properly. "
"This is likely a bug in spaCy, so feel free to open an issue: "
"https://github.com/explosion/spaCy/issues")
E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
"mapping label names to colors but got: {obj}")
E926 = ("It looks like you're trying to modify `nlp.{attr}` directly. This "
"doesn't work because it's an immutable computed property. If you "
"need to modify the pipeline, use the built-in methods like "
"`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or "
"`nlp.enable_pipe` instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
"property or default function argument?")
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
"but the provided argument {loc} points to a file.")
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
E930 = ("Received invalid get_examples callback in `{method}`. "
"Expected function that returns an iterable of Example objects but "
"got: {obj}")
E931 = ("Encountered {parent} subclass without `{parent}.{method}` "
"method in component '{name}'. If you want to use this "
"method, make sure it's overwritten on the subclass.")
E940 = ("Found NaN values in scores.")
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
"load the model, use its full name instead:\n\n"
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
"models, see the models directory: https://spacy.io/models. If you "
"want to create a blank model, use spacy.blank: "
"nlp = spacy.blank(\"{name}\")")
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
"return an initialized nlp object but got: {value}. Maybe "
"you forgot to return the modified object in your function?")
E943 = ("Executing `before_creation` callback failed. Expected the function to "
"return an uninitialized Language subclass but got: {value}. Maybe "
"you forgot to return the modified object in your function or "
"returned the initialized nlp object instead?")
E944 = ("Can't copy pipeline component '{name}' from source '{model}': "
"not found in pipeline. Available components: {opts}")
E945 = ("Can't copy pipeline component '{name}' from source. Expected "
"loaded nlp object, but got: {source}")
E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
"a string value from {expected} but got: '{arg}'")
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
"a list, but got: {arg_type}")
E949 = ("Unable to align tokens for the predicted and reference docs. It "
"is only possible to align the docs when both texts are the same "
"except for whitespace and capitalization. The predicted tokens "
"start with: {x}. The reference tokens start with: {y}.")
E952 = ("The section '{name}' is not a valid section in the provided config.")
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
"component.")
E955 = ("Can't find table(s) {table} for language '{lang}' in "
"spacy-lookups-data. Make sure you have the package installed or "
"provide your own lookup tables if no default lookups are available "
"for your language.")
E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}")
E957 = ("Writing directly to `Language.factories` isn't needed anymore in "
"spaCy v3. Instead, you can use the `@Language.factory` decorator "
"to register your custom component factory or `@Language.component` "
"to register a simple stateless function component that just takes "
"a Doc and returns it.")
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
"language code of current Language subclass {lang} ({lang_code}). "
"If you want to create an nlp object from a config, make sure to "
"use the matching subclass with the language-specific settings and "
"data.")
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
E960 = ("No config data found for component '{name}'. This is likely a bug "
"in spaCy.")
E961 = ("Found non-serializable Python object in config. Configs should "
"only include values that can be serialized to JSON. If you need "
"to pass models or other objects to your component, use a reference "
"to a registered function or initialize the object in your "
"component.\n\n{config}")
E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
"got: {cfg_type}.")
E963 = ("Can't read component info from `@Language.{decorator}` decorator. "
"Maybe you forgot to call it? Make sure you're using "
"`@Language.{decorator}()` instead of `@Language.{decorator}`.")
E964 = ("The pipeline component factory for '{name}' needs to have the "
"following named arguments, which are passed in by spaCy:\n- nlp: "
"receives the current nlp object and lets you access the vocab\n- "
"name: the name of the component instance, can be used to identify "
"the component, output losses etc.")
E965 = ("It looks like you're using the `@Language.component` decorator to "
"register '{name}' on a class instead of a function component. If "
"you need to register a class or function that *returns* a component "
"function, use the `@Language.factory` decorator instead.")
E966 = ("`nlp.add_pipe` now takes the string name of the registered component "
"factory, not a callable component. Expected string, but got "
"{component} (name: '{name}').\n\n- If you created your component "
"with `nlp.create_pipe('name')`: remove nlp.create_pipe and call "
"`nlp.add_pipe('name')` instead.\n\n- If you passed in a component "
"like `TextCategorizer()`: call `nlp.add_pipe` with the string name "
"instead, e.g. `nlp.add_pipe('textcat')`.\n\n- If you're using a custom "
"component: Add the decorator `@Language.component` (for function "
"components) or `@Language.factory` (for class components / factories) "
"to your custom component and assign it a name, e.g. "
"`@Language.component('your_name')`. You can then run "
"`nlp.add_pipe('your_name')` to add it to the pipeline.")
E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
E968 = ("`nlp.replace_pipe` now takes the string name of the registered component "
"factory, not a callable component. Expected string, but got "
"{component}.\n\n- If you created your component with"
"with `nlp.create_pipe('name')`: remove `nlp.create_pipe` and call "
"`nlp.replace_pipe('{name}', 'name')` instead.\n\n- If you passed in a "
"component like `TextCategorizer()`: call `nlp.replace_pipe` with the "
"string name instead, e.g. `nlp.replace_pipe('{name}', 'textcat')`.\n\n"
"- If you're using a custom component: Add the decorator "
"`@Language.component` (for function components) or `@Language.factory` "
"(for class components / factories) to your custom component and "
"assign it a name, e.g. `@Language.component('your_name')`. You can "
"then run `nlp.replace_pipe('{name}', 'your_name')`.")
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
E971 = ("Found incompatible lengths in `Doc.from_array`: {array_length} for the "
"array and {doc_length} for the Doc itself.")
E972 = ("`Example.__init__` got None for '{arg}'. Requires Doc.")
E973 = ("Unexpected type for NER data")
E974 = ("Unknown {obj} attribute: {key}")
E976 = ("The method `Example.from_dict` expects a {type} as {n} argument, "
"but received None.")
E977 = ("Can not compare a MorphAnalysis with a string object. "
"This is likely a bug in spaCy, so feel free to open an issue: "
"https://github.com/explosion/spaCy/issues")
E978 = ("The {name} method takes a list of Example objects, but got: {types}")
E980 = ("Each link annotation should refer to a dictionary with at most one "
"identifier mapping to 1.0, and all others to 0.0.")
E981 = ("The offsets of the annotations for `links` could not be aligned "
"to token boundaries.")
E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
"into {values}, but found {value}.")
E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: "
"{keys}")
E984 = ("Invalid component config for '{name}': component block needs either "
"a key `factory` specifying the registered function used to "
"initialize the component, or a key `source` key specifying a "
"spaCy model to copy the component from. For example, `factory = "
"\"ner\"` will use the 'ner' factory and all other settings in the "
"block will be passed to it as arguments. Alternatively, `source = "
"\"en_core_web_sm\"` will copy the component from that model.\n\n{config}")
E985 = ("Can't load model from config file: no [nlp] section found.\n\n{config}")
E986 = ("Could not create any training batches: check your input. "
"Are the train and dev paths defined? Is `discard_oversize` set appropriately? ")
E989 = ("`nlp.update()` was called with two positional arguments. This "
"may be due to a backwards-incompatible change to the format "
"of the training data in spaCy 3.0 onwards. The 'update' "
"function should now be called with a batch of Example "
"objects, instead of `(text, annotation)` tuples. ")
E991 = ("The function `nlp.select_pipes` should be called with either a "
"`disable` argument to list the names of the pipe components "
"that should be disabled, or with an 'enable' argument that "
"specifies which pipes should not be disabled.")
E992 = ("The function `select_pipes` was called with `enable`={enable} "
"and `disable`={disable} but that information is conflicting "
"for the `nlp` pipeline with components {names}.")
E993 = ("The config for the nlp object needs to include a key `lang` specifying "
"the code of the language to initialize it with (for example "
"'en' for English) - this can't be None.\n\n{config}")
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.")
E999 = ("Unable to merge the Doc objects because they do not all share "
"the same `Vocab`.")
E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
"loaded. Provide the name of a pretrained model or the path to "
"a model and initialize the pipeline:\n\n"
'nlp.tokenizer.initialize(pkuseg_model="default")')
E1001 = ("Target token outside of matched span for match with tokens "
"'{span}' and offset '{index}' matched by patterns '{patterns}'.")
E1002 = ("Span index out of range.")
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
"Required tables: {tables}. Found: {found}. Maybe you forgot to "
"call `nlp.initialize()` to load in the data?")
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
"ORTH and NORM.")
E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
"that you are providing a list of patterns as `List[List[dict]]`.")
E1010 = ("Unable to set entity information for token {i} which is included "
"in more than one span in entities, blocked, missing or outside.")
E1011 = ("Unsupported default '{default}' in `doc.set_ents`. Available "
"options: {modes}")
E1012 = ("Entity spans and blocked/missing/outside spans should be "
"provided to `doc.set_ents` as lists of Span objects.")
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
"token itself. To set the morph from this MorphAnalysis, set from "
"the string value with: `token.set_morph(str(other_morph))`.")
E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
"DocBin (.spacy) format. If your data is in spaCy v2's JSON "
"training format, convert it using `python -m spacy convert "
"file.json .`.")
E1015 = ("Can't initialize model from config: no {value} found. For more "
"information, run: python -m spacy debug config config.cfg")
E1016 = ("The operators 'OP': '?', '*', and '+' are not supported in "
"DependencyMatcher token patterns. The token pattern in "
"RIGHT_ATTR should return matches that are each exactly one token "
"long. Invalid pattern:\n{node}")
E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency "
"parses. If no dependency labels are available, provide "
"placeholder deps such as `deps=[\"dep\"]*len(heads)`.")
E1018 = ("Knowledge base for component '{name}' is not set. "
"Make sure either `nel.initialize` or `nel.set_kb` "
"is called with a `kb_loader` function.")
E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
"statistical model to be installed and loaded. For more info, see "
"the documentation:\nhttps://spacy.io/usage/models")
E1020 = ("No `epoch_resume` value specified and could not infer one from "
"filename. Specify an epoch to resume from.")
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
"Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
# Deprecated model shortcuts, only used in errors and warnings
OLD_MODEL_SHORTCUTS = {
"en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
"pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
"nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
"lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
}
# fmt: on
class MatchPatternError(ValueError):
def __init__(self, key, errors):
"""Custom error for validating match patterns.
key (str): The name of the matcher rule.
errors (dict): Validation errors (sequence of strings) mapped to pattern
ID, i.e. the index of the added pattern.
"""
msg = f"Invalid token patterns for matcher rule '{key}'\n"
for pattern_idx, error_msgs in errors.items():
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
ValueError.__init__(self, msg)
# THIS FILE IS GENERATED FROM SPACY SETUP.PY
#
GIT_VERSION = "0fc3dee77"
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.
term (str): The term to explain.
RETURNS (str): The explanation, or `None` if not found in the glossary.
EXAMPLE:
>>> spacy.explain(u'NORP')
>>> doc = nlp(u'Hello world')
>>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc])
"""
if term in GLOSSARY:
return GLOSSARY[term]
GLOSSARY = {
# POS tags
# Universal POS Tags
# http://universaldependencies.org/u/pos/
"ADJ": "adjective",
"ADP": "adposition",
"ADV": "adverb",
"AUX": "auxiliary",
"CONJ": "conjunction",
"CCONJ": "coordinating conjunction",
"DET": "determiner",
"INTJ": "interjection",
"NOUN": "noun",
"NUM": "numeral",
"PART": "particle",
"PRON": "pronoun",
"PROPN": "proper noun",
"PUNCT": "punctuation",
"SCONJ": "subordinating conjunction",
"SYM": "symbol",
"VERB": "verb",
"X": "other",
"EOL": "end of line",
"SPACE": "space",
# POS tags (English)
# OntoNotes 5 / Penn Treebank
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
".": "punctuation mark, sentence closer",
",": "punctuation mark, comma",
"-LRB-": "left round bracket",
"-RRB-": "right round bracket",
"``": "opening quotation mark",
'""': "closing quotation mark",
"''": "closing quotation mark",
":": "punctuation mark, colon or ellipsis",
"$": "symbol, currency",
"#": "symbol, number sign",
"AFX": "affix",
"CC": "conjunction, coordinating",
"CD": "cardinal number",
"DT": "determiner",
"EX": "existential there",
"FW": "foreign word",
"HYPH": "punctuation mark, hyphen",
"IN": "conjunction, subordinating or preposition",
"JJ": "adjective (English), other noun-modifier (Chinese)",
"JJR": "adjective, comparative",
"JJS": "adjective, superlative",
"LS": "list item marker",
"MD": "verb, modal auxiliary",
"NIL": "missing tag",
"NN": "noun, singular or mass",
"NNP": "noun, proper singular",
"NNPS": "noun, proper plural",
"NNS": "noun, plural",
"PDT": "predeterminer",
"POS": "possessive ending",
"PRP": "pronoun, personal",
"PRP$": "pronoun, possessive",
"RB": "adverb",
"RBR": "adverb, comparative",
"RBS": "adverb, superlative",
"RP": "adverb, particle",
"TO": 'infinitival "to"',
"UH": "interjection",
"VB": "verb, base form",
"VBD": "verb, past tense",
"VBG": "verb, gerund or present participle",
"VBN": "verb, past participle",
"VBP": "verb, non-3rd person singular present",
"VBZ": "verb, 3rd person singular present",
"WDT": "wh-determiner",
"WP": "wh-pronoun, personal",
"WP$": "wh-pronoun, possessive",
"WRB": "wh-adverb",
"SP": "space (English), sentence-final particle (Chinese)",
"ADD": "email",
"NFP": "superfluous punctuation",
"GW": "additional word in multi-word expression",
"XX": "unknown",
"BES": 'auxiliary "be"',
"HVS": 'forms of "have"',
"_SP": "whitespace",
# POS Tags (German)
# TIGER Treebank
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
"$(": "other sentence-internal punctuation mark",
"$,": "comma",
"$.": "sentence-final punctuation mark",
"ADJA": "adjective, attributive",
"ADJD": "adjective, adverbial or predicative",
"APPO": "postposition",
"APPR": "preposition; circumposition left",
"APPRART": "preposition with article",
"APZR": "circumposition right",
"ART": "definite or indefinite article",
"CARD": "cardinal number",
"FM": "foreign language material",
"ITJ": "interjection",
"KOKOM": "comparative conjunction",
"KON": "coordinate conjunction",
"KOUI": 'subordinate conjunction with "zu" and infinitive',
"KOUS": "subordinate conjunction with sentence",
"NE": "proper noun",
"NNE": "proper noun",
"PAV": "pronominal adverb",
"PROAV": "pronominal adverb",
"PDAT": "attributive demonstrative pronoun",
"PDS": "substituting demonstrative pronoun",
"PIAT": "attributive indefinite pronoun without determiner",
"PIDAT": "attributive indefinite pronoun with determiner",
"PIS": "substituting indefinite pronoun",
"PPER": "non-reflexive personal pronoun",
"PPOSAT": "attributive possessive pronoun",
"PPOSS": "substituting possessive pronoun",
"PRELAT": "attributive relative pronoun",
"PRELS": "substituting relative pronoun",
"PRF": "reflexive personal pronoun",
"PTKA": "particle with adjective or adverb",
"PTKANT": "answer particle",
"PTKNEG": "negative particle",
"PTKVZ": "separable verbal particle",
"PTKZU": '"zu" before infinitive',
"PWAT": "attributive interrogative pronoun",
"PWAV": "adverbial interrogative or relative pronoun",
"PWS": "substituting interrogative pronoun",
"TRUNC": "word remnant",
"VAFIN": "finite verb, auxiliary",
"VAIMP": "imperative, auxiliary",
"VAINF": "infinitive, auxiliary",
"VAPP": "perfect participle, auxiliary",
"VMFIN": "finite verb, modal",
"VMINF": "infinitive, modal",
"VMPP": "perfect participle, modal",
"VVFIN": "finite verb, full",
"VVIMP": "imperative, full",
"VVINF": "infinitive, full",
"VVIZU": 'infinitive with "zu", full',
"VVPP": "perfect participle, full",
"XY": "non-word containing non-letter",
# POS Tags (Chinese)
# OntoNotes / Chinese Penn Treebank
# https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports
"AD": "adverb",
"AS": "aspect marker",
"BA": "把 in ba-construction",
# "CD": "cardinal number",
"CS": "subordinating conjunction",
"DEC": "的 in a relative clause",
"DEG": "associative 的",
"DER": "得 in V-de const. and V-de-R",
"DEV": "地 before VP",
"ETC": "for words 等, 等等",
# "FW": "foreign words"
"IJ": "interjection",
# "JJ": "other noun-modifier",
"LB": "被 in long bei-const",
"LC": "localizer",
"M": "measure word",
"MSP": "other particle",
# "NN": "common noun",
"NR": "proper noun",
"NT": "temporal noun",
"OD": "ordinal number",
"ON": "onomatopoeia",
"P": "preposition excluding 把 and 被",
"PN": "pronoun",
"PU": "punctuation",
"SB": "被 in short bei-const",
# "SP": "sentence-final particle",
"VA": "predicative adjective",
"VC": "是 (copula)",
"VE": "有 as the main verb",
"VV": "other verb",
# Noun chunks
"NP": "noun phrase",
"PP": "prepositional phrase",
"VP": "verb phrase",
"ADVP": "adverb phrase",
"ADJP": "adjective phrase",
"SBAR": "subordinating conjunction",
"PRT": "particle",
"PNP": "prepositional noun phrase",
# Dependency Labels (English)
# ClearNLP / Universal Dependencies
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
"acl": "clausal modifier of noun (adjectival clause)",
"acomp": "adjectival complement",
"advcl": "adverbial clause modifier",
"advmod": "adverbial modifier",
"agent": "agent",
"amod": "adjectival modifier",
"appos": "appositional modifier",
"attr": "attribute",
"aux": "auxiliary",
"auxpass": "auxiliary (passive)",
"case": "case marking",
"cc": "coordinating conjunction",
"ccomp": "clausal complement",
"clf": "classifier",
"complm": "complementizer",
"compound": "compound",
"conj": "conjunct",
"cop": "copula",
"csubj": "clausal subject",
"csubjpass": "clausal subject (passive)",
"dative": "dative",
"dep": "unclassified dependent",
"det": "determiner",
"discourse": "discourse element",
"dislocated": "dislocated elements",
"dobj": "direct object",
"expl": "expletive",
"fixed": "fixed multiword expression",
"flat": "flat multiword expression",
"goeswith": "goes with",
"hmod": "modifier in hyphenation",
"hyph": "hyphen",
"infmod": "infinitival modifier",
"intj": "interjection",
"iobj": "indirect object",
"list": "list",
"mark": "marker",
"meta": "meta modifier",
"neg": "negation modifier",
"nmod": "modifier of nominal",
"nn": "noun compound modifier",
"npadvmod": "noun phrase as adverbial modifier",
"nsubj": "nominal subject",
"nsubjpass": "nominal subject (passive)",
"nounmod": "modifier of nominal",
"npmod": "noun phrase as adverbial modifier",
"num": "number modifier",
"number": "number compound modifier",
"nummod": "numeric modifier",
"oprd": "object predicate",
"obj": "object",
"obl": "oblique nominal",
"orphan": "orphan",
"parataxis": "parataxis",
"partmod": "participal modifier",
"pcomp": "complement of preposition",
"pobj": "object of preposition",
"poss": "possession modifier",
"possessive": "possessive modifier",
"preconj": "pre-correlative conjunction",
"prep": "prepositional modifier",
"prt": "particle",
"punct": "punctuation",
"quantmod": "modifier of quantifier",
"rcmod": "relative clause modifier",
"relcl": "relative clause modifier",
"reparandum": "overridden disfluency",
"root": "root",
"vocative": "vocative",
"xcomp": "open clausal complement",
# Dependency labels (German)
# TIGER Treebank
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
# currently missing: 'cc' (comparative complement) because of conflict
# with English labels
"ac": "adpositional case marker",
"adc": "adjective component",
"ag": "genitive attribute",
"ams": "measure argument of adjective",
"app": "apposition",
"avc": "adverbial phrase component",
"cd": "coordinating conjunction",
"cj": "conjunct",
"cm": "comparative conjunction",
"cp": "complementizer",
"cvc": "collocational verb construction",
"da": "dative",
"dh": "discourse-level head",
"dm": "discourse marker",
"ep": "expletive es",
"hd": "head",
"ju": "junctor",
"mnr": "postnominal modifier",
"mo": "modifier",
"ng": "negation",
"nk": "noun kernel element",
"nmc": "numerical component",
"oa": "accusative object",
"oc": "clausal object",
"og": "genitive object",
"op": "prepositional object",
"par": "parenthetical element",
"pd": "predicate",
"pg": "phrasal genitive",
"ph": "placeholder",
"pm": "morphological particle",
"pnc": "proper noun component",
"rc": "relative clause",
"re": "repeated element",
"rs": "reported speech",
"sb": "subject",
"sb": "subject",
"sbp": "passivized subject (PP)",
"sp": "subject or predicate",
"svp": "separable verb prefix",
"uc": "unit component",
"vo": "vocative",
# Named Entity Recognition
# OntoNotes 5
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
"PERSON": "People, including fictional",
"NORP": "Nationalities or religious or political groups",
"FACILITY": "Buildings, airports, highways, bridges, etc.",
"FAC": "Buildings, airports, highways, bridges, etc.",
"ORG": "Companies, agencies, institutions, etc.",
"GPE": "Countries, cities, states",
"LOC": "Non-GPE locations, mountain ranges, bodies of water",
"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
"WORK_OF_ART": "Titles of books, songs, etc.",
"LAW": "Named documents made into laws.",
"LANGUAGE": "Any named language",
"DATE": "Absolute or relative dates or periods",
"TIME": "Times smaller than a day",
"PERCENT": 'Percentage, including "%"',
"MONEY": "Monetary values, including unit",
"QUANTITY": "Measurements, as of weight or distance",
"ORDINAL": '"first", "second", etc.',
"CARDINAL": "Numerals that do not fall under another type",
# Named Entity Recognition
# Wikipedia
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
"PER": "Named person or family.",
"MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
# https://github.com/ltgoslo/norne
"EVT": "Festivals, cultural events, sports events, weather phenomena, wars, etc.",
"PROD": "Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas",
"DRV": "Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')",
"GPE_LOC": "Geo-political entity, with a locative sense, e.g. 'John lives in Spain'",
"GPE_ORG": "Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'",
}
This source diff could not be displayed because it is too large. You can view the blob instead.
"""Knowledge-base for entity or concept linking."""
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
from .vocab cimport Vocab
from .typedefs cimport hash_t
from .structs cimport KBEntryC, AliasC
ctypedef vector[KBEntryC] entry_vec
ctypedef vector[AliasC] alias_vec
ctypedef vector[float] float_vec
ctypedef vector[float_vec] float_matrix
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
cdef class Candidate:
cdef readonly KnowledgeBase kb
cdef hash_t entity_hash
cdef float entity_freq
cdef vector[float] entity_vector
cdef hash_t alias_hash
cdef float prior_prob
cdef class KnowledgeBase:
cdef Pool mem
cdef readonly Vocab vocab
cdef int64_t entity_vector_length
# This maps 64bit keys (hash of unique entity string)
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
# The PreshMap is pretty space efficient, as it uses open addressing. So
# the only overhead is the vacancy rate, which is approximately 30%.
cdef PreshMap _entry_index
# Each entry takes 128 bits, and again we'll have a 30% or so overhead for
# over allocation.
# In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
# Storing 1m entries would take 41.6mb under this scheme.
cdef entry_vec _entries
# This maps 64bit keys (hash of unique alias string)
# to 64bit values (position of the _AliasC struct in the _aliases_table vector).
cdef PreshMap _alias_index
# This should map mention hashes to (entry_id, prob) tuples. The probability
# should be P(entity | mention), which is pretty important to know.
# We can pack both pieces of information into a 64-bit value, to keep things
# efficient.
cdef alias_vec _aliases_table
# This is the part which might take more space: storing various
# categorical features for the entries, and storing vectors for disambiguation
# and possibly usage.
# If each entry gets a 300-dimensional vector, for 1m entries we would need
# 1.2gb. That gets expensive fast. What might be better is to avoid learning
# a unique vector for every entity. We could instead have a compositional
# model, that embeds different features of the entities into vectors. We'll
# still want some per-entity features, like the Wikipedia text or entity
# co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
cdef float_matrix _vectors_table
# It's very useful to track categorical features, at least for output, even
# if they're not useful in the model itself. For instance, we should be
# able to track stuff like a person's date of birth or whatever. This can
# easily make the KB bigger, but if this isn't needed by the model, and it's
# optional data, we can let users configure a DB as the backend for this.
cdef object _features_table
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
"""Add an entity vector to the vectors table."""
cdef int64_t new_index = self._vectors_table.size()
self._vectors_table.push_back(entity_vector)
return new_index
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
int32_t vector_index, int feats_row) nogil:
"""Add an entry to the vector of entries.
After calling this method, make sure to update also the _entry_index using the return value"""
# This is what we'll map the entity hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later.
cdef int64_t new_index = self._entries.size()
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
cdef KBEntryC entry
entry.entity_hash = entity_hash
entry.vector_index = vector_index
entry.feats_row = feats_row
entry.freq = freq
self._entries.push_back(entry)
return new_index
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
"""Connect a mention to a list of potential entities with their prior probabilities .
After calling this method, make sure to update also the _alias_index using the return value"""
# This is what we'll map the alias hash key to. It's where the alias will be defined
# in the vector of aliases.
cdef int64_t new_index = self._aliases_table.size()
# Avoid struct initializer to enable nogil
cdef AliasC alias
alias.entry_indices = entry_indices
alias.probs = probs
self._aliases_table.push_back(alias)
return new_index
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
"""
Initializing the vectors and making sure the first element of each vector is a dummy,
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
cf. https://github.com/explosion/preshed/issues/17
"""
cdef int32_t dummy_value = 0
# Avoid struct initializer to enable nogil
cdef KBEntryC entry
entry.entity_hash = dummy_hash
entry.vector_index = dummy_value
entry.feats_row = dummy_value
entry.freq = dummy_value
# Avoid struct initializer to enable nogil
cdef vector[int64_t] dummy_entry_indices
dummy_entry_indices.push_back(0)
cdef vector[float] dummy_probs
dummy_probs.push_back(0)
cdef AliasC alias
alias.entry_indices = dummy_entry_indices
alias.probs = dummy_probs
self._entries.push_back(entry)
self._aliases_table.push_back(alias)
cpdef set_entities(self, entity_list, freq_list, vector_list)
cdef class Writer:
cdef FILE* _fp
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
cdef int write_vector_element(self, float element) except -1
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
cdef int write_alias_length(self, int64_t alias_length) except -1
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
cdef int write_alias(self, int64_t entry_index, float prob) except -1
cdef int _write(self, void* value, size_t size) except -1
cdef class Reader:
cdef FILE* _fp
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
cdef int read_vector_element(self, float* element) except -1
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
cdef int read_alias_length(self, int64_t* alias_length) except -1
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1
# cython: infer_types=True, profile=True
from typing import Iterator, Iterable, Callable, Dict, Any
import srsly
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
from libcpp.vector cimport vector
from pathlib import Path
import warnings
from .typedefs cimport hash_t
from .errors import Errors, Warnings
from . import util
from .util import SimpleFrozenList, ensure_path
cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned to a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate_init
"""
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
self.kb = kb
self.entity_hash = entity_hash
self.entity_freq = entity_freq
self.entity_vector = entity_vector
self.alias_hash = alias_hash
self.prior_prob = prior_prob
@property
def entity(self):
"""RETURNS (uint64): hash of the entity's KB ID/name"""
return self.entity_hash
@property
def entity_(self):
"""RETURNS (str): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_hash]
@property
def alias(self):
"""RETURNS (uint64): hash of the alias"""
return self.alias_hash
@property
def alias_(self):
"""RETURNS (str): ID of the original alias"""
return self.kb.vocab.strings[self.alias_hash]
@property
def entity_freq(self):
return self.entity_freq
@property
def entity_vector(self):
return self.entity_vector
@property
def prior_prob(self):
return self.prior_prob
def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
"""
Return candidate entities for a given span by using the text of the span as the alias
and fetching appropriate entries from the index.
This particular function is optimized to work with the built-in KB functionality,
but any other custom candidate generation method can be used in combination with the KB as well.
"""
return kb.get_alias_candidates(span.text)
cdef class KnowledgeBase:
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
DOCS: https://spacy.io/api/kb
"""
def __init__(self, Vocab vocab, entity_vector_length):
"""Create a KnowledgeBase."""
self.mem = Pool()
self.entity_vector_length = entity_vector_length
self._entry_index = PreshMap()
self._alias_index = PreshMap()
self.vocab = vocab
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
def initialize_entities(self, int64_t nr_entities):
self._entry_index = PreshMap(nr_entities + 1)
self._entries = entry_vec(nr_entities + 1)
def initialize_vectors(self, int64_t nr_entities):
self._vectors_table = float_matrix(nr_entities + 1)
def initialize_aliases(self, int64_t nr_aliases):
self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1)
@property
def entity_vector_length(self):
"""RETURNS (uint64): length of the entity vectors"""
return self.entity_vector_length
def __len__(self):
return self.get_size_entities()
def get_size_entities(self):
return len(self._entry_index)
def get_entity_strings(self):
return [self.vocab.strings[x] for x in self._entry_index]
def get_size_aliases(self):
return len(self._alias_index)
def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index]
def add_entity(self, str entity, float freq, vector[float] entity_vector):
"""
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end.
"""
cdef hash_t entity_hash = self.vocab.strings.add(entity)
# Return if this entity was added before
if entity_hash in self._entry_index:
warnings.warn(Warnings.W018.format(entity=entity))
return
# Raise an error if the provided entity vector is not of the correct length
if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
vector_index = self.c_add_vector(entity_vector=entity_vector)
new_index = self.c_add_entity(entity_hash=entity_hash,
freq=freq,
vector_index=vector_index,
feats_row=-1) # Features table currently not implemented
self._entry_index[entity_hash] = new_index
return entity_hash
cpdef set_entities(self, entity_list, freq_list, vector_list):
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
raise ValueError(Errors.E140)
nr_entities = len(set(entity_list))
self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
i = 0
cdef KBEntryC entry
cdef hash_t entity_hash
while i < len(entity_list):
# only process this entity if its unique ID hadn't been added before
entity_hash = self.vocab.strings.add(entity_list[i])
if entity_hash in self._entry_index:
warnings.warn(Warnings.W018.format(entity=entity_list[i]))
else:
entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
entry.entity_hash = entity_hash
entry.freq = freq_list[i]
self._vectors_table[i] = entity_vector
entry.vector_index = i
entry.feats_row = -1 # Features table currently not implemented
self._entries[i+1] = entry
self._entry_index[entity_hash] = i+1
i += 1
def contains_entity(self, str entity):
cdef hash_t entity_hash = self.vocab.strings.add(entity)
return entity_hash in self._entry_index
def contains_alias(self, str alias):
cdef hash_t alias_hash = self.vocab.strings.add(alias)
return alias_hash in self._alias_index
def add_alias(self, str alias, entities, probabilities):
"""
For a given alias, add its potential entities and prior probabilies to the KB.
Return the alias_hash at the end
"""
if alias is None or len(alias) == 0:
raise ValueError(Errors.E890.format(alias=alias))
previous_alias_nr = self.get_size_aliases()
# Throw an error if the length of entities and probabilities are not the same
if not len(entities) == len(probabilities):
raise ValueError(Errors.E132.format(alias=alias,
entities_length=len(entities),
probabilities_length=len(probabilities)))
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
prob_sum = sum(probabilities)
if prob_sum > 1.00001:
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
cdef hash_t alias_hash = self.vocab.strings.add(alias)
# Check whether this alias was added before
if alias_hash in self._alias_index:
warnings.warn(Warnings.W017.format(alias=alias))
return
cdef vector[int64_t] entry_indices
cdef vector[float] probs
for entity, prob in zip(entities, probabilities):
entity_hash = self.vocab.strings[entity]
if not entity_hash in self._entry_index:
raise ValueError(Errors.E134.format(entity=entity))
entry_index = <int64_t>self._entry_index.get(entity_hash)
entry_indices.push_back(int(entry_index))
probs.push_back(float(prob))
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
self._alias_index[alias_hash] = new_index
if previous_alias_nr + 1 != self.get_size_aliases():
raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
"""
For an alias already existing in the KB, extend its potential entities with one more.
Throw a warning if either the alias or the entity is unknown,
or when the combination is already previously recorded.
Throw an error if this entity+prior prob would exceed the sum of 1.
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
"""
# Check if the alias exists in the KB
cdef hash_t alias_hash = self.vocab.strings[alias]
if not alias_hash in self._alias_index:
raise ValueError(Errors.E176.format(alias=alias))
# Check if the entity exists in the KB
cdef hash_t entity_hash = self.vocab.strings[entity]
if not entity_hash in self._entry_index:
raise ValueError(Errors.E134.format(entity=entity))
entry_index = <int64_t>self._entry_index.get(entity_hash)
# Throw an error if the prior probabilities (including the new one) sum up to more than 1
alias_index = <int64_t>self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index]
current_sum = sum([p for p in alias_entry.probs])
new_sum = current_sum + prior_prob
if new_sum > 1.00001:
raise ValueError(Errors.E133.format(alias=alias, sum=new_sum))
entry_indices = alias_entry.entry_indices
is_present = False
for i in range(entry_indices.size()):
if entry_indices[i] == int(entry_index):
is_present = True
if is_present:
if not ignore_warnings:
warnings.warn(Warnings.W024.format(entity=entity, alias=alias))
else:
entry_indices.push_back(int(entry_index))
alias_entry.entry_indices = entry_indices
probs = alias_entry.probs
probs.push_back(float(prior_prob))
alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
If the alias is not known in the KB, and empty list is returned.
"""
cdef hash_t alias_hash = self.vocab.strings[alias]
if not alias_hash in self._alias_index:
return []
alias_index = <int64_t>self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index]
return [Candidate(kb=self,
entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
alias_hash=alias_hash,
prior_prob=prior_prob)
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0]
def get_vector(self, str entity):
cdef hash_t entity_hash = self.vocab.strings[entity]
# Return an empty list if this entity is unknown in this KB
if entity_hash not in self._entry_index:
return [0] * self.entity_vector_length
entry_index = self._entry_index[entity_hash]
return self._vectors_table[self._entries[entry_index].vector_index]
def get_prior_prob(self, str entity, str alias):
""" Return the prior probability of a given alias being linked to a given entity,
or return 0.0 when this combination is not known in the knowledge base"""
cdef hash_t alias_hash = self.vocab.strings[alias]
cdef hash_t entity_hash = self.vocab.strings[entity]
if entity_hash not in self._entry_index or alias_hash not in self._alias_index:
return 0.0
alias_index = <int64_t>self._alias_index.get(alias_hash)
entry_index = self._entry_index[entity_hash]
alias_entry = self._aliases_table[alias_index]
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
if self._entries[entry_index].entity_hash == entity_hash:
return prior_prob
return 0.0
def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string.
"""
def serialize_header():
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
return srsly.json_dumps(header)
def serialize_entries():
i = 1
tuples = []
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash
assert entry_index == i
tuples.append((entry.entity_hash, entry.freq, entry.vector_index))
i = i + 1
return srsly.json_dumps(tuples)
def serialize_aliases():
i = 1
headers = []
indices_lists = []
probs_lists = []
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
alias = self._aliases_table[alias_index]
assert alias_index == i
candidate_length = len(alias.entry_indices)
headers.append((alias_hash, candidate_length))
indices_lists.append(alias.entry_indices)
probs_lists.append(alias.probs)
i = i + 1
headers_dump = srsly.json_dumps(headers)
indices_dump = srsly.json_dumps(indices_lists)
probs_dump = srsly.json_dumps(probs_lists)
return srsly.json_dumps((headers_dump, indices_dump, probs_dump))
serializers = {
"header": serialize_header,
"entity_vectors": lambda: srsly.json_dumps(self._vectors_table),
"entries": serialize_entries,
"aliases": serialize_aliases,
}
return util.to_bytes(serializers, [])
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load state from a binary string.
"""
def deserialize_header(b):
header = srsly.json_loads(b)
nr_entities = header[0]
nr_aliases = header[1]
entity_vector_length = header[2]
self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
self.initialize_aliases(nr_aliases)
self.entity_vector_length = entity_vector_length
def deserialize_vectors(b):
self._vectors_table = srsly.json_loads(b)
def deserialize_entries(b):
cdef KBEntryC entry
tuples = srsly.json_loads(b)
i = 1
for (entity_hash, freq, vector_index) in tuples:
entry.entity_hash = entity_hash
entry.freq = freq
entry.vector_index = vector_index
entry.feats_row = -1 # Features table currently not implemented
self._entries[i] = entry
self._entry_index[entity_hash] = i
i += 1
def deserialize_aliases(b):
cdef AliasC alias
i = 1
all_data = srsly.json_loads(b)
headers = srsly.json_loads(all_data[0])
indices = srsly.json_loads(all_data[1])
probs = srsly.json_loads(all_data[2])
for header, indices, probs in zip(headers, indices, probs):
alias_hash, candidate_length = header
alias.entry_indices = indices
alias.probs = probs
self._aliases_table[i] = alias
self._alias_index[alias_hash] = i
i += 1
setters = {
"header": deserialize_header,
"entity_vectors": deserialize_vectors,
"entries": deserialize_entries,
"aliases": deserialize_aliases,
}
util.from_bytes(bytes_data, setters, exclude)
return self
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
path = ensure_path(path)
if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
serialize = {}
serialize["contents"] = lambda p: self.write_contents(p)
serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
path = ensure_path(path)
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
deserialize: Dict[str, Callable[[Any], Any]] = {}
deserialize["contents"] = lambda p: self.read_contents(p)
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
util.from_disk(path, deserialize, exclude)
def write_contents(self, file_path):
cdef Writer writer = Writer(file_path)
writer.write_header(self.get_size_entities(), self.entity_vector_length)
# dumping the entity vectors in their original order
i = 0
for entity_vector in self._vectors_table:
for element in entity_vector:
writer.write_vector_element(element)
i = i+1
# dumping the entry records in the order in which they are in the _entries vector.
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
i = 1
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash
assert entry_index == i
writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index)
i = i+1
writer.write_alias_length(self.get_size_aliases())
# dumping the aliases in the order in which they are in the _alias_index vector.
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
i = 1
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
alias = self._aliases_table[alias_index]
assert alias_index == i
candidate_length = len(alias.entry_indices)
writer.write_alias_header(alias_hash, candidate_length)
for j in range(0, candidate_length):
writer.write_alias(alias.entry_indices[j], alias.probs[j])
i = i+1
writer.close()
def read_contents(self, file_path):
cdef hash_t entity_hash
cdef hash_t alias_hash
cdef int64_t entry_index
cdef float freq, prob
cdef int32_t vector_index
cdef KBEntryC entry
cdef AliasC alias
cdef float vector_element
cdef Reader reader = Reader(file_path)
# STEP 0: load header and initialize KB
cdef int64_t nr_entities
cdef int64_t entity_vector_length
reader.read_header(&nr_entities, &entity_vector_length)
self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
self.entity_vector_length = entity_vector_length
# STEP 1: load entity vectors
cdef int i = 0
cdef int j = 0
while i < nr_entities:
entity_vector = float_vec(entity_vector_length)
j = 0
while j < entity_vector_length:
reader.read_vector_element(&vector_element)
entity_vector[j] = vector_element
j = j+1
self._vectors_table[i] = entity_vector
i = i+1
# STEP 2: load entities
# we assume that the entity data was written in sequence
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
i = 1
while i <= nr_entities:
reader.read_entry(&entity_hash, &freq, &vector_index)
entry.entity_hash = entity_hash
entry.freq = freq
entry.vector_index = vector_index
entry.feats_row = -1 # Features table currently not implemented
self._entries[i] = entry
self._entry_index[entity_hash] = i
i += 1
# check that all entities were read in properly
assert nr_entities == self.get_size_entities()
# STEP 3: load aliases
cdef int64_t nr_aliases
reader.read_alias_length(&nr_aliases)
self.initialize_aliases(nr_aliases)
cdef int64_t nr_candidates
cdef vector[int64_t] entry_indices
cdef vector[float] probs
i = 1
# we assume the alias data was written in sequence
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
while i <= nr_aliases:
reader.read_alias_header(&alias_hash, &nr_candidates)
entry_indices = vector[int64_t](nr_candidates)
probs = vector[float](nr_candidates)
for j in range(0, nr_candidates):
reader.read_alias(&entry_index, &prob)
entry_indices[j] = entry_index
probs[j] = prob
alias.entry_indices = entry_indices
alias.probs = probs
self._aliases_table[i] = alias
self._alias_index[alias_hash] = i
i += 1
# check that all aliases were read in properly
assert nr_aliases == self.get_size_aliases()
cdef class Writer:
def __init__(self, path):
assert isinstance(path, Path)
content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'wb')
if not self._fp:
raise IOError(Errors.E146.format(path=path))
fseek(self._fp, 0, 0)
def close(self):
cdef size_t status = fclose(self._fp)
assert status == 0
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
self._write(&nr_entries, sizeof(nr_entries))
self._write(&entity_vector_length, sizeof(entity_vector_length))
cdef int write_vector_element(self, float element) except -1:
self._write(&element, sizeof(element))
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
self._write(&entry_hash, sizeof(entry_hash))
self._write(&entry_freq, sizeof(entry_freq))
self._write(&vector_index, sizeof(vector_index))
# Features table currently not implemented and not written to file
cdef int write_alias_length(self, int64_t alias_length) except -1:
self._write(&alias_length, sizeof(alias_length))
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
self._write(&alias_hash, sizeof(alias_hash))
self._write(&candidate_length, sizeof(candidate_length))
cdef int write_alias(self, int64_t entry_index, float prob) except -1:
self._write(&entry_index, sizeof(entry_index))
self._write(&prob, sizeof(prob))
cdef int _write(self, void* value, size_t size) except -1:
status = fwrite(value, size, 1, self._fp)
assert status == 1, status
cdef class Reader:
def __init__(self, path):
content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'rb')
if not self._fp:
PyErr_SetFromErrno(IOError)
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
def __dealloc__(self):
fclose(self._fp)
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
status = self._read(nr_entries, sizeof(int64_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="header"))
status = self._read(entity_vector_length, sizeof(int64_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="vector length"))
cdef int read_vector_element(self, float* element) except -1:
status = self._read(element, sizeof(float))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="vector element"))
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
status = self._read(entity_hash, sizeof(hash_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="entity hash"))
status = self._read(freq, sizeof(float))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="entity freq"))
status = self._read(vector_index, sizeof(int32_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="vector index"))
if feof(self._fp):
return 0
else:
return 1
cdef int read_alias_length(self, int64_t* alias_length) except -1:
status = self._read(alias_length, sizeof(int64_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="alias length"))
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
status = self._read(alias_hash, sizeof(hash_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="alias hash"))
status = self._read(candidate_length, sizeof(int64_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="candidate length"))
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1:
status = self._read(entry_index, sizeof(int64_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="entry index"))
status = self._read(prob, sizeof(float))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError(Errors.E145.format(param="prior probability"))
cdef int _read(self, void* value, size_t size) except -1:
status = fread(value, size, 1, self._fp)
return status
from typing import Iterator, Optional, Any, Dict, Callable, Iterable
from typing import Union, Tuple, List, Set, Pattern, Sequence
from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
from dataclasses import dataclass
import random
import itertools
import functools
from contextlib import contextmanager
from copy import deepcopy
from pathlib import Path
import warnings
from thinc.api import get_current_ops, Config, CupyOps, Optimizer
import srsly
import multiprocessing as mp
from itertools import chain, cycle
from timeit import default_timer as timer
import traceback
from . import ty
from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples
from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer
from .util import registry, SimpleFrozenList, _pipe, raise_error
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .util import warn_if_jupyter_cupy
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc
from .tokenizer import Tokenizer
from .errors import Errors, Warnings
from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
from .schemas import ConfigSchemaPretrain, validate_init_settings
from .git_info import GIT_VERSION
from . import util
from . import about
from .lookups import load_lookups
from .compat import Literal
if TYPE_CHECKING:
from .pipeline import Pipe # noqa: F401
# This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
# This is the base config for the [pretraining] block and currently not included
# in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
# Type variable for contexts piped with documents
_AnyContext = TypeVar("_AnyContext")
class BaseDefaults:
"""Language data defaults, available via Language.Defaults. Can be
overwritten by language subclasses by defining their own subclasses of
Language.Defaults.
"""
config: Config = Config(section_order=CONFIG_SECTION_ORDER)
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
prefixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_PREFIXES
suffixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
infixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_INFIXES
token_match: Optional[Callable] = None
url_match: Optional[Callable] = URL_MATCH
syntax_iterators: Dict[str, Callable] = {}
lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
stop_words: Set[str] = set()
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
"""Registered function to create a tokenizer. Returns a factory that takes
the nlp object and returns a Tokenizer instance using the language detaults.
"""
def tokenizer_factory(nlp: "Language") -> Tokenizer:
prefixes = nlp.Defaults.prefixes
suffixes = nlp.Defaults.suffixes
infixes = nlp.Defaults.infixes
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
return Tokenizer(
nlp.vocab,
rules=nlp.Defaults.tokenizer_exceptions,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=nlp.Defaults.token_match,
url_match=nlp.Defaults.url_match,
)
return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
lookups = load_lookups(lang=lang, tables=tables)
return lookups
class Language:
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (str): IETF language code, such as 'en'.
DOCS: https://spacy.io/api/language
"""
Defaults = BaseDefaults
lang: Optional[str] = None
default_config = DEFAULT_CONFIG
factories = SimpleFrozenDict(error=Errors.E957)
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
def __init__(
self,
vocab: Union[Vocab, bool] = True,
*,
max_length: int = 10 ** 6,
meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
batch_size: int = 1000,
**kwargs,
) -> None:
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
max_length (int): Maximum number of characters in a single text. The
current models may run out memory on extremely long texts, due to
large internal allocations. You should segment these texts into
meaningful units, e.g. paragraphs, subsections etc, before passing
them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
a rule of thumb, if all pipeline components are enabled, spaCy's
default models currently requires roughly 1GB of temporary memory per
100,000 characters in one text.
create_tokenizer (Callable): Function that takes the nlp object and
returns a tokenizer.
batch_size (int): Default batch size for pipe and evaluate.
DOCS: https://spacy.io/api/language#init
"""
# We're only calling this to import all factories provided via entry
# points. The factory decorator applied to these functions takes care
# of the rest.
util.registry._entry_point_factories.get_all()
self._config = DEFAULT_CONFIG.merge(self.default_config)
self._meta = dict(meta)
self._path = None
self._optimizer: Optional[Optimizer] = None
# Component meta and configs are only needed on the instance
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
self._pipe_configs: Dict[str, Config] = {} # config by component
if not isinstance(vocab, Vocab) and vocab is not True:
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
self.vocab: Vocab = vocab
if self.lang is None:
self.lang = self.vocab.lang
self._components: List[Tuple[str, "Pipe"]] = []
self._disabled: Set[str] = set()
self.max_length = max_length
# Create the default tokenizer from the default config
if not create_tokenizer:
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
self.tokenizer = create_tokenizer(self)
self.batch_size = batch_size
self.default_error_handler = raise_error
def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
cls.default_config["nlp"]["lang"] = cls.lang
@property
def path(self):
return self._path
@property
def meta(self) -> Dict[str, Any]:
"""Custom meta data of the language class. If a model is loaded, this
includes details from the model's meta.json.
RETURNS (Dict[str, Any]): The meta.
DOCS: https://spacy.io/api/language#meta
"""
spacy_version = util.get_minor_version_range(about.__version__)
if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang)
else:
self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "pipeline")
self._meta.setdefault("version", "0.0.0")
self._meta.setdefault("spacy_version", spacy_version)
self._meta.setdefault("description", "")
self._meta.setdefault("author", "")
self._meta.setdefault("email", "")
self._meta.setdefault("url", "")
self._meta.setdefault("license", "")
self._meta.setdefault("spacy_git_version", GIT_VERSION)
self._meta["vectors"] = {
"width": self.vocab.vectors_length,
"vectors": len(self.vocab.vectors),
"keys": self.vocab.vectors.n_keys,
"name": self.vocab.vectors.name,
"mode": self.vocab.vectors.mode,
}
self._meta["labels"] = dict(self.pipe_labels)
# TODO: Adding this back to prevent breaking people's code etc., but
# we should consider removing it
self._meta["pipeline"] = list(self.pipe_names)
self._meta["components"] = list(self.component_names)
self._meta["disabled"] = list(self.disabled)
return self._meta
@meta.setter
def meta(self, value: Dict[str, Any]) -> None:
self._meta = value
@property
def config(self) -> Config:
"""Trainable config for the current language instance. Includes the
current pipeline components, as well as default training config.
RETURNS (thinc.api.Config): The config.
DOCS: https://spacy.io/api/language#config
"""
self._config.setdefault("nlp", {})
self._config.setdefault("training", {})
self._config["nlp"]["lang"] = self.lang
# We're storing the filled config for each pipeline component and so
# we can populate the config again later
pipeline = {}
score_weights = []
for pipe_name in self.component_names:
pipe_meta = self.get_pipe_meta(pipe_name)
pipe_config = self.get_pipe_config(pipe_name)
pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
if pipe_meta.default_score_weights:
score_weights.append(pipe_meta.default_score_weights)
self._config["nlp"]["pipeline"] = list(self.component_names)
self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline
# We're merging the existing score weights back into the combined
# weights to make sure we're preserving custom settings in the config
# but also reflect updates (e.g. new components added)
prev_weights = self._config["training"].get("score_weights", {})
combined_score_weights = combine_score_weights(score_weights, prev_weights)
self._config["training"]["score_weights"] = combined_score_weights
if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config))
return self._config
@config.setter
def config(self, value: Config) -> None:
self._config = value
@property
def disabled(self) -> List[str]:
"""Get the names of all disabled components.
RETURNS (List[str]): The disabled components.
"""
# Make sure the disabled components are returned in the order they
# appear in the pipeline (which isn't guaranteed by the set)
names = [name for name, _ in self._components if name in self._disabled]
return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled"))
@property
def factory_names(self) -> List[str]:
"""Get names of all available factories.
RETURNS (List[str]): The factory names.
"""
names = list(self.factories.keys())
return SimpleFrozenList(names)
@property
def components(self) -> List[Tuple[str, "Pipe"]]:
"""Get all (name, component) tuples in the pipeline, including the
currently disabled components.
"""
return SimpleFrozenList(
self._components, error=Errors.E926.format(attr="components")
)
@property
def component_names(self) -> List[str]:
"""Get the names of the available pipeline components. Includes all
active and inactive pipeline components.
RETURNS (List[str]): List of component name strings, in order.
"""
names = [pipe_name for pipe_name, _ in self._components]
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
@property
def pipeline(self) -> List[Tuple[str, "Pipe"]]:
"""The processing pipeline consisting of (name, component) tuples. The
components are called on the Doc in order as it passes through the
pipeline.
RETURNS (List[Tuple[str, Pipe]]): The pipeline.
"""
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
@property
def pipe_names(self) -> List[str]:
"""Get names of available active pipeline components.
RETURNS (List[str]): List of component name strings, in order.
"""
names = [pipe_name for pipe_name, _ in self.pipeline]
return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names"))
@property
def pipe_factories(self) -> Dict[str, str]:
"""Get the component factories for the available pipeline components.
RETURNS (Dict[str, str]): Factory names, keyed by component names.
"""
factories = {}
for pipe_name, pipe in self._components:
factories[pipe_name] = self.get_pipe_meta(pipe_name).factory
return SimpleFrozenDict(factories)
@property
def pipe_labels(self) -> Dict[str, List[str]]:
"""Get the labels set by the pipeline components, if available (if
the component exposes a labels property).
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
"""
labels = {}
for name, pipe in self._components:
if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels)
return SimpleFrozenDict(labels)
@classmethod
def has_factory(cls, name: str) -> bool:
"""RETURNS (bool): Whether a factory of that name is registered."""
internal_name = cls.get_factory_name(name)
return name in registry.factories or internal_name in registry.factories
@classmethod
def get_factory_name(cls, name: str) -> str:
"""Get the internal factory name based on the language subclass.
name (str): The factory name.
RETURNS (str): The internal factory name.
"""
if cls.lang is None:
return name
return f"{cls.lang}.{name}"
@classmethod
def get_factory_meta(cls, name: str) -> "FactoryMeta":
"""Get the meta information for a given factory name.
name (str): The component factory name.
RETURNS (FactoryMeta): The meta for the given factory name.
"""
internal_name = cls.get_factory_name(name)
if internal_name in cls._factory_meta:
return cls._factory_meta[internal_name]
if name in cls._factory_meta:
return cls._factory_meta[name]
raise ValueError(Errors.E967.format(meta="factory", name=name))
@classmethod
def set_factory_meta(cls, name: str, value: "FactoryMeta") -> None:
"""Set the meta information for a given factory name.
name (str): The component factory name.
value (FactoryMeta): The meta to set.
"""
cls._factory_meta[cls.get_factory_name(name)] = value
def get_pipe_meta(self, name: str) -> "FactoryMeta":
"""Get the meta information for a given component name.
name (str): The component name.
RETURNS (FactoryMeta): The meta for the given component name.
"""
if name not in self._pipe_meta:
raise ValueError(Errors.E967.format(meta="component", name=name))
return self._pipe_meta[name]
def get_pipe_config(self, name: str) -> Config:
"""Get the config used to create a pipeline component.
name (str): The component name.
RETURNS (Config): The config used to create the pipeline component.
"""
if name not in self._pipe_configs:
raise ValueError(Errors.E960.format(name=name))
pipe_config = self._pipe_configs[name]
return pipe_config
@classmethod
def factory(
cls,
name: str,
*,
default_config: Dict[str, Any] = SimpleFrozenDict(),
assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
default_score_weights: Dict[str, Optional[float]] = SimpleFrozenDict(),
func: Optional[Callable] = None,
) -> Callable:
"""Register a new pipeline component factory. Can be used as a decorator
on a function or classmethod, or called as a function with the factory
provided as the func keyword argument. To create a component and add
it to the pipeline, you can use nlp.add_pipe(name).
name (str): The name of the component factory.
default_config (Dict[str, Any]): Default configuration, describing the
default values of the factory arguments.
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
e.g. "token.ent_id". Used for pipeline analysis.
requires (Iterable[str]): Doc/Token attributes required by this component,
e.g. "token.ent_id". Used for pipeline analysis.
retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis.
default_score_weights (Dict[str, Optional[float]]): The scores to report during
training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline. If None,
the score won't be shown in the logs or be weighted.
func (Optional[Callable]): Factory function if not used as a decorator.
DOCS: https://spacy.io/api/language#factory
"""
if not isinstance(name, str):
raise ValueError(Errors.E963.format(decorator="factory"))
if not isinstance(default_config, dict):
err = Errors.E962.format(
style="default config", name=name, cfg_type=type(default_config)
)
raise ValueError(err)
def add_factory(factory_func: Callable) -> Callable:
internal_name = cls.get_factory_name(name)
if internal_name in registry.factories:
# We only check for the internal name here – it's okay if it's a
# subclass and the base class has a factory of the same name. We
# also only raise if the function is different to prevent raising
# if module is reloaded.
existing_func = registry.factories.get(internal_name)
if not util.is_same_func(factory_func, existing_func):
err = Errors.E004.format(
name=name, func=existing_func, new_func=factory_func
)
raise ValueError(err)
arg_names = util.get_arg_names(factory_func)
if "nlp" not in arg_names or "name" not in arg_names:
raise ValueError(Errors.E964.format(name=name))
# Officially register the factory so we can later call
# registry.resolve and refer to it in the config as
# @factories = "spacy.Language.xyz". We use the class name here so
# different classes can have different factories.
registry.factories.register(internal_name, func=factory_func)
factory_meta = FactoryMeta(
factory=name,
default_config=default_config,
assigns=validate_attrs(assigns),
requires=validate_attrs(requires),
scores=list(default_score_weights.keys()),
default_score_weights=default_score_weights,
retokenizes=retokenizes,
)
cls.set_factory_meta(name, factory_meta)
# We're overwriting the class attr with a frozen dict to handle
# backwards-compat (writing to Language.factories directly). This
# wouldn't work with an instance property and just produce a
# confusing error – here we can show a custom error
cls.factories = SimpleFrozenDict(
registry.factories.get_all(), error=Errors.E957
)
return factory_func
if func is not None: # Support non-decorator use cases
return add_factory(func)
return add_factory
@classmethod
def component(
cls,
name: str,
*,
assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
func: Optional["Pipe"] = None,
) -> Callable:
"""Register a new pipeline component. Can be used for stateless function
components that don't require a separate factory. Can be used as a
decorator on a function or classmethod, or called as a function with the
factory provided as the func keyword argument. To create a component and
add it to the pipeline, you can use nlp.add_pipe(name).
name (str): The name of the component factory.
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
e.g. "token.ent_id". Used for pipeline analysis.
requires (Iterable[str]): Doc/Token attributes required by this component,
e.g. "token.ent_id". Used for pipeline analysis.
retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis.
func (Optional[Callable]): Factory function if not used as a decorator.
DOCS: https://spacy.io/api/language#component
"""
if name is not None and not isinstance(name, str):
raise ValueError(Errors.E963.format(decorator="component"))
component_name = name if name is not None else util.get_object_name(func)
def add_component(component_func: "Pipe") -> Callable:
if isinstance(func, type): # function is a class
raise ValueError(Errors.E965.format(name=component_name))
def factory_func(nlp, name: str) -> "Pipe":
return component_func
internal_name = cls.get_factory_name(name)
if internal_name in registry.factories:
# We only check for the internal name here – it's okay if it's a
# subclass and the base class has a factory of the same name. We
# also only raise if the function is different to prevent raising
# if module is reloaded. It's hacky, but we need to check the
# existing functure for a closure and whether that's identical
# to the component function (because factory_func created above
# will always be different, even for the same function)
existing_func = registry.factories.get(internal_name)
closure = existing_func.__closure__
wrapped = [c.cell_contents for c in closure][0] if closure else None
if util.is_same_func(wrapped, component_func):
factory_func = existing_func # noqa: F811
cls.factory(
component_name,
assigns=assigns,
requires=requires,
retokenizes=retokenizes,
func=factory_func,
)
return component_func
if func is not None: # Support non-decorator use cases
return add_component(func)
return add_component
def analyze_pipes(
self,
*,
keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
pretty: bool = False,
) -> Optional[Dict[str, Any]]:
"""Analyze the current pipeline components, print a summary of what
they assign or require and check that all requirements are met.
keys (List[str]): The meta values to display in the table. Corresponds
to values in FactoryMeta, defined by @Language.factory decorator.
pretty (bool): Pretty-print the results.
RETURNS (dict): The data.
"""
analysis = analyze_pipes(self, keys=keys)
if pretty:
print_pipe_analysis(analysis, keys=keys)
return analysis
def get_pipe(self, name: str) -> "Pipe":
"""Get a pipeline component for a given component name.
name (str): Name of pipeline component to get.
RETURNS (callable): The pipeline component.
DOCS: https://spacy.io/api/language#get_pipe
"""
for pipe_name, component in self._components:
if pipe_name == name:
return component
raise KeyError(Errors.E001.format(name=name, opts=self.component_names))
def create_pipe(
self,
factory_name: str,
name: Optional[str] = None,
*,
config: Dict[str, Any] = SimpleFrozenDict(),
raw_config: Optional[Config] = None,
validate: bool = True,
) -> "Pipe":
"""Create a pipeline component. Mostly used internally. To create and
add a component to the pipeline, you can use nlp.add_pipe.
factory_name (str): Name of component factory.
name (Optional[str]): Optional name to assign to component instance.
Defaults to factory name if not set.
config (Dict[str, Any]): Config parameters to use for this component.
Will be merged with default config, if available.
raw_config (Optional[Config]): Internals: the non-interpolated config.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Pipe): The pipeline component.
DOCS: https://spacy.io/api/language#create_pipe
"""
name = name if name is not None else factory_name
if not isinstance(config, dict):
err = Errors.E962.format(style="config", name=name, cfg_type=type(config))
raise ValueError(err)
if not srsly.is_json_serializable(config):
raise ValueError(Errors.E961.format(config=config))
if not self.has_factory(factory_name):
err = Errors.E002.format(
name=factory_name,
opts=", ".join(self.factory_names),
method="create_pipe",
lang=util.get_object_name(self),
lang_code=self.lang,
)
raise ValueError(err)
pipe_meta = self.get_factory_meta(factory_name)
# This is unideal, but the alternative would mean you always need to
# specify the full config settings, which is not really viable.
if pipe_meta.default_config:
config = Config(pipe_meta.default_config).merge(config)
internal_name = self.get_factory_name(factory_name)
# If the language-specific factory doesn't exist, try again with the
# not-specific name
if internal_name not in registry.factories:
internal_name = factory_name
# The name allows components to know their pipe name and use it in the
# losses etc. (even if multiple instances of the same factory are used)
config = {"nlp": self, "name": name, **config, "@factories": internal_name}
# We need to create a top-level key because Thinc doesn't allow resolving
# top-level references to registered functions. Also gives nicer errors.
cfg = {factory_name: config}
# We're calling the internal _fill here to avoid constructing the
# registered functions twice
resolved = registry.resolve(cfg, validate=validate)
filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
filled = Config(filled)
filled["factory"] = factory_name
filled.pop("@factories", None)
# Remove the extra values we added because we don't want to keep passing
# them around, copying them etc.
filled.pop("nlp", None)
filled.pop("name", None)
# Merge the final filled config with the raw config (including non-
# interpolated variables)
if raw_config:
filled = filled.merge(raw_config)
self._pipe_configs[name] = filled
return resolved[factory_name]
def create_pipe_from_source(
self, source_name: str, source: "Language", *, name: str
) -> Tuple["Pipe", str]:
"""Create a pipeline component by copying it from an existing model.
source_name (str): Name of the component in the source pipeline.
source (Language): The source nlp object to copy from.
name (str): Optional alternative name to use in current pipeline.
RETURNS (Tuple[Callable, str]): The component and its factory name.
"""
# Check source type
if not isinstance(source, Language):
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
# Check vectors, with faster checks first
if (
self.vocab.vectors.shape != source.vocab.vectors.shape
or self.vocab.vectors.key2row != source.vocab.vectors.key2row
or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
):
warnings.warn(Warnings.W113.format(name=source_name))
if source_name not in source.component_names:
raise KeyError(
Errors.E944.format(
name=source_name,
model=f"{source.meta['lang']}_{source.meta['name']}",
opts=", ".join(source.component_names),
)
)
pipe = source.get_pipe(source_name)
# Make sure the source config is interpolated so we don't end up with
# orphaned variables in our final config
source_config = source.config.interpolate()
pipe_config = util.copy_config(source_config["components"][source_name])
self._pipe_configs[name] = pipe_config
if self.vocab.strings != source.vocab.strings:
for s in source.vocab.strings:
self.vocab.strings.add(s)
return pipe, pipe_config["factory"]
def add_pipe(
self,
factory_name: str,
name: Optional[str] = None,
*,
before: Optional[Union[str, int]] = None,
after: Optional[Union[str, int]] = None,
first: Optional[bool] = None,
last: Optional[bool] = None,
source: Optional["Language"] = None,
config: Dict[str, Any] = SimpleFrozenDict(),
raw_config: Optional[Config] = None,
validate: bool = True,
) -> "Pipe":
"""Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one
of before/after/first/last can be set. Default behaviour is "last".
factory_name (str): Name of the component factory.
name (str): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if a name already exists in the pipeline.
before (Union[str, int]): Name or index of the component to insert new
component directly before.
after (Union[str, int]): Name or index of the component to insert new
component directly after.
first (bool): If True, insert component first in the pipeline.
last (bool): If True, insert component last in the pipeline.
source (Language): Optional loaded nlp object to copy the pipeline
component from.
config (Dict[str, Any]): Config parameters to use for this component.
Will be merged with default config, if available.
raw_config (Optional[Config]): Internals: the non-interpolated config.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Pipe): The pipeline component.
DOCS: https://spacy.io/api/language#add_pipe
"""
if not isinstance(factory_name, str):
bad_val = repr(factory_name)
err = Errors.E966.format(component=bad_val, name=name)
raise ValueError(err)
name = name if name is not None else factory_name
if name in self.component_names:
raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
if source is not None:
# We're loading the component from a model. After loading the
# component, we know its real factory name
pipe_component, factory_name = self.create_pipe_from_source(
factory_name, source, name=name
)
else:
if not self.has_factory(factory_name):
err = Errors.E002.format(
name=factory_name,
opts=", ".join(self.factory_names),
method="add_pipe",
lang=util.get_object_name(self),
lang_code=self.lang,
)
pipe_component = self.create_pipe(
factory_name,
name=name,
config=config,
raw_config=raw_config,
validate=validate,
)
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
self._components.insert(pipe_index, (name, pipe_component))
return pipe_component
def _get_pipe_index(
self,
before: Optional[Union[str, int]] = None,
after: Optional[Union[str, int]] = None,
first: Optional[bool] = None,
last: Optional[bool] = None,
) -> int:
"""Determine where to insert a pipeline component based on the before/
after/first/last values.
before (str): Name or index of the component to insert directly before.
after (str): Name or index of component to insert directly after.
first (bool): If True, insert component first in the pipeline.
last (bool): If True, insert component last in the pipeline.
RETURNS (int): The index of the new pipeline component.
"""
all_args = {"before": before, "after": after, "first": first, "last": last}
if sum(arg is not None for arg in [before, after, first, last]) >= 2:
raise ValueError(
Errors.E006.format(args=all_args, opts=self.component_names)
)
if last or not any(value is not None for value in [first, before, after]):
return len(self._components)
elif first:
return 0
elif isinstance(before, str):
if before not in self.component_names:
raise ValueError(
Errors.E001.format(name=before, opts=self.component_names)
)
return self.component_names.index(before)
elif isinstance(after, str):
if after not in self.component_names:
raise ValueError(
Errors.E001.format(name=after, opts=self.component_names)
)
return self.component_names.index(after) + 1
# We're only accepting indices referring to components that exist
# (can't just do isinstance here because bools are instance of int, too)
elif type(before) == int:
if before >= len(self._components) or before < 0:
err = Errors.E959.format(
dir="before", idx=before, opts=self.component_names
)
raise ValueError(err)
return before
elif type(after) == int:
if after >= len(self._components) or after < 0:
err = Errors.E959.format(
dir="after", idx=after, opts=self.component_names
)
raise ValueError(err)
return after + 1
raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names))
def has_pipe(self, name: str) -> bool:
"""Check if a component name is present in the pipeline. Equivalent to
`name in nlp.pipe_names`.
name (str): Name of the component.
RETURNS (bool): Whether a component of the name exists in the pipeline.
DOCS: https://spacy.io/api/language#has_pipe
"""
return name in self.pipe_names
def replace_pipe(
self,
name: str,
factory_name: str,
*,
config: Dict[str, Any] = SimpleFrozenDict(),
validate: bool = True,
) -> "Pipe":
"""Replace a component in the pipeline.
name (str): Name of the component to replace.
factory_name (str): Factory name of replacement component.
config (Optional[Dict[str, Any]]): Config parameters to use for this
component. Will be merged with default config, if available.
validate (bool): Whether to validate the component config against the
arguments and types expected by the factory.
RETURNS (Pipe): The new pipeline component.
DOCS: https://spacy.io/api/language#replace_pipe
"""
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
if hasattr(factory_name, "__call__"):
err = Errors.E968.format(component=repr(factory_name), name=name)
raise ValueError(err)
# We need to delegate to Language.add_pipe here instead of just writing
# to Language.pipeline to make sure the configs are handled correctly
pipe_index = self.component_names.index(name)
self.remove_pipe(name)
if not len(self._components) or pipe_index == len(self._components):
# we have no components to insert before/after, or we're replacing the last component
return self.add_pipe(
factory_name, name=name, config=config, validate=validate
)
else:
return self.add_pipe(
factory_name,
name=name,
before=pipe_index,
config=config,
validate=validate,
)
def rename_pipe(self, old_name: str, new_name: str) -> None:
"""Rename a pipeline component.
old_name (str): Name of the component to rename.
new_name (str): New name of the component.
DOCS: https://spacy.io/api/language#rename_pipe
"""
if old_name not in self.component_names:
raise ValueError(
Errors.E001.format(name=old_name, opts=self.component_names)
)
if new_name in self.component_names:
raise ValueError(
Errors.E007.format(name=new_name, opts=self.component_names)
)
i = self.component_names.index(old_name)
self._components[i] = (new_name, self._components[i][1])
self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
# Make sure [initialize] config is adjusted
if old_name in self._config["initialize"]["components"]:
init_cfg = self._config["initialize"]["components"].pop(old_name)
self._config["initialize"]["components"][new_name] = init_cfg
def remove_pipe(self, name: str) -> Tuple[str, "Pipe"]:
"""Remove a component from the pipeline.
name (str): Name of the component to remove.
RETURNS (tuple): A `(name, component)` tuple of the removed component.
DOCS: https://spacy.io/api/language#remove_pipe
"""
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
removed = self._components.pop(self.component_names.index(name))
# We're only removing the component itself from the metas/configs here
# because factory may be used for something else
self._pipe_meta.pop(name)
self._pipe_configs.pop(name)
self.meta.get("_sourced_vectors_hashes", {}).pop(name, None)
# Make sure name is removed from the [initialize] config
if name in self._config["initialize"]["components"]:
self._config["initialize"]["components"].pop(name)
# Make sure the name is also removed from the set of disabled components
if name in self.disabled:
self._disabled.remove(name)
return removed
def disable_pipe(self, name: str) -> None:
"""Disable a pipeline component. The component will still exist on
the nlp object, but it won't be run as part of the pipeline. Does
nothing if the component is already disabled.
name (str): The name of the component to disable.
"""
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
self._disabled.add(name)
def enable_pipe(self, name: str) -> None:
"""Enable a previously disabled pipeline component so it's run as part
of the pipeline. Does nothing if the component is already enabled.
name (str): The name of the component to enable.
"""
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
if name in self.disabled:
self._disabled.remove(name)
def __call__(
self,
text: Union[str, Doc],
*,
disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
) -> Doc:
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbitrary whitespace. Alignment into the original string
is preserved.
text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
the doc will be passed directly to the pipeline, skipping
`Language.make_doc`.
disable (List[str]): Names of the pipeline components to disable.
component_cfg (Dict[str, dict]): An optional dictionary with extra
keyword arguments for specific components.
RETURNS (Doc): A container for accessing the annotations.
DOCS: https://spacy.io/api/language#call
"""
doc = self._ensure_doc(text)
if component_cfg is None:
component_cfg = {}
for name, proc in self.pipeline:
if name in disable:
continue
if not hasattr(proc, "__call__"):
raise ValueError(Errors.E003.format(component=type(proc), name=name))
error_handler = self.default_error_handler
if hasattr(proc, "get_error_handler"):
error_handler = proc.get_error_handler()
try:
doc = proc(doc, **component_cfg.get(name, {})) # type: ignore[call-arg]
except KeyError as e:
# This typically happens if a component is not initialized
raise ValueError(Errors.E109.format(name=name)) from e
except Exception as e:
error_handler(name, proc, [doc], e)
if doc is None:
raise ValueError(Errors.E005.format(name=name))
return doc
def disable_pipes(self, *names) -> "DisabledPipes":
"""Disable one or more pipeline components. If used as a context
manager, the pipeline will be restored to the initial state at the end
of the block. Otherwise, a DisabledPipes object is returned, that has
a `.restore()` method you can use to undo your changes.
This method has been deprecated since 3.0
"""
warnings.warn(Warnings.W096, DeprecationWarning)
if len(names) == 1 and isinstance(names[0], (list, tuple)):
names = names[0] # type: ignore[assignment] # support list of names instead of spread
return self.select_pipes(disable=names)
def select_pipes(
self,
*,
disable: Optional[Union[str, Iterable[str]]] = None,
enable: Optional[Union[str, Iterable[str]]] = None,
) -> "DisabledPipes":
"""Disable one or more pipeline components. If used as a context
manager, the pipeline will be restored to the initial state at the end
of the block. Otherwise, a DisabledPipes object is returned, that has
a `.restore()` method you can use to undo your changes.
disable (str or iterable): The name(s) of the pipes to disable
enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
DOCS: https://spacy.io/api/language#select_pipes
"""
if enable is None and disable is None:
raise ValueError(Errors.E991)
if disable is not None and isinstance(disable, str):
disable = [disable]
if enable is not None:
if isinstance(enable, str):
enable = [enable]
to_disable = [pipe for pipe in self.pipe_names if pipe not in enable]
# raise an error if the enable and disable keywords are not consistent
if disable is not None and disable != to_disable:
raise ValueError(
Errors.E992.format(
enable=enable, disable=disable, names=self.pipe_names
)
)
disable = to_disable
assert disable is not None
# DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
# those pipes that were already disabled.
disable = [d for d in disable if d not in self._disabled]
return DisabledPipes(self, disable)
def make_doc(self, text: str) -> Doc:
"""Turn a text into a Doc object.
text (str): The text to process.
RETURNS (Doc): The processed doc.
"""
if len(text) > self.max_length:
raise ValueError(
Errors.E088.format(length=len(text), max_length=self.max_length)
)
return self.tokenizer(text)
def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
"""Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
if isinstance(doc_like, Doc):
return doc_like
if isinstance(doc_like, str):
return self.make_doc(doc_like)
raise ValueError(Errors.E866.format(type=type(doc_like)))
def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
"""Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
doc = self._ensure_doc(doc_like)
doc._context = context
return doc
def update(
self,
examples: Iterable[Example],
_: Optional[Any] = None,
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = SimpleFrozenList(),
annotates: Iterable[str] = SimpleFrozenList(),
):
"""Update the models in the pipeline.
examples (Iterable[Example]): A batch of examples
_: Should not be set - serves to catch backwards-incompatible scripts.
drop (float): The dropout rate.
sgd (Optimizer): An optimizer.
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
component.
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
components, keyed by component name.
exclude (Iterable[str]): Names of components that shouldn't be updated.
annotates (Iterable[str]): Names of components that should set
annotations on the predicted examples after updating.
RETURNS (Dict[str, float]): The updated losses dictionary
DOCS: https://spacy.io/api/language#update
"""
if _ is not None:
raise ValueError(Errors.E989)
if losses is None:
losses = {}
if isinstance(examples, list) and len(examples) == 0:
return losses
validate_examples(examples, "Language.update")
examples = _copy_examples(examples)
if sgd is None:
if self._optimizer is None:
self._optimizer = self.create_optimizer()
sgd = self._optimizer
if component_cfg is None:
component_cfg = {}
pipe_kwargs = {}
for i, (name, proc) in enumerate(self.pipeline):
component_cfg.setdefault(name, {})
pipe_kwargs[name] = deepcopy(component_cfg[name])
component_cfg[name].setdefault("drop", drop)
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
for name, proc in self.pipeline:
# ignore statements are used here because mypy ignores hasattr
if name not in exclude and hasattr(proc, "update"):
proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore
if sgd not in (None, False):
if (
name not in exclude
and isinstance(proc, ty.TrainableComponent)
and proc.is_trainable
and proc.model not in (True, False, None)
):
proc.finish_update(sgd)
if name in annotates:
for doc, eg in zip(
_pipe(
(eg.predicted for eg in examples),
proc=proc,
name=name,
default_error_handler=self.default_error_handler,
kwargs=pipe_kwargs[name],
),
examples,
):
eg.predicted = doc
return losses
def rehearse(
self,
examples: Iterable[Example],
*,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = SimpleFrozenList(),
) -> Dict[str, float]:
"""Make a "rehearsal" update to the models in the pipeline, to prevent
forgetting. Rehearsal updates run an initial copy of the model over some
data, and update the model so its current predictions are more like the
initial ones. This is useful for keeping a pretrained model on-track,
even if you're updating it with a smaller set of examples.
examples (Iterable[Example]): A batch of `Example` objects.
sgd (Optional[Optimizer]): An optimizer.
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
components, keyed by component name.
exclude (Iterable[str]): Names of components that shouldn't be updated.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> raw_text_batches = minibatch(raw_texts)
>>> for labelled_batch in minibatch(examples):
>>> nlp.update(labelled_batch)
>>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
>>> nlp.rehearse(raw_batch)
DOCS: https://spacy.io/api/language#rehearse
"""
if losses is None:
losses = {}
if isinstance(examples, list) and len(examples) == 0:
return losses
validate_examples(examples, "Language.rehearse")
if sgd is None:
if self._optimizer is None:
self._optimizer = self.create_optimizer()
sgd = self._optimizer
pipes = list(self.pipeline)
random.shuffle(pipes)
if component_cfg is None:
component_cfg = {}
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr]
get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr]
get_grads.b2 = sgd.b2 # type: ignore[attr-defined, union-attr]
for name, proc in pipes:
if name in exclude or not hasattr(proc, "rehearse"):
continue
grads = {}
proc.rehearse( # type: ignore[attr-defined]
examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key) # type: ignore[call-arg, misc]
return losses
def begin_training(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
sgd: Optional[Optimizer] = None,
) -> Optimizer:
warnings.warn(Warnings.W089, DeprecationWarning)
return self.initialize(get_examples, sgd=sgd)
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
sgd: Optional[Optimizer] = None,
) -> Optimizer:
"""Initialize the pipe for training, using data examples if available.
get_examples (Callable[[], Iterable[Example]]): Optional function that
returns gold-standard Example objects.
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
provided, will be created using the .create_optimizer() method.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://spacy.io/api/language#initialize
"""
if get_examples is None:
util.logger.debug(
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
)
doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})]
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(
method="Language.initialize", obj=type(get_examples)
)
raise TypeError(err)
# Make sure the config is interpolated so we can resolve subsections
config = self.config.interpolate()
# These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
before_init = I["before_init"]
if before_init is not None:
before_init(self)
try:
init_vocab(
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
)
except IOError:
raise IOError(Errors.E884.format(vectors=I["vectors"]))
if self.vocab.vectors.data.shape[1] >= 1:
ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
if hasattr(self.tokenizer, "initialize"):
tok_settings = validate_init_settings(
self.tokenizer.initialize, # type: ignore[union-attr]
I["tokenizer"],
section="tokenizer",
name="tokenizer",
)
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) # type: ignore[union-attr]
for name, proc in self.pipeline:
if isinstance(proc, ty.InitializableComponent):
p_settings = I["components"].get(name, {})
p_settings = validate_init_settings(
proc.initialize, p_settings, section="components", name=name
)
proc.initialize(get_examples, nlp=self, **p_settings)
pretrain_cfg = config.get("pretraining")
if pretrain_cfg:
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
init_tok2vec(self, P, I)
self._link_components()
self._optimizer = sgd
if sgd is not None:
self._optimizer = sgd
elif self._optimizer is None:
self._optimizer = self.create_optimizer()
after_init = I["after_init"]
if after_init is not None:
after_init(self)
return self._optimizer
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
"""Continue training a pretrained model.
Create and return an optimizer, and initialize "rehearsal" for any pipeline
component that has a .rehearse() method. Rehearsal is used to prevent
models from "forgetting" their initialized "knowledge". To perform
rehearsal, collect samples of text you want the models to retain performance
on, and call nlp.rehearse() with a batch of Example objects.
RETURNS (Optimizer): The optimizer.
DOCS: https://spacy.io/api/language#resume_training
"""
ops = get_current_ops()
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
for name, proc in self.pipeline:
if hasattr(proc, "_rehearsal_model"):
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]
if sgd is not None:
self._optimizer = sgd
elif self._optimizer is None:
self._optimizer = self.create_optimizer()
return self._optimizer
def set_error_handler(
self,
error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn],
):
"""Set an error handler object for all the components in the pipeline that implement
a set_error_handler function.
error_handler (Callable[[str, Pipe, List[Doc], Exception], NoReturn]):
Function that deals with a failing batch of documents. This callable function should take in
the component's name, the component itself, the offending batch of documents, and the exception
that was thrown.
DOCS: https://spacy.io/api/language#set_error_handler
"""
self.default_error_handler = error_handler
for name, pipe in self.pipeline:
if hasattr(pipe, "set_error_handler"):
pipe.set_error_handler(error_handler)
def evaluate(
self,
examples: Iterable[Example],
*,
batch_size: Optional[int] = None,
scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
scorer_cfg: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""Evaluate a model's pipeline components.
examples (Iterable[Example]): `Example` objects.
batch_size (Optional[int]): Batch size to use.
scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
will be created.
component_cfg (dict): An optional dictionary with extra keyword
arguments for specific components.
scorer_cfg (dict): An optional dictionary with extra keyword arguments
for the scorer.
RETURNS (Scorer): The scorer containing the evaluation results.
DOCS: https://spacy.io/api/language#evaluate
"""
examples = list(examples)
validate_examples(examples, "Language.evaluate")
examples = _copy_examples(examples)
if batch_size is None:
batch_size = self.batch_size
if component_cfg is None:
component_cfg = {}
if scorer_cfg is None:
scorer_cfg = {}
if scorer is None:
kwargs = dict(scorer_cfg)
kwargs.setdefault("nlp", self)
scorer = Scorer(**kwargs)
# reset annotation in predicted docs and time tokenization
start_time = timer()
# this is purely for timing
for eg in examples:
self.make_doc(eg.reference.text)
# apply all pipeline components
for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {})
kwargs.setdefault("batch_size", batch_size)
for doc, eg in zip(
_pipe(
(eg.predicted for eg in examples),
proc=pipe,
name=name,
default_error_handler=self.default_error_handler,
kwargs=kwargs,
),
examples,
):
eg.predicted = doc
end_time = timer()
results = scorer.score(examples)
n_words = sum(len(eg.predicted) for eg in examples)
results["speed"] = n_words / (end_time - start_time)
return results
def create_optimizer(self):
"""Create an optimizer, usually using the [training.optimizer] config."""
subconfig = {"optimizer": self.config["training"]["optimizer"]}
return registry.resolve(subconfig)["optimizer"]
@contextmanager
def use_params(self, params: Optional[dict]):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk("/tmp/checkpoint")
DOCS: https://spacy.io/api/language#use_params
"""
if not params:
yield
else:
contexts = [
pipe.use_params(params) # type: ignore[attr-defined]
for name, pipe in self.pipeline
if hasattr(pipe, "use_params") and hasattr(pipe, "model")
]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.
for context in contexts:
try:
next(context)
except StopIteration:
pass
yield
for context in contexts:
try:
next(context)
except StopIteration:
pass
@overload
def pipe(
self,
texts: Iterable[Union[str, Doc]],
*,
as_tuples: Literal[False] = ...,
batch_size: Optional[int] = ...,
disable: Iterable[str] = ...,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
n_process: int = ...,
) -> Iterator[Doc]:
...
@overload
def pipe( # noqa: F811
self,
texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
*,
as_tuples: Literal[True] = ...,
batch_size: Optional[int] = ...,
disable: Iterable[str] = ...,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
n_process: int = ...,
) -> Iterator[Tuple[Doc, _AnyContext]]:
...
def pipe( # noqa: F811
self,
texts: Union[
Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
],
*,
as_tuples: bool = False,
batch_size: Optional[int] = None,
disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
n_process: int = 1,
) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
"""Process texts as a stream, and yield `Doc` objects in order.
texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
process.
as_tuples (bool): If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False.
batch_size (Optional[int]): The number of texts to buffer.
disable (List[str]): Names of the pipeline components to disable.
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
arguments for specific components.
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
YIELDS (Doc): Documents in the order of the original text.
DOCS: https://spacy.io/api/language#pipe
"""
# Handle texts with context as tuples
if as_tuples:
texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
docs_with_contexts = (
self._ensure_doc_with_context(text, context) for text, context in texts
)
docs = self.pipe(
docs_with_contexts,
batch_size=batch_size,
disable=disable,
n_process=n_process,
component_cfg=component_cfg,
)
for doc in docs:
context = doc._context
doc._context = None
yield (doc, context)
return
texts = cast(Iterable[Union[str, Doc]], texts)
# Set argument defaults
if n_process == -1:
n_process = mp.cpu_count()
if component_cfg is None:
component_cfg = {}
if batch_size is None:
batch_size = self.batch_size
pipes = (
[]
) # contains functools.partial objects to easily create multiprocess worker.
for name, proc in self.pipeline:
if name in disable:
continue
kwargs = component_cfg.get(name, {})
# Allow component_cfg to overwrite the top-level kwargs.
kwargs.setdefault("batch_size", batch_size)
f = functools.partial(
_pipe,
proc=proc,
name=name,
kwargs=kwargs,
default_error_handler=self.default_error_handler,
)
pipes.append(f)
if n_process != 1:
if self._has_gpu_model(disable):
warnings.warn(Warnings.W114)
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
else:
# if n_process == 1, no processes are forked.
docs = (self._ensure_doc(text) for text in texts)
for pipe in pipes:
docs = pipe(docs)
for doc in docs:
yield doc
def _has_gpu_model(self, disable: Iterable[str]):
for name, proc in self.pipeline:
is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore
if name in disable or not is_trainable:
continue
if hasattr(proc, "model") and hasattr(proc.model, "ops") and isinstance(proc.model.ops, CupyOps): # type: ignore
return True
return False
def _multiprocessing_pipe(
self,
texts: Iterable[Union[str, Doc]],
pipes: Iterable[Callable[..., Iterator[Doc]]],
n_process: int,
batch_size: int,
) -> Iterator[Doc]:
# raw_texts is used later to stop iteration.
texts, raw_texts = itertools.tee(texts)
# for sending texts to worker
texts_q: List[mp.Queue] = [mp.Queue() for _ in range(n_process)]
# for receiving byte-encoded docs from worker
bytedocs_recv_ch, bytedocs_send_ch = zip(
*[mp.Pipe(False) for _ in range(n_process)]
)
batch_texts = util.minibatch(texts, batch_size)
# Sender sends texts to the workers.
# This is necessary to properly handle infinite length of texts.
# (In this case, all data cannot be sent to the workers at once)
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
# send twice to make process busy
sender.send()
sender.send()
procs = [
mp.Process(
target=_apply_pipes,
args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
)
for rch, sch in zip(texts_q, bytedocs_send_ch)
]
for proc in procs:
proc.start()
# Cycle channels not to break the order of docs.
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
byte_tuples = chain.from_iterable(
recv.recv() for recv in cycle(bytedocs_recv_ch)
)
try:
for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
zip(raw_texts, byte_tuples), 1
):
if byte_doc is not None:
doc = Doc(self.vocab).from_bytes(byte_doc)
doc._context = byte_context
yield doc
elif byte_error is not None:
error = srsly.msgpack_loads(byte_error)
self.default_error_handler(
None, None, None, ValueError(Errors.E871.format(error=error))
)
if i % batch_size == 0:
# tell `sender` that one batch was consumed.
sender.step()
finally:
for proc in procs:
proc.terminate()
def _link_components(self) -> None:
"""Register 'listeners' within pipeline components, to allow them to
effectively share weights.
"""
# I had thought, "Why do we do this inside the Language object? Shouldn't
# it be the tok2vec/transformer/etc's job?
# The problem is we need to do it during deserialization...And the
# components don't receive the pipeline then. So this does have to be
# here :(
for i, (name1, proc1) in enumerate(self.pipeline):
if isinstance(proc1, ty.ListenedToComponent):
for name2, proc2 in self.pipeline[i + 1 :]:
proc1.find_listeners(proc2)
@classmethod
def from_config(
cls,
config: Union[Dict[str, Any], Config] = {},
*,
vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True,
validate: bool = True,
) -> "Language":
"""Create the nlp object from a loaded config. Will set up the tokenizer
and language data, add pipeline components etc. If no config is provided,
the default config of the given language is used.
config (Dict[str, Any] / Config): The loaded config.
vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): Names of pipeline components to disable.
Disabled pipes will be loaded but they won't be run unless you
explicitly enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude.
Excluded components won't be loaded.
meta (Dict[str, Any]): Meta overrides for nlp.meta.
auto_fill (bool): Automatically fill in missing values in config based
on defaults and function argument annotations.
validate (bool): Validate the component config and arguments against
the types expected by the factory.
RETURNS (Language): The initialized Language class.
DOCS: https://spacy.io/api/language#from_config
"""
if auto_fill:
config = Config(
cls.default_config, section_order=CONFIG_SECTION_ORDER
).merge(config)
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
config_lang = config["nlp"].get("lang")
if config_lang is not None and config_lang != cls.lang:
raise ValueError(
Errors.E958.format(
bad_lang_code=config["nlp"]["lang"],
lang_code=cls.lang,
lang=util.get_object_name(cls),
)
)
config["nlp"]["lang"] = cls.lang
# This isn't very elegant, but we remove the [components] block here to prevent
# it from getting resolved (causes problems because we expect to pass in
# the nlp and name args for each component). If we're auto-filling, we're
# using the nlp.config with all defaults.
config = util.copy_config(config)
orig_pipeline = config.pop("components", {})
orig_pretraining = config.pop("pretraining", None)
config["components"] = {}
if auto_fill:
filled = registry.fill(config, validate=validate, schema=ConfigSchema)
else:
filled = config
filled["components"] = orig_pipeline
config["components"] = orig_pipeline
if orig_pretraining is not None:
filled["pretraining"] = orig_pretraining
config["pretraining"] = orig_pretraining
resolved_nlp = registry.resolve(
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
)
create_tokenizer = resolved_nlp["tokenizer"]
before_creation = resolved_nlp["before_creation"]
after_creation = resolved_nlp["after_creation"]
after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
lang_cls = cls
if before_creation is not None:
lang_cls = before_creation(cls)
if (
not isinstance(lang_cls, type)
or not issubclass(lang_cls, cls)
or lang_cls is not cls
):
raise ValueError(Errors.E943.format(value=type(lang_cls)))
# Warn about require_gpu usage in jupyter notebook
warn_if_jupyter_cupy()
# Note that we don't load vectors here, instead they get loaded explicitly
# inside stuff like the spacy train function. If we loaded them here,
# then we would load them twice at runtime: once when we make from config,
# and then again when we load from disk.
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
if after_creation is not None:
nlp = after_creation(nlp)
if not isinstance(nlp, cls):
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
# To create the components we need to use the final interpolated config
# so all values are available (if component configs use variables).
# Later we replace the component config with the raw config again.
interpolated = filled.interpolate() if not filled.is_interpolated else filled
pipeline = interpolated.get("components", {})
sourced = util.get_sourced_components(interpolated)
# If components are loaded from a source (existing models), we cache
# them here so they're only loaded once
source_nlps = {}
source_nlp_vectors_hashes = {}
vocab_b = None
for pipe_name in config["nlp"]["pipeline"]:
if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys())
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
pipe_cfg = util.copy_config(pipeline[pipe_name])
raw_config = Config(filled["components"][pipe_name])
if pipe_name not in exclude:
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
raise ValueError(err)
if "factory" in pipe_cfg:
factory = pipe_cfg.pop("factory")
# The pipe name (key in the config) here is the unique name
# of the component, not necessarily the factory
nlp.add_pipe(
factory,
name=pipe_name,
config=pipe_cfg,
validate=validate,
raw_config=raw_config,
)
else:
# We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform
# the vectors check. Since the source vectors clobber the
# current ones, we save the original vocab state and
# restore after this loop. Existing strings are preserved
# during deserialization, so they do not need any
# additional handling.
if vocab_b is None:
vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
model = pipe_cfg["source"]
if model not in source_nlps:
# Load with the same vocab, adding any strings
source_nlps[model] = util.load_model(
model, vocab=nlp.vocab, exclude=["lookups"]
)
source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False
if "replace_listeners" in pipe_cfg:
for name, proc in source_nlps[model].pipeline:
if source_name in getattr(proc, "listening_components", []):
source_nlps[model].replace_listeners(
name, source_name, pipe_cfg["replace_listeners"]
)
listeners_replaced = True
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W113\\]")
nlp.add_pipe(
source_name, source=source_nlps[model], name=pipe_name
)
if model not in source_nlp_vectors_hashes:
source_nlp_vectors_hashes[model] = hash(
source_nlps[model].vocab.vectors.to_bytes()
)
if "_sourced_vectors_hashes" not in nlp.meta:
nlp.meta["_sourced_vectors_hashes"] = {}
nlp.meta["_sourced_vectors_hashes"][
pipe_name
] = source_nlp_vectors_hashes[model]
# Delete from cache if listeners were replaced
if listeners_replaced:
del source_nlps[model]
# Restore the original vocab after sourcing if necessary
if vocab_b is not None:
nlp.vocab.from_bytes(vocab_b)
disabled_pipes = [*config["nlp"]["disabled"], *disable]
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.batch_size = config["nlp"]["batch_size"]
nlp.config = filled if auto_fill else config
if after_pipeline_creation is not None:
nlp = after_pipeline_creation(nlp)
if not isinstance(nlp, cls):
raise ValueError(
Errors.E942.format(name="pipeline_creation", value=type(nlp))
)
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
if isinstance(proc, ty.ListenedToComponent):
# Remove listeners not in the pipeline
listener_names = proc.listening_components
unused_listener_names = [
ll for ll in listener_names if ll not in nlp.pipe_names
]
for listener_name in unused_listener_names:
for listener in proc.listener_map.get(listener_name, []):
proc.remove_listener(listener, listener_name)
for listener_name in proc.listening_components:
# e.g. tok2vec/transformer
# If it's a component sourced from another pipeline, we check if
# the tok2vec listeners should be replaced with standalone tok2vec
# models (e.g. so component can be frozen without its performance
# degrading when other components/tok2vec are updated)
paths = sourced.get(listener_name, {}).get("replace_listeners", [])
if paths:
nlp.replace_listeners(name, listener_name, paths)
return nlp
def replace_listeners(
self,
tok2vec_name: str,
pipe_name: str,
listeners: Iterable[str],
) -> None:
"""Find listener layers (connecting to a token-to-vector embedding
component) of a given pipeline component model and replace
them with a standalone copy of the token-to-vector layer. This can be
useful when training a pipeline with components sourced from an existing
pipeline: if multiple components (e.g. tagger, parser, NER) listen to
the same tok2vec component, but some of them are frozen and not updated,
their performance may degrade significally as the tok2vec component is
updated with new data. To prevent this, listeners can be replaced with
a standalone tok2vec layer that is owned by the component and doesn't
change if the component isn't updated.
tok2vec_name (str): Name of the token-to-vector component, typically
"tok2vec" or "transformer".
pipe_name (str): Name of pipeline component to replace listeners for.
listeners (Iterable[str]): The paths to the listeners, relative to the
component config, e.g. ["model.tok2vec"]. Typically, implementations
will only connect to one tok2vec component, [model.tok2vec], but in
theory, custom models can use multiple listeners. The value here can
either be an empty list to not replace any listeners, or a complete
(!) list of the paths to all listener layers used by the model.
DOCS: https://spacy.io/api/language#replace_listeners
"""
if tok2vec_name not in self.pipe_names:
err = Errors.E889.format(
tok2vec=tok2vec_name,
name=pipe_name,
unknown=tok2vec_name,
opts=", ".join(self.pipe_names),
)
raise ValueError(err)
if pipe_name not in self.pipe_names:
err = Errors.E889.format(
tok2vec=tok2vec_name,
name=pipe_name,
unknown=pipe_name,
opts=", ".join(self.pipe_names),
)
raise ValueError(err)
tok2vec = self.get_pipe(tok2vec_name)
tok2vec_cfg = self.get_pipe_config(tok2vec_name)
if not isinstance(tok2vec, ty.ListenedToComponent):
raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
tok2vec_model = tok2vec.model
pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
pipe = self.get_pipe(pipe_name)
pipe_cfg = self._pipe_configs[pipe_name]
if listeners:
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
if len(list(listeners)) != len(pipe_listeners):
# The number of listeners defined in the component model doesn't
# match the listeners to replace, so we won't be able to update
# the nodes and generate a matching config
err = Errors.E887.format(
name=pipe_name,
tok2vec=tok2vec_name,
paths=listeners,
n_listeners=len(pipe_listeners),
)
raise ValueError(err)
# Update the config accordingly by copying the tok2vec model to all
# sections defined in the listener paths
for listener_path in listeners:
# Check if the path actually exists in the config
try:
util.dot_to_object(pipe_cfg, listener_path)
except KeyError:
err = Errors.E886.format(
name=pipe_name, tok2vec=tok2vec_name, path=listener_path
)
raise ValueError(err)
new_config = tok2vec_cfg["model"]
if "replace_listener_cfg" in tok2vec_model.attrs:
replace_func = tok2vec_model.attrs["replace_listener_cfg"]
new_config = replace_func(
tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
)
util.set_dot_to_object(pipe_cfg, listener_path, new_config)
# Go over the listener layers and replace them
for listener in pipe_listeners:
new_model = tok2vec_model.copy()
if "replace_listener" in tok2vec_model.attrs:
new_model = tok2vec_model.attrs["replace_listener"](new_model)
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
tok2vec.remove_listener(listener, pipe_name)
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Save the current state to a directory. If a model is loaded, this
will include the model.
path (str / Path): Path to a directory, which will be created if
it doesn't exist.
exclude (Iterable[str]): Names of components or serialization fields to exclude.
DOCS: https://spacy.io/api/language#to_disk
"""
path = util.ensure_path(path)
serializers = {}
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
p, exclude=["vocab"]
)
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
for name, proc in self._components:
if name in exclude:
continue
if not hasattr(proc, "to_disk"):
continue
serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"]) # type: ignore[misc]
serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
util.to_disk(path, serializers, exclude)
def from_disk(
self,
path: Union[str, Path],
*,
exclude: Iterable[str] = SimpleFrozenList(),
overrides: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language":
"""Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (str / Path): A path to a directory.
exclude (Iterable[str]): Names of components or serialization fields to exclude.
RETURNS (Language): The modified `Language` object.
DOCS: https://spacy.io/api/language#from_disk
"""
def deserialize_meta(path: Path) -> None:
if path.exists():
data = srsly.read_json(path)
self.meta.update(data)
# self.meta always overrides meta["vectors"] with the metadata
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
def deserialize_vocab(path: Path) -> None:
if path.exists():
self.vocab.from_disk(path, exclude=exclude)
path = util.ensure_path(path)
deserializers = {}
if Path(path / "config.cfg").exists(): # type: ignore[operator]
deserializers["config.cfg"] = lambda p: self.config.from_disk(
p, interpolate=False, overrides=overrides
)
deserializers["meta.json"] = deserialize_meta # type: ignore[assignment]
deserializers["vocab"] = deserialize_vocab # type: ignore[assignment]
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( # type: ignore[union-attr]
p, exclude=["vocab"]
)
for name, proc in self._components:
if name in exclude:
continue
if not hasattr(proc, "from_disk"):
continue
deserializers[name] = lambda p, proc=proc: proc.from_disk( # type: ignore[misc]
p, exclude=["vocab"]
)
if not (path / "vocab").exists() and "vocab" not in exclude: # type: ignore[operator]
# Convert to list here in case exclude is (default) tuple
exclude = list(exclude) + ["vocab"]
util.from_disk(path, deserializers, exclude) # type: ignore[arg-type]
self._path = path # type: ignore[assignment]
self._link_components()
return self
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the current state to a binary string.
exclude (Iterable[str]): Names of components or serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Language` object.
DOCS: https://spacy.io/api/language#to_bytes
"""
serializers: Dict[str, Callable[[], bytes]] = {}
serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
serializers["config.cfg"] = lambda: self.config.to_bytes()
for name, proc in self._components:
if name in exclude:
continue
if not hasattr(proc, "to_bytes"):
continue
serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"]) # type: ignore[misc]
return util.to_bytes(serializers, exclude)
def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Language":
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
exclude (Iterable[str]): Names of components or serialization fields to exclude.
RETURNS (Language): The `Language` object.
DOCS: https://spacy.io/api/language#from_bytes
"""
def deserialize_meta(b):
data = srsly.json_loads(b)
self.meta.update(data)
# self.meta always overrides meta["vectors"] with the metadata
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
deserializers: Dict[str, Callable[[bytes], Any]] = {}
deserializers["config.cfg"] = lambda b: self.config.from_bytes(
b, interpolate=False
)
deserializers["meta.json"] = deserialize_meta
deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( # type: ignore[union-attr]
b, exclude=["vocab"]
)
for name, proc in self._components:
if name in exclude:
continue
if not hasattr(proc, "from_bytes"):
continue
deserializers[name] = lambda b, proc=proc: proc.from_bytes( # type: ignore[misc]
b, exclude=["vocab"]
)
util.from_bytes(bytes_data, deserializers, exclude)
self._link_components()
return self
@dataclass
class FactoryMeta:
"""Dataclass containing information about a component and its defaults
provided by the @Language.component or @Language.factory decorator. It's
created whenever a component is defined and stored on the Language class for
each component instance and factory instance.
"""
factory: str
default_config: Optional[Dict[str, Any]] = None # noqa: E704
assigns: Iterable[str] = tuple()
requires: Iterable[str] = tuple()
retokenizes: bool = False
scores: Iterable[str] = tuple()
default_score_weights: Optional[Dict[str, Optional[float]]] = None # noqa: E704
class DisabledPipes(list):
"""Manager for temporary pipeline disabling."""
def __init__(self, nlp: Language, names: List[str]) -> None:
self.nlp = nlp
self.names = names
for name in self.names:
self.nlp.disable_pipe(name)
list.__init__(self)
self.extend(self.names)
def __enter__(self):
return self
def __exit__(self, *args):
self.restore()
def restore(self) -> None:
"""Restore the pipeline to its state when DisabledPipes was created."""
for name in self.names:
if name not in self.nlp.component_names:
raise ValueError(Errors.E008.format(name=name))
self.nlp.enable_pipe(name)
self[:] = []
def _copy_examples(examples: Iterable[Example]) -> List[Example]:
"""Make a copy of a batch of examples, copying the predicted Doc as well.
This is used in contexts where we need to take ownership of the examples
so that they can be mutated, for instance during Language.evaluate and
Language.update.
"""
return [Example(eg.x.copy(), eg.y) for eg in examples]
def _apply_pipes(
ensure_doc: Callable[[Union[str, Doc]], Doc],
pipes: Iterable[Callable[..., Iterator[Doc]]],
receiver,
sender,
underscore_state: Tuple[dict, dict, dict],
) -> None:
"""Worker for Language.pipe
ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
or raise an error if the input is neither a Doc nor a string.
pipes (Iterable[Pipe]): The components to apply.
receiver (multiprocessing.Connection): Pipe to receive text. Usually
created by `multiprocessing.Pipe()`
sender (multiprocessing.Connection): Pipe to send doc. Usually created by
`multiprocessing.Pipe()`
underscore_state (Tuple[dict, dict, dict]): The data in the Underscore class
of the parent.
"""
Underscore.load_state(underscore_state)
while True:
try:
texts = receiver.get()
docs = (ensure_doc(text) for text in texts)
for pipe in pipes:
docs = pipe(docs) # type: ignore[arg-type, assignment]
# Connection does not accept unpickable objects, so send list.
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
padding = [(None, None, None)] * (len(texts) - len(byte_docs))
sender.send(byte_docs + padding) # type: ignore[operator]
except Exception:
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
padding = [(None, None, None)] * (len(texts) - 1)
sender.send(error_msg + padding)
class _Sender:
"""Util for sending data to multiprocessing workers in Language.pipe"""
def __init__(
self, data: Iterable[Any], queues: List[mp.Queue], chunk_size: int
) -> None:
self.data = iter(data)
self.queues = iter(cycle(queues))
self.chunk_size = chunk_size
self.count = 0
def send(self) -> None:
"""Send chunk_size items from self.data to channels."""
for item, q in itertools.islice(
zip(self.data, cycle(self.queues)), self.chunk_size
):
# cycle channels so that distribute the texts evenly
q.put(item)
def step(self) -> None:
"""Tell sender that comsumed one item. Data is sent to the workers after
every chunk_size calls.
"""
self.count += 1
if self.count >= self.chunk_size:
self.count = 0
self.send()
This source diff could not be displayed because it is too large. You can view the blob instead.
from numpy cimport ndarray
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
from .structs cimport LexemeC
from .strings cimport StringStore
from .vocab cimport Vocab
cdef LexemeC EMPTY_LEXEME
cdef attr_t OOV_RANK
cdef class Lexeme:
cdef LexemeC* c
cdef readonly Vocab vocab
cdef readonly attr_t orth
@staticmethod
cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab):
cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
self.c = lex
self.vocab = vocab
self.orth = lex.orth
return self
@staticmethod
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
if name < (sizeof(flags_t) * 8):
Lexeme.c_set_flag(lex, name, value)
elif name == ID:
lex.id = value
elif name == LOWER:
lex.lower = value
elif name == NORM:
lex.norm = value
elif name == SHAPE:
lex.shape = value
elif name == PREFIX:
lex.prefix = value
elif name == SUFFIX:
lex.suffix = value
elif name == LANG:
lex.lang = value
@staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
if Lexeme.c_check_flag(lex, feat_name):
return 1
else:
return 0
elif feat_name == ID:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == LOWER:
return lex.lower
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == LANG:
return lex.lang
else:
return 0
@staticmethod
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
cdef flags_t one = 1
if lexeme.flags & (one << flag_id):
return True
else:
return False
@staticmethod
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) nogil:
cdef flags_t one = 1
if value:
lex.flags |= one << flag_id
else:
lex.flags &= ~(one << flag_id)
from typing import (
Union,
Any,
)
from thinc.types import Floats1d
from .tokens import Doc, Span, Token
from .vocab import Vocab
class Lexeme:
def __init__(self, vocab: Vocab, orth: int) -> None: ...
def __richcmp__(self, other: Lexeme, op: int) -> bool: ...
def __hash__(self) -> int: ...
def set_attrs(self, **attrs: Any) -> None: ...
def set_flag(self, flag_id: int, value: bool) -> None: ...
def check_flag(self, flag_id: int) -> bool: ...
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
@property
def has_vector(self) -> bool: ...
@property
def vector_norm(self) -> float: ...
vector: Floats1d
rank: str
sentiment: float
@property
def orth_(self) -> str: ...
@property
def text(self) -> str: ...
lower: str
norm: int
shape: int
prefix: int
suffix: int
cluster: int
lang: int
prob: float
lower_: str
norm_: str
shape_: str
prefix_: str
suffix_: str
lang_: str
flags: int
@property
def is_oov(self) -> bool: ...
is_stop: bool
is_alpha: bool
is_ascii: bool
is_digit: bool
is_lower: bool
is_upper: bool
is_title: bool
is_punct: bool
is_space: bool
is_bracket: bool
is_quote: bool
is_left_punct: bool
is_right_punct: bool
is_currency: bool
like_url: bool
like_num: bool
like_email: bool
# cython: embedsignature=True
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
from libc.string cimport memset
cimport numpy as np
np.import_array()
import numpy
from thinc.api import get_array_module
import warnings
from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from .attrs cimport IS_CURRENCY
from .attrs import intify_attrs
from .errors import Errors, Warnings
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.id = OOV_RANK
cdef class Lexeme:
"""An entry in the vocabulary. A `Lexeme` has no string context – it's a
word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the
part-of-speech tag).
DOCS: https://spacy.io/api/lexeme
"""
def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object.
vocab (Vocab): The parent vocabulary
orth (uint64): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object.
"""
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
def __richcmp__(self, other, int op):
if other is None:
if op == 0 or op == 1 or op == 2:
return False
else:
return True
if isinstance(other, Lexeme):
a = self.orth
b = other.orth
elif isinstance(other, long):
a = self.orth
b = other
elif isinstance(other, str):
a = self.orth_
b = other
else:
a = 0
b = 1
if op == 2: # ==
return a == b
elif op == 3: # !=
return a != b
elif op == 0: # <
return a < b
elif op == 1: # <=
return a <= b
elif op == 4: # >
return a > b
elif op == 5: # >=
return a >= b
else:
raise NotImplementedError(op)
def __hash__(self):
return self.c.orth
def set_attrs(self, **attrs):
cdef attr_id_t attr
attrs = intify_attrs(attrs)
for attr, value in attrs.items():
# skip PROB, e.g. from lexemes.jsonl
if isinstance(value, float):
continue
elif isinstance(value, (int, long)):
Lexeme.set_struct_attr(self.c, attr, value)
else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag.
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
"""
Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id):
"""Check the value of a boolean flag.
flag_id (int): The attribute ID of the flag to query.
RETURNS (bool): The value of the flag.
"""
return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other):
"""Compute a semantic similarity estimate. Defaults to cosine over
vectors.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
# Return 1.0 similarity for matches
if hasattr(other, "orth"):
if self.c.orth == other.orth:
return 1.0
elif hasattr(other, "__len__") and len(other) == 1 \
and hasattr(other[0], "orth"):
if self.c.orth == other[0].orth:
return 1.0
if self.vector_norm == 0 or other.vector_norm == 0:
warnings.warn(Warnings.W008.format(obj="Lexeme"))
return 0.0
vector = self.vector
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
@property
def has_vector(self):
"""RETURNS (bool): Whether a word vector is associated with the object.
"""
return self.vocab.has_vector(self.c.orth)
@property
def vector_norm(self):
"""RETURNS (float): The L2 norm of the vector representation."""
vector = self.vector
return numpy.sqrt((vector**2).sum())
property vector:
"""A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics.
"""
def __get__(self):
cdef int length = self.vocab.vectors_length
if length == 0:
raise ValueError(Errors.E010)
return self.vocab.get_vector(self.c.orth)
def __set__(self, vector):
if len(vector) != self.vocab.vectors_length:
raise ValueError(Errors.E073.format(new_length=len(vector),
length=self.vocab.vectors_length))
self.vocab.set_vector(self.c.orth, vector)
property rank:
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.id
def __set__(self, value):
self.c.id = value
property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme."""
def __get__(self):
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
return sentiment_table.get(self.c.orth, 0.0)
def __set__(self, float x):
if "lexeme_sentiment" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_sentiment")
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
sentiment_table[self.c.orth] = x
@property
def orth_(self):
"""RETURNS (str): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
return self.vocab.strings[self.c.orth]
@property
def text(self):
"""RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_
property lower:
"""RETURNS (str): Lowercase form of the lexeme."""
def __get__(self):
return self.c.lower
def __set__(self, attr_t x):
self.c.lower = x
property norm:
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
return self.c.norm
def __set__(self, attr_t x):
if "lexeme_norm" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_norm")
norm_table = self.vocab.lookups.get_table("lexeme_norm")
norm_table[self.c.orth] = self.vocab.strings[x]
self.c.norm = x
property shape:
"""RETURNS (uint64): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
return self.c.shape
def __set__(self, attr_t x):
self.c.shape = x
property prefix:
"""RETURNS (uint64): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
return self.c.prefix
def __set__(self, attr_t x):
self.c.prefix = x
property suffix:
"""RETURNS (uint64): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
return self.c.suffix
def __set__(self, attr_t x):
self.c.suffix = x
property cluster:
"""RETURNS (int): Brown cluster ID."""
def __get__(self):
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
return cluster_table.get(self.c.orth, 0)
def __set__(self, int x):
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table[self.c.orth] = x
property lang:
"""RETURNS (uint64): Language of the parent vocabulary."""
def __get__(self):
return self.c.lang
def __set__(self, attr_t x):
self.c.lang = x
property prob:
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
type."""
def __get__(self):
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
default_oov_prob = settings_table.get("oov_prob", -20.0)
return prob_table.get(self.c.orth, default_oov_prob)
def __set__(self, float x):
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table[self.c.orth] = x
property lower_:
"""RETURNS (str): Lowercase form of the word."""
def __get__(self):
return self.vocab.strings[self.c.lower]
def __set__(self, str x):
self.c.lower = self.vocab.strings.add(x)
property norm_:
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
return self.vocab.strings[self.c.norm]
def __set__(self, str x):
self.norm = self.vocab.strings.add(x)
property shape_:
"""RETURNS (str): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
return self.vocab.strings[self.c.shape]
def __set__(self, str x):
self.c.shape = self.vocab.strings.add(x)
property prefix_:
"""RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
return self.vocab.strings[self.c.prefix]
def __set__(self, str x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
"""RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
return self.vocab.strings[self.c.suffix]
def __set__(self, str x):
self.c.suffix = self.vocab.strings.add(x)
property lang_:
"""RETURNS (str): Language of the parent vocabulary."""
def __get__(self):
return self.vocab.strings[self.c.lang]
def __set__(self, str x):
self.c.lang = self.vocab.strings.add(x)
property flags:
"""RETURNS (uint64): Container of the lexeme's binary flags."""
def __get__(self):
return self.c.flags
def __set__(self, flags_t x):
self.c.flags = x
@property
def is_oov(self):
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
return self.orth not in self.vocab.vectors
property is_stop:
"""RETURNS (bool): Whether the lexeme is a stop word."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_STOP)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
property is_alpha:
"""RETURNS (bool): Whether the lexeme consists of alphabetic
characters. Equivalent to `lexeme.text.isalpha()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ALPHA)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
property is_ascii:
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ASCII)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
property is_digit:
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
to `lexeme.text.isdigit()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_DIGIT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
property is_lower:
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
`lexeme.text.islower()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LOWER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
property is_upper:
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
`lexeme.text.isupper()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_UPPER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
property is_title:
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
`lexeme.text.istitle()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_TITLE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
property is_punct:
"""RETURNS (bool): Whether the lexeme is punctuation."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
property is_space:
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
Equivalent to `lexeme.text.isspace()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
property is_bracket:
"""RETURNS (bool): Whether the lexeme is a bracket."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
property is_quote:
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct:
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
property is_right_punct:
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property is_currency:
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
property like_url:
"""RETURNS (bool): Whether the lexeme resembles a URL."""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
property like_num:
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
"10", "ten", etc.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_NUM)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
property like_email:
"""RETURNS (bool): Whether the lexeme resembles an email address."""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
from typing import Any, List, Union, Optional, Dict
from pathlib import Path
import srsly
from preshed.bloom import BloomFilter
from collections import OrderedDict
from .errors import Errors
from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
from .strings import get_string_id
UNSET = object()
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
"""Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty `Lookups` container if there's no data or if the package
is not installed.
lang (str): The language code (corresponds to entry point exposed by
the spacy-lookups-data package).
tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
strict (bool): Whether to raise an error if a table doesn't exist.
RETURNS (Lookups): The lookups container containing the loaded tables.
"""
# TODO: import spacy_lookups_data instead of going via entry points here?
lookups = Lookups()
if lang not in registry.lookups:
if strict and len(tables) > 0:
raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
return lookups
data = registry.lookups.get(lang)
for table in tables:
if table not in data:
if strict:
raise ValueError(Errors.E955.format(table=table, lang=lang))
language_data = {} # type: ignore[var-annotated]
else:
language_data = load_language_data(data[table]) # type: ignore[assignment]
lookups.add_table(table, language_data)
return lookups
class Table(OrderedDict):
"""A table in the lookups. Subclass of builtin dict that implements a
slightly more consistent and unified API.
Includes a Bloom filter to speed up missed lookups.
"""
@classmethod
def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
"""Initialize a new table from a dict.
data (dict): The dictionary.
name (str): Optional table name for reference.
DOCS: https://spacy.io/api/lookups#table.from_dict
"""
self = cls(name=name)
self.update(data)
return self
def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
"""Initialize a new table.
name (str): Optional table name for reference.
data (dict): Initial data, used to hint Bloom Filter.
DOCS: https://spacy.io/api/lookups#table.init
"""
OrderedDict.__init__(self)
self.name = name
# Assume a default size of 1M items
self.default_size = 1e6
size = max(len(data), 1) if data is not None else self.default_size
self.bloom = BloomFilter.from_error_rate(size)
if data:
self.update(data)
def __setitem__(self, key: Union[str, int], value: Any) -> None:
"""Set new key/value pair. String keys will be hashed.
key (str / int): The key to set.
value: The value to set.
"""
key = get_string_id(key)
OrderedDict.__setitem__(self, key, value)
self.bloom.add(key)
def set(self, key: Union[str, int], value: Any) -> None:
"""Set new key/value pair. String keys will be hashed.
Same as table[key] = value.
key (str / int): The key to set.
value: The value to set.
"""
self[key] = value
def __getitem__(self, key: Union[str, int]) -> Any:
"""Get the value for a given key. String keys will be hashed.
key (str / int): The key to get.
RETURNS: The value.
"""
key = get_string_id(key)
return OrderedDict.__getitem__(self, key)
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
"""Get the value for a given key. String keys will be hashed.
key (str / int): The key to get.
default: The default value to return.
RETURNS: The value.
"""
key = get_string_id(key)
return OrderedDict.get(self, key, default)
def __contains__(self, key: Union[str, int]) -> bool: # type: ignore[override]
"""Check whether a key is in the table. String keys will be hashed.
key (str / int): The key to check.
RETURNS (bool): Whether the key is in the table.
"""
key = get_string_id(key)
# This can give a false positive, so we need to check it after
if key not in self.bloom:
return False
return OrderedDict.__contains__(self, key)
def to_bytes(self) -> bytes:
"""Serialize table to a bytestring.
RETURNS (bytes): The serialized table.
DOCS: https://spacy.io/api/lookups#table.to_bytes
"""
data = {
"name": self.name,
"dict": dict(self.items()),
"bloom": self.bloom.to_bytes(),
}
return srsly.msgpack_dumps(data)
def from_bytes(self, bytes_data: bytes) -> "Table":
"""Load a table from a bytestring.
bytes_data (bytes): The data to load.
RETURNS (Table): The loaded table.
DOCS: https://spacy.io/api/lookups#table.from_bytes
"""
loaded = srsly.msgpack_loads(bytes_data)
data = loaded.get("dict", {})
self.name = loaded["name"]
self.bloom = BloomFilter().from_bytes(loaded["bloom"])
self.clear()
self.update(data)
return self
class Lookups:
"""Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups,
so they can be accessed before the pipeline components are applied (e.g.
in the tokenizer and lemmatizer), as well as within the pipeline components
via doc.vocab.lookups.
"""
def __init__(self) -> None:
"""Initialize the Lookups object.
DOCS: https://spacy.io/api/lookups#init
"""
self._tables: Dict[str, Table] = {}
def __contains__(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table.
name (str): Name of the table.
RETURNS (bool): Whether a table of that name is in the lookups.
"""
return self.has_table(name)
def __len__(self) -> int:
"""RETURNS (int): The number of tables in the lookups."""
return len(self._tables)
@property
def tables(self) -> List[str]:
"""RETURNS (List[str]): Names of all tables in the lookups."""
return list(self._tables.keys())
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> Table:
"""Add a new table to the lookups. Raises an error if the table exists.
name (str): Unique name of table.
data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table.
DOCS: https://spacy.io/api/lookups#add_table
"""
if name in self.tables:
raise ValueError(Errors.E158.format(name=name))
table = Table(name=name, data=data)
self._tables[name] = table
return table
def set_table(self, name: str, table: Table) -> None:
"""Set a table.
name (str): Name of the table to set.
table (Table): The Table to set.
DOCS: https://spacy.io/api/lookups#set_table
"""
self._tables[name] = table
def get_table(self, name: str, default: Any = UNSET) -> Table:
"""Get a table. Raises an error if the table doesn't exist and no
default value is provided.
name (str): Name of the table.
default (Any): Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table
"""
if name not in self._tables:
if default == UNSET:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return default
return self._tables[name]
def remove_table(self, name: str) -> Table:
"""Remove a table. Raises an error if the table doesn't exist.
name (str): Name of the table to remove.
RETURNS (Table): The removed table.
DOCS: https://spacy.io/api/lookups#remove_table
"""
if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return self._tables.pop(name)
def has_table(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name.
name (str): Name of the table.
RETURNS (bool): Whether a table of that name exists.
DOCS: https://spacy.io/api/lookups#has_table
"""
return name in self._tables
def to_bytes(self, **kwargs) -> bytes:
"""Serialize the lookups to a bytestring.
RETURNS (bytes): The serialized Lookups.
DOCS: https://spacy.io/api/lookups#to_bytes
"""
return srsly.msgpack_dumps(self._tables)
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
"""Load the lookups from a bytestring.
bytes_data (bytes): The data to load.
RETURNS (Lookups): The loaded Lookups.
DOCS: https://spacy.io/api/lookups#from_bytes
"""
self._tables = {}
for key, value in srsly.msgpack_loads(bytes_data).items():
self._tables[key] = Table(key, value)
return self
def to_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> None:
"""Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
path (str / Path): The file path.
DOCS: https://spacy.io/api/lookups#to_disk
"""
path = ensure_path(path)
if not path.exists():
path.mkdir()
filepath = path / filename
with filepath.open("wb") as file_:
file_.write(self.to_bytes())
def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> "Lookups":
"""Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
path (str / Path): The directory path.
RETURNS (Lookups): The loaded lookups.
DOCS: https://spacy.io/api/lookups#from_disk
"""
path = ensure_path(path)
filepath = path / filename
if filepath.exists():
with filepath.open("rb") as file_:
data = file_.read()
return self.from_bytes(data)
return self
This source diff could not be displayed because it is too large. You can view the blob instead.
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
cimport numpy as np
from libc.stdint cimport uint64_t
from .structs cimport MorphAnalysisC
from .strings cimport StringStore
from .typedefs cimport attr_t, hash_t
cdef class Morphology:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef PreshMap tags # Keyed by hash, value is pointer to tag
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
cdef int insert(self, MorphAnalysisC tag) except -1
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
cdef list list_features(const MorphAnalysisC* morph)
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
# cython: infer_types
import numpy
import warnings
from .attrs cimport POS
from .parts_of_speech import IDS as POS_IDS
from .errors import Warnings
from . import symbols
cdef class Morphology:
"""Store the possible morphological analyses for a language, and index them
by hash.
To save space on each token, tokens only know the hash of their
morphological analysis, so queries of morphological attributes are delegated
to this class.
"""
FEATURE_SEP = "|"
FIELD_SEP = "="
VALUE_SEP = ","
# not an empty string so we can distinguish unset morph from empty morph
EMPTY_MORPH = symbols.NAMES[symbols._]
def __init__(self, StringStore strings):
self.mem = Pool()
self.strings = strings
self.tags = PreshMap()
def __reduce__(self):
tags = set([self.get(self.strings[s]) for s in self.strings])
tags -= set([""])
return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
def add(self, features):
"""Insert a morphological analysis in the morphology table, if not
already present. The morphological analysis may be provided in the UD
FEATS format as a string or in the tag map dict format.
Returns the hash of the new analysis.
"""
cdef MorphAnalysisC* tag_ptr
if isinstance(features, str):
if features == "":
features = self.EMPTY_MORPH
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
if tag_ptr != NULL:
return tag_ptr.key
features = self.feats_to_dict(features)
if not isinstance(features, dict):
warnings.warn(Warnings.W100.format(feature=features))
features = {}
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
# intified ("Field", "Field=Value") pairs
field_feature_pairs = []
for field in sorted(string_features):
values = string_features[field]
for value in values.split(self.VALUE_SEP):
field_feature_pairs.append((
self.strings.add(field),
self.strings.add(field + self.FIELD_SEP + value),
))
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
# the hash key for the tag is either the hash of the normalized UFEATS
# string or the hash of an empty placeholder
norm_feats_string = self.normalize_features(features)
tag.key = self.strings.add(norm_feats_string)
self.insert(tag)
return tag.key
def normalize_features(self, features):
"""Create a normalized FEATS string from a features string or dict.
features (Union[dict, str]): Features as dict or UFEATS string.
RETURNS (str): Features as normalized UFEATS string.
"""
if isinstance(features, str):
features = self.feats_to_dict(features)
if not isinstance(features, dict):
warnings.warn(Warnings.W100.format(feature=features))
features = {}
features = self.normalize_attrs(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
# normalized UFEATS string with sorted fields and values
norm_feats_string = self.FEATURE_SEP.join(sorted([
self.FIELD_SEP.join([field, values])
for field, values in string_features.items()
]))
return norm_feats_string or self.EMPTY_MORPH
def normalize_attrs(self, attrs):
"""Convert attrs dict so that POS is always by ID, other features are
by string. Values separated by VALUE_SEP are sorted.
"""
out = {}
attrs = dict(attrs)
for key, value in attrs.items():
# convert POS value to ID
if key == POS or (isinstance(key, str) and key.upper() == "POS"):
if isinstance(value, str) and value.upper() in POS_IDS:
value = POS_IDS[value.upper()]
elif isinstance(value, int) and value not in POS_IDS.values():
warnings.warn(Warnings.W100.format(feature={key: value}))
continue
out[POS] = value
# accept any string or ID fields and values and convert to strings
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
key = self.strings.as_string(key)
value = self.strings.as_string(value)
# sort values
if self.VALUE_SEP in value:
value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
out[key] = value
else:
warnings.warn(Warnings.W100.format(feature={key: value}))
return out
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
"""Creates a MorphAnalysisC from a list of intified
("Field", "Field=Value") tuples where fields with multiple values have
been split into individual tuples, e.g.:
[("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
("Field2", "Field2=Value3")]
"""
cdef MorphAnalysisC tag
tag.length = len(field_feature_pairs)
if tag.length > 0:
tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
for i, (field, feature) in enumerate(field_feature_pairs):
tag.fields[i] = field
tag.features[i] = feature
return tag
cdef int insert(self, MorphAnalysisC tag) except -1:
cdef hash_t key = tag.key
if self.tags.get(key) == NULL:
tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
tag_ptr[0] = tag
self.tags.set(key, <void*>tag_ptr)
def get(self, hash_t morph):
tag = <MorphAnalysisC*>self.tags.get(morph)
if tag == NULL:
return ""
else:
return self.strings[tag.key]
@staticmethod
def feats_to_dict(feats):
if not feats or feats == Morphology.EMPTY_MORPH:
return {}
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
@staticmethod
def dict_to_feats(feats_dict):
if len(feats_dict) == 0:
return ""
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
cdef int i
for i in range(morph.length):
if morph.features[i] == feature:
return True
return False
cdef list list_features(const MorphAnalysisC* morph):
cdef int i
features = []
for i in range(morph.length):
features.append(morph.features[i])
return features
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
n = get_n_by_field(<uint64_t*>results.data, morph, field)
return results[:n]
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
cdef int n_results = 0
cdef int i
for i in range(morph.length):
if morph.fields[i] == field:
results[n_results] = morph.features[i]
n_results += 1
return n_results
def unpickle_morphology(strings, tags):
cdef Morphology morphology = Morphology(strings)
for tag in tags:
morphology.add(tag)
return morphology
This source diff could not be displayed because it is too large. You can view the blob instead.
from . cimport symbols
cpdef enum univ_pos_t:
NO_TAG = 0
ADJ = symbols.ADJ
ADP
ADV
AUX
CONJ
CCONJ # U20
DET
INTJ
NOUN
NUM
PART
PRON
PROPN
PUNCT
SCONJ
SYM
VERB
X
EOL
SPACE
IDS = {
"": NO_TAG,
"ADJ": ADJ,
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ, # U20
"CCONJ": CCONJ,
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
"NUM": NUM,
"PART": PART,
"PRON": PRON,
"PROPN": PROPN,
"PUNCT": PUNCT,
"SCONJ": SCONJ,
"SYM": SYM,
"VERB": VERB,
"X": X,
"EOL": EOL,
"SPACE": SPACE
}
NAMES = {value: key for key, value in IDS.items()}
from typing import List, Set, Dict, Iterable, ItemsView, Union, TYPE_CHECKING
from wasabi import msg
from .tokens import Doc, Token, Span
from .errors import Errors
from .util import dot_to_dict
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"]
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
"""Validate component attributes provided to "assigns", "requires" etc.
Raises error for invalid attributes and formatting. Doesn't check if
custom extension attributes are registered, since this is something the
user might want to do themselves later in the component.
values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
RETURNS (Iterable[str]): The checked attributes.
"""
data = dot_to_dict({value: True for value in values})
objs = {"doc": Doc, "token": Token, "span": Span}
for obj_key, attrs in data.items():
if obj_key == "span":
# Support Span only for custom extension attributes
span_attrs = [attr for attr in values if attr.startswith("span.")]
span_attrs = [attr for attr in span_attrs if not attr.startswith("span._.")]
if span_attrs:
raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
if obj_key not in objs: # first element is not doc/token/span
invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
if not isinstance(attrs, dict): # attr is something like "doc"
raise ValueError(Errors.E182.format(attr=obj_key))
for attr, value in attrs.items():
if attr == "_":
if value is True: # attr is something like "doc._"
raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
for ext_attr, ext_value in value.items():
# We don't check whether the attribute actually exists
if ext_value is not True: # attr is something like doc._.x.y
good = f"{obj_key}._.{ext_attr}"
bad = f"{good}.{'.'.join(ext_value)}"
raise ValueError(Errors.E183.format(attr=bad, solution=good))
continue # we can't validate those further
if attr.endswith("_"): # attr is something like "token.pos_"
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
if value is not True: # attr is something like doc.x.y
good = f"{obj_key}.{attr}"
bad = f"{good}.{'.'.join(value)}"
raise ValueError(Errors.E183.format(attr=bad, solution=good))
obj = objs[obj_key]
if not hasattr(obj, attr):
raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
return values
def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
"""Check which components in the pipeline assign or require an attribute.
nlp (Language): The current nlp object.
attr (str): The attribute, e.g. "doc.tensor".
RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires",
mapped to a list of component names.
"""
result: Dict[str, List[str]] = {"assigns": [], "requires": []}
for pipe_name in nlp.pipe_names:
meta = nlp.get_pipe_meta(pipe_name)
if attr in meta.assigns:
result["assigns"].append(pipe_name)
if attr in meta.requires:
result["requires"].append(pipe_name)
return result
def analyze_pipes(
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
) -> Dict[str, Dict[str, Union[List[str], Dict]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as
well as any problems if available.
nlp (Language): The nlp object.
keys (List[str]): The meta keys to show in the table.
RETURNS (dict): A dict with "summary" and "problems".
"""
result: Dict[str, Dict[str, Union[List[str], Dict]]] = {
"summary": {},
"problems": {},
}
all_attrs: Set[str] = set()
for i, name in enumerate(nlp.pipe_names):
meta = nlp.get_pipe_meta(name)
all_attrs.update(meta.assigns)
all_attrs.update(meta.requires)
result["summary"][name] = {key: getattr(meta, key, None) for key in keys}
prev_pipes = nlp.pipeline[:i]
requires = {annot: False for annot in meta.requires}
if requires:
for prev_name, prev_pipe in prev_pipes:
prev_meta = nlp.get_pipe_meta(prev_name)
for annot in prev_meta.assigns:
requires[annot] = True
result["problems"][name] = [
annot for annot, fulfilled in requires.items() if not fulfilled
]
result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs}
return result
def print_pipe_analysis(
analysis: Dict[str, Dict[str, Union[List[str], Dict]]],
*,
keys: List[str] = DEFAULT_KEYS,
) -> None:
"""Print a formatted version of the pipe analysis produced by analyze_pipes.
analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
keys (List[str]): The meta keys to show in the table.
"""
msg.divider("Pipeline Overview")
header = ["#", "Component", *[key.capitalize() for key in keys]]
summary: ItemsView = analysis["summary"].items()
body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)]
msg.table(body, header=header, divider=True, multiline=True)
n_problems = sum(len(p) for p in analysis["problems"].values())
if any(p for p in analysis["problems"].values()):
msg.divider(f"Problems ({n_problems})")
for name, problem in analysis["problems"].items():
if problem:
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
else:
msg.good("No problems found.")
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic.main import ModelMetaclass
from thinc.api import Optimizer, ConfigValidationError, Model
from thinc.config import Promise
from collections import defaultdict
import inspect
from .attrs import NAMES
from .lookups import Lookups
from .util import is_cython_func
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
from .training import Example # noqa: F401
from .vocab import Vocab # noqa: F401
# fmt: off
ItemT = TypeVar("ItemT")
Batcher = Union[Callable[[Iterable[ItemT]], Iterable[List[ItemT]]], Promise]
Reader = Union[Callable[["Language", str], Iterable["Example"]], Promise]
Logger = Union[Callable[["Language"], Tuple[Callable[[Dict[str, Any]], None], Callable]], Promise]
# fmt: on
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
"""Validate data against a given pydantic schema.
obj (Dict[str, Any]): JSON-serializable data to validate.
schema (pydantic.BaseModel): The schema to validate against.
RETURNS (List[str]): A list of error messages, if available.
"""
try:
schema(**obj)
return []
except ValidationError as e:
errors = e.errors()
data = defaultdict(list)
for error in errors:
err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
data[err_loc].append(error.get("msg"))
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] # type: ignore[arg-type]
# Initialization
class ArgSchemaConfig:
extra = "forbid"
arbitrary_types_allowed = True
class ArgSchemaConfigExtra:
extra = "forbid"
arbitrary_types_allowed = True
def get_arg_model(
func: Callable,
*,
exclude: Iterable[str] = tuple(),
name: str = "ArgModel",
strict: bool = True,
) -> ModelMetaclass:
"""Generate a pydantic model for function arguments.
func (Callable): The function to generate the schema for.
exclude (Iterable[str]): Parameter names to ignore.
name (str): Name of created model class.
strict (bool): Don't allow extra arguments if no variable keyword arguments
are allowed on the function.
RETURNS (ModelMetaclass): A pydantic model.
"""
sig_args = {}
try:
sig = inspect.signature(func)
except ValueError:
# Typically happens if the method is part of a Cython module without
# binding=True. Here we just use an empty model that allows everything.
return create_model(name, __config__=ArgSchemaConfigExtra) # type: ignore[arg-type, return-value]
has_variable = False
for param in sig.parameters.values():
if param.name in exclude:
continue
if param.kind == param.VAR_KEYWORD:
# The function allows variable keyword arguments so we shouldn't
# include **kwargs etc. in the schema and switch to non-strict
# mode and pass through all other values
has_variable = True
continue
# If no annotation is specified assume it's anything
annotation = param.annotation if param.annotation != param.empty else Any
# If no default value is specified assume that it's required. Cython
# functions/methods will have param.empty for default value None so we
# need to treat them differently
default_empty = None if is_cython_func(func) else ...
default = param.default if param.default != param.empty else default_empty
sig_args[param.name] = (annotation, default)
is_strict = strict and not has_variable
sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra # type: ignore[assignment]
return create_model(name, **sig_args) # type: ignore[arg-type, return-value]
def validate_init_settings(
func: Callable,
settings: Dict[str, Any],
*,
section: Optional[str] = None,
name: str = "",
exclude: Iterable[str] = ("get_examples", "nlp"),
) -> Dict[str, Any]:
"""Validate initialization settings against the expected arguments in
the method signature. Will parse values if possible (e.g. int to string)
and return the updated settings dict. Will raise a ConfigValidationError
if types don't match or required values are missing.
func (Callable): The initialize method of a given component etc.
settings (Dict[str, Any]): The settings from the respective [initialize] block.
section (str): Initialize section, for error message.
name (str): Name of the block in the section.
exclude (Iterable[str]): Parameter names to exclude from schema.
RETURNS (Dict[str, Any]): The validated settings.
"""
schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
try:
return schema(**settings).dict()
except ValidationError as e:
block = "initialize" if not section else f"initialize.{section}"
title = f"Error validating initialization settings in [{block}]"
raise ConfigValidationError(
title=title, errors=e.errors(), config=settings, parent=name
) from None
# Matcher token patterns
def validate_token_pattern(obj: list) -> List[str]:
# Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
if isinstance(obj, list):
converted = []
for pattern in obj:
if isinstance(pattern, dict):
pattern = {get_key(k): v for k, v in pattern.items()}
converted.append(pattern)
obj = converted
return validate(TokenPatternSchema, {"pattern": obj})
class TokenPatternString(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictStr]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
class Config:
extra = "forbid"
allow_population_by_field_name = True # allow alias and field name
@validator("*", pre=True, each_item=True, allow_reuse=True)
def raise_for_none(cls, v):
if v is None:
raise ValueError("None / null is not allowed")
return v
class TokenPatternNumber(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictInt]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
class Config:
extra = "forbid"
allow_population_by_field_name = True # allow alias and field name
@validator("*", pre=True, each_item=True, allow_reuse=True)
def raise_for_none(cls, v):
if v is None:
raise ValueError("None / null is not allowed")
return v
class TokenPatternOperator(str, Enum):
plus: StrictStr = StrictStr("+")
start: StrictStr = StrictStr("*")
question: StrictStr = StrictStr("?")
exclamation: StrictStr = StrictStr("!")
StringValue = Union[TokenPatternString, StrictStr]
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
]
class TokenPattern(BaseModel):
orth: Optional[StringValue] = None
text: Optional[StringValue] = None
lower: Optional[StringValue] = None
pos: Optional[StringValue] = None
tag: Optional[StringValue] = None
morph: Optional[StringValue] = None
dep: Optional[StringValue] = None
lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None
ent_type: Optional[StringValue] = None
norm: Optional[StringValue] = None
length: Optional[NumberValue] = None
spacy: Optional[StrictBool] = None
is_alpha: Optional[StrictBool] = None
is_ascii: Optional[StrictBool] = None
is_digit: Optional[StrictBool] = None
is_lower: Optional[StrictBool] = None
is_upper: Optional[StrictBool] = None
is_title: Optional[StrictBool] = None
is_punct: Optional[StrictBool] = None
is_space: Optional[StrictBool] = None
is_bracket: Optional[StrictBool] = None
is_quote: Optional[StrictBool] = None
is_left_punct: Optional[StrictBool] = None
is_right_punct: Optional[StrictBool] = None
is_currency: Optional[StrictBool] = None
is_stop: Optional[StrictBool] = None
is_sent_start: Optional[StrictBool] = None
sent_start: Optional[StrictBool] = None
like_num: Optional[StrictBool] = None
like_url: Optional[StrictBool] = None
like_email: Optional[StrictBool] = None
op: Optional[TokenPatternOperator] = None
underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_")
class Config:
extra = "forbid"
allow_population_by_field_name = True
alias_generator = lambda value: value.upper()
@validator("*", pre=True, allow_reuse=True)
def raise_for_none(cls, v):
if v is None:
raise ValueError("None / null is not allowed")
return v
class TokenPatternSchema(BaseModel):
pattern: List[TokenPattern] = Field(..., min_items=1)
class Config:
extra = "forbid"
# Model meta
class ModelMetaSchema(BaseModel):
# fmt: off
lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'")
name: StrictStr = Field(..., title="Model name")
version: StrictStr = Field(..., title="Model version")
spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier")
parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
requirements: List[StrictStr] = Field([], title="Additional Python package dependencies, used for the Python package setup")
pipeline: List[StrictStr] = Field([], title="Names of pipeline components")
description: StrictStr = Field("", title="Model description")
license: StrictStr = Field("", title="Model license")
author: StrictStr = Field("", title="Model author name")
email: StrictStr = Field("", title="Model author email")
url: StrictStr = Field("", title="Model author URL")
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
performance: Dict[str, Any] = Field({}, title="Accuracy and speed numbers")
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
# fmt: on
# Config schema
# We're not setting any defaults here (which is too messy) and are making all
# fields required, so we can raise validation errors for missing values. To
# provide a default, we include a separate .cfg file with all values and
# check that against this schema in the test suite to make sure it's always
# up to date.
class ConfigSchemaTraining(BaseModel):
# fmt: off
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
dropout: StrictFloat = Field(..., title="Dropout rate")
patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
seed: Optional[StrictInt] = Field(..., title="Random seed")
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
optimizer: Optimizer = Field(..., title="The optimizer to use")
logger: Logger = Field(..., title="The logger to track training progress")
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
# fmt: on
class Config:
extra = "forbid"
arbitrary_types_allowed = True
class ConfigSchemaNlp(BaseModel):
# fmt: off
lang: StrictStr = Field(..., title="The base language to use")
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
tokenizer: Callable = Field(..., title="The tokenizer to use")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
batch_size: Optional[int] = Field(..., title="Default batch size")
# fmt: on
class Config:
extra = "forbid"
arbitrary_types_allowed = True
class ConfigSchemaPretrainEmpty(BaseModel):
class Config:
extra = "forbid"
class ConfigSchemaPretrain(BaseModel):
# fmt: off
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
dropout: StrictFloat = Field(..., title="Dropout rate")
n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
optimizer: Optimizer = Field(..., title="The optimizer to use")
corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.")
# fmt: on
class Config:
extra = "forbid"
arbitrary_types_allowed = True
class ConfigSchemaInit(BaseModel):
# fmt: off
vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
# fmt: on
class Config:
extra = "forbid"
arbitrary_types_allowed = True
class ConfigSchema(BaseModel):
training: ConfigSchemaTraining
nlp: ConfigSchemaNlp
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} # type: ignore[assignment]
components: Dict[str, Dict[str, Any]]
corpora: Dict[str, Reader]
initialize: ConfigSchemaInit
class Config:
extra = "allow"
arbitrary_types_allowed = True
CONFIG_SCHEMAS = {
"nlp": ConfigSchemaNlp,
"training": ConfigSchemaTraining,
"pretraining": ConfigSchemaPretrain,
"initialize": ConfigSchemaInit,
}
# Project config Schema
class ProjectConfigAssetGitItem(BaseModel):
# fmt: off
repo: StrictStr = Field(..., title="URL of Git repo to download from")
path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
branch: StrictStr = Field("master", title="Branch to clone from")
# fmt: on
class ProjectConfigAssetURL(BaseModel):
# fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: Optional[StrictStr] = Field(None, title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: StrictStr = Field("", title="Description of asset")
# fmt: on
class ProjectConfigAssetGit(BaseModel):
# fmt: off
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: Optional[StrictStr] = Field(None, title="Description of asset")
# fmt: on
class ProjectConfigCommand(BaseModel):
# fmt: off
name: StrictStr = Field(..., title="Name of command")
help: Optional[StrictStr] = Field(None, title="Command description")
script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
deps: List[StrictStr] = Field([], title="File dependencies required by this command")
outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
# fmt: on
class Config:
title = "A single named command specified in a project config"
extra = "forbid"
class ProjectConfigSchema(BaseModel):
# fmt: off
vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
title: Optional[str] = Field(None, title="Project title")
spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
# fmt: on
class Config:
title = "Schema for project configuration file"
# Recommendations for init config workflows
class RecommendationTrfItem(BaseModel):
name: str
size_factor: int
class RecommendationTrf(BaseModel):
efficiency: RecommendationTrfItem
accuracy: RecommendationTrfItem
class RecommendationSchema(BaseModel):
word_vectors: Optional[str] = None
transformer: Optional[RecommendationTrf] = None
has_letters: bool = True
from typing import Optional, Iterable, Dict, Set, List, Any, Callable, Tuple
from typing import TYPE_CHECKING
import numpy as np
from collections import defaultdict
from .training import Example
from .tokens import Token, Doc, Span
from .errors import Errors
from .util import get_lang_class, SimpleFrozenList
from .morphology import Morphology
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
MISSING_VALUES = frozenset([None, 0, ""])
class PRFScore:
"""A precision / recall / F score."""
def __init__(
self,
*,
tp: int = 0,
fp: int = 0,
fn: int = 0,
) -> None:
self.tp = tp
self.fp = fp
self.fn = fn
def __len__(self) -> int:
return self.tp + self.fp + self.fn
def __iadd__(self, other):
self.tp += other.tp
self.fp += other.fp
self.fn += other.fn
return self
def __add__(self, other):
return PRFScore(
tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
)
def score_set(self, cand: set, gold: set) -> None:
self.tp += len(cand.intersection(gold))
self.fp += len(cand - gold)
self.fn += len(gold - cand)
@property
def precision(self) -> float:
return self.tp / (self.tp + self.fp + 1e-100)
@property
def recall(self) -> float:
return self.tp / (self.tp + self.fn + 1e-100)
@property
def fscore(self) -> float:
p = self.precision
r = self.recall
return 2 * ((p * r) / (p + r + 1e-100))
def to_dict(self) -> Dict[str, float]:
return {"p": self.precision, "r": self.recall, "f": self.fscore}
class ROCAUCScore:
"""An AUC ROC score. This is only defined for binary classification.
Use the method is_binary before calculating the score, otherwise it
may throw an error."""
def __init__(self) -> None:
self.golds: List[Any] = []
self.cands: List[Any] = []
self.saved_score = 0.0
self.saved_score_at_len = 0
def score_set(self, cand, gold) -> None:
self.cands.append(cand)
self.golds.append(gold)
def is_binary(self):
return len(np.unique(self.golds)) == 2
@property
def score(self):
if not self.is_binary():
raise ValueError(Errors.E165.format(label=set(self.golds)))
if len(self.golds) == self.saved_score_at_len:
return self.saved_score
self.saved_score = _roc_auc_score(self.golds, self.cands)
self.saved_score_at_len = len(self.golds)
return self.saved_score
class Scorer:
"""Compute evaluation scores."""
def __init__(
self,
nlp: Optional["Language"] = None,
default_lang: str = "xx",
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
**cfg,
) -> None:
"""Initialize the Scorer.
DOCS: https://spacy.io/api/scorer#init
"""
self.cfg = cfg
if nlp:
self.nlp = nlp
else:
nlp = get_lang_class(default_lang)()
for pipe in default_pipeline:
nlp.add_pipe(pipe)
self.nlp = nlp
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
"""Evaluate a list of Examples.
examples (Iterable[Example]): The predicted annotations + correct annotations.
RETURNS (Dict): A dictionary of scores.
DOCS: https://spacy.io/api/scorer#score
"""
scores = {}
if hasattr(self.nlp.tokenizer, "score"):
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
for name, component in self.nlp.pipeline:
if hasattr(component, "score"):
scores.update(component.score(examples, **self.cfg))
return scores
@staticmethod
def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
"""Returns accuracy and PRF scores for tokenization.
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for token character spans
examples (Iterable[Example]): Examples to score
RETURNS (Dict[str, Any]): A dictionary containing the scores
token_acc/p/r/f.
DOCS: https://spacy.io/api/scorer#score_tokenization
"""
acc_score = PRFScore()
prf_score = PRFScore()
for example in examples:
gold_doc = example.reference
pred_doc = example.predicted
if gold_doc.has_unknown_spaces:
continue
align = example.alignment
gold_spans = set()
pred_spans = set()
for token in gold_doc:
if token.orth_.isspace():
continue
gold_spans.add((token.idx, token.idx + len(token)))
for token in pred_doc:
if token.orth_.isspace():
continue
pred_spans.add((token.idx, token.idx + len(token)))
if align.x2y.lengths[token.i] != 1:
acc_score.fp += 1
else:
acc_score.tp += 1
prf_score.score_set(pred_spans, gold_spans)
if len(acc_score) > 0:
return {
"token_acc": acc_score.fscore,
"token_p": prf_score.precision,
"token_r": prf_score.recall,
"token_f": prf_score.fscore,
}
else:
return {
"token_acc": None,
"token_p": None,
"token_r": None,
"token_f": None,
}
@staticmethod
def score_token_attr(
examples: Iterable[Example],
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
**cfg,
) -> Dict[str, Any]:
"""Returns an accuracy score for a token-level attribute.
examples (Iterable[Example]): Examples to score
attr (str): The attribute to score.
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an
individual token.
missing_values (Set[Any]): Attribute values to treat as missing annotation
in the reference annotation.
RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
under the key attr_acc.
DOCS: https://spacy.io/api/scorer#score_token_attr
"""
tag_score = PRFScore()
for example in examples:
gold_doc = example.reference
pred_doc = example.predicted
align = example.alignment
gold_tags = set()
missing_indices = set()
for gold_i, token in enumerate(gold_doc):
value = getter(token, attr)
if value not in missing_values:
gold_tags.add((gold_i, getter(token, attr)))
else:
missing_indices.add(gold_i)
pred_tags = set()
for token in pred_doc:
if token.orth_.isspace():
continue
if align.x2y.lengths[token.i] == 1:
gold_i = align.x2y[token.i].dataXd[0, 0]
if gold_i not in missing_indices:
pred_tags.add((gold_i, getter(token, attr)))
tag_score.score_set(pred_tags, gold_tags)
score_key = f"{attr}_acc"
if len(tag_score) == 0:
return {score_key: None}
else:
return {score_key: tag_score.fscore}
@staticmethod
def score_token_attr_per_feat(
examples: Iterable[Example],
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
**cfg,
) -> Dict[str, Any]:
"""Return micro PRF and PRF scores per feat for a token attribute in
UFEATS format.
examples (Iterable[Example]): Examples to score
attr (str): The attribute to score.
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an
individual token.
missing_values (Set[Any]): Attribute values to treat as missing
annotation in the reference annotation.
RETURNS (dict): A dictionary containing the micro PRF scores under the
key attr_micro_p/r/f and the per-feat PRF scores under
attr_per_feat.
"""
micro_score = PRFScore()
per_feat = {}
for example in examples:
pred_doc = example.predicted
gold_doc = example.reference
align = example.alignment
gold_per_feat: Dict[str, Set] = {}
missing_indices = set()
for gold_i, token in enumerate(gold_doc):
value = getter(token, attr)
morph = gold_doc.vocab.strings[value]
if value not in missing_values and morph != Morphology.EMPTY_MORPH:
for feat in morph.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP)
if field not in per_feat:
per_feat[field] = PRFScore()
if field not in gold_per_feat:
gold_per_feat[field] = set()
gold_per_feat[field].add((gold_i, feat))
else:
missing_indices.add(gold_i)
pred_per_feat: Dict[str, Set] = {}
for token in pred_doc:
if token.orth_.isspace():
continue
if align.x2y.lengths[token.i] == 1:
gold_i = align.x2y[token.i].dataXd[0, 0]
if gold_i not in missing_indices:
value = getter(token, attr)
morph = gold_doc.vocab.strings[value]
if (
value not in missing_values
and morph != Morphology.EMPTY_MORPH
):
for feat in morph.split(Morphology.FEATURE_SEP):
field, values = feat.split(Morphology.FIELD_SEP)
if field not in per_feat:
per_feat[field] = PRFScore()
if field not in pred_per_feat:
pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat))
for field in per_feat:
micro_score.score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
)
per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
)
result: Dict[str, Any] = {}
if len(micro_score) > 0:
result[f"{attr}_micro_p"] = micro_score.precision
result[f"{attr}_micro_r"] = micro_score.recall
result[f"{attr}_micro_f"] = micro_score.fscore
result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
else:
result[f"{attr}_micro_p"] = None
result[f"{attr}_micro_r"] = None
result[f"{attr}_micro_f"] = None
result[f"{attr}_per_feat"] = None
return result
@staticmethod
def score_spans(
examples: Iterable[Example],
attr: str,
*,
getter: Callable[[Doc, str], Iterable[Span]] = getattr,
has_annotation: Optional[Callable[[Doc], bool]] = None,
labeled: bool = True,
allow_overlap: bool = False,
**cfg,
) -> Dict[str, Any]:
"""Returns PRF scores for labeled spans.
examples (Iterable[Example]): Examples to score
attr (str): The attribute to score.
getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
provided, getter(doc, attr) should return the spans for the
individual doc.
has_annotation (Optional[Callable[[Doc], bool]]) should return whether a `Doc`
has annotation for this `attr`. Docs without annotation are skipped for
scoring purposes.
labeled (bool): Whether or not to include label information in
the evaluation. If set to 'False', two spans will be considered
equal if their start and end match, irrespective of their label.
allow_overlap (bool): Whether or not to allow overlapping spans.
If set to 'False', the alignment will automatically resolve conflicts.
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
DOCS: https://spacy.io/api/scorer#score_spans
"""
score = PRFScore()
score_per_type = dict()
for example in examples:
pred_doc = example.predicted
gold_doc = example.reference
# Option to handle docs without annotation for this attribute
if has_annotation is not None:
if not has_annotation(gold_doc):
continue
# Find all labels in gold and doc
labels = set(
[k.label_ for k in getter(gold_doc, attr)]
+ [k.label_ for k in getter(pred_doc, attr)]
)
# Set up all labels for per type scoring and prepare gold per type
gold_per_type: Dict[str, Set] = {label: set() for label in labels}
for label in labels:
if label not in score_per_type:
score_per_type[label] = PRFScore()
# Find all predidate labels, for all and per type
gold_spans = set()
pred_spans = set()
for span in getter(gold_doc, attr):
gold_span: Tuple
if labeled:
gold_span = (span.label_, span.start, span.end - 1)
else:
gold_span = (span.start, span.end - 1)
gold_spans.add(gold_span)
gold_per_type[span.label_].add(gold_span)
pred_per_type: Dict[str, Set] = {label: set() for label in labels}
for span in example.get_aligned_spans_x2y(
getter(pred_doc, attr), allow_overlap
):
pred_span: Tuple
if labeled:
pred_span = (span.label_, span.start, span.end - 1)
else:
pred_span = (span.start, span.end - 1)
pred_spans.add(pred_span)
pred_per_type[span.label_].add(pred_span)
# Scores per label
if labeled:
for k, v in score_per_type.items():
if k in pred_per_type:
v.score_set(pred_per_type[k], gold_per_type[k])
# Score for all labels
score.score_set(pred_spans, gold_spans)
# Assemble final result
final_scores: Dict[str, Any] = {
f"{attr}_p": None,
f"{attr}_r": None,
f"{attr}_f": None,
}
if labeled:
final_scores[f"{attr}_per_type"] = None
if len(score) > 0:
final_scores[f"{attr}_p"] = score.precision
final_scores[f"{attr}_r"] = score.recall
final_scores[f"{attr}_f"] = score.fscore
if labeled:
final_scores[f"{attr}_per_type"] = {
k: v.to_dict() for k, v in score_per_type.items()
}
return final_scores
@staticmethod
def score_cats(
examples: Iterable[Example],
attr: str,
*,
getter: Callable[[Doc, str], Any] = getattr,
labels: Iterable[str] = SimpleFrozenList(),
multi_label: bool = True,
positive_label: Optional[str] = None,
threshold: Optional[float] = None,
**cfg,
) -> Dict[str, Any]:
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
dict with scores for each label like Doc.cats. The reported overall
score depends on the scorer settings.
examples (Iterable[Example]): Examples to score
attr (str): The attribute to score.
getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
getter(doc, attr) should return the values for the individual doc.
labels (Iterable[str]): The set of possible labels. Defaults to [].
multi_label (bool): Whether the attribute allows multiple labels.
Defaults to True.
positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None.
threshold (float): Cutoff to consider a prediction "positive". Defaults
to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
otherwise.
RETURNS (Dict[str, Any]): A dictionary containing the scores, with
inapplicable scores as None:
for all:
attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
attr_score_desc (text description of the overall score),
attr_micro_p,
attr_micro_r,
attr_micro_f,
attr_macro_p,
attr_macro_r,
attr_macro_f,
attr_macro_auc,
attr_f_per_type,
attr_auc_per_type
DOCS: https://spacy.io/api/scorer#score_cats
"""
if threshold is None:
threshold = 0.5 if multi_label else 0.0
f_per_type = {label: PRFScore() for label in labels}
auc_per_type = {label: ROCAUCScore() for label in labels}
labels = set(labels)
if labels:
for eg in examples:
labels.update(eg.predicted.cats.keys())
labels.update(eg.reference.cats.keys())
for example in examples:
# Through this loop, None in the gold_cats indicates missing label.
pred_cats = getter(example.predicted, attr)
gold_cats = getter(example.reference, attr)
for label in labels:
pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0)
if gold_score is not None:
auc_per_type[label].score_set(pred_score, gold_score)
if multi_label:
for label in labels:
pred_score = pred_cats.get(label, 0.0)
gold_score = gold_cats.get(label, 0.0)
if gold_score is not None:
if pred_score >= threshold and gold_score > 0:
f_per_type[label].tp += 1
elif pred_score >= threshold and gold_score == 0:
f_per_type[label].fp += 1
elif pred_score < threshold and gold_score > 0:
f_per_type[label].fn += 1
elif pred_cats and gold_cats:
# Get the highest-scoring for each.
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
if gold_score is not None:
if pred_label == gold_label and pred_score >= threshold:
f_per_type[pred_label].tp += 1
else:
f_per_type[gold_label].fn += 1
if pred_score >= threshold:
f_per_type[pred_label].fp += 1
elif gold_cats:
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
if gold_score is not None and gold_score > 0:
f_per_type[gold_label].fn += 1
elif pred_cats:
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
if pred_score >= threshold:
f_per_type[pred_label].fp += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp
micro_prf.fn += label_prf.fn
micro_prf.fp += label_prf.fp
n_cats = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
# Limit macro_auc to those labels with gold annotations,
# but still divide by all cats to avoid artificial boosting of datasets with missing labels
macro_auc = (
sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values())
/ n_cats
)
results: Dict[str, Any] = {
f"{attr}_score": None,
f"{attr}_score_desc": None,
f"{attr}_micro_p": micro_prf.precision,
f"{attr}_micro_r": micro_prf.recall,
f"{attr}_micro_f": micro_prf.fscore,
f"{attr}_macro_p": macro_p,
f"{attr}_macro_r": macro_r,
f"{attr}_macro_f": macro_f,
f"{attr}_macro_auc": macro_auc,
f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
f"{attr}_auc_per_type": {
k: v.score if v.is_binary() else None for k, v in auc_per_type.items()
},
}
if len(labels) == 2 and not multi_label and positive_label:
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
results[f"{attr}_score"] = positive_label_f
results[f"{attr}_score_desc"] = f"F ({positive_label})"
elif not multi_label:
results[f"{attr}_score"] = results[f"{attr}_macro_f"]
results[f"{attr}_score_desc"] = "macro F"
else:
results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
results[f"{attr}_score_desc"] = "macro AUC"
return results
@staticmethod
def score_links(
examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
) -> Dict[str, Any]:
"""Returns PRF for predicted links on the entity level.
To disentangle the performance of the NEL from the NER,
this method only evaluates NEL links for entities that overlap
between the gold reference and the predictions.
examples (Iterable[Example]): Examples to score
negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
RETURNS (Dict[str, Any]): A dictionary containing the scores.
DOCS: https://spacy.io/api/scorer#score_links
"""
f_per_type = {}
for example in examples:
gold_ent_by_offset = {}
for gold_ent in example.reference.ents:
gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
for pred_ent in example.predicted.ents:
gold_span = gold_ent_by_offset.get(
(pred_ent.start_char, pred_ent.end_char), None
)
if gold_span is not None:
label = gold_span.label_
if label not in f_per_type:
f_per_type[label] = PRFScore()
gold = gold_span.kb_id_
# only evaluating entities that overlap between gold and pred,
# to disentangle the performance of the NEL from the NER
if gold is not None:
pred = pred_ent.kb_id_
if gold in negative_labels and pred in negative_labels:
# ignore true negatives
pass
elif gold == pred:
f_per_type[label].tp += 1
elif gold in negative_labels:
f_per_type[label].fp += 1
elif pred in negative_labels:
f_per_type[label].fn += 1
else:
# a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
f_per_type[label].fp += 1
f_per_type[label].fn += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp
micro_prf.fn += label_prf.fn
micro_prf.fp += label_prf.fp
n_labels = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
results = {
f"nel_score": micro_prf.fscore,
f"nel_score_desc": "micro F",
f"nel_micro_p": micro_prf.precision,
f"nel_micro_r": micro_prf.recall,
f"nel_micro_f": micro_prf.fscore,
f"nel_macro_p": macro_p,
f"nel_macro_r": macro_r,
f"nel_macro_f": macro_f,
f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
}
return results
@staticmethod
def score_deps(
examples: Iterable[Example],
attr: str,
*,
getter: Callable[[Token, str], Any] = getattr,
head_attr: str = "head",
head_getter: Callable[[Token, str], Token] = getattr,
ignore_labels: Iterable[str] = SimpleFrozenList(),
missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
**cfg,
) -> Dict[str, Any]:
"""Returns the UAS, LAS, and LAS per type scores for dependency
parses.
examples (Iterable[Example]): Examples to score
attr (str): The attribute containing the dependency label.
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an
individual token.
head_attr (str): The attribute containing the head token. Defaults to
'head'.
head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
head_getter(token, attr) should return the value of the head for an
individual token.
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
missing_values (Set[Any]): Attribute values to treat as missing annotation
in the reference annotation.
RETURNS (Dict[str, Any]): A dictionary containing the scores:
attr_uas, attr_las, and attr_las_per_type.
DOCS: https://spacy.io/api/scorer#score_deps
"""
unlabelled = PRFScore()
labelled = PRFScore()
labelled_per_dep = dict()
missing_indices = set()
for example in examples:
gold_doc = example.reference
pred_doc = example.predicted
align = example.alignment
gold_deps = set()
gold_deps_per_dep: Dict[str, Set] = {}
for gold_i, token in enumerate(gold_doc):
dep = getter(token, attr)
head = head_getter(token, head_attr)
if dep not in missing_values:
if dep not in ignore_labels:
gold_deps.add((gold_i, head.i, dep))
if dep not in labelled_per_dep:
labelled_per_dep[dep] = PRFScore()
if dep not in gold_deps_per_dep:
gold_deps_per_dep[dep] = set()
gold_deps_per_dep[dep].add((gold_i, head.i, dep))
else:
missing_indices.add(gold_i)
pred_deps = set()
pred_deps_per_dep: Dict[str, Set] = {}
for token in pred_doc:
if token.orth_.isspace():
continue
if align.x2y.lengths[token.i] != 1:
gold_i = None # type: ignore
else:
gold_i = align.x2y[token.i].dataXd[0, 0]
if gold_i not in missing_indices:
dep = getter(token, attr)
head = head_getter(token, head_attr)
if dep not in ignore_labels and token.orth_.strip():
if align.x2y.lengths[head.i] == 1:
gold_head = align.x2y[head.i].dataXd[0, 0]
else:
gold_head = None
# None is indistinct, so we can't just add it to the set
# Multiple (None, None) deps are possible
if gold_i is None or gold_head is None:
unlabelled.fp += 1
labelled.fp += 1
else:
pred_deps.add((gold_i, gold_head, dep))
if dep not in labelled_per_dep:
labelled_per_dep[dep] = PRFScore()
if dep not in pred_deps_per_dep:
pred_deps_per_dep[dep] = set()
pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
labelled.score_set(pred_deps, gold_deps)
for dep in labelled_per_dep:
labelled_per_dep[dep].score_set(
pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
)
unlabelled.score_set(
set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
)
if len(unlabelled) > 0:
return {
f"{attr}_uas": unlabelled.fscore,
f"{attr}_las": labelled.fscore,
f"{attr}_las_per_type": {
k: v.to_dict() for k, v in labelled_per_dep.items()
},
}
else:
return {
f"{attr}_uas": None,
f"{attr}_las": None,
f"{attr}_las_per_type": None,
}
def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
score_per_type = defaultdict(PRFScore)
for eg in examples:
if not eg.y.has_annotation("ENT_IOB"):
continue
golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
align_x2y = eg.alignment.x2y
for pred_ent in eg.x.ents:
if pred_ent.label_ not in score_per_type:
score_per_type[pred_ent.label_] = PRFScore()
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
if len(indices):
g_span = eg.y[indices[0] : indices[-1] + 1]
# Check we aren't missing annotation on this span. If so,
# our prediction is neither right nor wrong, we just
# ignore it.
if all(token.ent_iob != 0 for token in g_span):
key = (pred_ent.label_, indices[0], indices[-1] + 1)
if key in golds:
score_per_type[pred_ent.label_].tp += 1
golds.remove(key)
else:
score_per_type[pred_ent.label_].fp += 1
for label, start, end in golds:
score_per_type[label].fn += 1
totals = PRFScore()
for prf in score_per_type.values():
totals += prf
if len(totals) > 0:
return {
"ents_p": totals.precision,
"ents_r": totals.recall,
"ents_f": totals.fscore,
"ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
}
else:
return {
"ents_p": None,
"ents_r": None,
"ents_f": None,
"ents_per_type": None,
}
# The following implementation of roc_auc_score() is adapted from
# scikit-learn, which is distributed under the New BSD License.
# Copyright (c) 2007–2019 The scikit-learn developers.
# See licenses/3rd_party_licenses.txt
def _roc_auc_score(y_true, y_score):
"""Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
from prediction scores.
Note: this implementation is restricted to the binary classification task
Parameters
----------
y_true : array, shape = [n_samples] or [n_samples, n_classes]
True binary labels or binary label indicators.
The multiclass case expects shape = [n_samples] and labels
with values in ``range(n_classes)``.
y_score : array, shape = [n_samples] or [n_samples, n_classes]
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by "decision_function" on some classifiers). For binary
y_true, y_score is supposed to be the score of the class with greater
label. The multiclass case expects shape = [n_samples, n_classes]
where the scores correspond to probability estimates.
Returns
-------
auc : float
References
----------
.. [1] `Wikipedia entry for the Receiver operating characteristic
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
Letters, 2006, 27(8):861-874.
.. [3] `Analyzing a portion of the ROC curve. McClish, 1989
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
"""
if len(np.unique(y_true)) != 2:
raise ValueError(Errors.E165.format(label=np.unique(y_true)))
fpr, tpr, _ = _roc_curve(y_true, y_score)
return _auc(fpr, tpr)
def _roc_curve(y_true, y_score):
"""Compute Receiver operating characteristic (ROC)
Note: this implementation is restricted to the binary classification task.
Parameters
----------
y_true : array, shape = [n_samples]
True binary labels. If labels are not either {-1, 1} or {0, 1}, then
pos_label should be explicitly given.
y_score : array, shape = [n_samples]
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by "decision_function" on some classifiers).
Returns
-------
fpr : array, shape = [>2]
Increasing false positive rates such that element i is the false
positive rate of predictions with score >= thresholds[i].
tpr : array, shape = [>2]
Increasing true positive rates such that element i is the true
positive rate of predictions with score >= thresholds[i].
thresholds : array, shape = [n_thresholds]
Decreasing thresholds on the decision function used to compute
fpr and tpr. `thresholds[0]` represents no instances being predicted
and is arbitrarily set to `max(y_score) + 1`.
Notes
-----
Since the thresholds are sorted from low to high values, they
are reversed upon returning them to ensure they correspond to both ``fpr``
and ``tpr``, which are sorted in reversed order during their calculation.
References
----------
.. [1] `Wikipedia entry for the Receiver operating characteristic
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
Letters, 2006, 27(8):861-874.
"""
fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
# Add an extra threshold position
# to make sure that the curve starts at (0, 0)
tps = np.r_[0, tps]
fps = np.r_[0, fps]
thresholds = np.r_[thresholds[0] + 1, thresholds]
if fps[-1] <= 0:
fpr = np.repeat(np.nan, fps.shape)
else:
fpr = fps / fps[-1]
if tps[-1] <= 0:
tpr = np.repeat(np.nan, tps.shape)
else:
tpr = tps / tps[-1]
return fpr, tpr, thresholds
def _binary_clf_curve(y_true, y_score):
"""Calculate true and false positives per binary classification threshold.
Parameters
----------
y_true : array, shape = [n_samples]
True targets of binary classification
y_score : array, shape = [n_samples]
Estimated probabilities or decision function
Returns
-------
fps : array, shape = [n_thresholds]
A count of false positives, at index i being the number of negative
samples assigned a score >= thresholds[i]. The total number of
negative samples is equal to fps[-1] (thus true negatives are given by
fps[-1] - fps).
tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
An increasing count of true positives, at index i being the number
of positive samples assigned a score >= thresholds[i]. The total
number of positive samples is equal to tps[-1] (thus false negatives
are given by tps[-1] - tps).
thresholds : array, shape = [n_thresholds]
Decreasing score values.
"""
pos_label = 1.0
y_true = np.ravel(y_true)
y_score = np.ravel(y_score)
# make y_true a boolean vector
y_true = y_true == pos_label
# sort scores and corresponding truth values
desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
y_score = y_score[desc_score_indices]
y_true = y_true[desc_score_indices]
weight = 1.0
# y_score typically has many tied values. Here we extract
# the indices associated with the distinct values. We also
# concatenate a value for the end of the curve.
distinct_value_indices = np.where(np.diff(y_score))[0]
threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
# accumulate the true positives with decreasing threshold
tps = _stable_cumsum(y_true * weight)[threshold_idxs]
fps = 1 + threshold_idxs - tps
return fps, tps, y_score[threshold_idxs]
def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
"""Use high precision for cumsum and check that final value matches sum
Parameters
----------
arr : array-like
To be cumulatively summed as flat
axis : int, optional
Axis along which the cumulative sum is computed.
The default (None) is to compute the cumsum over the flattened array.
rtol : float
Relative tolerance, see ``np.allclose``
atol : float
Absolute tolerance, see ``np.allclose``
"""
out = np.cumsum(arr, axis=axis, dtype=np.float64)
expected = np.sum(arr, axis=axis, dtype=np.float64)
if not np.all(
np.isclose(
out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
)
):
raise ValueError(Errors.E163)
return out
def _auc(x, y):
"""Compute Area Under the Curve (AUC) using the trapezoidal rule
This is a general function, given points on a curve. For computing the
area under the ROC-curve, see :func:`roc_auc_score`.
Parameters
----------
x : array, shape = [n]
x coordinates. These must be either monotonic increasing or monotonic
decreasing.
y : array, shape = [n]
y coordinates.
Returns
-------
auc : float
"""
x = np.ravel(x)
y = np.ravel(y)
direction = 1
dx = np.diff(x)
if np.any(dx < 0):
if np.all(dx <= 0):
direction = -1
else:
raise ValueError(Errors.E164.format(x=x))
area = direction * np.trapz(y, x)
if isinstance(area, np.memmap):
# Reductions such as .sum used internally in np.trapz do not return a
# scalar by default for numpy.memmap instances contrary to
# regular numpy.ndarray instances.
area = area.dtype.type(area)
return area
This source diff could not be displayed because it is too large. You can view the blob instead.
from libc.stdint cimport int64_t
from libcpp.vector cimport vector
from libcpp.set cimport set
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
from .typedefs cimport attr_t, hash_t
cpdef hash_t hash_string(str string) except 0
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
cdef str decode_Utf8Str(const Utf8Str* string)
ctypedef union Utf8Str:
unsigned char[8] s
unsigned char* p
cdef class StringStore:
cdef Pool mem
cdef vector[hash_t] keys
cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
from typing import Optional, Iterable, Iterator, Union, Any
from pathlib import Path
def get_string_id(key: Union[str, int]) -> int: ...
class StringStore:
def __init__(
self, strings: Optional[Iterable[str]] = ..., freeze: bool = ...
) -> None: ...
def __getitem__(self, string_or_id: Union[bytes, str, int]) -> Union[str, int]: ...
def as_int(self, key: Union[bytes, str, int]) -> int: ...
def as_string(self, key: Union[bytes, str, int]) -> str: ...
def add(self, string: str) -> int: ...
def __len__(self) -> int: ...
def __contains__(self, string: str) -> bool: ...
def __iter__(self) -> Iterator[str]: ...
def __reduce__(self) -> Any: ...
def to_disk(self, path: Union[str, Path]) -> None: ...
def from_disk(self, path: Union[str, Path]) -> StringStore: ...
def to_bytes(self, **kwargs: Any) -> bytes: ...
def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ...
def _reset_and_load(self, strings: Iterable[str]) -> None: ...
# cython: infer_types=True
cimport cython
from libc.string cimport memcpy
from libcpp.set cimport set
from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32
import srsly
from .typedefs cimport hash_t
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .errors import Errors
from . import util
def get_string_id(key):
"""Get a string ID, handling the reserved symbols correctly. If the key is
already an ID, return it.
This function optimises for convenience over performance, so shouldn't be
used in tight loops.
"""
if not isinstance(key, str):
return key
elif key in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[key]
elif not key:
return 0
else:
chars = key.encode("utf8")
return hash_utf8(chars, len(chars))
cpdef hash_t hash_string(str string) except 0:
chars = string.encode("utf8")
return hash_utf8(chars, len(chars))
cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
return hash64(utf8_string, length, 1)
cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1)
cdef str decode_Utf8Str(const Utf8Str* string):
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode("utf8")
elif string.p[0] < 255:
return string.p[1:string.p[0]+1].decode("utf8")
else:
i = 0
length = 0
while string.p[i] == 255:
i += 1
length += 255
length += string.p[i]
i += 1
return string.p[i:length + i].decode("utf8")
cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef int n_length_bytes
cdef int i
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
cdef uint32_t ulength = length
if length < sizeof(string.s):
string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length)
return string
elif length < 255:
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
string.p[0] = length
memcpy(&string.p[1], chars, length)
return string
else:
i = 0
n_length_bytes = (length // 255) + 1
string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
for i in range(n_length_bytes-1):
string.p[i] = 255
string.p[n_length_bytes-1] = length % 255
memcpy(&string.p[n_length_bytes], chars, length)
return string
cdef class StringStore:
"""Look up strings by 64-bit hashes.
DOCS: https://spacy.io/api/stringstore
"""
def __init__(self, strings=None, freeze=False):
"""Create the StringStore.
strings (iterable): A sequence of unicode strings to add to the store.
"""
self.mem = Pool()
self._map = PreshMap()
if strings is not None:
for string in strings:
self.add(string)
def __getitem__(self, object string_or_id):
"""Retrieve a string from a given hash, or vice versa.
string_or_id (bytes, str or uint64): The value to encode.
Returns (str / uint64): The value to be retrieved.
"""
if isinstance(string_or_id, str) and len(string_or_id) == 0:
return 0
elif string_or_id == 0:
return ""
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
cdef hash_t key
if isinstance(string_or_id, str):
key = hash_string(string_or_id)
return key
elif isinstance(string_or_id, bytes):
key = hash_utf8(string_or_id, len(string_or_id))
return key
elif string_or_id < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[string_or_id]
else:
key = string_or_id
utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
def as_int(self, key):
"""If key is an int, return it; otherwise, get the int value."""
if not isinstance(key, str):
return key
else:
return self[key]
def as_string(self, key):
"""If key is a string, return it; otherwise, get the string value."""
if isinstance(key, str):
return key
else:
return self[key]
def add(self, string):
"""Add a string to the StringStore.
string (str): The string to add.
RETURNS (uint64): The string's hash value.
"""
if isinstance(string, str):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_string(string)
self.intern_unicode(string)
elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_utf8(string, len(string))
self._intern_utf8(string, len(string))
else:
raise TypeError(Errors.E017.format(value_type=type(string)))
return key
def __len__(self):
"""The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self.keys.size()
def __contains__(self, string not None):
"""Check whether a string is in the store.
string (str): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
cdef hash_t key
if isinstance(string, int) or isinstance(string, long):
if string == 0:
return True
key = string
elif len(string) == 0:
return True
elif string in SYMBOLS_BY_STR:
return True
elif isinstance(string, str):
key = hash_string(string)
else:
string = string.encode("utf8")
key = hash_utf8(string, len(string))
if key < len(SYMBOLS_BY_INT):
return True
else:
return self._map.get(key) is not NULL
def __iter__(self):
"""Iterate over the strings in the store, in order.
YIELDS (str): A string in the store.
"""
cdef int i
cdef hash_t key
for i in range(self.keys.size()):
key = self.keys[i]
utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str)
# TODO: Iterate OOV here?
def __reduce__(self):
strings = list(self)
return (StringStore, (strings,), None, None, None)
def to_disk(self, path):
"""Save the current state to a directory.
path (str / Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects.
"""
path = util.ensure_path(path)
strings = sorted(self)
srsly.write_json(path, strings)
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (str / Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object.
"""
path = util.ensure_path(path)
strings = srsly.read_json(path)
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
return self
def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string.
RETURNS (bytes): The serialized form of the `StringStore` object.
"""
return srsly.json_dumps(sorted(self))
def from_bytes(self, bytes_data, **kwargs):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
RETURNS (StringStore): The `StringStore` object.
"""
strings = srsly.json_loads(bytes_data)
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
return self
def _reset_and_load(self, strings):
self.mem = Pool()
self._map = PreshMap()
self.keys.clear()
for string in strings:
self.add(string)
cdef const Utf8Str* intern_unicode(self, str py_string):
# 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string))
@cython.final
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
# TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = hash_utf8(utf8_string, length)
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL:
return value
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value)
self.keys.push_back(key)
return value
from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
from libcpp.vector cimport vector
from libcpp.unordered_set cimport unordered_set
from libcpp.unordered_map cimport unordered_map
from libc.stdint cimport int32_t, int64_t
from .typedefs cimport flags_t, attr_t, hash_t
from .parts_of_speech cimport univ_pos_t
cdef struct LexemeC:
flags_t flags
attr_t lang
attr_t id
attr_t length
attr_t orth
attr_t lower
attr_t norm
attr_t shape
attr_t prefix
attr_t suffix
cdef struct SpanC:
hash_t id
int start
int end
int start_char
int end_char
attr_t label
attr_t kb_id
cdef struct TokenC:
const LexemeC* lex
uint64_t morph
univ_pos_t pos
bint spacy
attr_t tag
int idx
attr_t lemma
attr_t norm
int head
attr_t dep
uint32_t l_kids
uint32_t r_kids
uint32_t l_edge
uint32_t r_edge
int sent_start
int ent_iob
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
attr_t ent_kb_id
hash_t ent_id
cdef struct MorphAnalysisC:
hash_t key
int length
attr_t* fields
attr_t* features
# Internal struct, for storage and disambiguation of entities.
cdef struct KBEntryC:
# The hash of this entry's unique ID/name in the kB
hash_t entity_hash
# Allows retrieval of the entity vector, as an index into a vectors table of the KB.
# Can be expanded later to refer to multiple rows (compositional model to reduce storage footprint).
int32_t vector_index
# Allows retrieval of a struct of non-vector features.
# This is currently not implemented and set to -1 for the common case where there are no features.
int32_t feats_row
# log probability of entity, based on corpus frequency
float freq
# Each alias struct stores a list of Entry pointers with their prior probabilities
# for this specific mention/alias.
cdef struct AliasC:
# All entry candidates for this alias
vector[int64_t] entry_indices
# Prior probability P(entity|alias) - should sum up to (at most) 1.
vector[float] probs
cdef struct EdgeC:
hash_t label
int32_t head
int32_t tail
cdef struct GraphC:
vector[vector[int32_t]] nodes
vector[EdgeC] edges
vector[float] weights
vector[int] n_heads
vector[int] n_tails
vector[int] first_head
vector[int] first_tail
unordered_set[int]* roots
unordered_map[hash_t, int]* node_map
unordered_map[hash_t, int]* edge_map
This source diff could not be displayed because it is too large. You can view the blob instead.
cdef enum symbol_t:
NIL
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUM
LIKE_EMAIL
IS_STOP
IS_OOV_DEPRECATED
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
IS_CURRENCY
FLAG19 = 19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
ID
ORTH
LOWER
NORM
SHAPE
PREFIX
SUFFIX
LENGTH
CLUSTER
LEMMA
POS
TAG
DEP
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB
LANG
ADJ
ADP
ADV
AUX
CONJ
CCONJ # U20
DET
INTJ
NOUN
NUM
PART
PRON
PROPN
PUNCT
SCONJ
SYM
VERB
X
EOL
SPACE
DEPRECATED001
DEPRECATED002
DEPRECATED003
DEPRECATED004
DEPRECATED005
DEPRECATED006
DEPRECATED007
DEPRECATED008
DEPRECATED009
DEPRECATED010
DEPRECATED011
DEPRECATED012
DEPRECATED013
DEPRECATED014
DEPRECATED015
DEPRECATED016
DEPRECATED017
DEPRECATED018
DEPRECATED019
DEPRECATED020
DEPRECATED021
DEPRECATED022
DEPRECATED023
DEPRECATED024
DEPRECATED025
DEPRECATED026
DEPRECATED027
DEPRECATED028
DEPRECATED029
DEPRECATED030
DEPRECATED031
DEPRECATED032
DEPRECATED033
DEPRECATED034
DEPRECATED035
DEPRECATED036
DEPRECATED037
DEPRECATED038
DEPRECATED039
DEPRECATED040
DEPRECATED041
DEPRECATED042
DEPRECATED043
DEPRECATED044
DEPRECATED045
DEPRECATED046
DEPRECATED047
DEPRECATED048
DEPRECATED049
DEPRECATED050
DEPRECATED051
DEPRECATED052
DEPRECATED053
DEPRECATED054
DEPRECATED055
DEPRECATED056
DEPRECATED057
DEPRECATED058
DEPRECATED059
DEPRECATED060
DEPRECATED061
DEPRECATED062
DEPRECATED063
DEPRECATED064
DEPRECATED065
DEPRECATED066
DEPRECATED067
DEPRECATED068
DEPRECATED069
DEPRECATED070
DEPRECATED071
DEPRECATED072
DEPRECATED073
DEPRECATED074
DEPRECATED075
DEPRECATED076
DEPRECATED077
DEPRECATED078
DEPRECATED079
DEPRECATED080
DEPRECATED081
DEPRECATED082
DEPRECATED083
DEPRECATED084
DEPRECATED085
DEPRECATED086
DEPRECATED087
DEPRECATED088
DEPRECATED089
DEPRECATED090
DEPRECATED091
DEPRECATED092
DEPRECATED093
DEPRECATED094
DEPRECATED095
DEPRECATED096
DEPRECATED097
DEPRECATED098
DEPRECATED099
DEPRECATED100
DEPRECATED101
DEPRECATED102
DEPRECATED103
DEPRECATED104
DEPRECATED105
DEPRECATED106
DEPRECATED107
DEPRECATED108
DEPRECATED109
DEPRECATED110
DEPRECATED111
DEPRECATED112
DEPRECATED113
DEPRECATED114
DEPRECATED115
DEPRECATED116
DEPRECATED117
DEPRECATED118
DEPRECATED119
DEPRECATED120
DEPRECATED121
DEPRECATED122
DEPRECATED123
DEPRECATED124
DEPRECATED125
DEPRECATED126
DEPRECATED127
DEPRECATED128
DEPRECATED129
DEPRECATED130
DEPRECATED131
DEPRECATED132
DEPRECATED133
DEPRECATED134
DEPRECATED135
DEPRECATED136
DEPRECATED137
DEPRECATED138
DEPRECATED139
DEPRECATED140
DEPRECATED141
DEPRECATED142
DEPRECATED143
DEPRECATED144
DEPRECATED145
DEPRECATED146
DEPRECATED147
DEPRECATED148
DEPRECATED149
DEPRECATED150
DEPRECATED151
DEPRECATED152
DEPRECATED153
DEPRECATED154
DEPRECATED155
DEPRECATED156
DEPRECATED157
DEPRECATED158
DEPRECATED159
DEPRECATED160
DEPRECATED161
DEPRECATED162
DEPRECATED163
DEPRECATED164
DEPRECATED165
DEPRECATED166
DEPRECATED167
DEPRECATED168
DEPRECATED169
DEPRECATED170
DEPRECATED171
DEPRECATED172
DEPRECATED173
DEPRECATED174
DEPRECATED175
DEPRECATED176
DEPRECATED177
DEPRECATED178
DEPRECATED179
DEPRECATED180
DEPRECATED181
DEPRECATED182
DEPRECATED183
DEPRECATED184
DEPRECATED185
DEPRECATED186
DEPRECATED187
DEPRECATED188
DEPRECATED189
DEPRECATED190
DEPRECATED191
DEPRECATED192
DEPRECATED193
DEPRECATED194
DEPRECATED195
DEPRECATED196
DEPRECATED197
DEPRECATED198
DEPRECATED199
DEPRECATED200
DEPRECATED201
DEPRECATED202
DEPRECATED203
DEPRECATED204
DEPRECATED205
DEPRECATED206
DEPRECATED207
DEPRECATED208
DEPRECATED209
DEPRECATED210
DEPRECATED211
DEPRECATED212
DEPRECATED213
DEPRECATED214
DEPRECATED215
DEPRECATED216
DEPRECATED217
DEPRECATED218
DEPRECATED219
DEPRECATED220
DEPRECATED221
DEPRECATED222
DEPRECATED223
DEPRECATED224
DEPRECATED225
DEPRECATED226
DEPRECATED227
DEPRECATED228
DEPRECATED229
DEPRECATED230
DEPRECATED231
DEPRECATED232
DEPRECATED233
DEPRECATED234
DEPRECATED235
DEPRECATED236
DEPRECATED237
DEPRECATED238
DEPRECATED239
DEPRECATED240
DEPRECATED241
DEPRECATED242
DEPRECATED243
DEPRECATED244
DEPRECATED245
DEPRECATED246
DEPRECATED247
DEPRECATED248
DEPRECATED249
DEPRECATED250
DEPRECATED251
DEPRECATED252
DEPRECATED253
DEPRECATED254
DEPRECATED255
DEPRECATED256
DEPRECATED257
DEPRECATED258
DEPRECATED259
DEPRECATED260
DEPRECATED261
DEPRECATED262
DEPRECATED263
DEPRECATED264
DEPRECATED265
DEPRECATED266
DEPRECATED267
DEPRECATED268
DEPRECATED269
DEPRECATED270
DEPRECATED271
DEPRECATED272
DEPRECATED273
DEPRECATED274
DEPRECATED275
DEPRECATED276
PERSON
NORP
FACILITY
ORG
GPE
LOC
PRODUCT
EVENT
WORK_OF_ART
LANGUAGE
LAW
DATE
TIME
PERCENT
MONEY
QUANTITY
ORDINAL
CARDINAL
acomp
advcl
advmod
agent
amod
appos
attr
aux
auxpass
cc
ccomp
complm
conj
cop # U20
csubj
csubjpass
dep
det
dobj
expl
hmod
hyph
infmod
intj
iobj
mark
meta
neg
nmod
nn
npadvmod
nsubj
nsubjpass
num
number
oprd
obj # U20
obl # U20
parataxis
partmod
pcomp
pobj
poss
possessive
preconj
prep
prt
punct
quantmod
relcl
rcmod
root
xcomp
acl
ENT_KB_ID
MORPH
ENT_ID
IDX
_
# cython: optimize.unpack_method_calls=False
IDS = {
"": NIL,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
"IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"IS_CURRENCY": IS_CURRENCY,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"CLUSTER": CLUSTER,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"ENT_ID": ENT_ID,
"ENT_KB_ID": ENT_KB_ID,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,
"IDX": IDX,
"ADJ": ADJ,
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"CCONJ": CCONJ, # U20
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
"NUM": NUM,
"PART": PART,
"PRON": PRON,
"PROPN": PROPN,
"PUNCT": PUNCT,
"SCONJ": SCONJ,
"SYM": SYM,
"VERB": VERB,
"X": X,
"EOL": EOL,
"SPACE": SPACE,
"DEPRECATED001": DEPRECATED001,
"DEPRECATED002": DEPRECATED002,
"DEPRECATED003": DEPRECATED003,
"DEPRECATED004": DEPRECATED004,
"DEPRECATED005": DEPRECATED005,
"DEPRECATED006": DEPRECATED006,
"DEPRECATED007": DEPRECATED007,
"DEPRECATED008": DEPRECATED008,
"DEPRECATED009": DEPRECATED009,
"DEPRECATED010": DEPRECATED010,
"DEPRECATED011": DEPRECATED011,
"DEPRECATED012": DEPRECATED012,
"DEPRECATED013": DEPRECATED013,
"DEPRECATED014": DEPRECATED014,
"DEPRECATED015": DEPRECATED015,
"DEPRECATED016": DEPRECATED016,
"DEPRECATED017": DEPRECATED017,
"DEPRECATED018": DEPRECATED018,
"DEPRECATED019": DEPRECATED019,
"DEPRECATED020": DEPRECATED020,
"DEPRECATED021": DEPRECATED021,
"DEPRECATED022": DEPRECATED022,
"DEPRECATED023": DEPRECATED023,
"DEPRECATED024": DEPRECATED024,
"DEPRECATED025": DEPRECATED025,
"DEPRECATED026": DEPRECATED026,
"DEPRECATED027": DEPRECATED027,
"DEPRECATED028": DEPRECATED028,
"DEPRECATED029": DEPRECATED029,
"DEPRECATED030": DEPRECATED030,
"DEPRECATED031": DEPRECATED031,
"DEPRECATED032": DEPRECATED032,
"DEPRECATED033": DEPRECATED033,
"DEPRECATED034": DEPRECATED034,
"DEPRECATED035": DEPRECATED035,
"DEPRECATED036": DEPRECATED036,
"DEPRECATED037": DEPRECATED037,
"DEPRECATED038": DEPRECATED038,
"DEPRECATED039": DEPRECATED039,
"DEPRECATED040": DEPRECATED040,
"DEPRECATED041": DEPRECATED041,
"DEPRECATED042": DEPRECATED042,
"DEPRECATED043": DEPRECATED043,
"DEPRECATED044": DEPRECATED044,
"DEPRECATED045": DEPRECATED045,
"DEPRECATED046": DEPRECATED046,
"DEPRECATED047": DEPRECATED047,
"DEPRECATED048": DEPRECATED048,
"DEPRECATED049": DEPRECATED049,
"DEPRECATED050": DEPRECATED050,
"DEPRECATED051": DEPRECATED051,
"DEPRECATED052": DEPRECATED052,
"DEPRECATED053": DEPRECATED053,
"DEPRECATED054": DEPRECATED054,
"DEPRECATED055": DEPRECATED055,
"DEPRECATED056": DEPRECATED056,
"DEPRECATED057": DEPRECATED057,
"DEPRECATED058": DEPRECATED058,
"DEPRECATED059": DEPRECATED059,
"DEPRECATED060": DEPRECATED060,
"DEPRECATED061": DEPRECATED061,
"DEPRECATED062": DEPRECATED062,
"DEPRECATED063": DEPRECATED063,
"DEPRECATED064": DEPRECATED064,
"DEPRECATED065": DEPRECATED065,
"DEPRECATED066": DEPRECATED066,
"DEPRECATED067": DEPRECATED067,
"DEPRECATED068": DEPRECATED068,
"DEPRECATED069": DEPRECATED069,
"DEPRECATED070": DEPRECATED070,
"DEPRECATED071": DEPRECATED071,
"DEPRECATED072": DEPRECATED072,
"DEPRECATED073": DEPRECATED073,
"DEPRECATED074": DEPRECATED074,
"DEPRECATED075": DEPRECATED075,
"DEPRECATED076": DEPRECATED076,
"DEPRECATED077": DEPRECATED077,
"DEPRECATED078": DEPRECATED078,
"DEPRECATED079": DEPRECATED079,
"DEPRECATED080": DEPRECATED080,
"DEPRECATED081": DEPRECATED081,
"DEPRECATED082": DEPRECATED082,
"DEPRECATED083": DEPRECATED083,
"DEPRECATED084": DEPRECATED084,
"DEPRECATED085": DEPRECATED085,
"DEPRECATED086": DEPRECATED086,
"DEPRECATED087": DEPRECATED087,
"DEPRECATED088": DEPRECATED088,
"DEPRECATED089": DEPRECATED089,
"DEPRECATED090": DEPRECATED090,
"DEPRECATED091": DEPRECATED091,
"DEPRECATED092": DEPRECATED092,
"DEPRECATED093": DEPRECATED093,
"DEPRECATED094": DEPRECATED094,
"DEPRECATED095": DEPRECATED095,
"DEPRECATED096": DEPRECATED096,
"DEPRECATED097": DEPRECATED097,
"DEPRECATED098": DEPRECATED098,
"DEPRECATED099": DEPRECATED099,
"DEPRECATED100": DEPRECATED100,
"DEPRECATED101": DEPRECATED101,
"DEPRECATED102": DEPRECATED102,
"DEPRECATED103": DEPRECATED103,
"DEPRECATED104": DEPRECATED104,
"DEPRECATED105": DEPRECATED105,
"DEPRECATED106": DEPRECATED106,
"DEPRECATED107": DEPRECATED107,
"DEPRECATED108": DEPRECATED108,
"DEPRECATED109": DEPRECATED109,
"DEPRECATED110": DEPRECATED110,
"DEPRECATED111": DEPRECATED111,
"DEPRECATED112": DEPRECATED112,
"DEPRECATED113": DEPRECATED113,
"DEPRECATED114": DEPRECATED114,
"DEPRECATED115": DEPRECATED115,
"DEPRECATED116": DEPRECATED116,
"DEPRECATED117": DEPRECATED117,
"DEPRECATED118": DEPRECATED118,
"DEPRECATED119": DEPRECATED119,
"DEPRECATED120": DEPRECATED120,
"DEPRECATED121": DEPRECATED121,
"DEPRECATED122": DEPRECATED122,
"DEPRECATED123": DEPRECATED123,
"DEPRECATED124": DEPRECATED124,
"DEPRECATED125": DEPRECATED125,
"DEPRECATED126": DEPRECATED126,
"DEPRECATED127": DEPRECATED127,
"DEPRECATED128": DEPRECATED128,
"DEPRECATED129": DEPRECATED129,
"DEPRECATED130": DEPRECATED130,
"DEPRECATED131": DEPRECATED131,
"DEPRECATED132": DEPRECATED132,
"DEPRECATED133": DEPRECATED133,
"DEPRECATED134": DEPRECATED134,
"DEPRECATED135": DEPRECATED135,
"DEPRECATED136": DEPRECATED136,
"DEPRECATED137": DEPRECATED137,
"DEPRECATED138": DEPRECATED138,
"DEPRECATED139": DEPRECATED139,
"DEPRECATED140": DEPRECATED140,
"DEPRECATED141": DEPRECATED141,
"DEPRECATED142": DEPRECATED142,
"DEPRECATED143": DEPRECATED143,
"DEPRECATED144": DEPRECATED144,
"DEPRECATED145": DEPRECATED145,
"DEPRECATED146": DEPRECATED146,
"DEPRECATED147": DEPRECATED147,
"DEPRECATED148": DEPRECATED148,
"DEPRECATED149": DEPRECATED149,
"DEPRECATED150": DEPRECATED150,
"DEPRECATED151": DEPRECATED151,
"DEPRECATED152": DEPRECATED152,
"DEPRECATED153": DEPRECATED153,
"DEPRECATED154": DEPRECATED154,
"DEPRECATED155": DEPRECATED155,
"DEPRECATED156": DEPRECATED156,
"DEPRECATED157": DEPRECATED157,
"DEPRECATED158": DEPRECATED158,
"DEPRECATED159": DEPRECATED159,
"DEPRECATED160": DEPRECATED160,
"DEPRECATED161": DEPRECATED161,
"DEPRECATED162": DEPRECATED162,
"DEPRECATED163": DEPRECATED163,
"DEPRECATED164": DEPRECATED164,
"DEPRECATED165": DEPRECATED165,
"DEPRECATED166": DEPRECATED166,
"DEPRECATED167": DEPRECATED167,
"DEPRECATED168": DEPRECATED168,
"DEPRECATED169": DEPRECATED169,
"DEPRECATED170": DEPRECATED170,
"DEPRECATED171": DEPRECATED171,
"DEPRECATED172": DEPRECATED172,
"DEPRECATED173": DEPRECATED173,
"DEPRECATED174": DEPRECATED174,
"DEPRECATED175": DEPRECATED175,
"DEPRECATED176": DEPRECATED176,
"DEPRECATED177": DEPRECATED177,
"DEPRECATED178": DEPRECATED178,
"DEPRECATED179": DEPRECATED179,
"DEPRECATED180": DEPRECATED180,
"DEPRECATED181": DEPRECATED181,
"DEPRECATED182": DEPRECATED182,
"DEPRECATED183": DEPRECATED183,
"DEPRECATED184": DEPRECATED184,
"DEPRECATED185": DEPRECATED185,
"DEPRECATED186": DEPRECATED186,
"DEPRECATED187": DEPRECATED187,
"DEPRECATED188": DEPRECATED188,
"DEPRECATED189": DEPRECATED189,
"DEPRECATED190": DEPRECATED190,
"DEPRECATED191": DEPRECATED191,
"DEPRECATED192": DEPRECATED192,
"DEPRECATED193": DEPRECATED193,
"DEPRECATED194": DEPRECATED194,
"DEPRECATED195": DEPRECATED195,
"DEPRECATED196": DEPRECATED196,
"DEPRECATED197": DEPRECATED197,
"DEPRECATED198": DEPRECATED198,
"DEPRECATED199": DEPRECATED199,
"DEPRECATED200": DEPRECATED200,
"DEPRECATED201": DEPRECATED201,
"DEPRECATED202": DEPRECATED202,
"DEPRECATED203": DEPRECATED203,
"DEPRECATED204": DEPRECATED204,
"DEPRECATED205": DEPRECATED205,
"DEPRECATED206": DEPRECATED206,
"DEPRECATED207": DEPRECATED207,
"DEPRECATED208": DEPRECATED208,
"DEPRECATED209": DEPRECATED209,
"DEPRECATED210": DEPRECATED210,
"DEPRECATED211": DEPRECATED211,
"DEPRECATED212": DEPRECATED212,
"DEPRECATED213": DEPRECATED213,
"DEPRECATED214": DEPRECATED214,
"DEPRECATED215": DEPRECATED215,
"DEPRECATED216": DEPRECATED216,
"DEPRECATED217": DEPRECATED217,
"DEPRECATED218": DEPRECATED218,
"DEPRECATED219": DEPRECATED219,
"DEPRECATED220": DEPRECATED220,
"DEPRECATED221": DEPRECATED221,
"DEPRECATED222": DEPRECATED222,
"DEPRECATED223": DEPRECATED223,
"DEPRECATED224": DEPRECATED224,
"DEPRECATED225": DEPRECATED225,
"DEPRECATED226": DEPRECATED226,
"DEPRECATED227": DEPRECATED227,
"DEPRECATED228": DEPRECATED228,
"DEPRECATED229": DEPRECATED229,
"DEPRECATED230": DEPRECATED230,
"DEPRECATED231": DEPRECATED231,
"DEPRECATED232": DEPRECATED232,
"DEPRECATED233": DEPRECATED233,
"DEPRECATED234": DEPRECATED234,
"DEPRECATED235": DEPRECATED235,
"DEPRECATED236": DEPRECATED236,
"DEPRECATED237": DEPRECATED237,
"DEPRECATED238": DEPRECATED238,
"DEPRECATED239": DEPRECATED239,
"DEPRECATED240": DEPRECATED240,
"DEPRECATED241": DEPRECATED241,
"DEPRECATED242": DEPRECATED242,
"DEPRECATED243": DEPRECATED243,
"DEPRECATED244": DEPRECATED244,
"DEPRECATED245": DEPRECATED245,
"DEPRECATED246": DEPRECATED246,
"DEPRECATED247": DEPRECATED247,
"DEPRECATED248": DEPRECATED248,
"DEPRECATED249": DEPRECATED249,
"DEPRECATED250": DEPRECATED250,
"DEPRECATED251": DEPRECATED251,
"DEPRECATED252": DEPRECATED252,
"DEPRECATED253": DEPRECATED253,
"DEPRECATED254": DEPRECATED254,
"DEPRECATED255": DEPRECATED255,
"DEPRECATED256": DEPRECATED256,
"DEPRECATED257": DEPRECATED257,
"DEPRECATED258": DEPRECATED258,
"DEPRECATED259": DEPRECATED259,
"DEPRECATED260": DEPRECATED260,
"DEPRECATED261": DEPRECATED261,
"DEPRECATED262": DEPRECATED262,
"DEPRECATED263": DEPRECATED263,
"DEPRECATED264": DEPRECATED264,
"DEPRECATED265": DEPRECATED265,
"DEPRECATED266": DEPRECATED266,
"DEPRECATED267": DEPRECATED267,
"DEPRECATED268": DEPRECATED268,
"DEPRECATED269": DEPRECATED269,
"DEPRECATED270": DEPRECATED270,
"DEPRECATED271": DEPRECATED271,
"DEPRECATED272": DEPRECATED272,
"DEPRECATED273": DEPRECATED273,
"DEPRECATED274": DEPRECATED274,
"DEPRECATED275": DEPRECATED275,
"DEPRECATED276": DEPRECATED276,
"PERSON": PERSON,
"NORP": NORP,
"FACILITY": FACILITY,
"ORG": ORG,
"GPE": GPE,
"LOC": LOC,
"PRODUCT": PRODUCT,
"EVENT": EVENT,
"WORK_OF_ART": WORK_OF_ART,
"LANGUAGE": LANGUAGE,
"DATE": DATE,
"TIME": TIME,
"PERCENT": PERCENT,
"MONEY": MONEY,
"QUANTITY": QUANTITY,
"ORDINAL": ORDINAL,
"CARDINAL": CARDINAL,
"acomp": acomp,
"advcl": advcl,
"advmod": advmod,
"agent": agent,
"amod": amod,
"appos": appos,
"attr": attr,
"aux": aux,
"auxpass": auxpass,
"cc": cc,
"ccomp": ccomp,
"complm": complm,
"conj": conj,
"cop": cop, # U20
"csubj": csubj,
"csubjpass": csubjpass,
"dep": dep,
"det": det,
"dobj": dobj,
"expl": expl,
"hmod": hmod,
"hyph": hyph,
"infmod": infmod,
"intj": intj,
"iobj": iobj,
"mark": mark,
"meta": meta,
"neg": neg,
"nmod": nmod,
"nn": nn,
"npadvmod": npadvmod,
"nsubj": nsubj,
"nsubjpass": nsubjpass,
"num": num,
"number": number,
"oprd": oprd,
"obj": obj, # U20
"obl": obl, # U20
"parataxis": parataxis,
"partmod": partmod,
"pcomp": pcomp,
"pobj": pobj,
"poss": poss,
"possessive": possessive,
"preconj": preconj,
"prep": prep,
"prt": prt,
"punct": punct,
"quantmod": quantmod,
"rcmod": rcmod,
"relcl": relcl,
"root": root,
"xcomp": xcomp,
"acl": acl,
"LAW": LAW,
"MORPH": MORPH,
"_": _,
}
def sort_nums(x):
return x[1]
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
# Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+)
# We keep the enum cdef, and just make sure the names are available to Python
locals().update(IDS)
This source diff could not be displayed because it is too large. You can view the blob instead.
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .structs cimport LexemeC, SpanC, TokenC
from .strings cimport StringStore
from .tokens.doc cimport Doc
from .vocab cimport Vocab, LexemesOrTokens, _Cached
from .matcher.phrasematcher cimport PhraseMatcher
cdef class Tokenizer:
cdef Pool mem
cdef PreshMap _cache
cdef PreshMap _specials
cdef readonly Vocab vocab
cdef object _token_match
cdef object _url_match
cdef object _prefix_search
cdef object _suffix_search
cdef object _infix_finditer
cdef object _rules
cdef PhraseMatcher _special_matcher
# TODO next two are unused and should be removed in v4
# https://github.com/explosion/spaCy/pull/9150
cdef int _unused_int1
cdef int _unused_int2
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1
cdef void _filter_special_spans(self, vector[SpanC] &original,
vector[SpanC] &filtered, int doc_len) nogil
cdef object _prepare_special_spans(self, Doc doc,
vector[SpanC] &filtered)
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
object span_data)
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
int* has_special,
bint with_special_cases) except -1
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
int* has_special, bint with_special_cases) except -1
cdef str _split_affixes(self, Pool mem, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases)
cdef int _attach_tokens(self, Doc tokens, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases) except -1
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
int* has_special, int n) except -1
# cython: embedsignature=True, profile=True, binding=True
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from libc.string cimport memcpy, memset
from libcpp.set cimport set as stdset
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
cimport cython
import re
import warnings
from .tokens.doc cimport Doc
from .strings cimport hash_string
from .lexeme cimport EMPTY_LEXEME
from .attrs import intify_attrs
from .symbols import ORTH, NORM
from .errors import Errors, Warnings
from . import util
from .util import registry, get_words_and_spaces
from .attrs import intify_attrs
from .symbols import ORTH
from .scorer import Scorer
from .training import validate_examples
from .tokens import Span
cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment
boundaries.
DOCS: https://spacy.io/api/tokenizer
"""
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None,
url_match=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types.
rules (dict): Exceptions and special-cases for the tokenizer.
prefix_search (callable): A function matching the signature of
`re.compile(string).search` to match prefixes.
suffix_search (callable): A function matching the signature of
`re.compile(string).search` to match suffixes.
`infix_finditer` (callable): A function matching the signature of
`re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be
recognized as tokens.
url_match (callable): A boolean function matching strings to be
recognized as tokens after considering prefixes and suffixes.
EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab)
DOCS: https://spacy.io/api/tokenizer#init
"""
self.mem = Pool()
self._cache = PreshMap()
self._specials = PreshMap()
self.token_match = token_match
self.url_match = url_match
self.prefix_search = prefix_search
self.suffix_search = suffix_search
self.infix_finditer = infix_finditer
self.vocab = vocab
self._rules = {}
self._special_matcher = PhraseMatcher(self.vocab)
self._load_special_cases(rules)
property token_match:
def __get__(self):
return self._token_match
def __set__(self, token_match):
self._token_match = token_match
self._reload_special_cases()
property url_match:
def __get__(self):
return self._url_match
def __set__(self, url_match):
self._url_match = url_match
self._reload_special_cases()
property prefix_search:
def __get__(self):
return self._prefix_search
def __set__(self, prefix_search):
self._prefix_search = prefix_search
self._reload_special_cases()
property suffix_search:
def __get__(self):
return self._suffix_search
def __set__(self, suffix_search):
self._suffix_search = suffix_search
self._reload_special_cases()
property infix_finditer:
def __get__(self):
return self._infix_finditer
def __set__(self, infix_finditer):
self._infix_finditer = infix_finditer
self._reload_special_cases()
property rules:
def __get__(self):
return self._rules
def __set__(self, rules):
self._rules = {}
self._flush_cache()
self._flush_specials()
self._cache = PreshMap()
self._specials = PreshMap()
self._load_special_cases(rules)
def __reduce__(self):
args = (self.vocab,
self.rules,
self.prefix_search,
self.suffix_search,
self.infix_finditer,
self.token_match,
self.url_match)
return (self.__class__, args, None, None)
def __call__(self, str string):
"""Tokenize a string.
string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
DOCS: https://spacy.io/api/tokenizer#call
"""
doc = self._tokenize_affixes(string, True)
self._apply_special_cases(doc)
return doc
@cython.boundscheck(False)
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases):
"""Tokenize according to affix and token_match settings.
string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
"""
if len(string) >= (2 ** 30):
raise ValueError(Errors.E025.format(length=len(string)))
cdef int length = len(string)
cdef Doc doc = Doc(self.vocab)
if length == 0:
return doc
cdef int i = 0
cdef int start = 0
cdef int has_special = 0
cdef bint in_ws = string[0].isspace()
cdef str span
# The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated
# by either ' ' or nothing.
for uc in string:
if uc.isspace() != in_ws:
if start < i:
# When we want to make this fast, get the data buffer once
# with PyUnicode_AS_DATA, and then maintain a start_byte
# and end_byte, so we can call hash64 directly. That way
# we don't have to create the slice when we hit the cache.
span = string[start:i]
key = hash_string(span)
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
self._tokenize(doc, span, key, &has_special, with_special_cases)
if uc == ' ':
doc.c[doc.length - 1].spacy = True
start = i + 1
else:
start = i
in_ws = not in_ws
i += 1
if start < i:
span = string[start:]
key = hash_string(span)
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
self._tokenize(doc, span, key, &has_special, with_special_cases)
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
return doc
def pipe(self, texts, batch_size=1000):
"""Tokenize a stream of texts.
texts: A sequence of unicode texts.
batch_size (int): Number of texts to accumulate in an internal buffer.
Defaults to 1000.
YIELDS (Doc): A sequence of Doc objects, in order.
DOCS: https://spacy.io/api/tokenizer#pipe
"""
for text in texts:
yield self(text)
def _flush_cache(self):
self._reset_cache([key for key in self._cache])
def _reset_cache(self, keys):
for k in keys:
cached = <_Cached*>self._cache.get(k)
del self._cache[k]
if cached is not NULL:
self.mem.free(cached)
def _flush_specials(self):
self._special_matcher = PhraseMatcher(self.vocab)
for k in self._specials:
cached = <_Cached*>self._specials.get(k)
del self._specials[k]
if cached is not NULL:
self.mem.free(cached)
cdef int _apply_special_cases(self, Doc doc) except -1:
"""Retokenize doc according to special cases.
doc (Doc): Document.
"""
cdef int i
cdef int max_length = 0
cdef bint modify_in_place
cdef Pool mem = Pool()
cdef vector[SpanC] c_matches
cdef vector[SpanC] c_filtered
cdef int offset
cdef int modified_doc_length
# Find matches for special cases
self._special_matcher.find_matches(doc, 0, doc.length, &c_matches)
# Skip processing if no matches
if c_matches.size() == 0:
return True
self._filter_special_spans(c_matches, c_filtered, doc.length)
# Put span info in span.start-indexed dict and calculate maximum
# intermediate document size
(span_data, max_length, modify_in_place) = self._prepare_special_spans(doc, c_filtered)
# If modifications never increase doc length, can modify in place
if modify_in_place:
tokens = doc.c
# Otherwise create a separate array to store modified tokens
else:
assert max_length > 0
tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
# Modify tokenization according to filtered special cases
offset = self._retokenize_special_spans(doc, tokens, span_data)
# Allocate more memory for doc if needed
modified_doc_length = doc.length + offset
while modified_doc_length >= doc.max_length:
doc._realloc(doc.max_length * 2)
# If not modified in place, copy tokens back to doc
if not modify_in_place:
memcpy(doc.c, tokens, max_length * sizeof(TokenC))
for i in range(doc.length + offset, doc.length):
memset(&doc.c[i], 0, sizeof(TokenC))
doc.c[i].lex = &EMPTY_LEXEME
doc.length = doc.length + offset
return True
cdef void _filter_special_spans(self, vector[SpanC] &original, vector[SpanC] &filtered, int doc_len) nogil:
cdef int seen_i
cdef SpanC span
cdef stdset[int] seen_tokens
stdsort(original.begin(), original.end(), len_start_cmp)
cdef int orig_i = original.size() - 1
while orig_i >= 0:
span = original[orig_i]
if not seen_tokens.count(span.start) and not seen_tokens.count(span.end - 1):
filtered.push_back(span)
for seen_i in range(span.start, span.end):
seen_tokens.insert(seen_i)
orig_i -= 1
stdsort(filtered.begin(), filtered.end(), start_cmp)
cdef object _prepare_special_spans(self, Doc doc, vector[SpanC] &filtered):
spans = [doc[match.start:match.end] for match in filtered]
cdef bint modify_in_place = True
cdef int curr_length = doc.length
cdef int max_length
cdef int span_length_diff = 0
span_data = {}
for span in spans:
rule = self._rules.get(span.text, None)
span_length_diff = 0
if rule:
span_length_diff = len(rule) - (span.end - span.start)
if span_length_diff > 0:
modify_in_place = False
curr_length += span_length_diff
if curr_length > max_length:
max_length = curr_length
span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
return (span_data, max_length, modify_in_place)
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, object span_data):
cdef int i = 0
cdef int j = 0
cdef int offset = 0
cdef _Cached* cached
cdef int idx_offset = 0
cdef int orig_final_spacy
cdef int orig_idx
cdef int span_start
cdef int span_end
while i < doc.length:
if not i in span_data:
tokens[i + offset] = doc.c[i]
i += 1
else:
span = span_data[i]
span_start = span[1]
span_end = span[2]
cached = <_Cached*>self._specials.get(hash_string(span[0]))
if cached == NULL:
# Copy original tokens if no rule found
for j in range(span_end - span_start):
tokens[i + offset + j] = doc.c[i + j]
i += span_end - span_start
else:
# Copy special case tokens into doc and adjust token and
# character offsets
idx_offset = 0
orig_final_spacy = doc.c[span_end - 1].spacy
orig_idx = doc.c[i].idx
for j in range(cached.length):
tokens[i + offset + j] = cached.data.tokens[j]
tokens[i + offset + j].idx = orig_idx + idx_offset
idx_offset += cached.data.tokens[j].lex.length
if cached.data.tokens[j].spacy:
idx_offset += 1
tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
i += span_end - span_start
offset += span[3]
return offset
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1:
cdef bint specials_hit = 0
cdef bint cache_hit = 0
cdef int i
if with_special_cases:
cached = <_Cached*>self._specials.get(key)
if cached == NULL:
specials_hit = False
else:
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
has_special[0] = 1
specials_hit = True
if not specials_hit:
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
cache_hit = False
else:
if cached.is_lex:
for i in range(cached.length):
tokens.push_back(cached.data.lexemes[i], False)
else:
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
cache_hit = True
if not specials_hit and not cache_hit:
return False
return True
cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef int orig_size
orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
has_special, with_special_cases)
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
cdef str _split_affixes(self, Pool mem, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
bint with_special_cases):
cdef size_t i
cdef str prefix
cdef str suffix
cdef str minus_pre
cdef str minus_suf
cdef size_t last_size = 0
while string and len(string) != last_size:
if self.token_match and self.token_match(string):
break
if with_special_cases and self._specials.get(hash_string(string)) != NULL:
break
last_size = len(string)
pre_len = self.find_prefix(string)
if pre_len != 0:
prefix = string[:pre_len]
minus_pre = string[pre_len:]
if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
break
suf_len = self.find_suffix(string[pre_len:])
if suf_len != 0:
suffix = string[-suf_len:]
minus_suf = string[:-suf_len]
if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len]
prefixes.push_back(self.vocab.get(mem, prefix))
suffixes.push_back(self.vocab.get(mem, suffix))
elif pre_len:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
elif suf_len:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
return string
cdef int _attach_tokens(self, Doc tokens, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
bint with_special_cases) except -1:
cdef bint specials_hit = 0
cdef bint cache_hit = 0
cdef int split, end
cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme
cdef str span
cdef int i
if prefixes.size():
for i in range(prefixes.size()):
tokens.push_back(prefixes[0][i], False)
if string:
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
pass
elif (self.token_match and self.token_match(string)) or \
(self.url_match and \
self.url_match(string)):
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
matches = self.find_infix(string)
if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
# Let's say we have dyn-o-mite-dave - the regex finds the
# start and end positions of the hyphens
start = 0
start_before_infixes = start
for match in matches:
infix_start = match.start()
infix_end = match.end()
if infix_start == start_before_infixes:
continue
if infix_start != start:
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
if infix_start != infix_end:
# If infix_start != infix_end, it means the infix
# token is non-empty. Empty infix tokens are useful
# for tokenization in some languages (see
# https://github.com/explosion/spaCy/issues/768)
infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
start = infix_end
span = string[start:]
if span:
tokens.push_back(self.vocab.get(tokens.mem, span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
lexeme = deref(it)
preinc(it)
tokens.push_back(lexeme, False)
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
int* has_special, int n) except -1:
cdef int i
if n <= 0:
# avoid mem alloc of zero length
return 0
for i in range(n):
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0
# See #1250
if has_special[0]:
return 0
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = n
cached.is_lex = True
lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
for i in range(n):
lexemes[i] = tokens[i].lex
cached.data.lexemes = <const LexemeC* const*>lexemes
self._cache.set(key, cached)
def find_infix(self, str string):
"""Find internal split points of the string, such as hyphens.
string (str): The string to segment.
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
and `.end()` methods, denoting the placement of internal segment
separators, e.g. hyphens.
DOCS: https://spacy.io/api/tokenizer#find_infix
"""
if self.infix_finditer is None:
return 0
return list(self.infix_finditer(string))
def find_prefix(self, str string):
"""Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match.
string (str): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_prefix
"""
if self.prefix_search is None:
return 0
match = self.prefix_search(string)
return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, str string):
"""Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match.
string (str): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_suffix
"""
if self.suffix_search is None:
return 0
match = self.suffix_search(string)
return (match.end() - match.start()) if match is not None else 0
def _load_special_cases(self, special_cases):
"""Add special-case tokenization rules."""
if special_cases is not None:
for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings)
def _validate_special_case(self, chunk, substrings):
"""Check whether the `ORTH` fields match the string. Check that
additional features beyond `ORTH` and `NORM` are not set by the
exception.
chunk (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes.
"""
attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
orth = "".join([spec[ORTH] for spec in attrs])
if chunk != orth:
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
for substring in attrs:
for attr in substring:
if attr not in (ORTH, NORM):
raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
def add_special_case(self, str string, substrings):
"""Add a special-case tokenization rule.
string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes
must exactly match the string when they are concatenated.
DOCS: https://spacy.io/api/tokenizer#add_special_case
"""
self._validate_special_case(string, substrings)
substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = self.vocab.make_fused_token(substrings)
key = hash_string(string)
stale_special = <_Cached*>self._specials.get(key)
self._specials.set(key, cached)
if stale_special is not NULL:
self.mem.free(stale_special)
self._rules[string] = substrings
self._flush_cache()
if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
def _reload_special_cases(self):
self._flush_cache()
self._flush_specials()
self._load_special_cases(self._rules)
def explain(self, text):
"""A debugging tokenizer that provides information about which
tokenizer rule or pattern was matched for each token. The tokens
produced are identical to `nlp.tokenizer()` except for whitespace
tokens.
string (str): The string to tokenize.
RETURNS (list): A list of (pattern_string, token_string) tuples
DOCS: https://spacy.io/api/tokenizer#explain
"""
prefix_search = self.prefix_search
if prefix_search is None:
prefix_search = re.compile("a^").search
suffix_search = self.suffix_search
if suffix_search is None:
suffix_search = re.compile("a^").search
infix_finditer = self.infix_finditer
if infix_finditer is None:
infix_finditer = re.compile("a^").finditer
token_match = self.token_match
if token_match is None:
token_match = re.compile("a^").match
url_match = self.url_match
if url_match is None:
url_match = re.compile("a^").match
special_cases = {}
for orth, special_tokens in self.rules.items():
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
tokens = []
for substring in text.split():
suffixes = []
while substring:
while prefix_search(substring) or suffix_search(substring):
if token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''
break
if substring in special_cases:
tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
break
if prefix_search(substring):
split = prefix_search(substring).end()
# break if pattern matches the empty string
if split == 0:
break
tokens.append(("PREFIX", substring[:split]))
substring = substring[split:]
if substring in special_cases:
continue
if suffix_search(substring):
split = suffix_search(substring).start()
# break if pattern matches the empty string
if split == len(substring):
break
suffixes.append(("SUFFIX", substring[split:]))
substring = substring[:split]
if len(substring) == 0:
continue
if token_match(substring):
tokens.append(("TOKEN_MATCH", substring))
substring = ''
elif url_match(substring):
tokens.append(("URL_MATCH", substring))
substring = ''
elif substring in special_cases:
tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
substring = ''
elif list(infix_finditer(substring)):
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
if substring[offset : match.start()]:
tokens.append(("TOKEN", substring[offset : match.start()]))
if substring[match.start() : match.end()]:
tokens.append(("INFIX", substring[match.start() : match.end()]))
offset = match.end()
if substring[offset:]:
tokens.append(("TOKEN", substring[offset:]))
substring = ''
elif substring:
tokens.append(("TOKEN", substring))
substring = ''
tokens.extend(reversed(suffixes))
# Find matches for special cases handled by special matcher
words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
t_words = []
t_spaces = []
for word, space in zip(words, spaces):
if not word.isspace():
t_words.append(word)
t_spaces.append(space)
doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
matches = self._special_matcher(doc)
spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
spans = util.filter_spans(spans)
# Replace matched tokens with their exceptions
i = 0
final_tokens = []
spans_by_start = {s.start: s for s in spans}
while i < len(tokens):
if i in spans_by_start:
span = spans_by_start[i]
exc = [d[ORTH] for d in special_cases[span.label_]]
for j, orth in enumerate(exc):
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
i += len(span)
else:
final_tokens.append(tokens[i])
i += 1
return final_tokens
def score(self, examples, **kwargs):
validate_examples(examples, "Tokenizer.score")
return Scorer.score_tokenization(examples)
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
path (str / Path): A path to a directory, which will be created if
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/tokenizer#to_disk
"""
path = util.ensure_path(path)
with path.open("wb") as file_:
file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, *, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (str / Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The modified `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#from_disk
"""
path = util.ensure_path(path)
with path.open("rb") as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, exclude=exclude)
return self
def to_bytes(self, *, exclude=tuple()):
"""Serialize the current state to a binary string.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#to_bytes
"""
serializers = {
"vocab": lambda: self.vocab.to_bytes(exclude=exclude),
"prefix_search": lambda: _get_regex_pattern(self.prefix_search),
"suffix_search": lambda: _get_regex_pattern(self.suffix_search),
"infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
"token_match": lambda: _get_regex_pattern(self.token_match),
"url_match": lambda: _get_regex_pattern(self.url_match),
"exceptions": lambda: dict(sorted(self._rules.items()))
}
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The `Tokenizer` object.
DOCS: https://spacy.io/api/tokenizer#from_bytes
"""
data = {}
deserializers = {
"vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
"prefix_search": lambda b: data.setdefault("prefix_search", b),
"suffix_search": lambda b: data.setdefault("suffix_search", b),
"infix_finditer": lambda b: data.setdefault("infix_finditer", b),
"token_match": lambda b: data.setdefault("token_match", b),
"url_match": lambda b: data.setdefault("url_match", b),
"exceptions": lambda b: data.setdefault("rules", b)
}
# reset all properties and flush all caches (through rules),
# reset rules first so that _reload_special_cases is trivial/fast as
# the other properties are reset
self.rules = {}
self.prefix_search = None
self.suffix_search = None
self.infix_finditer = None
self.token_match = None
self.url_match = None
msg = util.from_bytes(bytes_data, deserializers, exclude)
if "prefix_search" in data and isinstance(data["prefix_search"], str):
self.prefix_search = re.compile(data["prefix_search"]).search
if "suffix_search" in data and isinstance(data["suffix_search"], str):
self.suffix_search = re.compile(data["suffix_search"]).search
if "infix_finditer" in data and isinstance(data["infix_finditer"], str):
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if "token_match" in data and isinstance(data["token_match"], str):
self.token_match = re.compile(data["token_match"]).match
if "url_match" in data and isinstance(data["url_match"], str):
self.url_match = re.compile(data["url_match"]).match
if "rules" in data and isinstance(data["rules"], dict):
self.rules = data["rules"]
return self
def _get_regex_pattern(regex):
"""Get a pattern string for a regex, or None if the pattern is None."""
return None if regex is None else regex.__self__.pattern
cdef extern from "<algorithm>" namespace "std" nogil:
void stdsort "sort"(vector[SpanC].iterator,
vector[SpanC].iterator,
bint (*)(SpanC, SpanC))
cdef bint len_start_cmp(SpanC a, SpanC b) nogil:
if a.end - a.start == b.end - b.start:
return b.start < a.start
return a.end - a.start < b.end - b.start
cdef bint start_cmp(SpanC a, SpanC b) nogil:
return a.start < b.start
from typing import TYPE_CHECKING
from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
from .compat import Protocol, runtime_checkable
from thinc.api import Optimizer, Model
if TYPE_CHECKING:
from .training import Example
@runtime_checkable
class TrainableComponent(Protocol):
model: Any
is_trainable: bool
def update(
self,
examples: Iterable["Example"],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None
) -> Dict[str, float]:
...
def finish_update(self, sgd: Optimizer) -> None:
...
@runtime_checkable
class InitializableComponent(Protocol):
def initialize(
self,
get_examples: Callable[[], Iterable["Example"]],
nlp: Iterable["Example"],
**kwargs: Any
):
...
@runtime_checkable
class ListenedToComponent(Protocol):
model: Any
listeners: Sequence[Model]
listener_map: Dict[str, Sequence[Model]]
listening_components: List[str]
def add_listener(self, listener: Model, component_name: str) -> None:
...
def remove_listener(self, listener: Model, component_name: str) -> bool:
...
def find_listeners(self, component) -> None:
...
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
from libc.stdint cimport uint8_t
ctypedef float weight_t
ctypedef uint64_t hash_t
ctypedef uint64_t class_t
ctypedef uint64_t attr_t
ctypedef uint64_t flags_t
ctypedef uint16_t len_t
ctypedef uint16_t tag_t
from typing import List, Mapping, NoReturn, Union, Dict, Any, Set
from typing import Optional, Iterable, Callable, Tuple, Type
from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
from types import ModuleType
import os
import importlib
import importlib.util
import re
from pathlib import Path
import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
from thinc.api import ConfigValidationError, Model
import functools
import itertools
import numpy.random
import numpy
import srsly
import catalogue
from catalogue import RegistryError, Registry
import langcodes
import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
from packaging.requirements import Requirement
import subprocess
from contextlib import contextmanager
from collections import defaultdict
import tempfile
import shutil
import shlex
import inspect
import pkgutil
import logging
try:
import cupy.random
except ImportError:
cupy = None
# These are functions that were previously (v2.x) available from spacy.util
# and have since moved to Thinc. We're importing them here so people's code
# doesn't break, but they should always be imported from Thinc from now on,
# not from spacy.util.
from thinc.api import fix_random_seed, compounding, decaying # noqa: F401
from .symbols import ORTH
from .compat import cupy, CudaStream, is_windows, importlib_metadata
from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
from . import about
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
from .pipeline import Pipe # noqa: F401
from .tokens import Doc, Span # noqa: F401
from .vocab import Vocab # noqa: F401
# fmt: off
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
# fmt: on
logger = logging.getLogger("spacy")
logger_stream_handler = logging.StreamHandler()
logger_stream_handler.setFormatter(
logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
)
logger.addHandler(logger_stream_handler)
class ENV_VARS:
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
class registry(thinc.registry):
languages = catalogue.create("spacy", "languages", entry_points=True)
architectures = catalogue.create("spacy", "architectures", entry_points=True)
tokenizers = catalogue.create("spacy", "tokenizers", entry_points=True)
lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
lookups = catalogue.create("spacy", "lookups", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
misc = catalogue.create("spacy", "misc", entry_points=True)
# Callback functions used to manipulate nlp object etc.
callbacks = catalogue.create("spacy", "callbacks", entry_points=True)
batchers = catalogue.create("spacy", "batchers", entry_points=True)
readers = catalogue.create("spacy", "readers", entry_points=True)
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
loggers = catalogue.create("spacy", "loggers", entry_points=True)
scorers = catalogue.create("spacy", "scorers", entry_points=True)
# These are factories registered via third-party packages and the
# spacy_factories entry point. This registry only exists so we can easily
# load them via the entry points. The "true" factories are added via the
# Language.factory decorator (in the spaCy code base and user code) and those
# are the factories used to initialize components via registry.resolve.
_entry_point_factories = catalogue.create("spacy", "factories", entry_points=True)
factories = catalogue.create("spacy", "internal_factories")
# This is mostly used to get a list of all installed models in the current
# environment. spaCy models packaged with `spacy package` will "advertise"
# themselves via entry points.
models = catalogue.create("spacy", "models", entry_points=True)
cli = catalogue.create("spacy", "cli", entry_points=True)
@classmethod
def get_registry_names(cls) -> List[str]:
"""List all available registries."""
names = []
for name, value in inspect.getmembers(cls):
if not name.startswith("_") and isinstance(value, Registry):
names.append(name)
return sorted(names)
@classmethod
def get(cls, registry_name: str, func_name: str) -> Callable:
"""Get a registered function from the registry."""
# We're overwriting this classmethod so we're able to provide more
# specific error messages and implement a fallback to spacy-legacy.
if not hasattr(cls, registry_name):
names = ", ".join(cls.get_registry_names()) or "none"
raise RegistryError(Errors.E892.format(name=registry_name, available=names))
reg = getattr(cls, registry_name)
try:
func = reg.get(func_name)
except RegistryError:
if func_name.startswith("spacy."):
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
try:
return reg.get(legacy_name)
except catalogue.RegistryError:
pass
available = ", ".join(sorted(reg.get_all().keys())) or "none"
raise RegistryError(
Errors.E893.format(
name=func_name, reg_name=registry_name, available=available
)
) from None
return func
@classmethod
def find(cls, registry_name: str, func_name: str) -> Callable:
"""Get info about a registered function from the registry."""
# We're overwriting this classmethod so we're able to provide more
# specific error messages and implement a fallback to spacy-legacy.
if not hasattr(cls, registry_name):
names = ", ".join(cls.get_registry_names()) or "none"
raise RegistryError(Errors.E892.format(name=registry_name, available=names))
reg = getattr(cls, registry_name)
try:
func_info = reg.find(func_name)
except RegistryError:
if func_name.startswith("spacy."):
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
try:
return reg.find(legacy_name)
except catalogue.RegistryError:
pass
available = ", ".join(sorted(reg.get_all().keys())) or "none"
raise RegistryError(
Errors.E893.format(
name=func_name, reg_name=registry_name, available=available
)
) from None
return func_info
@classmethod
def has(cls, registry_name: str, func_name: str) -> bool:
"""Check whether a function is available in a registry."""
if not hasattr(cls, registry_name):
return False
reg = getattr(cls, registry_name)
if func_name.startswith("spacy."):
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
return func_name in reg or legacy_name in reg
return func_name in reg
class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default
function or method argument (for arguments that should default to empty
dictionary). Will raise an error if user or spaCy attempts to add to dict.
"""
def __init__(self, *args, error: str = Errors.E095, **kwargs) -> None:
"""Initialize the frozen dict. Can be initialized with pre-defined
values.
error (str): The error message when user tries to assign to dict.
"""
super().__init__(*args, **kwargs)
self.error = error
def __setitem__(self, key, value):
raise NotImplementedError(self.error)
def pop(self, key, default=None):
raise NotImplementedError(self.error)
def update(self, other):
raise NotImplementedError(self.error)
class SimpleFrozenList(list):
"""Wrapper class around a list that lets us raise custom errors if certain
attributes/methods are accessed. Mostly used for properties like
Language.pipeline that return an immutable list (and that we don't want to
convert to a tuple to not break too much backwards compatibility). If a user
accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
"""
def __init__(self, *args, error: str = Errors.E927) -> None:
"""Initialize the frozen list.
error (str): The error message when user tries to mutate the list.
"""
self.error = error
super().__init__(*args)
def append(self, *args, **kwargs):
raise NotImplementedError(self.error)
def clear(self, *args, **kwargs):
raise NotImplementedError(self.error)
def extend(self, *args, **kwargs):
raise NotImplementedError(self.error)
def insert(self, *args, **kwargs):
raise NotImplementedError(self.error)
def pop(self, *args, **kwargs):
raise NotImplementedError(self.error)
def remove(self, *args, **kwargs):
raise NotImplementedError(self.error)
def reverse(self, *args, **kwargs):
raise NotImplementedError(self.error)
def sort(self, *args, **kwargs):
raise NotImplementedError(self.error)
def lang_class_is_loaded(lang: str) -> bool:
"""Check whether a Language class is already loaded. Language classes are
loaded lazily, to avoid expensive setup code associated with the language
data.
lang (str): Two-letter language code, e.g. 'en'.
RETURNS (bool): Whether a Language class has been loaded.
"""
return lang in registry.languages
def find_matching_language(lang: str) -> Optional[str]:
"""
Given an IETF language code, find a supported spaCy language that is a
close match for it (according to Unicode CLDR language-matching rules).
This allows for language aliases, ISO 639-2 codes, more detailed language
tags, and close matches.
Returns the language code if a matching language is available, or None
if there is no matching language.
>>> find_matching_language('en')
'en'
>>> find_matching_language('pt-BR') # Brazilian Portuguese
'pt'
>>> find_matching_language('fra') # an ISO 639-2 code for French
'fr'
>>> find_matching_language('iw') # obsolete alias for Hebrew
'he'
>>> find_matching_language('no') # Norwegian
'nb'
>>> find_matching_language('mo') # old code for ro-MD
'ro'
>>> find_matching_language('zh-Hans') # Simplified Chinese
'zh'
>>> find_matching_language('zxx')
None
"""
import spacy.lang # noqa: F401
if lang == "xx":
return "xx"
# Find out which language modules we have
possible_languages = []
for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore
code = modinfo.name
if code == "xx":
# Temporarily make 'xx' into a valid language code
possible_languages.append("mul")
elif langcodes.tag_is_valid(code):
possible_languages.append(code)
# Distances from 1-9 allow near misses like Bosnian -> Croatian and
# Norwegian -> Norwegian Bokmål. A distance of 10 would include several
# more possibilities, like variants of Chinese like 'wuu', but text that
# is labeled that way is probably trying to be distinct from 'zh' and
# shouldn't automatically match.
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
if match == "mul":
# Convert 'mul' back to spaCy's 'xx'
return "xx"
else:
return match
def get_lang_class(lang: str) -> Type["Language"]:
"""Import and load a Language class.
lang (str): IETF language code, such as 'en'.
RETURNS (Language): Language class.
"""
# Check if language is registered / entry point is available
if lang in registry.languages:
return registry.languages.get(lang)
else:
# Find the language in the spacy.lang subpackage
try:
module = importlib.import_module(f".lang.{lang}", "spacy")
except ImportError as err:
# Find a matching language. For example, if the language 'no' is
# requested, we can use language-matching to load `spacy.lang.nb`.
try:
match = find_matching_language(lang)
except langcodes.tag_parser.LanguageTagError:
# proceed to raising an import error
match = None
if match:
lang = match
module = importlib.import_module(f".lang.{lang}", "spacy")
else:
raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
set_lang_class(lang, getattr(module, module.__all__[0])) # type: ignore[attr-defined]
return registry.languages.get(lang)
def set_lang_class(name: str, cls: Type["Language"]) -> None:
"""Set a custom Language class name that can be loaded via get_lang_class.
name (str): Name of Language class.
cls (Language): Language class.
"""
registry.languages.register(name, func=cls)
def ensure_path(path: Any) -> Any:
"""Ensure string is converted to a Path.
path (Any): Anything. If string, it's converted to Path.
RETURNS: Path or original argument.
"""
if isinstance(path, str):
return Path(path)
else:
return path
def load_language_data(path: Union[str, Path]) -> Union[dict, list]:
"""Load JSON language data using the given path as a base. If the provided
path isn't present, will attempt to load a gzipped version before giving up.
path (str / Path): The data to load.
RETURNS: The loaded data.
"""
path = ensure_path(path)
if path.exists():
return srsly.read_json(path)
path = path.with_suffix(path.suffix + ".gz")
if path.exists():
return srsly.read_gzip_json(path)
raise ValueError(Errors.E160.format(path=path))
def get_module_path(module: ModuleType) -> Path:
"""Get the path of a Python module.
module (ModuleType): The Python module.
RETURNS (Path): The path.
"""
if not hasattr(module, "__module__"):
raise ValueError(Errors.E169.format(module=repr(module)))
return Path(sys.modules[module.__module__].__file__).parent
def load_model(
name: Union[str, Path],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a package or data path.
name (str): Package name or model path.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
if isinstance(name, str): # name or string path
if name.startswith("blank:"): # shortcut for blank model
return get_lang_class(name.replace("blank:", ""))()
if is_package(name): # installed as package
return load_model_from_package(name, **kwargs) # type: ignore[arg-type]
if Path(name).exists(): # path to model data directory
return load_model_from_path(Path(name), **kwargs) # type: ignore[arg-type]
elif hasattr(name, "exists"): # Path or Path-like to model data
return load_model_from_path(name, **kwargs) # type: ignore[arg-type]
if name in OLD_MODEL_SHORTCUTS:
raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name])) # type: ignore[index]
raise IOError(Errors.E050.format(name=name))
def load_model_from_package(
name: str,
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from an installed package.
name (str): The package name.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
cls = importlib.import_module(name)
return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config) # type: ignore[attr-defined]
def load_model_from_path(
model_path: Path,
*,
meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a data directory path. Creates Language class with
pipeline from config.cfg and then calls from_disk() with path.
model_path (Path): Mmodel path.
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
if not model_path.exists():
raise IOError(Errors.E052.format(path=model_path))
if not meta:
meta = get_model_meta(model_path)
config_path = model_path / "config.cfg"
overrides = dict_to_dot(config)
config = load_config(config_path, overrides=overrides)
nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
def load_model_from_config(
config: Union[Dict[str, Any], Config],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
auto_fill: bool = False,
validate: bool = True,
) -> "Language":
"""Create an nlp object from a config. Expects the full config file including
a section "nlp" containing the settings for the nlp object.
name (str): Package name or model path.
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
auto_fill (bool): Whether to auto-fill config with missing defaults.
validate (bool): Whether to show config validation errors.
RETURNS (Language): The loaded nlp object.
"""
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
nlp_config = config["nlp"]
if "lang" not in nlp_config or nlp_config["lang"] is None:
raise ValueError(Errors.E993.format(config=nlp_config))
# This will automatically handle all codes registered via the languages
# registry, including custom subclasses provided via entry points
lang_cls = get_lang_class(nlp_config["lang"])
nlp = lang_cls.from_config(
config,
vocab=vocab,
disable=disable,
exclude=exclude,
auto_fill=auto_fill,
validate=validate,
)
return nlp
def get_sourced_components(
config: Union[Dict[str, Any], Config]
) -> Dict[str, Dict[str, Any]]:
"""RETURNS (List[str]): All sourced components in the original config,
e.g. {"source": "en_core_web_sm"}. If the config contains a key
"factory", we assume it refers to a component factory.
"""
return {
name: cfg
for name, cfg in config.get("components", {}).items()
if "factory" not in cfg and "source" in cfg
}
def resolve_dot_names(
config: Config, dot_names: List[Optional[str]]
) -> Tuple[Any, ...]:
"""Resolve one or more "dot notation" names, e.g. corpora.train.
The paths could point anywhere into the config, so we don't know which
top-level section we'll be looking within.
We resolve the whole top-level section, although we could resolve less --
we could find the lowest part of the tree.
"""
# TODO: include schema?
resolved = {}
output: List[Any] = []
errors = []
for name in dot_names:
if name is None:
output.append(name)
else:
section = name.split(".")[0]
# We want to avoid resolving the same thing twice
if section not in resolved:
if registry.is_promise(config[section]):
# Otherwise we can't resolve [corpus] if it's a promise
result = registry.resolve({"config": config[section]})["config"]
else:
result = registry.resolve(config[section])
resolved[section] = result
try:
output.append(dot_to_object(resolved, name)) # type: ignore[arg-type]
except KeyError:
msg = f"not a valid section reference: {name}"
errors.append({"loc": name.split("."), "msg": msg})
if errors:
raise ConfigValidationError(config=config, errors=errors)
return tuple(output)
def load_model_from_init_py(
init_file: Union[Path, str],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Helper function to use in the `load()` method of a model package's
__init__.py.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
model_path = Path(init_file).parent
meta = get_model_meta(model_path)
data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
data_path = model_path / data_dir
if not model_path.exists():
raise IOError(Errors.E052.format(path=data_path))
return load_model_from_path(
data_path,
vocab=vocab,
meta=meta,
disable=disable,
exclude=exclude,
config=config,
)
def load_config(
path: Union[str, Path],
overrides: Dict[str, Any] = SimpleFrozenDict(),
interpolate: bool = False,
) -> Config:
"""Load a config file. Takes care of path validation and section order.
path (Union[str, Path]): Path to the config file or "-" to read from stdin.
overrides: (Dict[str, Any]): Config overrides as nested dict or
dict keyed by section values in dot notation.
interpolate (bool): Whether to interpolate and resolve variables.
RETURNS (Config): The loaded config.
"""
config_path = ensure_path(path)
config = Config(section_order=CONFIG_SECTION_ORDER)
if str(config_path) == "-": # read from standard input
return config.from_str(
sys.stdin.read(), overrides=overrides, interpolate=interpolate
)
else:
if not config_path or not config_path.exists() or not config_path.is_file():
raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
return config.from_disk(
config_path, overrides=overrides, interpolate=interpolate
)
def load_config_from_str(
text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
):
"""Load a full config from a string. Wrapper around Thinc's Config.from_str.
text (str): The string config to load.
interpolate (bool): Whether to interpolate and resolve variables.
RETURNS (Config): The loaded config.
"""
return Config(section_order=CONFIG_SECTION_ORDER).from_str(
text, overrides=overrides, interpolate=interpolate
)
def get_installed_models() -> List[str]:
"""List all model packages currently installed in the environment.
RETURNS (List[str]): The string names of the models.
"""
return list(registry.models.get_all().keys())
def get_package_version(name: str) -> Optional[str]:
"""Get the version of an installed package. Typically used to get model
package versions.
name (str): The name of the installed Python package.
RETURNS (str / None): The version or None if package not installed.
"""
try:
return importlib_metadata.version(name) # type: ignore[attr-defined]
except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined]
return None
def is_compatible_version(
version: str, constraint: str, prereleases: bool = True
) -> Optional[bool]:
"""Check if a version (e.g. "2.0.0") is compatible given a version
constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
it's interpreted as =={version}.
version (str): The version to check.
constraint (str): The constraint string.
prereleases (bool): Whether to allow prereleases. If set to False,
prerelease versions will be considered incompatible.
RETURNS (bool / None): Whether the version is compatible, or None if the
version or constraint are invalid.
"""
# Handle cases where exact version is provided as constraint
if constraint[0].isdigit():
constraint = f"=={constraint}"
try:
spec = SpecifierSet(constraint)
version = Version(version) # type: ignore[assignment]
except (InvalidSpecifier, InvalidVersion):
return None
spec.prereleases = prereleases
return version in spec
def is_unconstrained_version(
constraint: str, prereleases: bool = True
) -> Optional[bool]:
# We have an exact version, this is the ultimate constrained version
if constraint[0].isdigit():
return False
try:
spec = SpecifierSet(constraint)
except InvalidSpecifier:
return None
spec.prereleases = prereleases
specs = [sp for sp in spec]
# We only have one version spec and it defines > or >=
if len(specs) == 1 and specs[0].operator in (">", ">="):
return True
# One specifier is exact version
if any(sp.operator in ("==") for sp in specs):
return False
has_upper = any(sp.operator in ("<", "<=") for sp in specs)
has_lower = any(sp.operator in (">", ">=") for sp in specs)
# We have a version spec that defines an upper and lower bound
if has_upper and has_lower:
return False
# Everything else, like only an upper version, only a lower version etc.
return True
def split_requirement(requirement: str) -> Tuple[str, str]:
"""Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3")."""
req = Requirement(requirement)
return (req.name, str(req.specifier))
def get_minor_version_range(version: str) -> str:
"""Generate a version range like >=1.2.3,<1.3.0 based on a given version
(e.g. of spaCy).
"""
release = Version(version).release
return f">={version},<{release[0]}.{release[1] + 1}.0"
def get_model_lower_version(constraint: str) -> Optional[str]:
"""From a version range like >=1.2.3,<1.3.0 return the lower pin."""
try:
specset = SpecifierSet(constraint)
for spec in specset:
if spec.operator in (">=", "==", "~="):
return spec.version
except Exception:
pass
return None
def get_base_version(version: str) -> str:
"""Generate the base version without any prerelease identifiers.
version (str): The version, e.g. "3.0.0.dev1".
RETURNS (str): The base version, e.g. "3.0.0".
"""
return Version(version).base_version
def get_minor_version(version: str) -> Optional[str]:
"""Get the major + minor version (without patch or prerelease identifiers).
version (str): The version.
RETURNS (str): The major + minor version or None if version is invalid.
"""
try:
v = Version(version)
except (TypeError, InvalidVersion):
return None
return f"{v.major}.{v.minor}"
def is_minor_version_match(version_a: str, version_b: str) -> bool:
"""Compare two versions and check if they match in major and minor, without
patch or prerelease identifiers. Used internally for compatibility checks
that should be insensitive to patch releases.
version_a (str): The first version
version_b (str): The second version.
RETURNS (bool): Whether the versions match.
"""
a = get_minor_version(version_a)
b = get_minor_version(version_b)
return a is not None and b is not None and a == b
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Load a model meta.json from a path and validate its contents.
path (Union[str, Path]): Path to meta.json.
RETURNS (Dict[str, Any]): The loaded meta.
"""
path = ensure_path(path)
if not path.parent.exists():
raise IOError(Errors.E052.format(path=path.parent))
if not path.exists() or not path.is_file():
raise IOError(Errors.E053.format(path=path.parent, name="meta.json"))
meta = srsly.read_json(path)
for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]:
raise ValueError(Errors.E054.format(setting=setting))
if "spacy_version" in meta:
if not is_compatible_version(about.__version__, meta["spacy_version"]):
lower_version = get_model_lower_version(meta["spacy_version"])
lower_version = get_minor_version(lower_version) # type: ignore[arg-type]
if lower_version is not None:
lower_version = "v" + lower_version
elif "spacy_git_version" in meta:
lower_version = "git commit " + meta["spacy_git_version"]
else:
lower_version = "version unknown"
warn_msg = Warnings.W095.format(
model=f"{meta['lang']}_{meta['name']}",
model_version=meta["version"],
version=lower_version,
current=about.__version__,
)
warnings.warn(warn_msg)
if is_unconstrained_version(meta["spacy_version"]):
warn_msg = Warnings.W094.format(
model=f"{meta['lang']}_{meta['name']}",
model_version=meta["version"],
version=meta["spacy_version"],
example=get_minor_version_range(about.__version__),
)
warnings.warn(warn_msg)
return meta
def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Get model meta.json from a directory path and validate its contents.
path (str / Path): Path to model directory.
RETURNS (Dict[str, Any]): The model's meta data.
"""
model_path = ensure_path(path)
return load_meta(model_path / "meta.json")
def is_package(name: str) -> bool:
"""Check if string maps to a package installed via pip.
name (str): Name of package.
RETURNS (bool): True if installed package, False if not.
"""
try:
importlib_metadata.distribution(name) # type: ignore[attr-defined]
return True
except: # noqa: E722
return False
def get_package_path(name: str) -> Path:
"""Get the path to an installed package.
name (str): Package name.
RETURNS (Path): Path to installed package.
"""
name = name.lower() # use lowercase version to be safe
# Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package.
pkg = importlib.import_module(name)
return Path(pkg.__file__).parent
def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
"""Replace a node within a model with a new one, updating refs.
model (Model): The parent model.
target (Model): The target node.
replacement (Model): The node to replace the target with.
"""
# Place the node into the sublayers
for node in model.walk():
if target in node.layers:
node.layers[node.layers.index(target)] = replacement
# Now fix any node references
for node in model.walk():
for ref_name in node.ref_names:
if node.maybe_get_ref(ref_name) is target:
node.set_ref(ref_name, replacement)
def split_command(command: str) -> List[str]:
"""Split a string command using shlex. Handles platform compatibility.
command (str) : The command to split
RETURNS (List[str]): The split command.
"""
return shlex.split(command, posix=not is_windows)
def join_command(command: List[str]) -> str:
"""Join a command using shlex. shlex.join is only available for Python 3.8+,
so we're using a workaround here.
command (List[str]): The command to join.
RETURNS (str): The joined command
"""
return " ".join(shlex.quote(cmd) for cmd in command)
def run_command(
command: Union[str, List[str]],
*,
stdin: Optional[Any] = None,
capture: bool = False,
) -> subprocess.CompletedProcess:
"""Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed.
command (str / List[str]): The command. If provided as a string, the
string will be split using shlex.split.
stdin (Optional[Any]): stdin to read from or None.
capture (bool): Whether to capture the output and errors. If False,
the stdout and stderr will not be redirected, and if there's an error,
sys.exit will be called with the return code. You should use capture=False
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
RETURNS (Optional[CompletedProcess]): The process object.
"""
if isinstance(command, str):
cmd_list = split_command(command)
cmd_str = command
else:
cmd_list = command
cmd_str = " ".join(command)
try:
ret = subprocess.run(
cmd_list,
env=os.environ.copy(),
input=stdin,
encoding="utf8",
check=False,
stdout=subprocess.PIPE if capture else None,
stderr=subprocess.STDOUT if capture else None,
)
except FileNotFoundError:
# Indicates the *command* wasn't found, it's an error before the command
# is run.
raise FileNotFoundError(
Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
) from None
if ret.returncode != 0 and capture:
message = f"Error running command:\n\n{cmd_str}\n\n"
message += f"Subprocess exited with status {ret.returncode}"
if ret.stdout is not None:
message += f"\n\nProcess log (stdout and stderr):\n\n"
message += ret.stdout
error = subprocess.SubprocessError(message)
error.ret = ret # type: ignore[attr-defined]
error.command = cmd_str # type: ignore[attr-defined]
raise error
elif ret.returncode != 0:
sys.exit(ret.returncode)
return ret
@contextmanager
def working_dir(path: Union[str, Path]) -> Iterator[Path]:
"""Change current working directory and returns to previous on exit.
path (str / Path): The directory to navigate to.
YIELDS (Path): The absolute path to the current working directory. This
should be used if the block needs to perform actions within the working
directory, to prevent mismatches with relative paths.
"""
prev_cwd = Path.cwd()
current = Path(path).resolve()
os.chdir(str(current))
try:
yield current
finally:
os.chdir(str(prev_cwd))
@contextmanager
def make_tempdir() -> Generator[Path, None, None]:
"""Execute a block in a temporary directory and remove the directory and
its contents at the end of the with block.
YIELDS (Path): The path of the temp directory.
"""
d = Path(tempfile.mkdtemp())
yield d
try:
shutil.rmtree(str(d))
except PermissionError as e:
warnings.warn(Warnings.W091.format(dir=d, msg=e))
def is_cwd(path: Union[Path, str]) -> bool:
"""Check whether a path is the current working directory.
path (Union[Path, str]): The directory path.
RETURNS (bool): Whether the path is the current working directory.
"""
return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
def is_in_jupyter() -> bool:
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.
RETURNS (bool): True if in Jupyter, False if not.
"""
# https://stackoverflow.com/a/39662359/6400719
try:
shell = get_ipython().__class__.__name__ # type: ignore[name-defined]
if shell == "ZMQInteractiveShell":
return True # Jupyter notebook or qtconsole
except NameError:
return False # Probably standard Python interpreter
return False
def get_object_name(obj: Any) -> str:
"""Get a human-readable name of a Python object, e.g. a pipeline component.
obj (Any): The Python object, typically a function or class.
RETURNS (str): A human-readable name.
"""
if hasattr(obj, "name") and obj.name is not None:
return obj.name
if hasattr(obj, "__name__"):
return obj.__name__
if hasattr(obj, "__class__") and hasattr(obj.__class__, "__name__"):
return obj.__class__.__name__
return repr(obj)
def is_same_func(func1: Callable, func2: Callable) -> bool:
"""Approximately decide whether two functions are the same, even if their
identity is different (e.g. after they have been live reloaded). Mostly
used in the @Language.component and @Language.factory decorators to decide
whether to raise if a factory already exists. Allows decorator to run
multiple times with the same function.
func1 (Callable): The first function.
func2 (Callable): The second function.
RETURNS (bool): Whether it's the same function (most likely).
"""
if not callable(func1) or not callable(func2):
return False
if not hasattr(func1, "__qualname__") or not hasattr(func2, "__qualname__"):
return False
same_name = func1.__qualname__ == func2.__qualname__
same_file = inspect.getfile(func1) == inspect.getfile(func2)
same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2)
return same_name and same_file and same_code
def get_cuda_stream(
require: bool = False, non_blocking: bool = True
) -> Optional[CudaStream]:
ops = get_current_ops()
if CudaStream is None:
return None
elif isinstance(ops, NumpyOps):
return None
else:
return CudaStream(non_blocking=non_blocking)
def get_async(stream, numpy_array):
if cupy is None:
return numpy_array
else:
array = cupy.ndarray(numpy_array.shape, order="C", dtype=numpy_array.dtype)
array.set(numpy_array, stream=stream)
return array
def read_regex(path: Union[str, Path]) -> Pattern:
path = ensure_path(path)
with path.open(encoding="utf8") as file_:
entries = file_.read().split("\n")
expression = "|".join(
["^" + re.escape(piece) for piece in entries if piece.strip()]
)
return re.compile(expression)
def compile_prefix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
"""Compile a sequence of prefix rules into a regex object.
entries (Iterable[Union[str, Pattern]]): The prefix rules, e.g.
spacy.lang.punctuation.TOKENIZER_PREFIXES.
RETURNS (Pattern): The regex object. to be used for Tokenizer.prefix_search.
"""
expression = "|".join(["^" + piece for piece in entries if piece.strip()]) # type: ignore[operator, union-attr]
return re.compile(expression)
def compile_suffix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
"""Compile a sequence of suffix rules into a regex object.
entries (Iterable[Union[str, Pattern]]): The suffix rules, e.g.
spacy.lang.punctuation.TOKENIZER_SUFFIXES.
RETURNS (Pattern): The regex object. to be used for Tokenizer.suffix_search.
"""
expression = "|".join([piece + "$" for piece in entries if piece.strip()]) # type: ignore[operator, union-attr]
return re.compile(expression)
def compile_infix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
"""Compile a sequence of infix rules into a regex object.
entries (Iterable[Union[str, Pattern]]): The infix rules, e.g.
spacy.lang.punctuation.TOKENIZER_INFIXES.
RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
"""
expression = "|".join([piece for piece in entries if piece.strip()]) # type: ignore[misc, union-attr]
return re.compile(expression)
def add_lookups(default_func: Callable[[str], Any], *lookups) -> Callable[[str], Any]:
"""Extend an attribute function with special cases. If a word is in the
lookups, the value is returned. Otherwise the previous function is used.
default_func (callable): The default function to execute.
*lookups (dict): Lookup dictionary mapping string to attribute value.
RETURNS (callable): Lexical attribute getter.
"""
# This is implemented as functools.partial instead of a closure, to allow
# pickle to work.
return functools.partial(_get_attr_unless_lookup, default_func, lookups)
def _get_attr_unless_lookup(
default_func: Callable[[str], Any], lookups: Dict[str, Any], string: str
) -> Any:
for lookup in lookups:
if string in lookup:
return lookup[string] # type: ignore[index]
return default_func(string)
def update_exc(
base_exceptions: Dict[str, List[dict]], *addition_dicts
) -> Dict[str, List[dict]]:
"""Update and validate tokenizer exceptions. Will overwrite exceptions.
base_exceptions (Dict[str, List[dict]]): Base exceptions.
*addition_dicts (Dict[str, List[dict]]): Exceptions to add to the base dict, in order.
RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
"""
exc = dict(base_exceptions)
for additions in addition_dicts:
for orth, token_attrs in additions.items():
if not all(isinstance(attr[ORTH], str) for attr in token_attrs):
raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
described_orth = "".join(attr[ORTH] for attr in token_attrs)
if orth != described_orth:
raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
exc.update(additions)
exc = expand_exc(exc, "'", "’")
return exc
def expand_exc(
excs: Dict[str, List[dict]], search: str, replace: str
) -> Dict[str, List[dict]]:
"""Find string in tokenizer exceptions, duplicate entry and replace string.
For example, to add additional versions with typographic apostrophes.
excs (Dict[str, List[dict]]): Tokenizer exceptions.
search (str): String to find and replace.
replace (str): Replacement.
RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
"""
def _fix_token(token, search, replace):
fixed = dict(token)
fixed[ORTH] = fixed[ORTH].replace(search, replace)
return fixed
new_excs = dict(excs)
for token_string, tokens in excs.items():
if search in token_string:
new_key = token_string.replace(search, replace)
new_value = [_fix_token(t, search, replace) for t in tokens]
new_excs[new_key] = new_value
return new_excs
def normalize_slice(
length: int, start: int, stop: int, step: Optional[int] = None
) -> Tuple[int, int]:
if not (step is None or step == 1):
raise ValueError(Errors.E057)
if start is None:
start = 0
elif start < 0:
start += length
start = min(length, max(0, start))
if stop is None:
stop = length
elif stop < 0:
stop += length
stop = min(length, max(start, stop))
return start, stop
def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for
creating named entities (where one token can only be part of one entity) or
when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
longest span is preferred over shorter spans.
spans (Iterable[Span]): The spans to filter.
RETURNS (List[Span]): The filtered spans.
"""
get_sort_key = lambda span: (span.end - span.start, -span.start)
sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
result = []
seen_tokens: Set[int] = set()
for span in sorted_spans:
# Check for end - 1 here because boundaries are inclusive
if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
result.append(span)
seen_tokens.update(range(span.start, span.end))
result = sorted(result, key=lambda span: span.start)
return result
def to_bytes(getters: Dict[str, Callable[[], bytes]], exclude: Iterable[str]) -> bytes:
return srsly.msgpack_dumps(to_dict(getters, exclude))
def from_bytes(
bytes_data: bytes,
setters: Dict[str, Callable[[bytes], Any]],
exclude: Iterable[str],
) -> None:
return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude) # type: ignore[return-value]
def to_dict(
getters: Dict[str, Callable[[], Any]], exclude: Iterable[str]
) -> Dict[str, Any]:
serialized = {}
for key, getter in getters.items():
# Split to support file names like meta.json
if key.split(".")[0] not in exclude:
serialized[key] = getter()
return serialized
def from_dict(
msg: Dict[str, Any],
setters: Dict[str, Callable[[Any], Any]],
exclude: Iterable[str],
) -> Dict[str, Any]:
for key, setter in setters.items():
# Split to support file names like meta.json
if key.split(".")[0] not in exclude and key in msg:
setter(msg[key])
return msg
def to_disk(
path: Union[str, Path],
writers: Dict[str, Callable[[Path], None]],
exclude: Iterable[str],
) -> Path:
path = ensure_path(path)
if not path.exists():
path.mkdir()
for key, writer in writers.items():
# Split to support file names like meta.json
if key.split(".")[0] not in exclude:
writer(path / key)
return path
def from_disk(
path: Union[str, Path],
readers: Dict[str, Callable[[Path], None]],
exclude: Iterable[str],
) -> Path:
path = ensure_path(path)
for key, reader in readers.items():
# Split to support file names like meta.json
if key.split(".")[0] not in exclude:
reader(path / key)
return path
def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
"""Import module from a file. Used to load models from a directory.
name (str): Name of module to load.
loc (str / Path): Path to the file.
RETURNS: The loaded module.
"""
spec = importlib.util.spec_from_file_location(name, str(loc))
module = importlib.util.module_from_spec(spec) # type: ignore[arg-type]
spec.loader.exec_module(module) # type: ignore[union-attr]
return module
def minify_html(html: str) -> str:
"""Perform a template-specific, rudimentary HTML minification for displaCy.
Disclaimer: NOT a general-purpose solution, only removes indentation and
newlines.
html (str): Markup to minify.
RETURNS (str): "Minified" HTML.
"""
return html.strip().replace(" ", "").replace("\n", "")
def escape_html(text: str) -> str:
"""Replace <, >, &, " with their HTML encoded representation. Intended to
prevent HTML errors in rendered displaCy markup.
text (str): The original text.
RETURNS (str): Equivalent text to be safely used within HTML.
"""
text = text.replace("&", "&amp;")
text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;")
text = text.replace('"', "&quot;")
return text
def get_words_and_spaces(
words: Iterable[str], text: str
) -> Tuple[List[str], List[bool]]:
"""Given a list of words and a text, reconstruct the original tokens and
return a list of words and spaces that can be used to create a Doc. This
can help recover destructive tokenization that didn't preserve any
whitespace information.
words (Iterable[str]): The words.
text (str): The original text.
RETURNS (Tuple[List[str], List[bool]]): The words and spaces.
"""
if "".join("".join(words).split()) != "".join(text.split()):
raise ValueError(Errors.E194.format(text=text, words=words))
text_words = []
text_spaces = []
text_pos = 0
# normalize words to remove all whitespace tokens
norm_words = [word for word in words if not word.isspace()]
# align words with text
for word in norm_words:
try:
word_start = text[text_pos:].index(word)
except ValueError:
raise ValueError(Errors.E194.format(text=text, words=words)) from None
if word_start > 0:
text_words.append(text[text_pos : text_pos + word_start])
text_spaces.append(False)
text_pos += word_start
text_words.append(word)
text_spaces.append(False)
text_pos += len(word)
if text_pos < len(text) and text[text_pos] == " ":
text_spaces[-1] = True
text_pos += 1
if text_pos < len(text):
text_words.append(text[text_pos:])
text_spaces.append(False)
return (text_words, text_spaces)
def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
"""Deep copy a Config. Will raise an error if the config contents are not
JSON-serializable.
config (Config): The config to copy.
RETURNS (Config): The copied config.
"""
try:
return Config(config).copy()
except ValueError:
raise ValueError(Errors.E961.format(config=config)) from None
def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
"""Convert dot notation to a dict. For example: {"token.pos": True,
"token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
values (Dict[str, Any]): The key/value pairs to convert.
RETURNS (Dict[str, dict]): The converted values.
"""
result: Dict[str, dict] = {}
for key, value in values.items():
path = result
parts = key.lower().split(".")
for i, item in enumerate(parts):
is_last = i == len(parts) - 1
path = path.setdefault(item, value if is_last else {})
return result
def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
"""Convert dot notation to a dict. For example: {"token": {"pos": True,
"_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
values (Dict[str, dict]): The dict to convert.
RETURNS (Dict[str, Any]): The key/value pairs.
"""
return {".".join(key): value for key, value in walk_dict(obj)}
def dot_to_object(config: Config, section: str):
"""Convert dot notation of a "section" to a specific part of the Config.
e.g. "training.optimizer" would return the Optimizer object.
Throws an error if the section is not defined in this config.
config (Config): The config.
section (str): The dot notation of the section in the config.
RETURNS: The object denoted by the section
"""
component = config
parts = section.split(".")
for item in parts:
try:
component = component[item]
except (KeyError, TypeError):
raise KeyError(Errors.E952.format(name=section)) from None
return component
def set_dot_to_object(config: Config, section: str, value: Any) -> None:
"""Update a config at a given position from a dot notation.
config (Config): The config.
section (str): The dot notation of the section in the config.
value (Any): The value to set in the config.
"""
component = config
parts = section.split(".")
for i, item in enumerate(parts):
try:
if i == len(parts) - 1:
component[item] = value
else:
component = component[item]
except (KeyError, TypeError):
raise KeyError(Errors.E952.format(name=section)) from None
def walk_dict(
node: Dict[str, Any], parent: List[str] = []
) -> Iterator[Tuple[List[str], Any]]:
"""Walk a dict and yield the path and values of the leaves."""
for key, value in node.items():
key_parent = [*parent, key]
if isinstance(value, dict):
yield from walk_dict(value, key_parent)
else:
yield (key_parent, value)
def get_arg_names(func: Callable) -> List[str]:
"""Get a list of all named arguments of a function (regular,
keyword-only).
func (Callable): The function
RETURNS (List[str]): The argument names.
"""
argspec = inspect.getfullargspec(func)
return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
def combine_score_weights(
weights: List[Dict[str, Optional[float]]],
overrides: Dict[str, Optional[float]] = SimpleFrozenDict(),
) -> Dict[str, Optional[float]]:
"""Combine and normalize score weights defined by components, e.g.
{"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
weights (List[dict]): The weights defined by the components.
overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
should be preserved.
RETURNS (Dict[str, float]): The combined and normalized weights.
"""
# We divide each weight by the total weight sum.
# We first need to extract all None/null values for score weights that
# shouldn't be shown in the table *or* be weighted
result: Dict[str, Optional[float]] = {
key: value for w_dict in weights for (key, value) in w_dict.items()
}
result.update(overrides)
weight_sum = sum([v if v else 0.0 for v in result.values()])
for key, value in result.items():
if value and weight_sum > 0:
result[key] = round(value / weight_sum, 2)
return result
class DummyTokenizer:
def __call__(self, text):
raise NotImplementedError
def pipe(self, texts, **kwargs):
for text in texts:
yield self(text)
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **kwargs):
return b""
def from_bytes(self, data: bytes, **kwargs) -> "DummyTokenizer":
return self
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
return None
def from_disk(self, path: Union[str, Path], **kwargs) -> "DummyTokenizer":
return self
def create_default_optimizer() -> Optimizer:
return Adam()
def minibatch(items, size):
"""Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step.
"""
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
items = iter(items)
while True:
batch_size = next(size_)
batch = list(itertools.islice(items, int(batch_size)))
if len(batch) == 0:
break
yield list(batch)
def is_cython_func(func: Callable) -> bool:
"""Slightly hacky check for whether a callable is implemented in Cython.
Can be used to implement slightly different behaviors, especially around
inspecting and parameter annotations. Note that this will only return True
for actual cdef functions and methods, not regular Python functions defined
in Python modules.
func (Callable): The callable to check.
RETURNS (bool): Whether the callable is Cython (probably).
"""
attr = "__pyx_vtable__"
if hasattr(func, attr): # function or class instance
return True
# https://stackoverflow.com/a/55767059
if (
hasattr(func, "__qualname__")
and hasattr(func, "__module__")
and func.__module__ in sys.modules
): # method
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
return hasattr(cls_func, attr)
return False
def check_bool_env_var(env_var: str) -> bool:
"""Convert the value of an environment variable to a boolean. Add special
check for "0" (falsy) and consider everything else truthy, except unset.
env_var (str): The name of the environment variable to check.
RETURNS (bool): Its boolean value.
"""
value = os.environ.get(env_var, False)
if value == "0":
return False
return bool(value)
def _pipe(
docs: Iterable["Doc"],
proc: "Pipe",
name: str,
default_error_handler: Callable[[str, "Pipe", List["Doc"], Exception], NoReturn],
kwargs: Mapping[str, Any],
) -> Iterator["Doc"]:
if hasattr(proc, "pipe"):
yield from proc.pipe(docs, **kwargs)
else:
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)
error_handler = default_error_handler
if hasattr(proc, "get_error_handler"):
error_handler = proc.get_error_handler()
for arg in ["batch_size"]:
if arg in kwargs:
kwargs.pop(arg)
for doc in docs:
try:
doc = proc(doc, **kwargs) # type: ignore[call-arg]
yield doc
except Exception as e:
error_handler(name, proc, [doc], e)
def raise_error(proc_name, proc, docs, e):
raise e
def ignore_error(proc_name, proc, docs, e):
pass
def warn_if_jupyter_cupy():
"""Warn about require_gpu if a jupyter notebook + cupy + mismatched
contextvars vs. thread ops are detected
"""
if is_in_jupyter():
from thinc.backends.cupy_ops import CupyOps
if CupyOps.xp is not None:
from thinc.backends import contextvars_eq_thread_ops
if not contextvars_eq_thread_ops():
warnings.warn(Warnings.W111)
def check_lexeme_norms(vocab, component_name):
lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
langs = ", ".join(LEXEME_NORM_LANGS)
logger.debug(Warnings.W033.format(model=component_name, langs=langs))
def to_ternary_int(val) -> int:
"""Convert a value to the ternary 1/0/-1 int used for True/None/False in
attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
(None), any other values are -1 (False).
"""
if val is True:
return 1
elif val is None:
return 0
elif val is False:
return -1
elif val == 1:
return 1
elif val == 0:
return 0
else:
return -1
# The following implementation of packages_distributions() is adapted from
# importlib_metadata, which is distributed under the Apache 2.0 License.
# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw
# See licenses/3rd_party_licenses.txt
def packages_distributions() -> Dict[str, List[str]]:
"""Return a mapping of top-level packages to their distributions. We're
inlining this helper from the importlib_metadata "backport" here, since
it's not available in the builtin importlib.metadata.
"""
pkg_to_dist = defaultdict(list)
for dist in importlib_metadata.distributions(): # type: ignore[attr-defined]
for pkg in (dist.read_text("top_level.txt") or "").split():
pkg_to_dist[pkg].append(dist.metadata["Name"])
return dict(pkg_to_dist)
This source diff could not be displayed because it is too large. You can view the blob instead.
cimport numpy as np
from libc.stdint cimport uint32_t
from cython.operator cimport dereference as deref
from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64
import functools
import numpy
from typing import cast
import warnings
from enum import Enum
import srsly
from thinc.api import get_array_module, get_current_ops
from thinc.backends import get_array_ops
from thinc.types import Floats2d
from .strings cimport StringStore
from .strings import get_string_id
from .errors import Errors, Warnings
from . import util
def unpickle_vectors(bytes_data):
return Vectors().from_bytes(bytes_data)
class Mode(str, Enum):
default = "default"
floret = "floret"
@classmethod
def values(cls):
return list(cls.__members__.keys())
cdef class Vectors:
"""Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
(for GPU vectors).
In the default mode, `vectors.key2row` is a dictionary mapping word hashes
to rows in the vectors.data table. Multiple keys can be mapped to the same
vector, and not all of the rows in the table need to be assigned - so
len(list(vectors.keys())) may be greater or smaller than vectors.shape[0].
In floret mode, the floret settings (minn, maxn, etc.) are used to
calculate the vector from the rows corresponding to the key's ngrams.
DOCS: https://spacy.io/api/vectors
"""
cdef public object strings
cdef public object name
cdef readonly object mode
cdef public object data
cdef public object key2row
cdef cppset[int] _unset
cdef readonly uint32_t minn
cdef readonly uint32_t maxn
cdef readonly uint32_t hash_count
cdef readonly uint32_t hash_seed
cdef readonly unicode bow
cdef readonly unicode eow
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
"""Create a new vector store.
strings (StringStore): The string store.
shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray or cupy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data.
name (str): A name to identify the vectors table.
mode (str): Vectors mode: "default" or "floret" (default: "default").
minn (int): The floret char ngram minn (default: 0).
maxn (int): The floret char ngram maxn (default: 0).
hash_count (int): The floret hash count (1-4, default: 1).
hash_seed (int): The floret hash seed (default: 0).
bow (str): The floret BOW string (default: "<").
eow (str): The floret EOW string (default: ">").
DOCS: https://spacy.io/api/vectors#init
"""
self.strings = strings
if self.strings is None:
self.strings = StringStore()
self.name = name
if mode not in Mode.values():
raise ValueError(
Errors.E202.format(
name="vectors",
mode=mode,
modes=str(Mode.values())
)
)
self.mode = Mode(mode).value
self.key2row = {}
self.minn = minn
self.maxn = maxn
self.hash_count = hash_count
self.hash_seed = hash_seed
self.bow = bow
self.eow = eow
if self.mode == Mode.default:
if data is None:
if shape is None:
shape = (0,0)
ops = get_current_ops()
data = ops.xp.zeros(shape, dtype="f")
self._unset = cppset[int]({i for i in range(data.shape[0])})
else:
self._unset = cppset[int]()
self.data = data
if keys is not None:
for i, key in enumerate(keys):
self.add(key, row=i)
elif self.mode == Mode.floret:
if maxn < minn:
raise ValueError(Errors.E863)
if hash_count < 1 or hash_count >= 5:
raise ValueError(Errors.E862)
if data is None:
raise ValueError(Errors.E864)
if keys is not None:
raise ValueError(Errors.E861)
self.data = data
self._unset = cppset[int]()
@property
def shape(self):
"""Get `(rows, dims)` tuples of number of rows and number of dimensions
in the vector table.
RETURNS (tuple): A `(rows, dims)` pair.
DOCS: https://spacy.io/api/vectors#shape
"""
return self.data.shape
@property
def size(self):
"""The vector size i,e. rows * dims.
RETURNS (int): The vector size.
DOCS: https://spacy.io/api/vectors#size
"""
return self.data.shape[0] * self.data.shape[1]
@property
def is_full(self):
"""Whether the vectors table is full.
RETURNS (bool): `True` if no slots are available for new keys.
DOCS: https://spacy.io/api/vectors#is_full
"""
if self.mode == Mode.floret:
return True
return self._unset.size() == 0
@property
def n_keys(self):
"""Get the number of keys in the table. Note that this is the number
of all keys, not just unique vectors.
RETURNS (int): The number of keys in the table for default vectors.
For floret vectors, return -1.
DOCS: https://spacy.io/api/vectors#n_keys
"""
return len(self.key2row)
def __reduce__(self):
return (unpickle_vectors, (self.to_bytes(),))
def __getitem__(self, key):
"""Get a vector by key. If the key is not found, a KeyError is raised.
key (str/int): The key to get the vector for.
RETURNS (ndarray): The vector for the key.
DOCS: https://spacy.io/api/vectors#getitem
"""
if self.mode == Mode.default:
i = self.key2row.get(get_string_id(key), None)
if i is None:
raise KeyError(Errors.E058.format(key=key))
else:
return self.data[i]
elif self.mode == Mode.floret:
return self.get_batch([key])[0]
raise KeyError(Errors.E058.format(key=key))
def __setitem__(self, key, vector):
"""Set a vector for the given key.
key (str/int): The key to set the vector for.
vector (ndarray): The vector to set.
DOCS: https://spacy.io/api/vectors#setitem
"""
if self.mode == Mode.floret:
warnings.warn(Warnings.W115.format(method="Vectors.__setitem__"))
return
key = get_string_id(key)
i = self.key2row[key]
self.data[i] = vector
if self._unset.count(i):
self._unset.erase(self._unset.find(i))
def __iter__(self):
"""Iterate over the keys in the table.
YIELDS (int): A key in the table.
DOCS: https://spacy.io/api/vectors#iter
"""
yield from self.key2row
def __len__(self):
"""Return the number of vectors in the table.
RETURNS (int): The number of vectors in the data.
DOCS: https://spacy.io/api/vectors#len
"""
return self.data.shape[0]
def __contains__(self, key):
"""Check whether a key has been mapped to a vector entry in the table.
key (int): The key to check.
RETURNS (bool): Whether the key has a vector entry.
DOCS: https://spacy.io/api/vectors#contains
"""
if self.mode == Mode.floret:
return True
else:
return key in self.key2row
def resize(self, shape, inplace=False):
"""Resize the underlying vectors array. If inplace=True, the memory
is reallocated. This may cause other references to the data to become
invalid, so only use inplace=True if you're sure that's what you want.
If the number of vectors is reduced, keys mapped to rows that have been
deleted are removed. These removed items are returned as a list of
`(key, row)` tuples.
shape (tuple): A `(rows, dims)` tuple.
inplace (bool): Reallocate the memory.
RETURNS (list): The removed items as a list of `(key, row)` tuples.
DOCS: https://spacy.io/api/vectors#resize
"""
if self.mode == Mode.floret:
warnings.warn(Warnings.W115.format(method="Vectors.resize"))
return -1
xp = get_array_module(self.data)
if inplace:
if shape[1] != self.data.shape[1]:
raise ValueError(Errors.E193.format(new_dim=shape[1], curr_dim=self.data.shape[1]))
if xp == numpy:
self.data.resize(shape, refcheck=False)
else:
raise ValueError(Errors.E192)
else:
resized_array = xp.zeros(shape, dtype=self.data.dtype)
copy_shape = (min(shape[0], self.data.shape[0]), min(shape[1], self.data.shape[1]))
resized_array[:copy_shape[0], :copy_shape[1]] = self.data[:copy_shape[0], :copy_shape[1]]
self.data = resized_array
self._sync_unset()
removed_items = []
for key, row in list(self.key2row.items()):
if row >= shape[0]:
self.key2row.pop(key)
removed_items.append((key, row))
return removed_items
def keys(self):
"""RETURNS (iterable): A sequence of keys in the table."""
return self.key2row.keys()
def values(self):
"""Iterate over vectors that have been assigned to at least one key.
Note that some vectors may be unassigned, so the number of vectors
returned may be less than the length of the vectors table.
YIELDS (ndarray): A vector in the table.
DOCS: https://spacy.io/api/vectors#values
"""
for row, vector in enumerate(range(self.data.shape[0])):
if not self._unset.count(row):
yield vector
def items(self):
"""Iterate over `(key, vector)` pairs.
YIELDS (tuple): A key/vector pair.
DOCS: https://spacy.io/api/vectors#items
"""
for key, row in self.key2row.items():
yield key, self.data[row]
def find(self, *, key=None, keys=None, row=None, rows=None):
"""Look up one or more keys by row, or vice versa.
key (Union[int, str]): Find the row that the given key points to.
Returns int, -1 if missing.
keys (Iterable[Union[int, str]]): Find rows that the keys point to.
Returns ndarray.
row (int): Find the first key that points to the row.
Returns int.
rows (Iterable[int]): Find the keys that point to the rows.
Returns ndarray.
RETURNS: The requested key, keys, row or rows.
"""
if self.mode == Mode.floret:
raise ValueError(
Errors.E858.format(
mode=self.mode,
alternative="Use Vectors[key] instead.",
)
)
if sum(arg is None for arg in (key, keys, row, rows)) != 3:
bad_kwargs = {"key": key, "keys": keys, "row": row, "rows": rows}
raise ValueError(Errors.E059.format(kwargs=bad_kwargs))
xp = get_array_module(self.data)
if key is not None:
key = get_string_id(key)
return self.key2row.get(key, -1)
elif keys is not None:
keys = [get_string_id(key) for key in keys]
rows = [self.key2row.get(key, -1.) for key in keys]
return xp.asarray(rows, dtype="i")
else:
row2key = {row: key for key, row in self.key2row.items()}
if row is not None:
return row2key[row]
else:
results = [row2key[row] for row in rows]
return xp.asarray(results, dtype="uint64")
def _get_ngram_hashes(self, unicode s):
"""Calculate up to 4 32-bit hash values with MurmurHash3_x64_128 using
the floret hash settings.
key (str): The string key.
RETURNS: A list of the integer hashes.
"""
cdef uint32_t[4] out
chars = s.encode("utf8")
cdef char* utf8_string = chars
hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
rows = [out[i] for i in range(min(self.hash_count, 4))]
return rows
def _get_ngrams(self, unicode key):
"""Get all padded ngram strings using the ngram settings.
key (str): The string key.
RETURNS: A list of the ngram strings for the padded key.
"""
key = self.bow + key + self.eow
ngrams = [key] + [
key[start:start+ngram_size]
for ngram_size in range(self.minn, self.maxn + 1)
for start in range(0, len(key) - ngram_size + 1)
]
return ngrams
def get_batch(self, keys):
"""Get the vectors for the provided keys efficiently as a batch.
keys (Iterable[Union[int, str]]): The keys.
RETURNS: The requested vectors from the vector table.
"""
ops = get_array_ops(self.data)
if self.mode == Mode.default:
rows = self.find(keys=keys)
vecs = self.data[rows]
elif self.mode == Mode.floret:
keys = [self.strings.as_string(key) for key in keys]
if sum(len(key) for key in keys) == 0:
return ops.xp.zeros((len(keys), self.data.shape[1]))
unique_keys = tuple(set(keys))
row_index = {key: i for i, key in enumerate(unique_keys)}
rows = [row_index[key] for key in keys]
indices = []
lengths = []
for key in unique_keys:
if key == "":
ngram_rows = []
else:
ngram_rows = [
h % self.data.shape[0]
for ngram in self._get_ngrams(key)
for h in self._get_ngram_hashes(ngram)
]
indices.extend(ngram_rows)
lengths.append(len(ngram_rows))
indices = ops.asarray(indices, dtype="int32")
lengths = ops.asarray(lengths, dtype="int32")
vecs = ops.reduce_mean(cast(Floats2d, self.data[indices]), lengths)
vecs = vecs[rows]
return ops.as_contig(vecs)
def add(self, key, *, vector=None, row=None):
"""Add a key to the table. Keys can be mapped to an existing vector
by setting `row`, or a new vector can be added.
key (int): The key to add.
vector (ndarray / None): A vector to add for the key.
row (int / None): The row number of a vector to map the key to.
RETURNS (int): The row the vector was added to.
DOCS: https://spacy.io/api/vectors#add
"""
if self.mode == Mode.floret:
warnings.warn(Warnings.W115.format(method="Vectors.add"))
return -1
# use int for all keys and rows in key2row for more efficient access
# and serialization
key = int(get_string_id(key))
if row is not None:
row = int(row)
if row is None and key in self.key2row:
row = self.key2row[key]
elif row is None:
if self.is_full:
raise ValueError(Errors.E060.format(rows=self.data.shape[0],
cols=self.data.shape[1]))
row = deref(self._unset.begin())
if row < self.data.shape[0]:
self.key2row[key] = row
else:
raise ValueError(Errors.E197.format(row=row, key=key))
if vector is not None:
xp = get_array_module(self.data)
vector = xp.asarray(vector)
self.data[row] = vector
if self._unset.count(row):
self._unset.erase(self._unset.find(row))
return row
def most_similar(self, queries, *, batch_size=1024, n=1, sort=True):
"""For each of the given vectors, find the n most similar entries
to it, by cosine.
Queries are by vector. Results are returned as a `(keys, best_rows,
scores)` tuple. If `queries` is large, the calculations are performed in
chunks, to avoid consuming too much memory. You can set the `batch_size`
to control the size/space trade-off during the calculations.
queries (ndarray): An array with one or more vectors.
batch_size (int): The batch size to use.
n (int): The number of entries to return for each query.
sort (bool): Whether to sort the n entries returned by score.
RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)`
tuple.
"""
if self.mode == Mode.floret:
raise ValueError(Errors.E858.format(
mode=self.mode,
alternative="",
))
xp = get_array_module(self.data)
filled = sorted(list({row for row in self.key2row.values()}))
if len(filled) < n:
raise ValueError(Errors.E198.format(n=n, n_rows=len(filled)))
filled = xp.asarray(filled)
norms = xp.linalg.norm(self.data[filled], axis=1, keepdims=True)
norms[norms == 0] = 1
vectors = self.data[filled] / norms
best_rows = xp.zeros((queries.shape[0], n), dtype='i')
scores = xp.zeros((queries.shape[0], n), dtype='f')
# Work in batches, to avoid memory problems.
for i in range(0, queries.shape[0], batch_size):
batch = queries[i : i+batch_size]
batch_norms = xp.linalg.norm(batch, axis=1, keepdims=True)
batch_norms[batch_norms == 0] = 1
batch /= batch_norms
# batch e.g. (1024, 300)
# vectors e.g. (10000, 300)
# sims e.g. (1024, 10000)
sims = xp.dot(batch, vectors.T)
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:]
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
if sort and n >= 2:
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index]
for i, j in numpy.ndindex(best_rows.shape):
best_rows[i, j] = filled[best_rows[i, j]]
# Round values really close to 1 or -1
scores = xp.around(scores, decimals=4, out=scores)
# Account for numerical error we want to return in range -1, 1
scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
row2key = {row: key for key, row in self.key2row.items()}
numpy_rows = get_current_ops().to_numpy(best_rows)
keys = xp.asarray(
[[row2key[row] for row in numpy_rows[i] if row in row2key]
for i in range(len(queries)) ], dtype="uint64")
return (keys, best_rows, scores)
def _get_cfg(self):
if self.mode == Mode.default:
return {
"mode": Mode(self.mode).value,
}
elif self.mode == Mode.floret:
return {
"mode": Mode(self.mode).value,
"minn": self.minn,
"maxn": self.maxn,
"hash_count": self.hash_count,
"hash_seed": self.hash_seed,
"bow": self.bow,
"eow": self.eow,
}
def _set_cfg(self, cfg):
self.mode = Mode(cfg.get("mode", Mode.default)).value
self.minn = cfg.get("minn", 0)
self.maxn = cfg.get("maxn", 0)
self.hash_count = cfg.get("hash_count", 0)
self.hash_seed = cfg.get("hash_seed", 0)
self.bow = cfg.get("bow", "<")
self.eow = cfg.get("eow", ">")
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.
path (str / Path): A path to a directory, which will be created if
it doesn't exists.
DOCS: https://spacy.io/api/vectors#to_disk
"""
xp = get_array_module(self.data)
if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
else:
save_array = lambda arr, file_: xp.save(file_, arr)
def save_vectors(path):
# the source of numpy.save indicates that the file object is closed after use.
# but it seems that somehow this does not happen, as ResourceWarnings are raised here.
# in order to not rely on this, wrap in context manager.
with path.open("wb") as _file:
save_array(self.data, _file)
serializers = {
"strings": lambda p: self.strings.to_disk(p.with_suffix(".json")),
"vectors": lambda p: save_vectors(p),
"key2row": lambda p: srsly.write_msgpack(p, self.key2row),
"vectors.cfg": lambda p: srsly.write_json(p, self._get_cfg()),
}
return util.to_disk(path, serializers, exclude)
def from_disk(self, path, *, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (str / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object.
DOCS: https://spacy.io/api/vectors#from_disk
"""
def load_key2row(path):
if path.exists():
self.key2row = srsly.read_msgpack(path)
for key, row in self.key2row.items():
if self._unset.count(row):
self._unset.erase(self._unset.find(row))
def load_keys(path):
if path.exists():
keys = numpy.load(str(path))
for i, key in enumerate(keys):
self.add(key, row=i)
def load_vectors(path):
ops = get_current_ops()
if path.exists():
self.data = ops.xp.load(str(path))
def load_settings(path):
if path.exists():
self._set_cfg(srsly.read_json(path))
serializers = {
"strings": lambda p: self.strings.from_disk(p.with_suffix(".json")),
"vectors": load_vectors,
"keys": load_keys,
"key2row": load_key2row,
"vectors.cfg": load_settings,
}
util.from_disk(path, serializers, exclude)
self._sync_unset()
return self
def to_bytes(self, *, exclude=tuple()):
"""Serialize the current state to a binary string.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vectors` object.
DOCS: https://spacy.io/api/vectors#to_bytes
"""
def serialize_weights():
if hasattr(self.data, "to_bytes"):
return self.data.to_bytes()
else:
return srsly.msgpack_dumps(self.data)
serializers = {
"strings": lambda: self.strings.to_bytes(),
"key2row": lambda: srsly.msgpack_dumps(self.key2row),
"vectors": serialize_weights,
"vectors.cfg": lambda: srsly.json_dumps(self._get_cfg()),
}
return util.to_bytes(serializers, exclude)
def from_bytes(self, data, *, exclude=tuple()):
"""Load state from a binary string.
data (bytes): The data to load from.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vectors): The `Vectors` object.
DOCS: https://spacy.io/api/vectors#from_bytes
"""
def deserialize_weights(b):
if hasattr(self.data, "from_bytes"):
self.data.from_bytes()
else:
xp = get_array_module(self.data)
self.data = xp.asarray(srsly.msgpack_loads(b))
deserializers = {
"strings": lambda b: self.strings.from_bytes(b),
"key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)),
"vectors": deserialize_weights,
"vectors.cfg": lambda b: self._set_cfg(srsly.json_loads(b))
}
util.from_bytes(data, deserializers, exclude)
self._sync_unset()
return self
def clear(self):
"""Clear all entries in the vector table.
DOCS: https://spacy.io/api/vectors#clear
"""
if self.mode == Mode.floret:
raise ValueError(Errors.E859)
self.key2row = {}
self._sync_unset()
def _sync_unset(self):
filled = {row for row in self.key2row.values()}
self._unset = cppset[int]({row for row in range(self.data.shape[0]) if row not in filled})
This source diff could not be displayed because it is too large. You can view the blob instead.
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from .structs cimport LexemeC, TokenC
from .typedefs cimport attr_t, hash_t
from .strings cimport StringStore
from .morphology cimport Morphology
cdef LexemeC EMPTY_LEXEME
cdef union LexemesOrTokens:
const LexemeC* const* lexemes
const TokenC* tokens
cdef struct _Cached:
LexemesOrTokens data
bint is_lex
int length
cdef class Vocab:
cdef Pool mem
cdef readonly StringStore strings
cdef public Morphology morphology
cdef public object _vectors
cdef public object _lookups
cdef public object writing_system
cdef public object get_noun_chunks
cdef readonly int length
cdef public object _unused_object # TODO remove in v4, see #9150
cdef public object lex_attr_getters
cdef public object cfg
cdef const LexemeC* get(self, Pool mem, str string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef PreshMap _by_orth
from typing import Callable, Iterator, Optional, Union, List, Dict
from typing import Any, Iterable
from thinc.types import Floats1d, FloatsXd
from . import Language
from .strings import StringStore
from .lexeme import Lexeme
from .lookups import Lookups
from .morphology import Morphology
from .tokens import Doc, Span
from .vectors import Vectors
from pathlib import Path
def create_vocab(
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
) -> Vocab: ...
class Vocab:
cfg: Dict[str, Any]
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]]
lookups: Lookups
morphology: Morphology
strings: StringStore
vectors: Vectors
writing_system: Dict[str, Any]
def __init__(
self,
lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
strings: Optional[Union[List[str], StringStore]] = ...,
lookups: Optional[Lookups] = ...,
oov_prob: float = ...,
vectors_name: Optional[str] = ...,
writing_system: Dict[str, Any] = ...,
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
) -> None: ...
@property
def lang(self) -> str: ...
def __len__(self) -> int: ...
def add_flag(
self, flag_getter: Callable[[str], bool], flag_id: int = ...
) -> int: ...
def __contains__(self, key: str) -> bool: ...
def __iter__(self) -> Iterator[Lexeme]: ...
def __getitem__(self, id_or_string: Union[str, int]) -> Lexeme: ...
@property
def vectors_length(self) -> int: ...
def reset_vectors(
self, *, width: Optional[int] = ..., shape: Optional[int] = ...
) -> None: ...
def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ...
def get_vector(
self,
orth: Union[int, str],
minn: Optional[int] = ...,
maxn: Optional[int] = ...,
) -> FloatsXd: ...
def set_vector(self, orth: Union[int, str], vector: Floats1d) -> None: ...
def has_vector(self, orth: Union[int, str]) -> bool: ...
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
) -> None: ...
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = ...
) -> Vocab: ...
def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
) -> Vocab: ...
def pickle_vocab(vocab: Vocab) -> Any: ...
def unpickle_vocab(
sstore: StringStore,
vectors: Any,
morphology: Any,
_unused_object: Any,
lex_attr_getters: Any,
lookups: Any,
get_noun_chunks: Any,
) -> Vocab: ...
# cython: profile=True
from libc.string cimport memcpy
import srsly
from thinc.api import get_array_module, get_current_ops
import functools
from .lexeme cimport EMPTY_LEXEME, OOV_RANK
from .lexeme cimport Lexeme
from .typedefs cimport attr_t
from .tokens.token cimport Token
from .attrs cimport LANG, ORTH
from .compat import copy_reg
from .errors import Errors
from .attrs import intify_attrs, NORM, IS_STOP
from .vectors import Vectors, Mode as VectorsMode
from .util import registry
from .lookups import Lookups
from . import util
from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None):
# If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
# Ensure that getter can be pickled
lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
)
return Vocab(
lex_attr_getters=lex_attrs,
writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
)
cdef class Vocab:
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying
C-data that is shared between `Doc` objects.
DOCS: https://spacy.io/api/vocab
"""
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
oov_prob=-20., vectors_name=None, writing_system={},
get_noun_chunks=None, **deprecated_kwargs):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to
functions to compute them. Defaults to `None`.
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries.
oov_prob (float): Default OOV probability.
vectors_name (str): Optional name to identify the vectors table.
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
A function that yields base noun phrases used for Doc.noun_chunks.
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
if lookups in (None, True, False):
lookups = Lookups()
self.cfg = {'oov_prob': oov_prob}
self.mem = Pool()
self._by_orth = PreshMap()
self.strings = StringStore()
self.length = 0
if strings:
for string in strings:
_ = self[string]
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings)
self.vectors = Vectors(strings=self.strings, name=vectors_name)
self.lookups = lookups
self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks
property vectors:
def __get__(self):
return self._vectors
def __set__(self, vectors):
for s in vectors.strings:
self.strings.add(s)
self._vectors = vectors
self._vectors.strings = self.strings
@property
def lang(self):
langfunc = None
if self.lex_attr_getters:
langfunc = self.lex_attr_getters.get(LANG, None)
return langfunc("_") if langfunc else ""
def __len__(self):
"""The current number of lexemes stored.
RETURNS (int): The current number of lexemes stored.
"""
return self.length
def add_flag(self, flag_getter, int flag_id=-1):
"""Set a new boolean flag to words in the vocabulary.
The flag_getter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able
to access the flag value on each token using token.check_flag(flag_id).
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`.
flag_getter (callable): A function `f(str) -> bool`, to get the
flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
available bit will be chosen.
RETURNS (int): The integer ID by which the flag value can be checked.
DOCS: https://spacy.io/api/vocab#add_flag
"""
if flag_id == -1:
for bit in range(1, 64):
if bit not in self.lex_attr_getters:
flag_id = bit
break
else:
raise ValueError(Errors.E062)
elif flag_id >= 64 or flag_id < 1:
raise ValueError(Errors.E063.format(value=flag_id))
for lex in self:
lex.set_flag(flag_id, flag_getter(lex.orth_))
self.lex_attr_getters[flag_id] = flag_getter
return flag_id
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if string == "":
return &EMPTY_LEXEME
cdef LexemeC* lex
cdef hash_t key = self.strings[string]
lex = <LexemeC*>self._by_orth.get(key)
cdef size_t addr
if lex != NULL:
assert lex.orth in self.strings
if lex.orth != key:
raise KeyError(Errors.E064.format(string=lex.orth,
orth=key, orth_id=string))
return lex
else:
return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if orth == 0:
return &EMPTY_LEXEME
cdef LexemeC* lex
lex = <LexemeC*>self._by_orth.get(orth)
if lex != NULL:
return lex
else:
return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always
# own the lexemes. It avoids weird bugs this way, as it's how the thing
# was originally supposed to work. The best solution to the growing
# memory use is to periodically reset the vocab, which is an action
# that should be up to the user to do (so we don't need to keep track
# of the doc ownership).
# TODO: Change the C API so that the mem isn't passed in here.
mem = self.mem
#if len(string) < 3 or self.length < 10000:
# mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string)
lex.length = len(string)
if self.vectors is not None:
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
else:
lex.id = OOV_RANK
if self.lex_attr_getters is not None:
for attr, func in self.lex_attr_getters.items():
value = func(string)
if isinstance(value, str):
value = self.strings.add(value)
if value is not None:
Lexeme.set_struct_attr(lex, attr, value)
if not is_oov:
self._add_lex_to_vocab(lex.orth, lex)
if lex == NULL:
raise ValueError(Errors.E085.format(string=string))
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._by_orth.set(lex.orth, <void*>lex)
self.length += 1
def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary.
string (str): The ID string.
RETURNS (bool) Whether the string has an entry in the vocabulary.
DOCS: https://spacy.io/api/vocab#contains
"""
cdef hash_t int_key
if isinstance(key, bytes):
int_key = self.strings[key.decode("utf8")]
elif isinstance(key, str):
int_key = self.strings[key]
else:
int_key = key
lex = self._by_orth.get(int_key)
return lex is not NULL
def __iter__(self):
"""Iterate over the lexemes in the vocabulary.
YIELDS (Lexeme): An entry in the vocabulary.
DOCS: https://spacy.io/api/vocab#iter
"""
cdef attr_t key
cdef size_t addr
for key, addr in self._by_orth.items():
lex = Lexeme(self, key)
yield lex
def __getitem__(self, id_or_string):
"""Retrieve a lexeme, given an int ID or a unicode string. If a
previously unseen unicode string is given, a new lexeme is created and
stored.
id_or_string (int or str): The integer ID of a word, or its unicode
string. If `int >= Lexicon.size`, `IndexError` is raised. If
`id_or_string` is neither an int nor a unicode string, `ValueError`
is raised.
RETURNS (Lexeme): The lexeme indicated by the given ID.
EXAMPLE:
>>> apple = nlp.vocab.strings["apple"]
>>> assert nlp.vocab[apple] == nlp.vocab[u"apple"]
DOCS: https://spacy.io/api/vocab#getitem
"""
cdef attr_t orth
if isinstance(id_or_string, str):
orth = self.strings.add(id_or_string)
else:
orth = id_or_string
return Lexeme(self, orth)
cdef const TokenC* make_fused_token(self, substrings) except NULL:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
props = intify_attrs(props, strings_map=self.strings,
_do_deprecated=True)
token = &tokens[i]
# Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
token.lex = lex
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
# NORM is the only one that overlaps between the two
# (which is maybe not great?)
if attr_id != NORM:
Lexeme.set_struct_attr(lex, attr_id, value)
return tokens
@property
def vectors_length(self):
return self.vectors.data.shape[1]
def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same
width, you have to call this to change the size of the vectors.
"""
if width is not None and shape is not None:
raise ValueError(Errors.E065.format(width=width, shape=shape))
elif shape is not None:
self.vectors = Vectors(strings=self.strings, shape=shape)
else:
width = width if width is not None else self.vectors.data.shape[1]
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
def prune_vectors(self, nr_row, batch_size=1024):
"""Reduce the current vector table to `nr_row` unique entries. Words
mapped to the discarded vectors will be remapped to the closest vector
among those remaining.
For example, suppose the original table had vectors for the words:
['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to
two rows, we would discard the vectors for 'feline' and 'reclined'.
These words would then be remapped to the closest remaining vector
-- so "feline" would have the same vector as "cat", and "reclined"
would have the same vector as "sat".
The similarities are judged by cosine. The original vectors may
be large, so the cosines are calculated in minibatches, to reduce
memory usage.
nr_row (int): The number of rows to keep in the vector table.
batch_size (int): Batch of vectors for calculating the similarities.
Larger batch sizes might be faster, while temporarily requiring
more memory.
RETURNS (dict): A dictionary keyed by removed words mapped to
`(string, score)` tuples, where `string` is the entry the removed
word was mapped to, and `score` the similarity score between the
two words.
DOCS: https://spacy.io/api/vocab#prune_vectors
"""
if self.vectors.mode != VectorsMode.default:
raise ValueError(Errors.E866)
ops = get_current_ops()
xp = get_array_module(self.vectors.data)
# Make sure all vectors are in the vocab
for orth in self.vectors:
self[orth]
# Make prob negative so it sorts by rank ascending
# (key2row contains the rank)
priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
for lex in self if lex.orth in self.vectors.key2row]
priority.sort()
indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
syn_keys = ops.to_numpy(syn_keys)
remap = {}
for i, key in enumerate(ops.to_numpy(keys[nr_row:])):
self.vectors.add(key, row=syn_rows[i][0])
word = self.strings[key]
synonym = self.strings[syn_keys[i][0]]
score = scores[i][0]
remap[word] = (synonym, score)
return remap
def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is
raised.
orth (int / unicode): The hash value of a word, or its unicode string.
RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size
and shape determined by the `vocab.vectors` instance. Usually, a
numpy ndarray of shape (300,) and dtype float32.
DOCS: https://spacy.io/api/vocab#get_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
if self.has_vector(orth):
return self.vectors[orth]
xp = get_array_module(self.vectors.data)
vectors = xp.zeros((self.vectors_length,), dtype="f")
return vectors
def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID.
orth (int / str): The word.
vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
DOCS: https://spacy.io/api/vocab#set_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
if self.vectors.is_full and orth not in self.vectors:
new_rows = max(100, int(self.vectors.shape[0]*1.3))
if self.vectors.shape[1] == 0:
width = vector.size
else:
width = self.vectors.shape[1]
self.vectors.resize((new_rows, width))
lex = self[orth] # Add word to vocab if necessary
row = self.vectors.add(orth, vector=vector)
if row >= 0:
lex.rank = row
def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no vectors have
been loaded. Words can be looked up by string or int ID.
orth (int / str): The word.
RETURNS (bool): Whether the word has a vector.
DOCS: https://spacy.io/api/vocab#has_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
return orth in self.vectors
property lookups:
def __get__(self):
return self._lookups
def __set__(self, lookups):
self._lookups = lookups
if lookups.has_table("lexeme_norm"):
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
self.lookups.get_table("lexeme_norm"),
)
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.
path (str or Path): A path to a directory, which will be created if
it doesn't exist.
exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/vocab#to_disk
"""
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
setters = ["strings", "vectors"]
if "strings" not in exclude:
self.strings.to_disk(path / "strings.json")
if "vectors" not in "exclude":
self.vectors.to_disk(path, exclude=["strings"])
if "lookups" not in "exclude":
self.lookups.to_disk(path)
def from_disk(self, path, *, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (str or Path): A path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_disk
"""
path = util.ensure_path(path)
getters = ["strings", "vectors"]
if "strings" not in exclude:
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
if "vectors" not in exclude:
if self.vectors is not None:
self.vectors.from_disk(path, exclude=["strings"])
if "lookups" not in exclude:
self.lookups.from_disk(path)
if "lexeme_norm" in self.lookups:
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
)
self.length = 0
self._by_orth = PreshMap()
return self
def to_bytes(self, *, exclude=tuple()):
"""Serialize the current state to a binary string.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vocab` object.
DOCS: https://spacy.io/api/vocab#to_bytes
"""
def deserialize_vectors():
if self.vectors is None:
return None
else:
return self.vectors.to_bytes(exclude=["strings"])
getters = {
"strings": lambda: self.strings.to_bytes(),
"vectors": deserialize_vectors,
"lookups": lambda: self.lookups.to_bytes(),
}
return util.to_bytes(getters, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Vocab): The `Vocab` object.
DOCS: https://spacy.io/api/vocab#from_bytes
"""
def serialize_vectors(b):
if self.vectors is None:
return None
else:
return self.vectors.from_bytes(b, exclude=["strings"])
setters = {
"strings": lambda b: self.strings.from_bytes(b),
"vectors": lambda b: serialize_vectors(b),
"lookups": lambda b: self.lookups.from_bytes(b),
}
util.from_bytes(bytes_data, setters, exclude)
if "lexeme_norm" in self.lookups:
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
)
self.length = 0
self._by_orth = PreshMap()
return self
def _reset_cache(self, keys, strings):
# I'm not sure this made sense. Disable it for now.
raise NotImplementedError
def pickle_vocab(vocab):
sstore = vocab.strings
vectors = vocab.vectors
morph = vocab.morphology
_unused_object = vocab._unused_object
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
lookups = vocab.lookups
get_noun_chunks = vocab.get_noun_chunks
return (unpickle_vocab,
(sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
def unpickle_vocab(sstore, vectors, morphology, _unused_object,
lex_attr_getters, lookups, get_noun_chunks):
cdef Vocab vocab = Vocab()
vocab.vectors = vectors
vocab.strings = sstore
vocab.morphology = morphology
vocab._unused_object = _unused_object
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
vocab.lookups = lookups
vocab.get_noun_chunks = get_noun_chunks
return vocab
copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment