GITENV file updated 18

327c3ee6 · IT18195194-Karunaratne J.M.P.D · 909bc7f7 · 327c3ee6 · 327c3ee6 · 327c3ee6
Commit 327c3ee6 authored Nov 24, 2021 by IT18195194-Karunaratne J.M.P.D
63 changed files
--- a/cgpenv/Lib/site-packages/spacy/__init__.pxd
+++ b/cgpenv/Lib/site-packages/spacy/__init__.pxd
--- a/cgpenv/Lib/site-packages/spacy/__init__.py
+++ b/cgpenv/Lib/site-packages/spacy/__init__.py
+from typing import Union, Iterable, Dict, Any
+from pathlib import Path
+import sys
+
+# set library-specific custom warning handling before doing anything else
+from .errors import setup_default_warnings
+
+setup_default_warnings()  # noqa: E402
+
+# These are imported as part of the API
+from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
+from thinc.api import Config
+
+from . import pipeline  # noqa: F401
+from .cli.info import info  # noqa: F401
+from .glossary import explain  # noqa: F401
+from .about import __version__  # noqa: F401
+from .util import registry, logger  # noqa: F401
+
+from .errors import Errors
+from .language import Language
+from .vocab import Vocab
+from . import util
+
+
+if sys.maxunicode == 65535:
+    raise SystemError(Errors.E130)
+
+
+def load(
+    name: Union[str, Path],
+    *,
+    vocab: Union[Vocab, bool] = True,
+    disable: Iterable[str] = util.SimpleFrozenList(),
+    exclude: Iterable[str] = util.SimpleFrozenList(),
+    config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
+) -> Language:
+    """Load a spaCy model from an installed package or a local path.
+
+    name (str): Package name or model path.
+    vocab (Vocab): A Vocab object. If True, a vocab is created.
+    disable (Iterable[str]): Names of pipeline components to disable. Disabled
+        pipes will be loaded but they won't be run unless you explicitly
+        enable them by calling nlp.enable_pipe.
+    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
+        components won't be loaded.
+    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
+        keyed by section values in dot notation.
+    RETURNS (Language): The loaded nlp object.
+    """
+    return util.load_model(
+        name, vocab=vocab, disable=disable, exclude=exclude, config=config
+    )
+
+
+def blank(
+    name: str,
+    *,
+    vocab: Union[Vocab, bool] = True,
+    config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
+    meta: Dict[str, Any] = util.SimpleFrozenDict(),
+) -> Language:
+    """Create a blank nlp object for a given language code.
+
+    name (str): The language code, e.g. "en".
+    vocab (Vocab): A Vocab object. If True, a vocab is created.
+    config (Dict[str, Any] / Config): Optional config overrides.
+    meta (Dict[str, Any]): Overrides for nlp.meta.
+    RETURNS (Language): The nlp object.
+    """
+    LangClass = util.get_lang_class(name)
+    # We should accept both dot notation and nested dict here for consistency
+    config = util.dot_to_dict(config)
+    return LangClass.from_config(config, vocab=vocab, meta=meta)
--- a/cgpenv/Lib/site-packages/spacy/__main__.py
+++ b/cgpenv/Lib/site-packages/spacy/__main__.py
+if __name__ == "__main__":
+    from spacy.cli import setup_cli
+
+    setup_cli()
--- a/cgpenv/Lib/site-packages/spacy/about.py
+++ b/cgpenv/Lib/site-packages/spacy/about.py
+# fmt: off
+__title__ = "spacy"
+__version__ = "3.2.0"
+__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
+__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
+__projects__ = "https://github.com/explosion/projects"
+__projects_branch__ = "v3"
--- a/cgpenv/Lib/site-packages/spacy/attrs.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/attrs.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/attrs.cpp
+++ b/cgpenv/Lib/site-packages/spacy/attrs.cpp
--- a/cgpenv/Lib/site-packages/spacy/attrs.pxd
+++ b/cgpenv/Lib/site-packages/spacy/attrs.pxd
+# Reserve 64 values for flag features
+from . cimport symbols
+
+cdef enum attr_id_t:
+    NULL_ATTR
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER
+    LIKE_URL
+    LIKE_NUM
+    LIKE_EMAIL
+    IS_STOP
+    IS_OOV_DEPRECATED
+    IS_BRACKET
+    IS_QUOTE
+    IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT
+    IS_CURRENCY
+
+    FLAG19 = 19
+    FLAG20
+    FLAG21
+    FLAG22
+    FLAG23
+    FLAG24
+    FLAG25
+    FLAG26
+    FLAG27
+    FLAG28
+    FLAG29
+    FLAG30
+    FLAG31
+    FLAG32
+    FLAG33
+    FLAG34
+    FLAG35
+    FLAG36
+    FLAG37
+    FLAG38
+    FLAG39
+    FLAG40
+    FLAG41
+    FLAG42
+    FLAG43
+    FLAG44
+    FLAG45
+    FLAG46
+    FLAG47
+    FLAG48
+    FLAG49
+    FLAG50
+    FLAG51
+    FLAG52
+    FLAG53
+    FLAG54
+    FLAG55
+    FLAG56
+    FLAG57
+    FLAG58
+    FLAG59
+    FLAG60
+    FLAG61
+    FLAG62
+    FLAG63
+
+    ID
+    ORTH
+    LOWER
+    NORM
+    SHAPE
+    PREFIX
+    SUFFIX
+
+    LENGTH
+    CLUSTER
+    LEMMA
+    POS
+    TAG
+    DEP
+    ENT_IOB
+    ENT_TYPE
+    HEAD
+    SENT_START
+    SPACY
+    PROB
+
+    LANG
+    ENT_KB_ID = symbols.ENT_KB_ID
+    MORPH
+    ENT_ID = symbols.ENT_ID
+
+    IDX
+    SENT_END
\ No newline at end of file
--- a/cgpenv/Lib/site-packages/spacy/attrs.pyx
+++ b/cgpenv/Lib/site-packages/spacy/attrs.pyx
+
+IDS = {
+    "": NULL_ATTR,
+    "IS_ALPHA": IS_ALPHA,
+    "IS_ASCII": IS_ASCII,
+    "IS_DIGIT": IS_DIGIT,
+    "IS_LOWER": IS_LOWER,
+    "IS_PUNCT": IS_PUNCT,
+    "IS_SPACE": IS_SPACE,
+    "IS_TITLE": IS_TITLE,
+    "IS_UPPER": IS_UPPER,
+    "LIKE_URL": LIKE_URL,
+    "LIKE_NUM": LIKE_NUM,
+    "LIKE_EMAIL": LIKE_EMAIL,
+    "IS_STOP": IS_STOP,
+    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
+    "IS_BRACKET": IS_BRACKET,
+    "IS_QUOTE": IS_QUOTE,
+    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
+    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
+    "IS_CURRENCY": IS_CURRENCY,
+    "FLAG19": FLAG19,
+    "FLAG20": FLAG20,
+    "FLAG21": FLAG21,
+    "FLAG22": FLAG22,
+    "FLAG23": FLAG23,
+    "FLAG24": FLAG24,
+    "FLAG25": FLAG25,
+    "FLAG26": FLAG26,
+    "FLAG27": FLAG27,
+    "FLAG28": FLAG28,
+    "FLAG29": FLAG29,
+    "FLAG30": FLAG30,
+    "FLAG31": FLAG31,
+    "FLAG32": FLAG32,
+    "FLAG33": FLAG33,
+    "FLAG34": FLAG34,
+    "FLAG35": FLAG35,
+    "FLAG36": FLAG36,
+    "FLAG37": FLAG37,
+    "FLAG38": FLAG38,
+    "FLAG39": FLAG39,
+    "FLAG40": FLAG40,
+    "FLAG41": FLAG41,
+    "FLAG42": FLAG42,
+    "FLAG43": FLAG43,
+    "FLAG44": FLAG44,
+    "FLAG45": FLAG45,
+    "FLAG46": FLAG46,
+    "FLAG47": FLAG47,
+    "FLAG48": FLAG48,
+    "FLAG49": FLAG49,
+    "FLAG50": FLAG50,
+    "FLAG51": FLAG51,
+    "FLAG52": FLAG52,
+    "FLAG53": FLAG53,
+    "FLAG54": FLAG54,
+    "FLAG55": FLAG55,
+    "FLAG56": FLAG56,
+    "FLAG57": FLAG57,
+    "FLAG58": FLAG58,
+    "FLAG59": FLAG59,
+    "FLAG60": FLAG60,
+    "FLAG61": FLAG61,
+    "FLAG62": FLAG62,
+    "FLAG63": FLAG63,
+
+    "ID": ID,
+    "ORTH": ORTH,
+    "LOWER": LOWER,
+    "NORM": NORM,
+    "SHAPE": SHAPE,
+    "PREFIX": PREFIX,
+    "SUFFIX": SUFFIX,
+
+    "LENGTH": LENGTH,
+    "LEMMA": LEMMA,
+    "POS": POS,
+    "TAG": TAG,
+    "DEP": DEP,
+    "ENT_IOB": ENT_IOB,
+    "ENT_TYPE": ENT_TYPE,
+    "ENT_ID": ENT_ID,
+    "ENT_KB_ID": ENT_KB_ID,
+    "HEAD": HEAD,
+    "SENT_START": SENT_START,
+    "SPACY": SPACY,
+    "LANG": LANG,
+    "MORPH": MORPH,
+    "IDX": IDX
+}
+
+
+# ATTR IDs, in order of the symbol
+NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+locals().update(IDS)
+
+
+def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
+    """
+    Normalize a dictionary of attributes, converting them to ints.
+
+    stringy_attrs (dict): Dictionary keyed by attribute string names. Values
+        can be ints or strings.
+    strings_map (StringStore): Defaults to None. If provided, encodes string
+        values into ints.
+    RETURNS (dict): Attributes dictionary with keys and optionally values
+        converted to ints.
+    """
+    inty_attrs = {}
+    if _do_deprecated:
+        if 'F' in stringy_attrs:
+            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
+        if 'L' in stringy_attrs:
+            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
+        if 'pos' in stringy_attrs:
+            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
+        if 'morph' in stringy_attrs:
+            morphs = stringy_attrs.pop('morph')
+        if 'number' in stringy_attrs:
+            stringy_attrs.pop('number')
+        if 'tenspect' in stringy_attrs:
+            stringy_attrs.pop('tenspect')
+        morph_keys = [
+            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
+            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
+            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
+            'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
+            'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
+            'NumValue', 'PartType', 'Polite', 'StyleVariant',
+            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
+            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
+            'Polarity', 'PrepCase', 'Animacy' # U20
+        ]
+        for key in morph_keys:
+            if key in stringy_attrs:
+                stringy_attrs.pop(key)
+            elif key.lower() in stringy_attrs:
+                stringy_attrs.pop(key.lower())
+            elif key.upper() in stringy_attrs:
+                stringy_attrs.pop(key.upper())
+    for name, value in stringy_attrs.items():
+        int_key = intify_attr(name)
+        if int_key is not None:
+            if strings_map is not None and isinstance(value, str):
+                if hasattr(strings_map, 'add'):
+                    value = strings_map.add(value)
+                else:
+                    value = strings_map[value]
+            inty_attrs[int_key] = value
+    return inty_attrs
+
+
+def intify_attr(name):
+    """
+    Normalize an attribute name, converting it to int.
+
+    stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)
+    RETURNS (int): int representation of the attribute, or None if it couldn't be converted.
+    """
+    if isinstance(name, int):
+        return name
+    elif name in IDS:
+        return IDS[name]
+    elif name.upper() in IDS:
+        return IDS[name.upper()]
+    return None
--- a/cgpenv/Lib/site-packages/spacy/compat.py
+++ b/cgpenv/Lib/site-packages/spacy/compat.py
+"""Helpers for Python and platform compatibility."""
+import sys
+from thinc.util import copy_array
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle  # type: ignore[no-redef]
+
+try:
+    import copy_reg
+except ImportError:
+    import copyreg as copy_reg  # type: ignore[no-redef]
+
+try:
+    from cupy.cuda.stream import Stream as CudaStream
+except ImportError:
+    CudaStream = None
+
+try:
+    import cupy
+except ImportError:
+    cupy = None
+
+if sys.version_info[:2] >= (3, 8):  # Python 3.8+
+    from typing import Literal, Protocol, runtime_checkable
+else:
+    from typing_extensions import Literal, Protocol, runtime_checkable  # noqa: F401
+
+# Important note: The importlib_metadata "backport" includes functionality
+# that's not part of the built-in importlib.metadata. We should treat this
+# import like the built-in and only use what's available there.
+try:  # Python 3.8+
+    import importlib.metadata as importlib_metadata
+except ImportError:
+    from catalogue import _importlib_metadata as importlib_metadata  # type: ignore[no-redef]    # noqa: F401
+
+from thinc.api import Optimizer  # noqa: F401
+
+pickle = pickle
+copy_reg = copy_reg
+CudaStream = CudaStream
+cupy = cupy
+copy_array = copy_array
+
+is_windows = sys.platform.startswith("win")
+is_linux = sys.platform.startswith("linux")
+is_osx = sys.platform == "darwin"
--- a/cgpenv/Lib/site-packages/spacy/default_config.cfg
+++ b/cgpenv/Lib/site-packages/spacy/default_config.cfg
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = null
+# List of pipeline component names, in order. The names should correspond to
+# components defined in the [components block]
+pipeline = []
+# Components that are loaded but disabled by default
+disabled = []
+# Optional callbacks to modify the nlp object before it's initialized, after
+# it's created and after the pipeline has been set up
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+# Default batch size to use with nlp.pipe and nlp.evaluate
+batch_size = 1000
+
+[nlp.tokenizer]
+@tokenizers = "spacy.Tokenizer.v1"
+
+# The pipeline components and their models
+[components]
+
+# Readers for corpora like dev and train.
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length
+max_length = 0
+# Limitation on number of training examples
+limit = 0
+# Apply some simply data augmentation, where we replace tokens with variations.
+# This is especially useful for punctuation and case replacement, to help
+# generalize beyond corpora that don't/only have smart quotes etc.
+augmenter = null
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length
+max_length = 0
+# Limitation on number of training examples
+limit = 0
+# Optional callback for data augmentation
+augmenter = null
+
+# Training hyper-parameters and additional features.
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+# Controls early-stopping. 0 disables early stopping.
+patience = 1600
+# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
+# memory and shuffled within the training loop. -1 means stream train corpus
+# rather than loading in memory with no shuffling within the training loop.
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+# Control how scores are printed and checkpoints are evaluated.
+score_weights = {}
+# Names of pipeline components that shouldn't be updated during training
+frozen_components = []
+# Names of pipeline components that should set annotations during training
+annotating_components = []
+# Location in the config where the dev corpus is defined
+dev_corpus = "corpora.dev"
+# Location in the config where the train corpus is defined
+train_corpus = "corpora.train"
+# Optional callback before nlp object is saved to disk after training
+before_to_disk = null
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 1e-8
+learn_rate = 0.001
+
+# These settings are used when nlp.initialize() is called (typically before
+# training or pretraining). Components and the tokenizer can each define their
+# own arguments via their initialize methods that are populated by the config.
+# This lets them gather data resources, build label sets etc.
+[initialize]
+vectors = ${paths.vectors}
+# Extra resources for transfer-learning or pseudo-rehearsal
+init_tok2vec = ${paths.init_tok2vec}
+# Data and lookups for vocabulary
+vocab_data = null
+lookups = null
+# Arguments passed to the tokenizer's initialize method
+tokenizer = {}
+# Arguments for initialize methods of the components (keyed by component)
+components = {}
+before_init = null
+after_init = null
--- a/cgpenv/Lib/site-packages/spacy/default_config_pretraining.cfg
+++ b/cgpenv/Lib/site-packages/spacy/default_config_pretraining.cfg
+[paths]
+raw_text = null
+
+[pretraining]
+max_epochs = 1000
+dropout = 0.2
+n_save_every = null
+n_save_epoch = null
+component = "tok2vec"
+layer = ""
+corpus = "corpora.pretrain"
+
+[pretraining.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 3000
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[pretraining.objective]
+@architectures = "spacy.PretrainCharacters.v1"
+maxout_pieces = 3
+hidden_size = 300
+n_characters = 4
+
+[pretraining.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+[corpora]
+
+[corpora.pretrain]
+@readers = "spacy.JsonlCorpus.v1"
+path = ${paths.raw_text}
+min_length = 5
+max_length = 500
+limit = 0
--- a/cgpenv/Lib/site-packages/spacy/errors.py
+++ b/cgpenv/Lib/site-packages/spacy/errors.py
+import warnings
+
+
+class ErrorsWithCodes(type):
+    def __getattribute__(self, code):
+        msg = super().__getattribute__(code)
+        if code.startswith("__"):  # python system attributes like __class__
+            return msg
+        else:
+            return "[{code}] {msg}".format(code=code, msg=msg)
+
+
+def setup_default_warnings():
+    # ignore certain numpy warnings
+    filter_warning("ignore", error_msg="numpy.dtype size changed")  # noqa
+    filter_warning("ignore", error_msg="numpy.ufunc size changed")  # noqa
+
+    # warn about entity_ruler & matcher having no patterns only once
+    for pipe in ["matcher", "entity_ruler"]:
+        filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
+
+    # warn once about lemmatizer without required POS
+    filter_warning("once", error_msg=Warnings.W108)
+
+    # floret vector table cannot be modified
+    filter_warning("once", error_msg="[W114]")
+
+
+def filter_warning(action: str, error_msg: str):
+    """Customize how spaCy should handle a certain warning.
+
+    error_msg (str): e.g. "W006", or a full error message
+    action (str): "default", "error", "ignore", "always", "module" or "once"
+    """
+    warnings.filterwarnings(action, message=_escape_warning_msg(error_msg))
+
+
+def _escape_warning_msg(msg):
+    """To filter with warnings.filterwarnings, the [] brackets need to be escaped"""
+    return msg.replace("[", "\\[").replace("]", "\\]")
+
+
+# fmt: off
+
+class Warnings(metaclass=ErrorsWithCodes):
+    W005 = ("Doc object not parsed. This means displaCy won't be able to "
+            "generate a dependency visualization for it. Make sure the Doc "
+            "was processed with a model that supports dependency parsing, and "
+            "not just a language class like `English()`. For more info, see "
+            "the docs:\nhttps://spacy.io/usage/models")
+    W006 = ("No entities to visualize found in Doc object. If this is "
+            "surprising to you, make sure the Doc was processed using a model "
+            "that supports named entity recognition, and check the `doc.ents` "
+            "property manually if necessary.")
+    W007 = ("The model you're using has no word vectors loaded, so the result "
+            "of the {obj}.similarity method will be based on the tagger, "
+            "parser and NER, which may not give useful similarity judgements. "
+            "This may happen if you're using one of the small models, e.g. "
+            "`en_core_web_sm`, which don't ship with word vectors and only "
+            "use context-sensitive tensors. You can always add your own word "
+            "vectors, or use one of the larger models instead if available.")
+    W008 = ("Evaluating {obj}.similarity based on empty vectors.")
+    W011 = ("It looks like you're calling displacy.serve from within a "
+            "Jupyter notebook or a similar environment. This likely means "
+            "you're already running a local web server, so there's no need to "
+            "make displaCy start another one. Instead, you should be able to "
+            "replace displacy.serve with displacy.render to show the "
+            "visualization.")
+    W012 = ("A Doc object you're adding to the PhraseMatcher for pattern "
+            "'{key}' is parsed and/or tagged, but to match on '{attr}', you "
+            "don't actually need this information. This means that creating "
+            "the patterns is potentially much slower, because all pipeline "
+            "components are applied. To only create tokenized Doc objects, "
+            "try using `nlp.make_doc(text)` or process all texts as a stream "
+            "using `list(nlp.tokenizer.pipe(all_texts))`.")
+    W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
+    W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
+            "ignoring the duplicate entry.")
+    W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
+            "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
+    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+            "the Knowledge Base.")
+    W026 = ("Unable to set all sentence boundaries from dependency parses. If "
+            "you are constructing a parse tree incrementally by setting "
+            "token.head values, you can probably ignore this warning. Consider "
+            "using Doc(words, ..., heads=heads, deps=deps) instead.")
+    W027 = ("Found a large training file of {size} bytes. Note that it may "
+            "be more efficient to split your training data into multiple "
+            "smaller JSON files instead.")
+    W028 = ("Doc.from_array was called with a vector of type '{type}', "
+            "but is expecting one of type uint64 instead. This may result "
+            "in problems with the vocab further on in the pipeline.")
+    W030 = ("Some entities could not be aligned in the text \"{text}\" with "
+            "entities \"{entities}\". Use "
+            "`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
+            " to check the alignment. Misaligned entities ('-') will be "
+            "ignored during training.")
+    W033 = ("Training a new {model} using a model with no lexeme normalization "
+            "table. This may degrade the performance of the model to some "
+            "degree. If this is intentional or the language you're using "
+            "doesn't have a normalization table, please ignore this warning. "
+            "If this is surprising, make sure you have the spacy-lookups-data "
+            "package installed and load the table in your config. The "
+            "languages with lexeme normalization tables are currently: "
+            "{langs}\n\nLoad the table in your config with:\n\n"
+            "[initialize.lookups]\n"
+            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "lang = ${{nlp.lang}}\n"
+            "tables = [\"lexeme_norm\"]\n")
+    W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
+            "attribute or operator.")
+    W036 = ("The component '{name}' does not have any patterns defined.")
+
+    # New warnings added in v3.x
+    W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
+            "'{name}' which is frozen. If you want to prevent retraining '{name}' "
+            "but want to train '{listener}' on top of it, you should add '{name}' to the "
+            "list of 'annotating_components' in the 'training' block in the config. "
+            "See the documentation for details: "
+            "https://spacy.io/usage/training#annotating-components")
+    W087 = ("Component '{name}' will be (re)trained, but the component '{listener}' "
+            "depends on it via a listener and is frozen. This means that the "
+            "performance of '{listener}' will be degraded. You can either freeze "
+            "both, or neither of the two. If you're sourcing the component from "
+            "an existing pipeline, you can use the `replace_listeners` setting in "
+            "the config block to replace its token-to-vector listener with a copy "
+            "and make it independent. For example, `replace_listeners = "
+            "[\"model.tok2vec\"]` See the documentation for details: "
+            "https://spacy.io/usage/training#config-components-listeners")
+    W088 = ("The pipeline component {name} implements a `begin_training` "
+            "method, which won't be called by spaCy. As of v3.0, `begin_training` "
+            "has been renamed to `initialize`, so you likely want to rename the "
+            "component method. See the documentation for details: "
+            "https://spacy.io/api/language#initialize")
+    W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
+            "to `nlp.initialize`.")
+    W090 = ("Could not locate any {format} files in path '{path}'.")
+    W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
+    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
+    W093 = ("Could not find any data to train the {name} on. Is your "
+            "input data correctly formatted?")
+    W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
+            "spaCy version requirement: {version}. This can lead to compatibility "
+            "problems with older versions, or as new spaCy versions are "
+            "released, because the model may say it's compatible when it's "
+            'not. Consider changing the "spacy_version" in your meta.json to a '
+            "version range, with a lower and upper pin. For example: {example}")
+    W095 = ("Model '{model}' ({model_version}) was trained with spaCy "
+            "{version} and may not be 100% compatible with the current version "
+            "({current}). If you see errors or degraded performance, download "
+            "a newer compatible model or retrain your custom model with the "
+            "current spaCy version. For more details and available updates, "
+            "run: python -m spacy validate")
+    W096 = ("The method `nlp.disable_pipes` is now deprecated - use "
+            "`nlp.select_pipes` instead.")
+    W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
+            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
+            "string \"Field1=Value1,Value2|Field2=Value3\".")
+    W101 = ("Skipping Doc custom extension '{name}' while merging docs.")
+    W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
+    W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
+            "word segmenters: {supported}. Defaulting to {default}.")
+    W104 = ("Skipping modifications for '{target}' segmenter. The current "
+            "segmenter is '{current}'.")
+    W105 = ("As of spaCy v3.0, the `{matcher}.pipe` method is deprecated. If you "
+            "need to match on a stream of documents, you can use `nlp.pipe` and "
+            "call the {matcher} on each Doc object.")
+    W107 = ("The property `Doc.{prop}` is deprecated. Use "
+            "`Doc.has_annotation(\"{attr}\")` instead.")
+    W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
+            "more tokens. Check that your pipeline includes components that "
+            "assign token.pos, typically 'tagger'+'attribute_ruler' or "
+            "'morphologizer'.")
+    W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
+            "required user hooks to the doc after processing.")
+    W110 = ("The DependencyMatcher token pattern {pattern} matched a span "
+            "{tokens} that is 2+ tokens long. Only the first token in the span "
+            "will be included in the results. For better results, token "
+            "patterns should return matches that are each exactly one token "
+            "long.")
+    W111 = ("Jupyter notebook detected: if using `prefer_gpu()` or "
+            "`require_gpu()`, include it in the same cell right before "
+            "`spacy.load()` to ensure that the model is loaded on the correct "
+            "device. More information: "
+            "http://spacy.io/usage/v3#jupyter-notebook-gpu")
+    W112 = ("The model specified to use for initial vectors ({name}) has no "
+            "vectors. This is almost certainly a mistake.")
+    W113 = ("Sourced component '{name}' may not work as expected: source "
+            "vectors are not identical to current pipeline vectors.")
+    W114 = ("Using multiprocessing with GPU models is not recommended and may "
+            "lead to errors.")
+    W115 = ("Skipping {method}: the floret vector table cannot be modified. "
+            "Vectors are calculated from character ngrams.")
+
+
+class Errors(metaclass=ErrorsWithCodes):
+    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
+    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
+            "This usually happens when spaCy calls `nlp.{method}` with a custom "
+            "component name that's not registered on the current language class. "
+            "If you're using a Transformer, make sure to install 'spacy-transformers'. "
+            "If you're using a custom component, make sure you've added the "
+            "decorator `@Language.component` (for function components) or "
+            "`@Language.factory` (for class components).\n\nAvailable "
+            "factories: {opts}")
+    E003 = ("Not a valid pipeline component. Expected callable, but "
+            "got {component} (name: '{name}'). If you're using a custom "
+            "component factory, double-check that it correctly returns your "
+            "initialized component.")
+    E004 = ("Can't set up pipeline component: a factory for '{name}' already "
+            "exists. Existing factory: {func}. New factory: {new_func}")
+    E005 = ("Pipeline component '{name}' returned None. If you're using a "
+            "custom component, maybe you forgot to return the processed Doc?")
+    E006 = ("Invalid constraints for adding pipeline component. You can only "
+            "set one of the following: before (component name or index), "
+            "after (component name or index), first (True) or last (True). "
+            "Invalid configuration: {args}. Existing components: {opts}")
+    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
+    E008 = ("Can't restore disabled pipeline component '{name}' because it "
+            "doesn't exist in the pipeline anymore. If you want to remove "
+            "components from the pipeline, you should do it before calling "
+            "`nlp.select_pipes` or after restoring the disabled components.")
+    E010 = ("Word vectors set to length 0. This may be because you don't have "
+            "a model installed or loaded, or because your model doesn't "
+            "include word vectors. For more info, see the docs:\n"
+            "https://spacy.io/usage/models")
+    E011 = ("Unknown operator: '{op}'. Options: {opts}")
+    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
+    E016 = ("MultitaskObjective target should be function or one of: dep, "
+            "tag, ent, dep_tag_offset, ent_tag.")
+    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
+    E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
+            "refers to an issue with the `Vocab` or `StringStore`.")
+    E019 = ("Can't create transition with unknown action ID: {action}. Action "
+            "IDs are enumerated in spacy/syntax/{src}.pyx.")
+    E022 = ("Could not find a transition with the name '{name}' in the NER "
+            "model.")
+    E024 = ("Could not find an optimal move to supervise the parser. Usually, "
+            "this means that the model can't be updated in a way that's valid "
+            "and satisfies the correct annotations specified in the GoldParse. "
+            "For example, are all labels added to the model? If you're "
+            "training a named entity recognizer, also make sure that none of "
+            "your annotated entity spans have leading or trailing whitespace "
+            "or punctuation. You can also use the `debug data` command to "
+            "validate your JSON-formatted training data. For details, run:\n"
+            "python -m spacy debug data --help")
+    E025 = ("String is too long: {length} characters. Max is 2**30.")
+    E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
+            "length {length}.")
+    E027 = ("Arguments `words` and `spaces` should be sequences of the same "
+            "length, or `spaces` should be left default at None. `spaces` "
+            "should be a sequence of booleans, with True meaning that the "
+            "word owns a ' ' character following it.")
+    E028 = ("`words` expects a list of unicode strings, but got bytes instance: {value}")
+    E029 = ("`noun_chunks` requires the dependency parse, which requires a "
+            "statistical model to be installed and loaded. For more info, see "
+            "the documentation:\nhttps://spacy.io/usage/models")
+    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
+            "component to the pipeline with: `nlp.add_pipe('sentencizer')`. "
+            "Alternatively, add the dependency parser or sentence recognizer, "
+            "or set sentence boundaries by setting `doc[i].is_sent_start`.")
+    E031 = ("Invalid token: empty string ('') at position {i}.")
+    E033 = ("Cannot load into non-empty Doc of length {length}.")
+    E035 = ("Error creating span with start {start} and end {end} for Doc of "
+            "length {length}.")
+    E036 = ("Error calculating span: Can't find a token starting at character "
+            "offset {start}.")
+    E037 = ("Error calculating span: Can't find a token ending at character "
+            "offset {end}.")
+    E039 = ("Array bounds exceeded while searching for root word. This likely "
+            "means the parse tree is in an invalid state. Please report this "
+            "issue here: http://github.com/explosion/spaCy/issues")
+    E040 = ("Attempt to access token at {i}, max length {max_length}.")
+    E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
+    E042 = ("Error accessing `doc[{i}].nbor({j})`, for doc of length {length}.")
+    E043 = ("Refusing to write to token.sent_start if its document is parsed, "
+            "because this may cause inconsistent state.")
+    E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
+            "None, True, False")
+    E045 = ("Possibly infinite loop encountered while looking for {attr}.")
+    E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did "
+            "you forget to call the `set_extension` method?")
+    E047 = ("Can't assign a value to unregistered extension attribute "
+            "'{name}'. Did you forget to call the `set_extension` method?")
+    E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
+    E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
+            "package or a valid path to a data directory.")
+    E052 = ("Can't find model directory: {path}")
+    E053 = ("Could not read {name} from {path}")
+    E054 = ("No valid '{setting}' setting found in model meta.json.")
+    E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
+    E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
+            "original string.\nKey: {key}\nOrths: {orths}")
+    E057 = ("Stepped slices not supported in Span objects. Try: "
+            "`list(tokens)[start:stop:step]` instead.")
+    E058 = ("Could not retrieve vector for key {key}.")
+    E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
+    E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
+            "({rows}, {cols}).")
+    E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
+            "and 63 are occupied. You can replace one by specifying the "
+            "`flag_id` explicitly, e.g. "
+            "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
+    E063 = ("Invalid value for `flag_id`: {value}. Flag IDs must be between 1 "
+            "and 63 (inclusive).")
+    E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
+            "string, the lexeme returned had an orth ID that did not match "
+            "the query string. This means that the cached lexeme structs are "
+            "mismatched to the string encoding table. The mismatched:\n"
+            "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
+    E065 = ("Only one of the vector table's width and shape can be specified. "
+            "Got width {width} and shape {shape}.")
+    E067 = ("Invalid BILUO tag sequence: Got a tag starting with {start} "
+            "without a preceding 'B' (beginning of an entity). "
+            "Tag sequence:\n{tags}")
+    E068 = ("Invalid BILUO tag: '{tag}'.")
+    E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
+            "match the one in the vocab ({vocab_orth}).")
+    E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
+            "are of length {length}. You can use `vocab.reset_vectors` to "
+            "clear the existing vectors and resize the table.")
+    E074 = ("Error interpreting compiled match pattern: patterns are expected "
+            "to end with the attribute {attr}. Got: {bad_attr}.")
+    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
+            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
+            "match.")
+    E083 = ("Error setting extension: only one of `default`, `method`, or "
+            "`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
+    E084 = ("Error assigning label ID {label} to span: not in StringStore.")
+    E085 = ("Can't create lexeme for string '{string}'.")
+    E087 = ("Unknown displaCy style: {style}.")
+    E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
+            "parser and NER models require roughly 1GB of temporary "
+            "memory per 100,000 characters in the input. This means long "
+            "texts may cause memory allocation errors. If you're not using "
+            "the parser or NER, it's probably safe to increase the "
+            "`nlp.max_length` limit. The limit is in number of characters, so "
+            "you can check whether your inputs are too long by checking "
+            "`len(text)`.")
+    E089 = ("Extensions can't have a setter argument without a getter "
+            "argument. Check the keyword arguments on `set_extension`.")
+    E090 = ("Extension '{name}' already exists on {obj}. To overwrite the "
+            "existing extension, set `force=True` on `{obj}.set_extension`.")
+    E091 = ("Invalid extension attribute {name}: expected callable or None, "
+            "but got: {value}")
+    E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
+    E094 = ("Error reading line {line_num} in vectors file {loc}.")
+    E095 = ("Can't write to frozen dictionary. This is likely an internal "
+            "error. Are you writing to a default function argument?")
+    E096 = ("Invalid object passed to displaCy: Can only visualize `Doc` or "
+            "Span objects, or dicts if set to `manual=True`.")
+    E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
+            "phrase pattern (string) but got:\n{pattern}")
+    E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.")
+    E099 = ("Invalid pattern: the first node of pattern should be an anchor "
+            "node. The node should only contain RIGHT_ID and RIGHT_ATTRS.")
+    E100 = ("Nodes other than the anchor node should all contain {required}, "
+            "but these are missing: {missing}")
+    E101 = ("RIGHT_ID should be a new node and LEFT_ID should already have "
+            "have been declared in previous edges.")
+    E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
+            "tokens to merge. If you want to find the longest non-overlapping "
+            "spans, you can use the util.filter_spans helper:\n"
+            "https://spacy.io/api/top-level#util.filter_spans")
+    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
+            "token can only be part of one entity, so make sure the entities "
+            "you're setting don't overlap. To work with overlapping entities, "
+            "consider using doc.spans instead.")
+    E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
+            "settings: {opts}")
+    E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
+    E109 = ("Component '{name}' could not be run. Did you forget to "
+            "call `initialize()`?")
+    E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
+    E111 = ("Pickling a token is not supported, because tokens are only views "
+            "of the parent Doc and can't exist on their own. A pickled token "
+            "would always have to include its Doc and Vocab, which has "
+            "practically no advantage over pickling the parent Doc directly. "
+            "So instead of pickling the token, pickle the Doc it belongs to.")
+    E112 = ("Pickling a span is not supported, because spans are only views "
+            "of the parent Doc and can't exist on their own. A pickled span "
+            "would always have to include its Doc and Vocab, which has "
+            "practically no advantage over pickling the parent Doc directly. "
+            "So instead of pickling the span, pickle the Doc it belongs to or "
+            "use Span.as_doc to convert the span to a standalone Doc object.")
+    E115 = ("All subtokens must have associated heads.")
+    E117 = ("The newly split tokens must match the text of the original token. "
+            "New orths: {new}. Old text: {old}.")
+    E118 = ("The custom extension attribute '{attr}' is not registered on the "
+            "`Token` object so it can't be set during retokenization. To "
+            "register an attribute, use the `Token.set_extension` classmethod.")
+    E119 = ("Can't set custom extension attribute '{attr}' during "
+            "retokenization because it's not writable. This usually means it "
+            "was registered with a getter function (and no setter) or as a "
+            "method extension, so the value is computed dynamically. To "
+            "overwrite a custom attribute manually, it should be registered "
+            "with a default value or with a getter AND setter.")
+    E120 = ("Can't set custom extension attributes during retokenization. "
+            "Expected dict mapping attribute names to values, but got: {value}")
+    E121 = ("Can't bulk merge spans. Attribute length {attr_len} should be "
+            "equal to span length ({span_len}).")
+    E122 = ("Cannot find token to be split. Did it get merged?")
+    E123 = ("Cannot find head of token to be split. Did it get merged?")
+    E125 = ("Unexpected value: {value}")
+    E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
+            "This is likely a bug in spaCy, so feel free to open an issue.")
+    E130 = ("You are running a narrow unicode build, which is incompatible "
+            "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
+            "unicode build instead. You can also rebuild Python and set the "
+            "`--enable-unicode=ucs4 flag`.")
+    E132 = ("The vectors for entities and probabilities for alias '{alias}' "
+            "should have equal length, but found {entities_length} and "
+            "{probabilities_length} respectively.")
+    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
+            "exceed 1, but found {sum}.")
+    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
+    E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
+            "`kb.add_entity` and `kb.add_alias` to add entries.")
+    E140 = ("The list of entities, prior probabilities and entity vectors "
+            "should be of equal length.")
+    E141 = ("Entity vectors should be of length {required} instead of the "
+            "provided {found}.")
+    E143 = ("Labels for component '{name}' not initialized. This can be fixed "
+            "by calling add_label, or by providing a representative batch of "
+            "examples to the component's `initialize` method.")
+    E145 = ("Error reading `{param}` from input file.")
+    E146 = ("Could not access {path}.")
+    E147 = ("Unexpected error in the {method} functionality of the "
+            "EntityLinker: {msg}. This is likely a bug in spaCy, so feel free "
+            "to open an issue: https://github.com/explosion/spaCy/issues")
+    E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that "
+            "each entity in `doc.ents` is assigned to a KB identifier.")
+    E149 = ("Error deserializing model. Check that the config used to create "
+            "the component matches the model being loaded.")
+    E150 = ("The language of the `nlp` object and the `vocab` should be the "
+            "same, but found '{nlp}' and '{vocab}' respectively.")
+    E152 = ("The attribute {attr} is not supported for token patterns. "
+            "Please use the option `validate=True` with the Matcher, PhraseMatcher, "
+            "or EntityRuler for more details.")
+    E153 = ("The value type {vtype} is not supported for token patterns. "
+            "Please use the option validate=True with Matcher, PhraseMatcher, "
+            "or EntityRuler for more details.")
+    E154 = ("One of the attributes or values is not supported for token "
+            "patterns. Please use the option `validate=True` with the Matcher, "
+            "PhraseMatcher, or EntityRuler for more details.")
+    E155 = ("The pipeline needs to include a {pipe} in order to use "
+            "Matcher or PhraseMatcher with the attribute {attr}. "
+            "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
+            "instead of `list(nlp.tokenizer.pipe())`.")
+    E157 = ("Can't render negative values for dependency arc start or end. "
+            "Make sure that you're passing in absolute token indices, not "
+            "relative token offsets.\nstart: {start}, end: {end}, label: "
+            "{label}, direction: {dir}")
+    E158 = ("Can't add table '{name}' to lookups because it already exists.")
+    E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
+    E160 = ("Can't find language data file: {path}")
+    E161 = ("Found an internal inconsistency when predicting entity links. "
+            "This is likely a bug in spaCy, so feel free to open an issue: "
+            "https://github.com/explosion/spaCy/issues")
+    E163 = ("cumsum was found to be unstable: its last element does not "
+            "correspond to sum")
+    E164 = ("x is neither increasing nor decreasing: {x}.")
+    E165 = ("Only one class present in the gold labels: {label}. "
+            "ROC AUC score is not defined in that case.")
+    E166 = ("Can only merge DocBins with the same value for '{param}'.\n"
+            "Current DocBin: {current}\nOther DocBin: {other}")
+    E169 = ("Can't find module: {module}")
+    E170 = ("Cannot apply transition {name}: invalid for the current state.")
+    E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
+            "callable or None, but got: {arg_type}")
+    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
+    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
+    E177 = ("Ill-formed IOB input detected: {tag}")
+    E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you "
+            "accidentally passed a single pattern to Matcher.add instead of a "
+            "list of patterns? If you only want to add one pattern, make sure "
+            "to wrap it in a list. For example: `matcher.add('{key}', [pattern])`")
+    E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
+            "Doc. If you only want to add one pattern, make sure to wrap it "
+            "in a list. For example: `matcher.add('{key}', [doc])`")
+    E180 = ("Span attributes can't be declared as required or assigned by "
+            "components, since spans are only views of the Doc. Use Doc and "
+            "Token attributes (or custom extension attributes) only and remove "
+            "the following: {attrs}")
+    E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
+            "Only Doc and Token attributes are supported.")
+    E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
+            "to define the attribute? For example: `{attr}.???`")
+    E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
+            "attributes are supported, for example: {solution}")
+    E184 = ("Only attributes without underscores are supported in component "
+            "attribute declarations (because underscore and non-underscore "
+            "attributes are connected anyways): {attr} -> {solution}")
+    E185 = ("Received invalid attribute in component attribute declaration: "
+            "`{obj}.{attr}`\nAttribute '{attr}' does not exist on {obj}.")
+    E187 = ("Only unicode strings are supported as labels.")
+    E189 = ("Each argument to `Doc.__init__` should be of equal length.")
+    E190 = ("Token head out of range in `Doc.from_array()` for token index "
+            "'{index}' with value '{value}' (equivalent to relative head "
+            "index: '{rel_head_index}'). The head indices should be relative "
+            "to the current token index rather than absolute indices in the "
+            "array.")
+    E191 = ("Invalid head: the head token must be from the same doc as the "
+            "token itself.")
+    E192 = ("Unable to resize vectors in place with cupy.")
+    E193 = ("Unable to resize vectors in place if the resized vector dimension "
+            "({new_dim}) is not the same as the current vector dimension "
+            "({curr_dim}).")
+    E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.")
+    E195 = ("Matcher can be called on {good} only, got {got}.")
+    E196 = ("Refusing to write to `token.is_sent_end`. Sentence boundaries can "
+            "only be fixed with `token.is_sent_start`.")
+    E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
+    E198 = ("Unable to return {n} most similar vectors for the current vectors "
+            "table, which contains {n_rows} vectors.")
+    E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
+    E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
+            "issue tracker: http://github.com/explosion/spaCy/issues")
+    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
+
+    # New errors added in v3.x
+    E858 = ("The {mode} vector table does not support this operation. "
+            "{alternative}")
+    E859 = ("The floret vector table cannot be modified.")
+    E860 = ("Can't truncate fasttext-bloom vectors.")
+    E861 = ("No 'keys' should be provided when initializing floret vectors "
+            "with 'minn' and 'maxn'.")
+    E862 = ("'hash_count' must be between 1-4 for floret vectors.")
+    E863 = ("'maxn' must be greater than or equal to 'minn'.")
+    E864 = ("The complete vector table 'data' is required to initialize floret "
+            "vectors.")
+    E865 = ("A SpanGroup is not functional after the corresponding Doc has "
+            "been garbage collected. To keep using the spans, make sure that "
+            "the corresponding Doc object is still available in the scope of "
+            "your function.")
+    E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
+    E867 = ("The 'textcat' component requires at least two labels because it "
+            "uses mutually exclusive classes where exactly one label is True "
+            "for each doc. For binary classification tasks, you can use two "
+            "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
+            "can use the 'textcat_multilabel' component with one label.")
+    E868 = ("Found a conflicting gold annotation in a reference document, "
+            "with the following char-based span occurring both in the gold ents "
+            "as well as in the negative spans: {span}.")
+    E869 = ("The notation '{label}' is not supported anymore. To annotate "
+            "negative NER samples, use `doc.spans[key]` instead, and "
+            "specify the key as 'incorrect_spans_key' when constructing "
+            "the NER component.")
+    E870 = ("Could not serialize the DocBin because it is too large. Consider "
+            "splitting up your documents into several doc bins and serializing "
+            "each separately. spacy.Corpus.v1 will search recursively for all "
+            "*.spacy files if you provide a directory instead of a filename as "
+            "the 'path'.")
+    E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}")
+    E872 = ("Unable to copy tokenizer from base model due to different "
+            'tokenizer settings: current tokenizer config "{curr_config}" '
+            'vs. base model "{base_config}"')
+    E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
+            "'{text}'. This is likely a bug in spaCy, so feel free to open an "
+            "issue: https://github.com/explosion/spaCy/issues")
+    E874 = ("Could not initialize the tok2vec model from component "
+            "'{component}' and layer '{layer}'.")
+    E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. "
+            "In the config, these are defined by the initialize.vectors setting.")
+    E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
+            "a list of spans, with each span represented by a tuple (start_char, end_char). "
+            "The tuple can be optionally extended with a label and a KB ID.")
+    E880 = ("The 'wandb' library could not be found - did you install it? "
+            "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
+            "config section, instead of the 'WandbLogger'.")
+    E884 = ("The pipeline could not be initialized because the vectors "
+            "could not be found at '{vectors}'. If your pipeline was already "
+            "initialized/trained before, call 'resume_training' instead of 'initialize', "
+            "or initialize only the components that are new.")
+    E885 = ("entity_linker.set_kb received an invalid 'kb_loader' argument: expected "
+            "a callable function, but got: {arg_type}")
+    E886 = ("Can't replace {name} -> {tok2vec} listeners: path '{path}' not "
+            "found in config for component '{name}'.")
+    E887 = ("Can't replace {name} -> {tok2vec} listeners: the paths to replace "
+            "({paths}) don't match the available listeners in the model ({n_listeners}).")
+    E888 = ("Can't replace listeners for '{name}' ({pipe}): invalid upstream "
+            "component that doesn't seem to support listeners. Expected Tok2Vec "
+            "or Transformer component. If you didn't call nlp.replace_listeners "
+            "manually, this is likely a bug in spaCy.")
+    E889 = ("Can't replace '{tok2vec}' listeners of component '{name}' because "
+            "'{unknown}' is not in the pipeline. Available components: {opts}. "
+            "If you didn't call nlp.replace_listeners manually, this is likely "
+            "a bug in spaCy.")
+    E890 = ("Cannot add the alias '{alias}' to the Knowledge base. "
+            "Each alias should be a meaningful string.")
+    E891 = ("Alias '{alias}' could not be added to the Knowledge base. "
+            "This is likely a bug in spaCy.")
+    E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
+    E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
+            "If you're using a custom function, make sure the code is available. "
+            "If the function is provided by a third-party package, e.g. "
+            "spacy-transformers, make sure the package is installed in your "
+            "environment.\n\nAvailable names: {available}")
+    E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
+            "'{lang}'.")
+    E895 = ("The 'textcat' component received gold-standard annotations with "
+            "multiple labels per document. In spaCy 3 you should use the "
+            "'textcat_multilabel' component for this instead. "
+            "Example of an offending annotation: {value}")
+    E896 = ("There was an error using the static vectors. Ensure that the vectors "
+            "of the vocab are properly initialized, or set 'include_static_vectors' "
+            "to False.")
+    E897 = ("Field '{field}' should be a dot-notation string referring to the "
+            "relevant section in the config, but found type {type} instead.")
+    E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute "
+            "is not set or None. If you've implemented a custom component, make "
+            "sure to store the component model as `self.model` in your "
+            "component's __init__ method.")
+    E899 = ("Can't serialize trainable pipe '{name}': the `vocab` attribute "
+            "is not set or None. If you've implemented a custom component, make "
+            "sure to store the current `nlp` object's vocab as `self.vocab` in "
+            "your component's __init__ method.")
+    E900 = ("Could not run the full pipeline for evaluation. If you specified "
+            "frozen components, make sure they were already initialized and "
+            "trained. Full pipeline: {pipeline}")
+    E901 = ("Failed to remove existing output directory: {path}. If your "
+            "config and the components you train change between runs, a "
+            "non-empty output directory can lead to stale pipeline data. To "
+            "solve this, remove the existing directories in the output directory.")
+    E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
+            "Try checking whitespace and delimiters. See "
+            "https://spacy.io/api/cli#convert")
+    E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
+            "whitespace and delimiters. See https://spacy.io/api/cli#convert")
+    E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
+            "dimension refers to the output width, after the linear projection "
+            "has been applied.")
+    E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
+            "dimension refers to the width of the vectors table.")
+    E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
+            "are: {supported}")
+    E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
+    E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
+    E910 = ("Encountered NaN value when computing loss for component '{name}'.")
+    E911 = ("Invalid feature: {feat}. Must be a token attribute.")
+    E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
+            "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
+    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
+            "config.cfg or override it on the CLI?")
+    E914 = ("Executing {name} callback failed. Expected the function to "
+            "return the nlp object but got: {value}. Maybe you forgot to return "
+            "the modified object in your function?")
+    E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
+            "float or int but got: {score_type}. To exclude the score from the "
+            "final score, set its weight to null in the [training.score_weights] "
+            "section of your training config.")
+    E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
+    E917 = ("Received invalid value {value} for `state_type` in "
+            "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
+    E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
+            "values are an instance of `spacy.vocab.Vocab` or True to create one"
+            " (default).")
+    E919 = ("A textcat `positive_label` '{pos_label}' was provided for training "
+            "data that does not appear to be a binary classification problem "
+            "with two labels. Labels found: {labels}")
+    E920 = ("The textcat's `positive_label` setting '{pos_label}' "
+            "does not match any label in the training data or provided during "
+            "initialization. Available labels: {labels}")
+    E921 = ("The method `set_output` can only be called on components that have "
+            "a Model with a `resize_output` attribute. Otherwise, the output "
+            "layer can not be dynamically changed.")
+    E922 = ("Component '{name}' has been initialized with an output dimension of "
+            "{nO} - cannot add any more labels.")
+    E923 = ("It looks like there is no proper sample data to initialize the "
+            "Model of component '{name}'. To check your input data paths and "
+            "annotation, run: python -m spacy debug data config.cfg "
+            "and include the same config override values you would specify "
+            "for the 'spacy train' command.")
+    E924 = ("The '{name}' component does not seem to be initialized properly. "
+            "This is likely a bug in spaCy, so feel free to open an issue: "
+            "https://github.com/explosion/spaCy/issues")
+    E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
+            "mapping label names to colors but got: {obj}")
+    E926 = ("It looks like you're trying to modify `nlp.{attr}` directly. This "
+            "doesn't work because it's an immutable computed property. If you "
+            "need to modify the pipeline, use the built-in methods like "
+            "`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or "
+            "`nlp.enable_pipe` instead.")
+    E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
+            "property or default function argument?")
+    E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
+            "but the provided argument {loc} points to a file.")
+    E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
+    E930 = ("Received invalid get_examples callback in `{method}`. "
+            "Expected function that returns an iterable of Example objects but "
+            "got: {obj}")
+    E931 = ("Encountered {parent} subclass without `{parent}.{method}` "
+            "method in component '{name}'. If you want to use this "
+            "method, make sure it's overwritten on the subclass.")
+    E940 = ("Found NaN values in scores.")
+    E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
+            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
+            "load the model, use its full name instead:\n\n"
+            "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
+            "models, see the models directory: https://spacy.io/models. If you "
+            "want to create a blank model, use spacy.blank: "
+            "nlp = spacy.blank(\"{name}\")")
+    E942 = ("Executing `after_{name}` callback failed. Expected the function to "
+            "return an initialized nlp object but got: {value}. Maybe "
+            "you forgot to return the modified object in your function?")
+    E943 = ("Executing `before_creation` callback failed. Expected the function to "
+            "return an uninitialized Language subclass but got: {value}. Maybe "
+            "you forgot to return the modified object in your function or "
+            "returned the initialized nlp object instead?")
+    E944 = ("Can't copy pipeline component '{name}' from source '{model}': "
+            "not found in pipeline. Available components: {opts}")
+    E945 = ("Can't copy pipeline component '{name}' from source. Expected "
+            "loaded nlp object, but got: {source}")
+    E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
+            "a string value from {expected} but got: '{arg}'")
+    E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
+            "a list, but got: {arg_type}")
+    E949 = ("Unable to align tokens for the predicted and reference docs. It "
+            "is only possible to align the docs when both texts are the same "
+            "except for whitespace and capitalization. The predicted tokens "
+            "start with: {x}. The reference tokens start with: {y}.")
+    E952 = ("The section '{name}' is not a valid section in the provided config.")
+    E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
+    E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
+            "component.")
+    E955 = ("Can't find table(s) {table} for language '{lang}' in "
+            "spacy-lookups-data. Make sure you have the package installed or "
+            "provide your own lookup tables if no default lookups are available "
+            "for your language.")
+    E956 = ("Can't find component '{name}' in [components] block in the config. "
+            "Available components: {opts}")
+    E957 = ("Writing directly to `Language.factories` isn't needed anymore in "
+            "spaCy v3. Instead, you can use the `@Language.factory` decorator "
+            "to register your custom component factory or `@Language.component` "
+            "to register a simple stateless function component that just takes "
+            "a Doc and returns it.")
+    E958 = ("Language code defined in config ({bad_lang_code}) does not match "
+            "language code of current Language subclass {lang} ({lang_code}). "
+            "If you want to create an nlp object from a config, make sure to "
+            "use the matching subclass with the language-specific settings and "
+            "data.")
+    E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
+    E960 = ("No config data found for component '{name}'. This is likely a bug "
+            "in spaCy.")
+    E961 = ("Found non-serializable Python object in config. Configs should "
+            "only include values that can be serialized to JSON. If you need "
+            "to pass models or other objects to your component, use a reference "
+            "to a registered function or initialize the object in your "
+            "component.\n\n{config}")
+    E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
+            "got: {cfg_type}.")
+    E963 = ("Can't read component info from `@Language.{decorator}` decorator. "
+            "Maybe you forgot to call it? Make sure you're using "
+            "`@Language.{decorator}()` instead of `@Language.{decorator}`.")
+    E964 = ("The pipeline component factory for '{name}' needs to have the "
+            "following named arguments, which are passed in by spaCy:\n- nlp: "
+            "receives the current nlp object and lets you access the vocab\n- "
+            "name: the name of the component instance, can be used to identify "
+            "the component, output losses etc.")
+    E965 = ("It looks like you're using the `@Language.component` decorator to "
+            "register '{name}' on a class instead of a function component. If "
+            "you need to register a class or function that *returns* a component "
+            "function, use the `@Language.factory` decorator instead.")
+    E966 = ("`nlp.add_pipe` now takes the string name of the registered component "
+            "factory, not a callable component. Expected string, but got "
+            "{component} (name: '{name}').\n\n- If you created your component "
+            "with `nlp.create_pipe('name')`: remove nlp.create_pipe and call "
+            "`nlp.add_pipe('name')` instead.\n\n- If you passed in a component "
+            "like `TextCategorizer()`: call `nlp.add_pipe` with the string name "
+            "instead, e.g. `nlp.add_pipe('textcat')`.\n\n- If you're using a custom "
+            "component: Add the decorator `@Language.component` (for function "
+            "components) or `@Language.factory` (for class components / factories) "
+            "to your custom component and assign it a name, e.g. "
+            "`@Language.component('your_name')`. You can then run "
+            "`nlp.add_pipe('your_name')` to add it to the pipeline.")
+    E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
+    E968 = ("`nlp.replace_pipe` now takes the string name of the registered component "
+            "factory, not a callable component. Expected string, but got "
+            "{component}.\n\n- If you created your component with"
+            "with `nlp.create_pipe('name')`: remove `nlp.create_pipe` and call "
+            "`nlp.replace_pipe('{name}', 'name')` instead.\n\n- If you passed in a "
+            "component like `TextCategorizer()`: call `nlp.replace_pipe` with the "
+            "string name instead, e.g. `nlp.replace_pipe('{name}', 'textcat')`.\n\n"
+            "- If you're using a custom component: Add the decorator "
+            "`@Language.component` (for function components) or `@Language.factory` "
+            "(for class components / factories) to your custom component and "
+            "assign it a name, e.g. `@Language.component('your_name')`. You can "
+            "then run `nlp.replace_pipe('{name}', 'your_name')`.")
+    E969 = ("Expected string values for field '{field}', but received {types} instead. ")
+    E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
+    E971 = ("Found incompatible lengths in `Doc.from_array`: {array_length} for the "
+            "array and {doc_length} for the Doc itself.")
+    E972 = ("`Example.__init__` got None for '{arg}'. Requires Doc.")
+    E973 = ("Unexpected type for NER data")
+    E974 = ("Unknown {obj} attribute: {key}")
+    E976 = ("The method `Example.from_dict` expects a {type} as {n} argument, "
+            "but received None.")
+    E977 = ("Can not compare a MorphAnalysis with a string object. "
+            "This is likely a bug in spaCy, so feel free to open an issue: "
+            "https://github.com/explosion/spaCy/issues")
+    E978 = ("The {name} method takes a list of Example objects, but got: {types}")
+    E980 = ("Each link annotation should refer to a dictionary with at most one "
+            "identifier mapping to 1.0, and all others to 0.0.")
+    E981 = ("The offsets of the annotations for `links` could not be aligned "
+            "to token boundaries.")
+    E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
+            "into {values}, but found {value}.")
+    E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: "
+            "{keys}")
+    E984 = ("Invalid component config for '{name}': component block needs either "
+            "a key `factory` specifying the registered function used to "
+            "initialize the component, or a key `source` key specifying a "
+            "spaCy model to copy the component from. For example, `factory = "
+            "\"ner\"` will use the 'ner' factory and all other settings in the "
+            "block will be passed to it as arguments. Alternatively, `source = "
+            "\"en_core_web_sm\"` will copy the component from that model.\n\n{config}")
+    E985 = ("Can't load model from config file: no [nlp] section found.\n\n{config}")
+    E986 = ("Could not create any training batches: check your input. "
+            "Are the train and dev paths defined? Is `discard_oversize` set appropriately? ")
+    E989 = ("`nlp.update()` was called with two positional arguments. This "
+            "may be due to a backwards-incompatible change to the format "
+            "of the training data in spaCy 3.0 onwards. The 'update' "
+            "function should now be called with a batch of Example "
+            "objects, instead of `(text, annotation)` tuples. ")
+    E991 = ("The function `nlp.select_pipes` should be called with either a "
+            "`disable` argument to list the names of the pipe components "
+            "that should be disabled, or with an 'enable' argument that "
+            "specifies which pipes should not be disabled.")
+    E992 = ("The function `select_pipes` was called with `enable`={enable} "
+            "and `disable`={disable} but that information is conflicting "
+            "for the `nlp` pipeline with components {names}.")
+    E993 = ("The config for the nlp object needs to include a key `lang` specifying "
+            "the code of the language to initialize it with (for example "
+            "'en' for English) - this can't be None.\n\n{config}")
+    E997 = ("Tokenizer special cases are not allowed to modify the text. "
+            "This would map '{chunk}' to '{orth}' given token attributes "
+            "'{token_attrs}'.")
+    E999 = ("Unable to merge the Doc objects because they do not all share "
+            "the same `Vocab`.")
+    E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was "
+             "loaded. Provide the name of a pretrained model or the path to "
+             "a model and initialize the pipeline:\n\n"
+             'nlp.tokenizer.initialize(pkuseg_model="default")')
+    E1001 = ("Target token outside of matched span for match with tokens "
+             "'{span}' and offset '{index}' matched by patterns '{patterns}'.")
+    E1002 = ("Span index out of range.")
+    E1003 = ("Unsupported lemmatizer mode '{mode}'.")
+    E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
+             "Required tables: {tables}. Found: {found}. Maybe you forgot to "
+             "call `nlp.initialize()` to load in the data?")
+    E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
+             "'{chunk}'. Tokenizer exceptions are only allowed to specify "
+             "ORTH and NORM.")
+    E1007 = ("Unsupported DependencyMatcher operator '{op}'.")
+    E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check "
+             "that you are providing a list of patterns as `List[List[dict]]`.")
+    E1010 = ("Unable to set entity information for token {i} which is included "
+             "in more than one span in entities, blocked, missing or outside.")
+    E1011 = ("Unsupported default '{default}' in `doc.set_ents`. Available "
+             "options: {modes}")
+    E1012 = ("Entity spans and blocked/missing/outside spans should be "
+             "provided to `doc.set_ents` as lists of Span objects.")
+    E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
+             "token itself. To set the morph from this MorphAnalysis, set from "
+             "the string value with: `token.set_morph(str(other_morph))`.")
+    E1014 = ("Error loading DocBin data. It doesn't look like the data is in "
+             "DocBin (.spacy) format. If your data is in spaCy v2's JSON "
+             "training format, convert it using `python -m spacy convert "
+             "file.json .`.")
+    E1015 = ("Can't initialize model from config: no {value} found. For more "
+             "information, run: python -m spacy debug config config.cfg")
+    E1016 = ("The operators 'OP': '?', '*', and '+' are not supported in "
+             "DependencyMatcher token patterns. The token pattern in "
+             "RIGHT_ATTR should return matches that are each exactly one token "
+             "long. Invalid pattern:\n{node}")
+    E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency "
+             "parses. If no dependency labels are available, provide "
+             "placeholder deps such as `deps=[\"dep\"]*len(heads)`.")
+    E1018 = ("Knowledge base for component '{name}' is not set. "
+             "Make sure either `nel.initialize` or `nel.set_kb` "
+             "is called with a `kb_loader` function.")
+    E1019 = ("`noun_chunks` requires the pos tagging, which requires a "
+             "statistical model to be installed and loaded. For more info, see "
+             "the documentation:\nhttps://spacy.io/usage/models")
+    E1020 = ("No `epoch_resume` value specified and could not infer one from "
+             "filename. Specify an epoch to resume from.")
+    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
+             "Non-UD tags should use the `tag` property.")
+    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
+
+
+# Deprecated model shortcuts, only used in errors and warnings
+OLD_MODEL_SHORTCUTS = {
+    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
+    "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
+    "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
+    "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
+}
+
+
+# fmt: on
+
+
+class MatchPatternError(ValueError):
+    def __init__(self, key, errors):
+        """Custom error for validating match patterns.
+
+        key (str): The name of the matcher rule.
+        errors (dict): Validation errors (sequence of strings) mapped to pattern
+            ID, i.e. the index of the added pattern.
+        """
+        msg = f"Invalid token patterns for matcher rule '{key}'\n"
+        for pattern_idx, error_msgs in errors.items():
+            pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
+            msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
+        ValueError.__init__(self, msg)
--- a/cgpenv/Lib/site-packages/spacy/git_info.py
+++ b/cgpenv/Lib/site-packages/spacy/git_info.py
+# THIS FILE IS GENERATED FROM SPACY SETUP.PY
+#
+GIT_VERSION = "0fc3dee77"
--- a/cgpenv/Lib/site-packages/spacy/glossary.py
+++ b/cgpenv/Lib/site-packages/spacy/glossary.py
+def explain(term):
+    """Get a description for a given POS tag, dependency label or entity type.
+
+    term (str): The term to explain.
+    RETURNS (str): The explanation, or `None` if not found in the glossary.
+
+    EXAMPLE:
+        >>> spacy.explain(u'NORP')
+        >>> doc = nlp(u'Hello world')
+        >>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc])
+    """
+    if term in GLOSSARY:
+        return GLOSSARY[term]
+
+
+GLOSSARY = {
+    # POS tags
+    # Universal POS Tags
+    # http://universaldependencies.org/u/pos/
+    "ADJ": "adjective",
+    "ADP": "adposition",
+    "ADV": "adverb",
+    "AUX": "auxiliary",
+    "CONJ": "conjunction",
+    "CCONJ": "coordinating conjunction",
+    "DET": "determiner",
+    "INTJ": "interjection",
+    "NOUN": "noun",
+    "NUM": "numeral",
+    "PART": "particle",
+    "PRON": "pronoun",
+    "PROPN": "proper noun",
+    "PUNCT": "punctuation",
+    "SCONJ": "subordinating conjunction",
+    "SYM": "symbol",
+    "VERB": "verb",
+    "X": "other",
+    "EOL": "end of line",
+    "SPACE": "space",
+    # POS tags (English)
+    # OntoNotes 5 / Penn Treebank
+    # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
+    ".": "punctuation mark, sentence closer",
+    ",": "punctuation mark, comma",
+    "-LRB-": "left round bracket",
+    "-RRB-": "right round bracket",
+    "``": "opening quotation mark",
+    '""': "closing quotation mark",
+    "''": "closing quotation mark",
+    ":": "punctuation mark, colon or ellipsis",
+    "$": "symbol, currency",
+    "#": "symbol, number sign",
+    "AFX": "affix",
+    "CC": "conjunction, coordinating",
+    "CD": "cardinal number",
+    "DT": "determiner",
+    "EX": "existential there",
+    "FW": "foreign word",
+    "HYPH": "punctuation mark, hyphen",
+    "IN": "conjunction, subordinating or preposition",
+    "JJ": "adjective (English), other noun-modifier (Chinese)",
+    "JJR": "adjective, comparative",
+    "JJS": "adjective, superlative",
+    "LS": "list item marker",
+    "MD": "verb, modal auxiliary",
+    "NIL": "missing tag",
+    "NN": "noun, singular or mass",
+    "NNP": "noun, proper singular",
+    "NNPS": "noun, proper plural",
+    "NNS": "noun, plural",
+    "PDT": "predeterminer",
+    "POS": "possessive ending",
+    "PRP": "pronoun, personal",
+    "PRP$": "pronoun, possessive",
+    "RB": "adverb",
+    "RBR": "adverb, comparative",
+    "RBS": "adverb, superlative",
+    "RP": "adverb, particle",
+    "TO": 'infinitival "to"',
+    "UH": "interjection",
+    "VB": "verb, base form",
+    "VBD": "verb, past tense",
+    "VBG": "verb, gerund or present participle",
+    "VBN": "verb, past participle",
+    "VBP": "verb, non-3rd person singular present",
+    "VBZ": "verb, 3rd person singular present",
+    "WDT": "wh-determiner",
+    "WP": "wh-pronoun, personal",
+    "WP$": "wh-pronoun, possessive",
+    "WRB": "wh-adverb",
+    "SP": "space (English), sentence-final particle (Chinese)",
+    "ADD": "email",
+    "NFP": "superfluous punctuation",
+    "GW": "additional word in multi-word expression",
+    "XX": "unknown",
+    "BES": 'auxiliary "be"',
+    "HVS": 'forms of "have"',
+    "_SP": "whitespace",
+    # POS Tags (German)
+    # TIGER Treebank
+    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
+    "$(": "other sentence-internal punctuation mark",
+    "$,": "comma",
+    "$.": "sentence-final punctuation mark",
+    "ADJA": "adjective, attributive",
+    "ADJD": "adjective, adverbial or predicative",
+    "APPO": "postposition",
+    "APPR": "preposition; circumposition left",
+    "APPRART": "preposition with article",
+    "APZR": "circumposition right",
+    "ART": "definite or indefinite article",
+    "CARD": "cardinal number",
+    "FM": "foreign language material",
+    "ITJ": "interjection",
+    "KOKOM": "comparative conjunction",
+    "KON": "coordinate conjunction",
+    "KOUI": 'subordinate conjunction with "zu" and infinitive',
+    "KOUS": "subordinate conjunction with sentence",
+    "NE": "proper noun",
+    "NNE": "proper noun",
+    "PAV": "pronominal adverb",
+    "PROAV": "pronominal adverb",
+    "PDAT": "attributive demonstrative pronoun",
+    "PDS": "substituting demonstrative pronoun",
+    "PIAT": "attributive indefinite pronoun without determiner",
+    "PIDAT": "attributive indefinite pronoun with determiner",
+    "PIS": "substituting indefinite pronoun",
+    "PPER": "non-reflexive personal pronoun",
+    "PPOSAT": "attributive possessive pronoun",
+    "PPOSS": "substituting possessive pronoun",
+    "PRELAT": "attributive relative pronoun",
+    "PRELS": "substituting relative pronoun",
+    "PRF": "reflexive personal pronoun",
+    "PTKA": "particle with adjective or adverb",
+    "PTKANT": "answer particle",
+    "PTKNEG": "negative particle",
+    "PTKVZ": "separable verbal particle",
+    "PTKZU": '"zu" before infinitive',
+    "PWAT": "attributive interrogative pronoun",
+    "PWAV": "adverbial interrogative or relative pronoun",
+    "PWS": "substituting interrogative pronoun",
+    "TRUNC": "word remnant",
+    "VAFIN": "finite verb, auxiliary",
+    "VAIMP": "imperative, auxiliary",
+    "VAINF": "infinitive, auxiliary",
+    "VAPP": "perfect participle, auxiliary",
+    "VMFIN": "finite verb, modal",
+    "VMINF": "infinitive, modal",
+    "VMPP": "perfect participle, modal",
+    "VVFIN": "finite verb, full",
+    "VVIMP": "imperative, full",
+    "VVINF": "infinitive, full",
+    "VVIZU": 'infinitive with "zu", full',
+    "VVPP": "perfect participle, full",
+    "XY": "non-word containing non-letter",
+    # POS Tags (Chinese)
+    # OntoNotes / Chinese Penn Treebank
+    # https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports
+    "AD": "adverb",
+    "AS": "aspect marker",
+    "BA": "把 in ba-construction",
+    # "CD": "cardinal number",
+    "CS": "subordinating conjunction",
+    "DEC": "的 in a relative clause",
+    "DEG": "associative 的",
+    "DER": "得 in V-de const. and V-de-R",
+    "DEV": "地 before VP",
+    "ETC": "for words 等, 等等",
+    # "FW": "foreign words"
+    "IJ": "interjection",
+    # "JJ": "other noun-modifier",
+    "LB": "被 in long bei-const",
+    "LC": "localizer",
+    "M": "measure word",
+    "MSP": "other particle",
+    # "NN": "common noun",
+    "NR": "proper noun",
+    "NT": "temporal noun",
+    "OD": "ordinal number",
+    "ON": "onomatopoeia",
+    "P": "preposition excluding 把 and 被",
+    "PN": "pronoun",
+    "PU": "punctuation",
+    "SB": "被 in short bei-const",
+    # "SP": "sentence-final particle",
+    "VA": "predicative adjective",
+    "VC": "是 (copula)",
+    "VE": "有 as the main verb",
+    "VV": "other verb",
+    # Noun chunks
+    "NP": "noun phrase",
+    "PP": "prepositional phrase",
+    "VP": "verb phrase",
+    "ADVP": "adverb phrase",
+    "ADJP": "adjective phrase",
+    "SBAR": "subordinating conjunction",
+    "PRT": "particle",
+    "PNP": "prepositional noun phrase",
+    # Dependency Labels (English)
+    # ClearNLP / Universal Dependencies
+    # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
+    "acl": "clausal modifier of noun (adjectival clause)",
+    "acomp": "adjectival complement",
+    "advcl": "adverbial clause modifier",
+    "advmod": "adverbial modifier",
+    "agent": "agent",
+    "amod": "adjectival modifier",
+    "appos": "appositional modifier",
+    "attr": "attribute",
+    "aux": "auxiliary",
+    "auxpass": "auxiliary (passive)",
+    "case": "case marking",
+    "cc": "coordinating conjunction",
+    "ccomp": "clausal complement",
+    "clf": "classifier",
+    "complm": "complementizer",
+    "compound": "compound",
+    "conj": "conjunct",
+    "cop": "copula",
+    "csubj": "clausal subject",
+    "csubjpass": "clausal subject (passive)",
+    "dative": "dative",
+    "dep": "unclassified dependent",
+    "det": "determiner",
+    "discourse": "discourse element",
+    "dislocated": "dislocated elements",
+    "dobj": "direct object",
+    "expl": "expletive",
+    "fixed": "fixed multiword expression",
+    "flat": "flat multiword expression",
+    "goeswith": "goes with",
+    "hmod": "modifier in hyphenation",
+    "hyph": "hyphen",
+    "infmod": "infinitival modifier",
+    "intj": "interjection",
+    "iobj": "indirect object",
+    "list": "list",
+    "mark": "marker",
+    "meta": "meta modifier",
+    "neg": "negation modifier",
+    "nmod": "modifier of nominal",
+    "nn": "noun compound modifier",
+    "npadvmod": "noun phrase as adverbial modifier",
+    "nsubj": "nominal subject",
+    "nsubjpass": "nominal subject (passive)",
+    "nounmod": "modifier of nominal",
+    "npmod": "noun phrase as adverbial modifier",
+    "num": "number modifier",
+    "number": "number compound modifier",
+    "nummod": "numeric modifier",
+    "oprd": "object predicate",
+    "obj": "object",
+    "obl": "oblique nominal",
+    "orphan": "orphan",
+    "parataxis": "parataxis",
+    "partmod": "participal modifier",
+    "pcomp": "complement of preposition",
+    "pobj": "object of preposition",
+    "poss": "possession modifier",
+    "possessive": "possessive modifier",
+    "preconj": "pre-correlative conjunction",
+    "prep": "prepositional modifier",
+    "prt": "particle",
+    "punct": "punctuation",
+    "quantmod": "modifier of quantifier",
+    "rcmod": "relative clause modifier",
+    "relcl": "relative clause modifier",
+    "reparandum": "overridden disfluency",
+    "root": "root",
+    "vocative": "vocative",
+    "xcomp": "open clausal complement",
+    # Dependency labels (German)
+    # TIGER Treebank
+    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
+    # currently missing: 'cc' (comparative complement) because of conflict
+    # with English labels
+    "ac": "adpositional case marker",
+    "adc": "adjective component",
+    "ag": "genitive attribute",
+    "ams": "measure argument of adjective",
+    "app": "apposition",
+    "avc": "adverbial phrase component",
+    "cd": "coordinating conjunction",
+    "cj": "conjunct",
+    "cm": "comparative conjunction",
+    "cp": "complementizer",
+    "cvc": "collocational verb construction",
+    "da": "dative",
+    "dh": "discourse-level head",
+    "dm": "discourse marker",
+    "ep": "expletive es",
+    "hd": "head",
+    "ju": "junctor",
+    "mnr": "postnominal modifier",
+    "mo": "modifier",
+    "ng": "negation",
+    "nk": "noun kernel element",
+    "nmc": "numerical component",
+    "oa": "accusative object",
+    "oc": "clausal object",
+    "og": "genitive object",
+    "op": "prepositional object",
+    "par": "parenthetical element",
+    "pd": "predicate",
+    "pg": "phrasal genitive",
+    "ph": "placeholder",
+    "pm": "morphological particle",
+    "pnc": "proper noun component",
+    "rc": "relative clause",
+    "re": "repeated element",
+    "rs": "reported speech",
+    "sb": "subject",
+    "sb": "subject",
+    "sbp": "passivized subject (PP)",
+    "sp": "subject or predicate",
+    "svp": "separable verb prefix",
+    "uc": "unit component",
+    "vo": "vocative",
+    # Named Entity Recognition
+    # OntoNotes 5
+    # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
+    "PERSON": "People, including fictional",
+    "NORP": "Nationalities or religious or political groups",
+    "FACILITY": "Buildings, airports, highways, bridges, etc.",
+    "FAC": "Buildings, airports, highways, bridges, etc.",
+    "ORG": "Companies, agencies, institutions, etc.",
+    "GPE": "Countries, cities, states",
+    "LOC": "Non-GPE locations, mountain ranges, bodies of water",
+    "PRODUCT": "Objects, vehicles, foods, etc. (not services)",
+    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
+    "WORK_OF_ART": "Titles of books, songs, etc.",
+    "LAW": "Named documents made into laws.",
+    "LANGUAGE": "Any named language",
+    "DATE": "Absolute or relative dates or periods",
+    "TIME": "Times smaller than a day",
+    "PERCENT": 'Percentage, including "%"',
+    "MONEY": "Monetary values, including unit",
+    "QUANTITY": "Measurements, as of weight or distance",
+    "ORDINAL": '"first", "second", etc.',
+    "CARDINAL": "Numerals that do not fall under another type",
+    # Named Entity Recognition
+    # Wikipedia
+    # http://www.sciencedirect.com/science/article/pii/S0004370212000276
+    # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
+    "PER": "Named person or family.",
+    "MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
+    # https://github.com/ltgoslo/norne
+    "EVT": "Festivals, cultural events, sports events, weather phenomena, wars, etc.",
+    "PROD": "Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas",
+    "DRV": "Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')",
+    "GPE_LOC": "Geo-political entity, with a locative sense, e.g. 'John lives in Spain'",
+    "GPE_ORG": "Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'",
+}
--- a/cgpenv/Lib/site-packages/spacy/kb.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/kb.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/kb.cpp
+++ b/cgpenv/Lib/site-packages/spacy/kb.cpp
--- a/cgpenv/Lib/site-packages/spacy/kb.pxd
+++ b/cgpenv/Lib/site-packages/spacy/kb.pxd
+"""Knowledge-base for entity or concept linking."""
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+from libcpp.vector cimport vector
+from libc.stdint cimport int32_t, int64_t
+from libc.stdio cimport FILE
+
+from .vocab cimport Vocab
+from .typedefs cimport hash_t
+from .structs cimport KBEntryC, AliasC
+
+
+ctypedef vector[KBEntryC] entry_vec
+ctypedef vector[AliasC] alias_vec
+ctypedef vector[float] float_vec
+ctypedef vector[float_vec] float_matrix
+
+
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
+cdef class Candidate:
+    cdef readonly KnowledgeBase kb
+    cdef hash_t entity_hash
+    cdef float entity_freq
+    cdef vector[float] entity_vector
+    cdef hash_t alias_hash
+    cdef float prior_prob
+
+
+cdef class KnowledgeBase:
+    cdef Pool mem
+    cdef readonly Vocab vocab
+    cdef int64_t entity_vector_length
+
+    # This maps 64bit keys (hash of unique entity string)
+    # to 64bit values (position of the _KBEntryC struct in the _entries vector).
+    # The PreshMap is pretty space efficient, as it uses open addressing. So
+    # the only overhead is the vacancy rate, which is approximately 30%.
+    cdef PreshMap _entry_index
+
+    # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
+    # over allocation.
+    # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
+    # Storing 1m entries would take 41.6mb under this scheme.
+    cdef entry_vec _entries
+
+    # This maps 64bit keys (hash of unique alias string)
+    # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
+    cdef PreshMap _alias_index
+
+    # This should map mention hashes to (entry_id, prob) tuples. The probability
+    # should be P(entity | mention), which is pretty important to know.
+    # We can pack both pieces of information into a 64-bit value, to keep things
+    # efficient.
+    cdef alias_vec _aliases_table
+
+    # This is the part which might take more space: storing various
+    # categorical features for the entries, and storing vectors for disambiguation
+    # and possibly usage.
+    # If each entry gets a 300-dimensional vector, for 1m entries we would need
+    # 1.2gb. That gets expensive fast. What might be better is to avoid learning
+    # a unique vector for every entity. We could instead have a compositional
+    # model, that embeds different features of the entities into vectors. We'll
+    # still want some per-entity features, like the Wikipedia text or entity
+    # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
+    cdef float_matrix _vectors_table
+
+    # It's very useful to track categorical features, at least for output, even
+    # if they're not useful in the model itself. For instance, we should be
+    # able to track stuff like a person's date of birth or whatever. This can
+    # easily make the KB bigger, but if this isn't needed by the model, and it's
+    # optional data, we can let users configure a DB as the backend for this.
+    cdef object _features_table
+
+
+    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
+        """Add an entity vector to the vectors table."""
+        cdef int64_t new_index = self._vectors_table.size()
+        self._vectors_table.push_back(entity_vector)
+        return new_index
+
+
+    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
+                                     int32_t vector_index, int feats_row) nogil:
+        """Add an entry to the vector of entries.
+        After calling this method, make sure to update also the _entry_index using the return value"""
+        # This is what we'll map the entity hash key to. It's where the entry will sit
+        # in the vector of entries, so we can get it later.
+        cdef int64_t new_index = self._entries.size()
+
+        # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
+        cdef KBEntryC entry
+        entry.entity_hash = entity_hash
+        entry.vector_index = vector_index
+        entry.feats_row = feats_row
+        entry.freq = freq
+
+        self._entries.push_back(entry)
+        return new_index
+
+    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
+        """Connect a mention to a list of potential entities with their prior probabilities .
+        After calling this method, make sure to update also the _alias_index using the return value"""
+        # This is what we'll map the alias hash key to. It's where the alias will be defined
+        # in the vector of aliases.
+        cdef int64_t new_index = self._aliases_table.size()
+
+        # Avoid struct initializer to enable nogil
+        cdef AliasC alias
+        alias.entry_indices = entry_indices
+        alias.probs = probs
+
+        self._aliases_table.push_back(alias)
+        return new_index
+
+    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
+        """
+        Initializing the vectors and making sure the first element of each vector is a dummy,
+        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
+        cf. https://github.com/explosion/preshed/issues/17
+        """
+        cdef int32_t dummy_value = 0
+
+        # Avoid struct initializer to enable nogil
+        cdef KBEntryC entry
+        entry.entity_hash = dummy_hash
+        entry.vector_index = dummy_value
+        entry.feats_row = dummy_value
+        entry.freq = dummy_value
+
+        # Avoid struct initializer to enable nogil
+        cdef vector[int64_t] dummy_entry_indices
+        dummy_entry_indices.push_back(0)
+        cdef vector[float] dummy_probs
+        dummy_probs.push_back(0)
+
+        cdef AliasC alias
+        alias.entry_indices = dummy_entry_indices
+        alias.probs = dummy_probs
+
+        self._entries.push_back(entry)
+        self._aliases_table.push_back(alias)
+
+    cpdef set_entities(self, entity_list, freq_list, vector_list)
+
+
+cdef class Writer:
+    cdef FILE* _fp
+
+    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
+    cdef int write_vector_element(self, float element) except -1
+    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
+
+    cdef int write_alias_length(self, int64_t alias_length) except -1
+    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
+    cdef int write_alias(self, int64_t entry_index, float prob) except -1
+
+    cdef int _write(self, void* value, size_t size) except -1
+
+cdef class Reader:
+    cdef FILE* _fp
+
+    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
+    cdef int read_vector_element(self, float* element) except -1
+    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
+
+    cdef int read_alias_length(self, int64_t* alias_length) except -1
+    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
+    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
+
+    cdef int _read(self, void* value, size_t size) except -1
--- a/cgpenv/Lib/site-packages/spacy/kb.pyx
+++ b/cgpenv/Lib/site-packages/spacy/kb.pyx
+# cython: infer_types=True, profile=True
+from typing import Iterator, Iterable, Callable, Dict, Any
+
+import srsly
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+from cpython.exc cimport PyErr_SetFromErrno
+from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
+from libc.stdint cimport int32_t, int64_t
+from libcpp.vector cimport vector
+
+from pathlib import Path
+import warnings
+
+from .typedefs cimport hash_t
+from .errors import Errors, Warnings
+from . import util
+from .util import SimpleFrozenList, ensure_path
+
+cdef class Candidate:
+    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
+    to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
+    algorithm which will disambiguate the various candidates to the correct one.
+    Each candidate (alias, entity) pair is assigned to a certain prior probability.
+
+    DOCS: https://spacy.io/api/kb/#candidate_init
+    """
+
+    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
+        self.kb = kb
+        self.entity_hash = entity_hash
+        self.entity_freq = entity_freq
+        self.entity_vector = entity_vector
+        self.alias_hash = alias_hash
+        self.prior_prob = prior_prob
+
+    @property
+    def entity(self):
+        """RETURNS (uint64): hash of the entity's KB ID/name"""
+        return self.entity_hash
+
+    @property
+    def entity_(self):
+        """RETURNS (str): ID/name of this entity in the KB"""
+        return self.kb.vocab.strings[self.entity_hash]
+
+    @property
+    def alias(self):
+        """RETURNS (uint64): hash of the alias"""
+        return self.alias_hash
+
+    @property
+    def alias_(self):
+        """RETURNS (str): ID of the original alias"""
+        return self.kb.vocab.strings[self.alias_hash]
+
+    @property
+    def entity_freq(self):
+        return self.entity_freq
+
+    @property
+    def entity_vector(self):
+        return self.entity_vector
+
+    @property
+    def prior_prob(self):
+        return self.prior_prob
+
+
+def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
+    """
+    Return candidate entities for a given span by using the text of the span as the alias
+    and fetching appropriate entries from the index.
+    This particular function is optimized to work with the built-in KB functionality,
+    but any other custom candidate generation method can be used in combination with the KB as well.
+    """
+    return kb.get_alias_candidates(span.text)
+
+
+cdef class KnowledgeBase:
+    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+    to support entity linking of named entities to real-world concepts.
+
+    DOCS: https://spacy.io/api/kb
+    """
+
+    def __init__(self, Vocab vocab, entity_vector_length):
+        """Create a KnowledgeBase."""
+        self.mem = Pool()
+        self.entity_vector_length = entity_vector_length
+        self._entry_index = PreshMap()
+        self._alias_index = PreshMap()
+        self.vocab = vocab
+        self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
+
+    def initialize_entities(self, int64_t nr_entities):
+        self._entry_index = PreshMap(nr_entities + 1)
+        self._entries = entry_vec(nr_entities + 1)
+
+    def initialize_vectors(self, int64_t nr_entities):
+        self._vectors_table = float_matrix(nr_entities + 1)
+
+    def initialize_aliases(self, int64_t nr_aliases):
+        self._alias_index = PreshMap(nr_aliases + 1)
+        self._aliases_table = alias_vec(nr_aliases + 1)
+
+    @property
+    def entity_vector_length(self):
+        """RETURNS (uint64): length of the entity vectors"""
+        return self.entity_vector_length
+
+    def __len__(self):
+        return self.get_size_entities()
+
+    def get_size_entities(self):
+        return len(self._entry_index)
+
+    def get_entity_strings(self):
+        return [self.vocab.strings[x] for x in self._entry_index]
+
+    def get_size_aliases(self):
+        return len(self._alias_index)
+
+    def get_alias_strings(self):
+        return [self.vocab.strings[x] for x in self._alias_index]
+
+    def add_entity(self, str entity, float freq, vector[float] entity_vector):
+        """
+        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
+        Return the hash of the entity ID/name at the end.
+        """
+        cdef hash_t entity_hash = self.vocab.strings.add(entity)
+
+        # Return if this entity was added before
+        if entity_hash in self._entry_index:
+            warnings.warn(Warnings.W018.format(entity=entity))
+            return
+
+        # Raise an error if the provided entity vector is not of the correct length
+        if len(entity_vector) != self.entity_vector_length:
+            raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
+
+        vector_index = self.c_add_vector(entity_vector=entity_vector)
+
+        new_index = self.c_add_entity(entity_hash=entity_hash,
+                                      freq=freq,
+                                      vector_index=vector_index,
+                                      feats_row=-1)  # Features table currently not implemented
+        self._entry_index[entity_hash] = new_index
+
+        return entity_hash
+
+    cpdef set_entities(self, entity_list, freq_list, vector_list):
+        if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
+            raise ValueError(Errors.E140)
+
+        nr_entities = len(set(entity_list))
+        self.initialize_entities(nr_entities)
+        self.initialize_vectors(nr_entities)
+
+        i = 0
+        cdef KBEntryC entry
+        cdef hash_t entity_hash
+        while i < len(entity_list):
+            # only process this entity if its unique ID hadn't been added before
+            entity_hash = self.vocab.strings.add(entity_list[i])
+            if entity_hash in self._entry_index:
+                warnings.warn(Warnings.W018.format(entity=entity_list[i]))
+
+            else:
+                entity_vector = vector_list[i]
+                if len(entity_vector) != self.entity_vector_length:
+                    raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
+
+                entry.entity_hash = entity_hash
+                entry.freq = freq_list[i]
+
+                self._vectors_table[i] = entity_vector
+                entry.vector_index = i
+
+                entry.feats_row = -1   # Features table currently not implemented
+
+                self._entries[i+1] = entry
+                self._entry_index[entity_hash] = i+1
+
+            i += 1
+
+    def contains_entity(self, str entity):
+        cdef hash_t entity_hash = self.vocab.strings.add(entity)
+        return entity_hash in self._entry_index
+
+    def contains_alias(self, str alias):
+        cdef hash_t alias_hash = self.vocab.strings.add(alias)
+        return alias_hash in self._alias_index
+
+    def add_alias(self, str alias, entities, probabilities):
+        """
+        For a given alias, add its potential entities and prior probabilies to the KB.
+        Return the alias_hash at the end
+        """
+        if alias is None or len(alias) == 0:
+            raise ValueError(Errors.E890.format(alias=alias))
+
+        previous_alias_nr = self.get_size_aliases()
+        # Throw an error if the length of entities and probabilities are not the same
+        if not len(entities) == len(probabilities):
+            raise ValueError(Errors.E132.format(alias=alias,
+                                                entities_length=len(entities),
+                                                probabilities_length=len(probabilities)))
+
+        # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
+        prob_sum = sum(probabilities)
+        if prob_sum > 1.00001:
+            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
+
+        cdef hash_t alias_hash = self.vocab.strings.add(alias)
+
+        # Check whether this alias was added before
+        if alias_hash in self._alias_index:
+            warnings.warn(Warnings.W017.format(alias=alias))
+            return
+
+        cdef vector[int64_t] entry_indices
+        cdef vector[float] probs
+
+        for entity, prob in zip(entities, probabilities):
+            entity_hash = self.vocab.strings[entity]
+            if not entity_hash in self._entry_index:
+                raise ValueError(Errors.E134.format(entity=entity))
+
+            entry_index = <int64_t>self._entry_index.get(entity_hash)
+            entry_indices.push_back(int(entry_index))
+            probs.push_back(float(prob))
+
+        new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
+        self._alias_index[alias_hash] = new_index
+
+        if previous_alias_nr + 1 != self.get_size_aliases():
+            raise RuntimeError(Errors.E891.format(alias=alias))
+        return alias_hash
+
+    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
+        """
+        For an alias already existing in the KB, extend its potential entities with one more.
+        Throw a warning if either the alias or the entity is unknown,
+        or when the combination is already previously recorded.
+        Throw an error if this entity+prior prob would exceed the sum of 1.
+        For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
+        """
+        # Check if the alias exists in the KB
+        cdef hash_t alias_hash = self.vocab.strings[alias]
+        if not alias_hash in self._alias_index:
+            raise ValueError(Errors.E176.format(alias=alias))
+
+        # Check if the entity exists in the KB
+        cdef hash_t entity_hash = self.vocab.strings[entity]
+        if not entity_hash in self._entry_index:
+            raise ValueError(Errors.E134.format(entity=entity))
+        entry_index = <int64_t>self._entry_index.get(entity_hash)
+
+        # Throw an error if the prior probabilities (including the new one) sum up to more than 1
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        alias_entry = self._aliases_table[alias_index]
+        current_sum = sum([p for p in alias_entry.probs])
+        new_sum = current_sum + prior_prob
+
+        if new_sum > 1.00001:
+            raise ValueError(Errors.E133.format(alias=alias, sum=new_sum))
+
+        entry_indices = alias_entry.entry_indices
+
+        is_present = False
+        for i in range(entry_indices.size()):
+            if entry_indices[i] == int(entry_index):
+                is_present = True
+
+        if is_present:
+            if not ignore_warnings:
+                warnings.warn(Warnings.W024.format(entity=entity, alias=alias))
+        else:
+            entry_indices.push_back(int(entry_index))
+            alias_entry.entry_indices = entry_indices
+
+            probs = alias_entry.probs
+            probs.push_back(float(prior_prob))
+            alias_entry.probs = probs
+            self._aliases_table[alias_index] = alias_entry
+
+    def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
+        """
+        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
+        and the prior probability of that alias resolving to that entity.
+        If the alias is not known in the KB, and empty list is returned.
+        """
+        cdef hash_t alias_hash = self.vocab.strings[alias]
+        if not alias_hash in self._alias_index:
+            return []
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        alias_entry = self._aliases_table[alias_index]
+
+        return [Candidate(kb=self,
+                          entity_hash=self._entries[entry_index].entity_hash,
+                          entity_freq=self._entries[entry_index].freq,
+                          entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
+                          alias_hash=alias_hash,
+                          prior_prob=prior_prob)
+                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+                if entry_index != 0]
+
+    def get_vector(self, str entity):
+        cdef hash_t entity_hash = self.vocab.strings[entity]
+
+        # Return an empty list if this entity is unknown in this KB
+        if entity_hash not in self._entry_index:
+            return [0] * self.entity_vector_length
+        entry_index = self._entry_index[entity_hash]
+
+        return self._vectors_table[self._entries[entry_index].vector_index]
+
+    def get_prior_prob(self, str entity, str alias):
+        """ Return the prior probability of a given alias being linked to a given entity,
+        or return 0.0 when this combination is not known in the knowledge base"""
+        cdef hash_t alias_hash = self.vocab.strings[alias]
+        cdef hash_t entity_hash = self.vocab.strings[entity]
+
+        if entity_hash not in self._entry_index or alias_hash not in self._alias_index:
+            return 0.0
+
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        entry_index = self._entry_index[entity_hash]
+
+        alias_entry = self._aliases_table[alias_index]
+        for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
+            if self._entries[entry_index].entity_hash == entity_hash:
+                return prior_prob
+
+        return 0.0
+
+    def to_bytes(self, **kwargs):
+        """Serialize the current state to a binary string.
+        """
+        def serialize_header():
+            header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
+            return srsly.json_dumps(header)
+
+        def serialize_entries():
+            i = 1
+            tuples = []
+            for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
+                entry = self._entries[entry_index]
+                assert entry.entity_hash == entry_hash
+                assert entry_index == i
+                tuples.append((entry.entity_hash, entry.freq, entry.vector_index))
+                i = i + 1
+            return srsly.json_dumps(tuples)
+
+        def serialize_aliases():
+            i = 1
+            headers = []
+            indices_lists = []
+            probs_lists = []
+            for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
+                alias = self._aliases_table[alias_index]
+                assert alias_index == i
+                candidate_length = len(alias.entry_indices)
+                headers.append((alias_hash, candidate_length))
+                indices_lists.append(alias.entry_indices)
+                probs_lists.append(alias.probs)
+                i = i + 1
+            headers_dump = srsly.json_dumps(headers)
+            indices_dump = srsly.json_dumps(indices_lists)
+            probs_dump = srsly.json_dumps(probs_lists)
+            return srsly.json_dumps((headers_dump, indices_dump, probs_dump))
+
+        serializers = {
+            "header": serialize_header,
+            "entity_vectors": lambda: srsly.json_dumps(self._vectors_table),
+            "entries": serialize_entries,
+            "aliases": serialize_aliases,
+        }
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, bytes_data, *, exclude=tuple()):
+        """Load state from a binary string.
+        """
+        def deserialize_header(b):
+            header = srsly.json_loads(b)
+            nr_entities = header[0]
+            nr_aliases = header[1]
+            entity_vector_length = header[2]
+            self.initialize_entities(nr_entities)
+            self.initialize_vectors(nr_entities)
+            self.initialize_aliases(nr_aliases)
+            self.entity_vector_length = entity_vector_length
+
+        def deserialize_vectors(b):
+            self._vectors_table = srsly.json_loads(b)
+
+        def deserialize_entries(b):
+            cdef KBEntryC entry
+            tuples = srsly.json_loads(b)
+            i = 1
+            for (entity_hash, freq, vector_index) in tuples:
+                entry.entity_hash = entity_hash
+                entry.freq = freq
+                entry.vector_index = vector_index
+                entry.feats_row = -1  # Features table currently not implemented
+                self._entries[i] = entry
+                self._entry_index[entity_hash] = i
+                i += 1
+
+        def deserialize_aliases(b):
+            cdef AliasC alias
+            i = 1
+            all_data = srsly.json_loads(b)
+            headers = srsly.json_loads(all_data[0])
+            indices = srsly.json_loads(all_data[1])
+            probs = srsly.json_loads(all_data[2])
+            for header, indices, probs in zip(headers, indices, probs):
+                alias_hash, candidate_length = header
+                alias.entry_indices = indices
+                alias.probs = probs
+                self._aliases_table[i] = alias
+                self._alias_index[alias_hash] = i
+                i += 1
+
+        setters = {
+            "header": deserialize_header,
+            "entity_vectors": deserialize_vectors,
+            "entries": deserialize_entries,
+            "aliases": deserialize_aliases,
+        }
+        util.from_bytes(bytes_data, setters, exclude)
+        return self
+
+    def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
+        path = ensure_path(path)
+        if not path.exists():
+            path.mkdir(parents=True)
+        if not path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        serialize = {}
+        serialize["contents"] = lambda p: self.write_contents(p)
+        serialize["strings.json"] = lambda p: self.vocab.strings.to_disk(p)
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
+        path = ensure_path(path)
+        if not path.exists():
+            raise ValueError(Errors.E929.format(loc=path))
+        if not path.is_dir():
+            raise ValueError(Errors.E928.format(loc=path))
+        deserialize: Dict[str, Callable[[Any], Any]] = {}
+        deserialize["contents"] = lambda p: self.read_contents(p)
+        deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
+        util.from_disk(path, deserialize, exclude)
+
+    def write_contents(self, file_path):
+        cdef Writer writer = Writer(file_path)
+        writer.write_header(self.get_size_entities(), self.entity_vector_length)
+
+        # dumping the entity vectors in their original order
+        i = 0
+        for entity_vector in self._vectors_table:
+            for element in entity_vector:
+                writer.write_vector_element(element)
+            i = i+1
+
+        # dumping the entry records in the order in which they are in the _entries vector.
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        i = 1
+        for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
+            entry = self._entries[entry_index]
+            assert entry.entity_hash == entry_hash
+            assert entry_index == i
+            writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index)
+            i = i+1
+
+        writer.write_alias_length(self.get_size_aliases())
+
+        # dumping the aliases in the order in which they are in the _alias_index vector.
+        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
+        i = 1
+        for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
+            alias = self._aliases_table[alias_index]
+            assert alias_index == i
+
+            candidate_length = len(alias.entry_indices)
+            writer.write_alias_header(alias_hash, candidate_length)
+
+            for j in range(0, candidate_length):
+                writer.write_alias(alias.entry_indices[j], alias.probs[j])
+
+            i = i+1
+
+        writer.close()
+
+    def read_contents(self, file_path):
+        cdef hash_t entity_hash
+        cdef hash_t alias_hash
+        cdef int64_t entry_index
+        cdef float freq, prob
+        cdef int32_t vector_index
+        cdef KBEntryC entry
+        cdef AliasC alias
+        cdef float vector_element
+
+        cdef Reader reader = Reader(file_path)
+
+        # STEP 0: load header and initialize KB
+        cdef int64_t nr_entities
+        cdef int64_t entity_vector_length
+        reader.read_header(&nr_entities, &entity_vector_length)
+
+        self.initialize_entities(nr_entities)
+        self.initialize_vectors(nr_entities)
+        self.entity_vector_length = entity_vector_length
+
+        # STEP 1: load entity vectors
+        cdef int i = 0
+        cdef int j = 0
+        while i < nr_entities:
+            entity_vector = float_vec(entity_vector_length)
+            j = 0
+            while j < entity_vector_length:
+                reader.read_vector_element(&vector_element)
+                entity_vector[j] = vector_element
+                j = j+1
+            self._vectors_table[i] = entity_vector
+            i = i+1
+
+        # STEP 2: load entities
+        # we assume that the entity data was written in sequence
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        i = 1
+        while i <= nr_entities:
+            reader.read_entry(&entity_hash, &freq, &vector_index)
+
+            entry.entity_hash = entity_hash
+            entry.freq = freq
+            entry.vector_index = vector_index
+            entry.feats_row = -1    # Features table currently not implemented
+
+            self._entries[i] = entry
+            self._entry_index[entity_hash] = i
+
+            i += 1
+
+        # check that all entities were read in properly
+        assert nr_entities == self.get_size_entities()
+
+        # STEP 3: load aliases
+        cdef int64_t nr_aliases
+        reader.read_alias_length(&nr_aliases)
+        self.initialize_aliases(nr_aliases)
+
+        cdef int64_t nr_candidates
+        cdef vector[int64_t] entry_indices
+        cdef vector[float] probs
+
+        i = 1
+        # we assume the alias data was written in sequence
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        while i <= nr_aliases:
+            reader.read_alias_header(&alias_hash, &nr_candidates)
+            entry_indices = vector[int64_t](nr_candidates)
+            probs = vector[float](nr_candidates)
+
+            for j in range(0, nr_candidates):
+                reader.read_alias(&entry_index, &prob)
+                entry_indices[j] = entry_index
+                probs[j] = prob
+
+            alias.entry_indices = entry_indices
+            alias.probs = probs
+
+            self._aliases_table[i] = alias
+            self._alias_index[alias_hash] = i
+
+            i += 1
+
+        # check that all aliases were read in properly
+        assert nr_aliases == self.get_size_aliases()
+
+
+cdef class Writer:
+    def __init__(self, path):
+        assert isinstance(path, Path)
+        content = bytes(path)
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
+        self._fp = fopen(<char*>bytes_loc, 'wb')
+        if not self._fp:
+            raise IOError(Errors.E146.format(path=path))
+        fseek(self._fp, 0, 0)
+
+    def close(self):
+        cdef size_t status = fclose(self._fp)
+        assert status == 0
+
+    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
+        self._write(&nr_entries, sizeof(nr_entries))
+        self._write(&entity_vector_length, sizeof(entity_vector_length))
+
+    cdef int write_vector_element(self, float element) except -1:
+        self._write(&element, sizeof(element))
+
+    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
+        self._write(&entry_hash, sizeof(entry_hash))
+        self._write(&entry_freq, sizeof(entry_freq))
+        self._write(&vector_index, sizeof(vector_index))
+        # Features table currently not implemented and not written to file
+
+    cdef int write_alias_length(self, int64_t alias_length) except -1:
+        self._write(&alias_length, sizeof(alias_length))
+
+    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
+        self._write(&alias_hash, sizeof(alias_hash))
+        self._write(&candidate_length, sizeof(candidate_length))
+
+    cdef int write_alias(self, int64_t entry_index, float prob) except -1:
+        self._write(&entry_index, sizeof(entry_index))
+        self._write(&prob, sizeof(prob))
+
+    cdef int _write(self, void* value, size_t size) except -1:
+        status = fwrite(value, size, 1, self._fp)
+        assert status == 1, status
+
+
+cdef class Reader:
+    def __init__(self, path):
+        content = bytes(path)
+        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
+        self._fp = fopen(<char*>bytes_loc, 'rb')
+        if not self._fp:
+            PyErr_SetFromErrno(IOError)
+        status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header
+
+    def __dealloc__(self):
+        fclose(self._fp)
+
+    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
+        status = self._read(nr_entries, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="header"))
+
+        status = self._read(entity_vector_length, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="vector length"))
+
+    cdef int read_vector_element(self, float* element) except -1:
+        status = self._read(element, sizeof(float))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="vector element"))
+
+    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
+        status = self._read(entity_hash, sizeof(hash_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="entity hash"))
+
+        status = self._read(freq, sizeof(float))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="entity freq"))
+
+        status = self._read(vector_index, sizeof(int32_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="vector index"))
+
+        if feof(self._fp):
+            return 0
+        else:
+            return 1
+
+    cdef int read_alias_length(self, int64_t* alias_length) except -1:
+        status = self._read(alias_length, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="alias length"))
+
+    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
+        status = self._read(alias_hash, sizeof(hash_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="alias hash"))
+
+        status = self._read(candidate_length, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="candidate length"))
+
+    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1:
+        status = self._read(entry_index, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="entry index"))
+
+        status = self._read(prob, sizeof(float))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError(Errors.E145.format(param="prior probability"))
+
+    cdef int _read(self, void* value, size_t size) except -1:
+        status = fread(value, size, 1, self._fp)
+        return status
--- a/cgpenv/Lib/site-packages/spacy/language.py
+++ b/cgpenv/Lib/site-packages/spacy/language.py
+from typing import Iterator, Optional, Any, Dict, Callable, Iterable
+from typing import Union, Tuple, List, Set, Pattern, Sequence
+from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
+
+from dataclasses import dataclass
+import random
+import itertools
+import functools
+from contextlib import contextmanager
+from copy import deepcopy
+from pathlib import Path
+import warnings
+from thinc.api import get_current_ops, Config, CupyOps, Optimizer
+import srsly
+import multiprocessing as mp
+from itertools import chain, cycle
+from timeit import default_timer as timer
+import traceback
+
+from . import ty
+from .tokens.underscore import Underscore
+from .vocab import Vocab, create_vocab
+from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
+from .training import Example, validate_examples
+from .training.initialize import init_vocab, init_tok2vec
+from .scorer import Scorer
+from .util import registry, SimpleFrozenList, _pipe, raise_error
+from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
+from .util import warn_if_jupyter_cupy
+from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
+from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.punctuation import TOKENIZER_INFIXES
+from .tokens import Doc
+from .tokenizer import Tokenizer
+from .errors import Errors, Warnings
+from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
+from .schemas import ConfigSchemaPretrain, validate_init_settings
+from .git_info import GIT_VERSION
+from . import util
+from . import about
+from .lookups import load_lookups
+from .compat import Literal
+
+
+if TYPE_CHECKING:
+    from .pipeline import Pipe  # noqa: F401
+
+
+# This is the base config will all settings (training etc.)
+DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
+DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
+# This is the base config for the [pretraining] block and currently not included
+# in the main config and only added via the 'init fill-config' command
+DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
+
+# Type variable for contexts piped with documents
+_AnyContext = TypeVar("_AnyContext")
+
+
+class BaseDefaults:
+    """Language data defaults, available via Language.Defaults. Can be
+    overwritten by language subclasses by defining their own subclasses of
+    Language.Defaults.
+    """
+
+    config: Config = Config(section_order=CONFIG_SECTION_ORDER)
+    tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
+    prefixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_PREFIXES
+    suffixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
+    infixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_INFIXES
+    token_match: Optional[Callable] = None
+    url_match: Optional[Callable] = URL_MATCH
+    syntax_iterators: Dict[str, Callable] = {}
+    lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
+    stop_words: Set[str] = set()
+    writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
+
+
+@registry.tokenizers("spacy.Tokenizer.v1")
+def create_tokenizer() -> Callable[["Language"], Tokenizer]:
+    """Registered function to create a tokenizer. Returns a factory that takes
+    the nlp object and returns a Tokenizer instance using the language detaults.
+    """
+
+    def tokenizer_factory(nlp: "Language") -> Tokenizer:
+        prefixes = nlp.Defaults.prefixes
+        suffixes = nlp.Defaults.suffixes
+        infixes = nlp.Defaults.infixes
+        prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
+        suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
+        infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
+        return Tokenizer(
+            nlp.vocab,
+            rules=nlp.Defaults.tokenizer_exceptions,
+            prefix_search=prefix_search,
+            suffix_search=suffix_search,
+            infix_finditer=infix_finditer,
+            token_match=nlp.Defaults.token_match,
+            url_match=nlp.Defaults.url_match,
+        )
+
+    return tokenizer_factory
+
+
+@registry.misc("spacy.LookupsDataLoader.v1")
+def load_lookups_data(lang, tables):
+    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    lookups = load_lookups(lang=lang, tables=tables)
+    return lookups
+
+
+class Language:
+    """A text-processing pipeline. Usually you'll load this once per process,
+    and pass the instance around your application.
+
+    Defaults (class): Settings, data and factory methods for creating the `nlp`
+        object and processing pipeline.
+    lang (str): IETF language code, such as 'en'.
+
+    DOCS: https://spacy.io/api/language
+    """
+
+    Defaults = BaseDefaults
+    lang: Optional[str] = None
+    default_config = DEFAULT_CONFIG
+
+    factories = SimpleFrozenDict(error=Errors.E957)
+    _factory_meta: Dict[str, "FactoryMeta"] = {}  # meta by factory
+
+    def __init__(
+        self,
+        vocab: Union[Vocab, bool] = True,
+        *,
+        max_length: int = 10 ** 6,
+        meta: Dict[str, Any] = {},
+        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
+        batch_size: int = 1000,
+        **kwargs,
+    ) -> None:
+        """Initialise a Language object.
+
+        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
+        meta (dict): Custom meta data for the Language class. Is written to by
+            models to add model meta data.
+        max_length (int): Maximum number of characters in a single text. The
+            current models may run out memory on extremely long texts, due to
+            large internal allocations. You should segment these texts into
+            meaningful units, e.g. paragraphs, subsections etc, before passing
+            them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
+            a rule of thumb, if all pipeline components are enabled, spaCy's
+            default models currently requires roughly 1GB of temporary memory per
+            100,000 characters in one text.
+        create_tokenizer (Callable): Function that takes the nlp object and
+            returns a tokenizer.
+        batch_size (int): Default batch size for pipe and evaluate.
+
+        DOCS: https://spacy.io/api/language#init
+        """
+        # We're only calling this to import all factories provided via entry
+        # points. The factory decorator applied to these functions takes care
+        # of the rest.
+        util.registry._entry_point_factories.get_all()
+
+        self._config = DEFAULT_CONFIG.merge(self.default_config)
+        self._meta = dict(meta)
+        self._path = None
+        self._optimizer: Optional[Optimizer] = None
+        # Component meta and configs are only needed on the instance
+        self._pipe_meta: Dict[str, "FactoryMeta"] = {}  # meta by component
+        self._pipe_configs: Dict[str, Config] = {}  # config by component
+
+        if not isinstance(vocab, Vocab) and vocab is not True:
+            raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
+        if vocab is True:
+            vectors_name = meta.get("vectors", {}).get("name")
+            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
+        else:
+            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
+                raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
+        self.vocab: Vocab = vocab
+        if self.lang is None:
+            self.lang = self.vocab.lang
+        self._components: List[Tuple[str, "Pipe"]] = []
+        self._disabled: Set[str] = set()
+        self.max_length = max_length
+        # Create the default tokenizer from the default config
+        if not create_tokenizer:
+            tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
+            create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
+        self.tokenizer = create_tokenizer(self)
+        self.batch_size = batch_size
+        self.default_error_handler = raise_error
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
+        cls.default_config["nlp"]["lang"] = cls.lang
+
+    @property
+    def path(self):
+        return self._path
+
+    @property
+    def meta(self) -> Dict[str, Any]:
+        """Custom meta data of the language class. If a model is loaded, this
+        includes details from the model's meta.json.
+
+        RETURNS (Dict[str, Any]): The meta.
+
+        DOCS: https://spacy.io/api/language#meta
+        """
+        spacy_version = util.get_minor_version_range(about.__version__)
+        if self.vocab.lang:
+            self._meta.setdefault("lang", self.vocab.lang)
+        else:
+            self._meta.setdefault("lang", self.lang)
+        self._meta.setdefault("name", "pipeline")
+        self._meta.setdefault("version", "0.0.0")
+        self._meta.setdefault("spacy_version", spacy_version)
+        self._meta.setdefault("description", "")
+        self._meta.setdefault("author", "")
+        self._meta.setdefault("email", "")
+        self._meta.setdefault("url", "")
+        self._meta.setdefault("license", "")
+        self._meta.setdefault("spacy_git_version", GIT_VERSION)
+        self._meta["vectors"] = {
+            "width": self.vocab.vectors_length,
+            "vectors": len(self.vocab.vectors),
+            "keys": self.vocab.vectors.n_keys,
+            "name": self.vocab.vectors.name,
+            "mode": self.vocab.vectors.mode,
+        }
+        self._meta["labels"] = dict(self.pipe_labels)
+        # TODO: Adding this back to prevent breaking people's code etc., but
+        # we should consider removing it
+        self._meta["pipeline"] = list(self.pipe_names)
+        self._meta["components"] = list(self.component_names)
+        self._meta["disabled"] = list(self.disabled)
+        return self._meta
+
+    @meta.setter
+    def meta(self, value: Dict[str, Any]) -> None:
+        self._meta = value
+
+    @property
+    def config(self) -> Config:
+        """Trainable config for the current language instance. Includes the
+        current pipeline components, as well as default training config.
+
+        RETURNS (thinc.api.Config): The config.
+
+        DOCS: https://spacy.io/api/language#config
+        """
+        self._config.setdefault("nlp", {})
+        self._config.setdefault("training", {})
+        self._config["nlp"]["lang"] = self.lang
+        # We're storing the filled config for each pipeline component and so
+        # we can populate the config again later
+        pipeline = {}
+        score_weights = []
+        for pipe_name in self.component_names:
+            pipe_meta = self.get_pipe_meta(pipe_name)
+            pipe_config = self.get_pipe_config(pipe_name)
+            pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
+            if pipe_meta.default_score_weights:
+                score_weights.append(pipe_meta.default_score_weights)
+        self._config["nlp"]["pipeline"] = list(self.component_names)
+        self._config["nlp"]["disabled"] = list(self.disabled)
+        self._config["components"] = pipeline
+        # We're merging the existing score weights back into the combined
+        # weights to make sure we're preserving custom settings in the config
+        # but also reflect updates (e.g. new components added)
+        prev_weights = self._config["training"].get("score_weights", {})
+        combined_score_weights = combine_score_weights(score_weights, prev_weights)
+        self._config["training"]["score_weights"] = combined_score_weights
+        if not srsly.is_json_serializable(self._config):
+            raise ValueError(Errors.E961.format(config=self._config))
+        return self._config
+
+    @config.setter
+    def config(self, value: Config) -> None:
+        self._config = value
+
+    @property
+    def disabled(self) -> List[str]:
+        """Get the names of all disabled components.
+
+        RETURNS (List[str]): The disabled components.
+        """
+        # Make sure the disabled components are returned in the order they
+        # appear in the pipeline (which isn't guaranteed by the set)
+        names = [name for name, _ in self._components if name in self._disabled]
+        return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled"))
+
+    @property
+    def factory_names(self) -> List[str]:
+        """Get names of all available factories.
+
+        RETURNS (List[str]): The factory names.
+        """
+        names = list(self.factories.keys())
+        return SimpleFrozenList(names)
+
+    @property
+    def components(self) -> List[Tuple[str, "Pipe"]]:
+        """Get all (name, component) tuples in the pipeline, including the
+        currently disabled components.
+        """
+        return SimpleFrozenList(
+            self._components, error=Errors.E926.format(attr="components")
+        )
+
+    @property
+    def component_names(self) -> List[str]:
+        """Get the names of the available pipeline components. Includes all
+        active and inactive pipeline components.
+
+        RETURNS (List[str]): List of component name strings, in order.
+        """
+        names = [pipe_name for pipe_name, _ in self._components]
+        return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
+
+    @property
+    def pipeline(self) -> List[Tuple[str, "Pipe"]]:
+        """The processing pipeline consisting of (name, component) tuples. The
+        components are called on the Doc in order as it passes through the
+        pipeline.
+
+        RETURNS (List[Tuple[str, Pipe]]): The pipeline.
+        """
+        pipes = [(n, p) for n, p in self._components if n not in self._disabled]
+        return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
+
+    @property
+    def pipe_names(self) -> List[str]:
+        """Get names of available active pipeline components.
+
+        RETURNS (List[str]): List of component name strings, in order.
+        """
+        names = [pipe_name for pipe_name, _ in self.pipeline]
+        return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names"))
+
+    @property
+    def pipe_factories(self) -> Dict[str, str]:
+        """Get the component factories for the available pipeline components.
+
+        RETURNS (Dict[str, str]): Factory names, keyed by component names.
+        """
+        factories = {}
+        for pipe_name, pipe in self._components:
+            factories[pipe_name] = self.get_pipe_meta(pipe_name).factory
+        return SimpleFrozenDict(factories)
+
+    @property
+    def pipe_labels(self) -> Dict[str, List[str]]:
+        """Get the labels set by the pipeline components, if available (if
+        the component exposes a labels property).
+
+        RETURNS (Dict[str, List[str]]): Labels keyed by component name.
+        """
+        labels = {}
+        for name, pipe in self._components:
+            if hasattr(pipe, "labels"):
+                labels[name] = list(pipe.labels)
+        return SimpleFrozenDict(labels)
+
+    @classmethod
+    def has_factory(cls, name: str) -> bool:
+        """RETURNS (bool): Whether a factory of that name is registered."""
+        internal_name = cls.get_factory_name(name)
+        return name in registry.factories or internal_name in registry.factories
+
+    @classmethod
+    def get_factory_name(cls, name: str) -> str:
+        """Get the internal factory name based on the language subclass.
+
+        name (str): The factory name.
+        RETURNS (str): The internal factory name.
+        """
+        if cls.lang is None:
+            return name
+        return f"{cls.lang}.{name}"
+
+    @classmethod
+    def get_factory_meta(cls, name: str) -> "FactoryMeta":
+        """Get the meta information for a given factory name.
+
+        name (str): The component factory name.
+        RETURNS (FactoryMeta): The meta for the given factory name.
+        """
+        internal_name = cls.get_factory_name(name)
+        if internal_name in cls._factory_meta:
+            return cls._factory_meta[internal_name]
+        if name in cls._factory_meta:
+            return cls._factory_meta[name]
+        raise ValueError(Errors.E967.format(meta="factory", name=name))
+
+    @classmethod
+    def set_factory_meta(cls, name: str, value: "FactoryMeta") -> None:
+        """Set the meta information for a given factory name.
+
+        name (str): The component factory name.
+        value (FactoryMeta): The meta to set.
+        """
+        cls._factory_meta[cls.get_factory_name(name)] = value
+
+    def get_pipe_meta(self, name: str) -> "FactoryMeta":
+        """Get the meta information for a given component name.
+
+        name (str): The component name.
+        RETURNS (FactoryMeta): The meta for the given component name.
+        """
+        if name not in self._pipe_meta:
+            raise ValueError(Errors.E967.format(meta="component", name=name))
+        return self._pipe_meta[name]
+
+    def get_pipe_config(self, name: str) -> Config:
+        """Get the config used to create a pipeline component.
+
+        name (str): The component name.
+        RETURNS (Config): The config used to create the pipeline component.
+        """
+        if name not in self._pipe_configs:
+            raise ValueError(Errors.E960.format(name=name))
+        pipe_config = self._pipe_configs[name]
+        return pipe_config
+
+    @classmethod
+    def factory(
+        cls,
+        name: str,
+        *,
+        default_config: Dict[str, Any] = SimpleFrozenDict(),
+        assigns: Iterable[str] = SimpleFrozenList(),
+        requires: Iterable[str] = SimpleFrozenList(),
+        retokenizes: bool = False,
+        default_score_weights: Dict[str, Optional[float]] = SimpleFrozenDict(),
+        func: Optional[Callable] = None,
+    ) -> Callable:
+        """Register a new pipeline component factory. Can be used as a decorator
+        on a function or classmethod, or called as a function with the factory
+        provided as the func keyword argument. To create a component and add
+        it to the pipeline, you can use nlp.add_pipe(name).
+
+        name (str): The name of the component factory.
+        default_config (Dict[str, Any]): Default configuration, describing the
+            default values of the factory arguments.
+        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
+            e.g. "token.ent_id". Used for pipeline analysis.
+        requires (Iterable[str]): Doc/Token attributes required by this component,
+            e.g. "token.ent_id". Used for pipeline analysis.
+        retokenizes (bool): Whether the component changes the tokenization.
+            Used for pipeline analysis.
+        default_score_weights (Dict[str, Optional[float]]): The scores to report during
+            training, and their default weight towards the final score used to
+            select the best model. Weights should sum to 1.0 per component and
+            will be combined and normalized for the whole pipeline. If None,
+            the score won't be shown in the logs or be weighted.
+        func (Optional[Callable]): Factory function if not used as a decorator.
+
+        DOCS: https://spacy.io/api/language#factory
+        """
+        if not isinstance(name, str):
+            raise ValueError(Errors.E963.format(decorator="factory"))
+        if not isinstance(default_config, dict):
+            err = Errors.E962.format(
+                style="default config", name=name, cfg_type=type(default_config)
+            )
+            raise ValueError(err)
+
+        def add_factory(factory_func: Callable) -> Callable:
+            internal_name = cls.get_factory_name(name)
+            if internal_name in registry.factories:
+                # We only check for the internal name here – it's okay if it's a
+                # subclass and the base class has a factory of the same name. We
+                # also only raise if the function is different to prevent raising
+                # if module is reloaded.
+                existing_func = registry.factories.get(internal_name)
+                if not util.is_same_func(factory_func, existing_func):
+                    err = Errors.E004.format(
+                        name=name, func=existing_func, new_func=factory_func
+                    )
+                    raise ValueError(err)
+
+            arg_names = util.get_arg_names(factory_func)
+            if "nlp" not in arg_names or "name" not in arg_names:
+                raise ValueError(Errors.E964.format(name=name))
+            # Officially register the factory so we can later call
+            # registry.resolve and refer to it in the config as
+            # @factories = "spacy.Language.xyz". We use the class name here so
+            # different classes can have different factories.
+            registry.factories.register(internal_name, func=factory_func)
+            factory_meta = FactoryMeta(
+                factory=name,
+                default_config=default_config,
+                assigns=validate_attrs(assigns),
+                requires=validate_attrs(requires),
+                scores=list(default_score_weights.keys()),
+                default_score_weights=default_score_weights,
+                retokenizes=retokenizes,
+            )
+            cls.set_factory_meta(name, factory_meta)
+            # We're overwriting the class attr with a frozen dict to handle
+            # backwards-compat (writing to Language.factories directly). This
+            # wouldn't work with an instance property and just produce a
+            # confusing error – here we can show a custom error
+            cls.factories = SimpleFrozenDict(
+                registry.factories.get_all(), error=Errors.E957
+            )
+            return factory_func
+
+        if func is not None:  # Support non-decorator use cases
+            return add_factory(func)
+        return add_factory
+
+    @classmethod
+    def component(
+        cls,
+        name: str,
+        *,
+        assigns: Iterable[str] = SimpleFrozenList(),
+        requires: Iterable[str] = SimpleFrozenList(),
+        retokenizes: bool = False,
+        func: Optional["Pipe"] = None,
+    ) -> Callable:
+        """Register a new pipeline component. Can be used for stateless function
+        components that don't require a separate factory. Can be used as a
+        decorator on a function or classmethod, or called as a function with the
+        factory provided as the func keyword argument. To create a component and
+        add it to the pipeline, you can use nlp.add_pipe(name).
+
+        name (str): The name of the component factory.
+        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
+            e.g. "token.ent_id". Used for pipeline analysis.
+        requires (Iterable[str]): Doc/Token attributes required by this component,
+            e.g. "token.ent_id". Used for pipeline analysis.
+        retokenizes (bool): Whether the component changes the tokenization.
+            Used for pipeline analysis.
+        func (Optional[Callable]): Factory function if not used as a decorator.
+
+        DOCS: https://spacy.io/api/language#component
+        """
+        if name is not None and not isinstance(name, str):
+            raise ValueError(Errors.E963.format(decorator="component"))
+        component_name = name if name is not None else util.get_object_name(func)
+
+        def add_component(component_func: "Pipe") -> Callable:
+            if isinstance(func, type):  # function is a class
+                raise ValueError(Errors.E965.format(name=component_name))
+
+            def factory_func(nlp, name: str) -> "Pipe":
+                return component_func
+
+            internal_name = cls.get_factory_name(name)
+            if internal_name in registry.factories:
+                # We only check for the internal name here – it's okay if it's a
+                # subclass and the base class has a factory of the same name. We
+                # also only raise if the function is different to prevent raising
+                # if module is reloaded. It's hacky, but we need to check the
+                # existing functure for a closure and whether that's identical
+                # to the component function (because factory_func created above
+                # will always be different, even for the same function)
+                existing_func = registry.factories.get(internal_name)
+                closure = existing_func.__closure__
+                wrapped = [c.cell_contents for c in closure][0] if closure else None
+                if util.is_same_func(wrapped, component_func):
+                    factory_func = existing_func  # noqa: F811
+
+            cls.factory(
+                component_name,
+                assigns=assigns,
+                requires=requires,
+                retokenizes=retokenizes,
+                func=factory_func,
+            )
+            return component_func
+
+        if func is not None:  # Support non-decorator use cases
+            return add_component(func)
+        return add_component
+
+    def analyze_pipes(
+        self,
+        *,
+        keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
+        pretty: bool = False,
+    ) -> Optional[Dict[str, Any]]:
+        """Analyze the current pipeline components, print a summary of what
+        they assign or require and check that all requirements are met.
+
+        keys (List[str]): The meta values to display in the table. Corresponds
+            to values in FactoryMeta, defined by @Language.factory decorator.
+        pretty (bool): Pretty-print the results.
+        RETURNS (dict): The data.
+        """
+        analysis = analyze_pipes(self, keys=keys)
+        if pretty:
+            print_pipe_analysis(analysis, keys=keys)
+        return analysis
+
+    def get_pipe(self, name: str) -> "Pipe":
+        """Get a pipeline component for a given component name.
+
+        name (str): Name of pipeline component to get.
+        RETURNS (callable): The pipeline component.
+
+        DOCS: https://spacy.io/api/language#get_pipe
+        """
+        for pipe_name, component in self._components:
+            if pipe_name == name:
+                return component
+        raise KeyError(Errors.E001.format(name=name, opts=self.component_names))
+
+    def create_pipe(
+        self,
+        factory_name: str,
+        name: Optional[str] = None,
+        *,
+        config: Dict[str, Any] = SimpleFrozenDict(),
+        raw_config: Optional[Config] = None,
+        validate: bool = True,
+    ) -> "Pipe":
+        """Create a pipeline component. Mostly used internally. To create and
+        add a component to the pipeline, you can use nlp.add_pipe.
+
+        factory_name (str): Name of component factory.
+        name (Optional[str]): Optional name to assign to component instance.
+            Defaults to factory name if not set.
+        config (Dict[str, Any]): Config parameters to use for this component.
+            Will be merged with default config, if available.
+        raw_config (Optional[Config]): Internals: the non-interpolated config.
+        validate (bool): Whether to validate the component config against the
+            arguments and types expected by the factory.
+        RETURNS (Pipe): The pipeline component.
+
+        DOCS: https://spacy.io/api/language#create_pipe
+        """
+        name = name if name is not None else factory_name
+        if not isinstance(config, dict):
+            err = Errors.E962.format(style="config", name=name, cfg_type=type(config))
+            raise ValueError(err)
+        if not srsly.is_json_serializable(config):
+            raise ValueError(Errors.E961.format(config=config))
+        if not self.has_factory(factory_name):
+            err = Errors.E002.format(
+                name=factory_name,
+                opts=", ".join(self.factory_names),
+                method="create_pipe",
+                lang=util.get_object_name(self),
+                lang_code=self.lang,
+            )
+            raise ValueError(err)
+        pipe_meta = self.get_factory_meta(factory_name)
+        # This is unideal, but the alternative would mean you always need to
+        # specify the full config settings, which is not really viable.
+        if pipe_meta.default_config:
+            config = Config(pipe_meta.default_config).merge(config)
+        internal_name = self.get_factory_name(factory_name)
+        # If the language-specific factory doesn't exist, try again with the
+        # not-specific name
+        if internal_name not in registry.factories:
+            internal_name = factory_name
+        # The name allows components to know their pipe name and use it in the
+        # losses etc. (even if multiple instances of the same factory are used)
+        config = {"nlp": self, "name": name, **config, "@factories": internal_name}
+        # We need to create a top-level key because Thinc doesn't allow resolving
+        # top-level references to registered functions. Also gives nicer errors.
+        cfg = {factory_name: config}
+        # We're calling the internal _fill here to avoid constructing the
+        # registered functions twice
+        resolved = registry.resolve(cfg, validate=validate)
+        filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
+        filled = Config(filled)
+        filled["factory"] = factory_name
+        filled.pop("@factories", None)
+        # Remove the extra values we added because we don't want to keep passing
+        # them around, copying them etc.
+        filled.pop("nlp", None)
+        filled.pop("name", None)
+        # Merge the final filled config with the raw config (including non-
+        # interpolated variables)
+        if raw_config:
+            filled = filled.merge(raw_config)
+        self._pipe_configs[name] = filled
+        return resolved[factory_name]
+
+    def create_pipe_from_source(
+        self, source_name: str, source: "Language", *, name: str
+    ) -> Tuple["Pipe", str]:
+        """Create a pipeline component by copying it from an existing model.
+
+        source_name (str): Name of the component in the source pipeline.
+        source (Language): The source nlp object to copy from.
+        name (str): Optional alternative name to use in current pipeline.
+        RETURNS (Tuple[Callable, str]): The component and its factory name.
+        """
+        # Check source type
+        if not isinstance(source, Language):
+            raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
+        # Check vectors, with faster checks first
+        if (
+            self.vocab.vectors.shape != source.vocab.vectors.shape
+            or self.vocab.vectors.key2row != source.vocab.vectors.key2row
+            or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes()
+        ):
+            warnings.warn(Warnings.W113.format(name=source_name))
+        if source_name not in source.component_names:
+            raise KeyError(
+                Errors.E944.format(
+                    name=source_name,
+                    model=f"{source.meta['lang']}_{source.meta['name']}",
+                    opts=", ".join(source.component_names),
+                )
+            )
+        pipe = source.get_pipe(source_name)
+        # Make sure the source config is interpolated so we don't end up with
+        # orphaned variables in our final config
+        source_config = source.config.interpolate()
+        pipe_config = util.copy_config(source_config["components"][source_name])
+        self._pipe_configs[name] = pipe_config
+        if self.vocab.strings != source.vocab.strings:
+            for s in source.vocab.strings:
+                self.vocab.strings.add(s)
+        return pipe, pipe_config["factory"]
+
+    def add_pipe(
+        self,
+        factory_name: str,
+        name: Optional[str] = None,
+        *,
+        before: Optional[Union[str, int]] = None,
+        after: Optional[Union[str, int]] = None,
+        first: Optional[bool] = None,
+        last: Optional[bool] = None,
+        source: Optional["Language"] = None,
+        config: Dict[str, Any] = SimpleFrozenDict(),
+        raw_config: Optional[Config] = None,
+        validate: bool = True,
+    ) -> "Pipe":
+        """Add a component to the processing pipeline. Valid components are
+        callables that take a `Doc` object, modify it and return it. Only one
+        of before/after/first/last can be set. Default behaviour is "last".
+
+        factory_name (str): Name of the component factory.
+        name (str): Name of pipeline component. Overwrites existing
+            component.name attribute if available. If no name is set and
+            the component exposes no name attribute, component.__name__ is
+            used. An error is raised if a name already exists in the pipeline.
+        before (Union[str, int]): Name or index of the component to insert new
+            component directly before.
+        after (Union[str, int]): Name or index of the component to insert new
+            component directly after.
+        first (bool): If True, insert component first in the pipeline.
+        last (bool): If True, insert component last in the pipeline.
+        source (Language): Optional loaded nlp object to copy the pipeline
+            component from.
+        config (Dict[str, Any]): Config parameters to use for this component.
+            Will be merged with default config, if available.
+        raw_config (Optional[Config]): Internals: the non-interpolated config.
+        validate (bool): Whether to validate the component config against the
+            arguments and types expected by the factory.
+        RETURNS (Pipe): The pipeline component.
+
+        DOCS: https://spacy.io/api/language#add_pipe
+        """
+        if not isinstance(factory_name, str):
+            bad_val = repr(factory_name)
+            err = Errors.E966.format(component=bad_val, name=name)
+            raise ValueError(err)
+        name = name if name is not None else factory_name
+        if name in self.component_names:
+            raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
+        if source is not None:
+            # We're loading the component from a model. After loading the
+            # component, we know its real factory name
+            pipe_component, factory_name = self.create_pipe_from_source(
+                factory_name, source, name=name
+            )
+        else:
+            if not self.has_factory(factory_name):
+                err = Errors.E002.format(
+                    name=factory_name,
+                    opts=", ".join(self.factory_names),
+                    method="add_pipe",
+                    lang=util.get_object_name(self),
+                    lang_code=self.lang,
+                )
+            pipe_component = self.create_pipe(
+                factory_name,
+                name=name,
+                config=config,
+                raw_config=raw_config,
+                validate=validate,
+            )
+        pipe_index = self._get_pipe_index(before, after, first, last)
+        self._pipe_meta[name] = self.get_factory_meta(factory_name)
+        self._components.insert(pipe_index, (name, pipe_component))
+        return pipe_component
+
+    def _get_pipe_index(
+        self,
+        before: Optional[Union[str, int]] = None,
+        after: Optional[Union[str, int]] = None,
+        first: Optional[bool] = None,
+        last: Optional[bool] = None,
+    ) -> int:
+        """Determine where to insert a pipeline component based on the before/
+        after/first/last values.
+
+        before (str): Name or index of the component to insert directly before.
+        after (str): Name or index of component to insert directly after.
+        first (bool): If True, insert component first in the pipeline.
+        last (bool): If True, insert component last in the pipeline.
+        RETURNS (int): The index of the new pipeline component.
+        """
+        all_args = {"before": before, "after": after, "first": first, "last": last}
+        if sum(arg is not None for arg in [before, after, first, last]) >= 2:
+            raise ValueError(
+                Errors.E006.format(args=all_args, opts=self.component_names)
+            )
+        if last or not any(value is not None for value in [first, before, after]):
+            return len(self._components)
+        elif first:
+            return 0
+        elif isinstance(before, str):
+            if before not in self.component_names:
+                raise ValueError(
+                    Errors.E001.format(name=before, opts=self.component_names)
+                )
+            return self.component_names.index(before)
+        elif isinstance(after, str):
+            if after not in self.component_names:
+                raise ValueError(
+                    Errors.E001.format(name=after, opts=self.component_names)
+                )
+            return self.component_names.index(after) + 1
+        # We're only accepting indices referring to components that exist
+        # (can't just do isinstance here because bools are instance of int, too)
+        elif type(before) == int:
+            if before >= len(self._components) or before < 0:
+                err = Errors.E959.format(
+                    dir="before", idx=before, opts=self.component_names
+                )
+                raise ValueError(err)
+            return before
+        elif type(after) == int:
+            if after >= len(self._components) or after < 0:
+                err = Errors.E959.format(
+                    dir="after", idx=after, opts=self.component_names
+                )
+                raise ValueError(err)
+            return after + 1
+        raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names))
+
+    def has_pipe(self, name: str) -> bool:
+        """Check if a component name is present in the pipeline. Equivalent to
+        `name in nlp.pipe_names`.
+
+        name (str): Name of the component.
+        RETURNS (bool): Whether a component of the name exists in the pipeline.
+
+        DOCS: https://spacy.io/api/language#has_pipe
+        """
+        return name in self.pipe_names
+
+    def replace_pipe(
+        self,
+        name: str,
+        factory_name: str,
+        *,
+        config: Dict[str, Any] = SimpleFrozenDict(),
+        validate: bool = True,
+    ) -> "Pipe":
+        """Replace a component in the pipeline.
+
+        name (str): Name of the component to replace.
+        factory_name (str): Factory name of replacement component.
+        config (Optional[Dict[str, Any]]): Config parameters to use for this
+            component. Will be merged with default config, if available.
+        validate (bool): Whether to validate the component config against the
+            arguments and types expected by the factory.
+        RETURNS (Pipe): The new pipeline component.
+
+        DOCS: https://spacy.io/api/language#replace_pipe
+        """
+        if name not in self.component_names:
+            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
+        if hasattr(factory_name, "__call__"):
+            err = Errors.E968.format(component=repr(factory_name), name=name)
+            raise ValueError(err)
+        # We need to delegate to Language.add_pipe here instead of just writing
+        # to Language.pipeline to make sure the configs are handled correctly
+        pipe_index = self.component_names.index(name)
+        self.remove_pipe(name)
+        if not len(self._components) or pipe_index == len(self._components):
+            # we have no components to insert before/after, or we're replacing the last component
+            return self.add_pipe(
+                factory_name, name=name, config=config, validate=validate
+            )
+        else:
+            return self.add_pipe(
+                factory_name,
+                name=name,
+                before=pipe_index,
+                config=config,
+                validate=validate,
+            )
+
+    def rename_pipe(self, old_name: str, new_name: str) -> None:
+        """Rename a pipeline component.
+
+        old_name (str): Name of the component to rename.
+        new_name (str): New name of the component.
+
+        DOCS: https://spacy.io/api/language#rename_pipe
+        """
+        if old_name not in self.component_names:
+            raise ValueError(
+                Errors.E001.format(name=old_name, opts=self.component_names)
+            )
+        if new_name in self.component_names:
+            raise ValueError(
+                Errors.E007.format(name=new_name, opts=self.component_names)
+            )
+        i = self.component_names.index(old_name)
+        self._components[i] = (new_name, self._components[i][1])
+        self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
+        self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
+        # Make sure [initialize] config is adjusted
+        if old_name in self._config["initialize"]["components"]:
+            init_cfg = self._config["initialize"]["components"].pop(old_name)
+            self._config["initialize"]["components"][new_name] = init_cfg
+
+    def remove_pipe(self, name: str) -> Tuple[str, "Pipe"]:
+        """Remove a component from the pipeline.
+
+        name (str): Name of the component to remove.
+        RETURNS (tuple): A `(name, component)` tuple of the removed component.
+
+        DOCS: https://spacy.io/api/language#remove_pipe
+        """
+        if name not in self.component_names:
+            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
+        removed = self._components.pop(self.component_names.index(name))
+        # We're only removing the component itself from the metas/configs here
+        # because factory may be used for something else
+        self._pipe_meta.pop(name)
+        self._pipe_configs.pop(name)
+        self.meta.get("_sourced_vectors_hashes", {}).pop(name, None)
+        # Make sure name is removed from the [initialize] config
+        if name in self._config["initialize"]["components"]:
+            self._config["initialize"]["components"].pop(name)
+        # Make sure the name is also removed from the set of disabled components
+        if name in self.disabled:
+            self._disabled.remove(name)
+        return removed
+
+    def disable_pipe(self, name: str) -> None:
+        """Disable a pipeline component. The component will still exist on
+        the nlp object, but it won't be run as part of the pipeline. Does
+        nothing if the component is already disabled.
+
+        name (str): The name of the component to disable.
+        """
+        if name not in self.component_names:
+            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
+        self._disabled.add(name)
+
+    def enable_pipe(self, name: str) -> None:
+        """Enable a previously disabled pipeline component so it's run as part
+        of the pipeline. Does nothing if the component is already enabled.
+
+        name (str): The name of the component to enable.
+        """
+        if name not in self.component_names:
+            raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
+        if name in self.disabled:
+            self._disabled.remove(name)
+
+    def __call__(
+        self,
+        text: Union[str, Doc],
+        *,
+        disable: Iterable[str] = SimpleFrozenList(),
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+    ) -> Doc:
+        """Apply the pipeline to some text. The text can span multiple sentences,
+        and can contain arbitrary whitespace. Alignment into the original string
+        is preserved.
+
+        text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
+            the doc will be passed directly to the pipeline, skipping
+            `Language.make_doc`.
+        disable (List[str]): Names of the pipeline components to disable.
+        component_cfg (Dict[str, dict]): An optional dictionary with extra
+            keyword arguments for specific components.
+        RETURNS (Doc): A container for accessing the annotations.
+
+        DOCS: https://spacy.io/api/language#call
+        """
+        doc = self._ensure_doc(text)
+        if component_cfg is None:
+            component_cfg = {}
+        for name, proc in self.pipeline:
+            if name in disable:
+                continue
+            if not hasattr(proc, "__call__"):
+                raise ValueError(Errors.E003.format(component=type(proc), name=name))
+            error_handler = self.default_error_handler
+            if hasattr(proc, "get_error_handler"):
+                error_handler = proc.get_error_handler()
+            try:
+                doc = proc(doc, **component_cfg.get(name, {}))  # type: ignore[call-arg]
+            except KeyError as e:
+                # This typically happens if a component is not initialized
+                raise ValueError(Errors.E109.format(name=name)) from e
+            except Exception as e:
+                error_handler(name, proc, [doc], e)
+            if doc is None:
+                raise ValueError(Errors.E005.format(name=name))
+        return doc
+
+    def disable_pipes(self, *names) -> "DisabledPipes":
+        """Disable one or more pipeline components. If used as a context
+        manager, the pipeline will be restored to the initial state at the end
+        of the block. Otherwise, a DisabledPipes object is returned, that has
+        a `.restore()` method you can use to undo your changes.
+
+        This method has been deprecated since 3.0
+        """
+        warnings.warn(Warnings.W096, DeprecationWarning)
+        if len(names) == 1 and isinstance(names[0], (list, tuple)):
+            names = names[0]  # type: ignore[assignment]    # support list of names instead of spread
+        return self.select_pipes(disable=names)
+
+    def select_pipes(
+        self,
+        *,
+        disable: Optional[Union[str, Iterable[str]]] = None,
+        enable: Optional[Union[str, Iterable[str]]] = None,
+    ) -> "DisabledPipes":
+        """Disable one or more pipeline components. If used as a context
+        manager, the pipeline will be restored to the initial state at the end
+        of the block. Otherwise, a DisabledPipes object is returned, that has
+        a `.restore()` method you can use to undo your changes.
+
+        disable (str or iterable): The name(s) of the pipes to disable
+        enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
+
+        DOCS: https://spacy.io/api/language#select_pipes
+        """
+        if enable is None and disable is None:
+            raise ValueError(Errors.E991)
+        if disable is not None and isinstance(disable, str):
+            disable = [disable]
+        if enable is not None:
+            if isinstance(enable, str):
+                enable = [enable]
+            to_disable = [pipe for pipe in self.pipe_names if pipe not in enable]
+            # raise an error if the enable and disable keywords are not consistent
+            if disable is not None and disable != to_disable:
+                raise ValueError(
+                    Errors.E992.format(
+                        enable=enable, disable=disable, names=self.pipe_names
+                    )
+                )
+            disable = to_disable
+        assert disable is not None
+        # DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
+        # those pipes that were already disabled.
+        disable = [d for d in disable if d not in self._disabled]
+        return DisabledPipes(self, disable)
+
+    def make_doc(self, text: str) -> Doc:
+        """Turn a text into a Doc object.
+
+        text (str): The text to process.
+        RETURNS (Doc): The processed doc.
+        """
+        if len(text) > self.max_length:
+            raise ValueError(
+                Errors.E088.format(length=len(text), max_length=self.max_length)
+            )
+        return self.tokenizer(text)
+
+    def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
+        """Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
+        if isinstance(doc_like, Doc):
+            return doc_like
+        if isinstance(doc_like, str):
+            return self.make_doc(doc_like)
+        raise ValueError(Errors.E866.format(type=type(doc_like)))
+
+    def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
+        """Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
+        doc = self._ensure_doc(doc_like)
+        doc._context = context
+        return doc
+
+    def update(
+        self,
+        examples: Iterable[Example],
+        _: Optional[Any] = None,
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        exclude: Iterable[str] = SimpleFrozenList(),
+        annotates: Iterable[str] = SimpleFrozenList(),
+    ):
+        """Update the models in the pipeline.
+
+        examples (Iterable[Example]): A batch of examples
+        _: Should not be set - serves to catch backwards-incompatible scripts.
+        drop (float): The dropout rate.
+        sgd (Optimizer): An optimizer.
+        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
+            component.
+        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
+            components, keyed by component name.
+        exclude (Iterable[str]): Names of components that shouldn't be updated.
+        annotates (Iterable[str]): Names of components that should set
+            annotations on the predicted examples after updating.
+        RETURNS (Dict[str, float]): The updated losses dictionary
+
+        DOCS: https://spacy.io/api/language#update
+        """
+        if _ is not None:
+            raise ValueError(Errors.E989)
+        if losses is None:
+            losses = {}
+        if isinstance(examples, list) and len(examples) == 0:
+            return losses
+        validate_examples(examples, "Language.update")
+        examples = _copy_examples(examples)
+        if sgd is None:
+            if self._optimizer is None:
+                self._optimizer = self.create_optimizer()
+            sgd = self._optimizer
+        if component_cfg is None:
+            component_cfg = {}
+        pipe_kwargs = {}
+        for i, (name, proc) in enumerate(self.pipeline):
+            component_cfg.setdefault(name, {})
+            pipe_kwargs[name] = deepcopy(component_cfg[name])
+            component_cfg[name].setdefault("drop", drop)
+            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
+        for name, proc in self.pipeline:
+            # ignore statements are used here because mypy ignores hasattr
+            if name not in exclude and hasattr(proc, "update"):
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
+            if sgd not in (None, False):
+                if (
+                    name not in exclude
+                    and isinstance(proc, ty.TrainableComponent)
+                    and proc.is_trainable
+                    and proc.model not in (True, False, None)
+                ):
+                    proc.finish_update(sgd)
+            if name in annotates:
+                for doc, eg in zip(
+                    _pipe(
+                        (eg.predicted for eg in examples),
+                        proc=proc,
+                        name=name,
+                        default_error_handler=self.default_error_handler,
+                        kwargs=pipe_kwargs[name],
+                    ),
+                    examples,
+                ):
+                    eg.predicted = doc
+        return losses
+
+    def rehearse(
+        self,
+        examples: Iterable[Example],
+        *,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        exclude: Iterable[str] = SimpleFrozenList(),
+    ) -> Dict[str, float]:
+        """Make a "rehearsal" update to the models in the pipeline, to prevent
+        forgetting. Rehearsal updates run an initial copy of the model over some
+        data, and update the model so its current predictions are more like the
+        initial ones. This is useful for keeping a pretrained model on-track,
+        even if you're updating it with a smaller set of examples.
+
+        examples (Iterable[Example]): A batch of `Example` objects.
+        sgd (Optional[Optimizer]): An optimizer.
+        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
+            components, keyed by component name.
+        exclude (Iterable[str]): Names of components that shouldn't be updated.
+        RETURNS (dict): Results from the update.
+
+        EXAMPLE:
+            >>> raw_text_batches = minibatch(raw_texts)
+            >>> for labelled_batch in minibatch(examples):
+            >>>     nlp.update(labelled_batch)
+            >>>     raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
+            >>>     nlp.rehearse(raw_batch)
+
+        DOCS: https://spacy.io/api/language#rehearse
+        """
+        if losses is None:
+            losses = {}
+        if isinstance(examples, list) and len(examples) == 0:
+            return losses
+        validate_examples(examples, "Language.rehearse")
+        if sgd is None:
+            if self._optimizer is None:
+                self._optimizer = self.create_optimizer()
+            sgd = self._optimizer
+        pipes = list(self.pipeline)
+        random.shuffle(pipes)
+        if component_cfg is None:
+            component_cfg = {}
+        grads = {}
+
+        def get_grads(W, dW, key=None):
+            grads[key] = (W, dW)
+
+        get_grads.learn_rate = sgd.learn_rate  # type: ignore[attr-defined, union-attr]
+        get_grads.b1 = sgd.b1  # type: ignore[attr-defined, union-attr]
+        get_grads.b2 = sgd.b2  # type: ignore[attr-defined, union-attr]
+        for name, proc in pipes:
+            if name in exclude or not hasattr(proc, "rehearse"):
+                continue
+            grads = {}
+            proc.rehearse(  # type: ignore[attr-defined]
+                examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
+            )
+        for key, (W, dW) in grads.items():
+            sgd(W, dW, key=key)  # type: ignore[call-arg, misc]
+        return losses
+
+    def begin_training(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        sgd: Optional[Optimizer] = None,
+    ) -> Optimizer:
+        warnings.warn(Warnings.W089, DeprecationWarning)
+        return self.initialize(get_examples, sgd=sgd)
+
+    def initialize(
+        self,
+        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
+        *,
+        sgd: Optional[Optimizer] = None,
+    ) -> Optimizer:
+        """Initialize the pipe for training, using data examples if available.
+
+        get_examples (Callable[[], Iterable[Example]]): Optional function that
+            returns gold-standard Example objects.
+        sgd (Optional[Optimizer]): An optimizer to use for updates. If not
+            provided, will be created using the .create_optimizer() method.
+        RETURNS (thinc.api.Optimizer): The optimizer.
+
+        DOCS: https://spacy.io/api/language#initialize
+        """
+        if get_examples is None:
+            util.logger.debug(
+                "No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
+            )
+            doc = Doc(self.vocab, words=["x", "y", "z"])
+            get_examples = lambda: [Example.from_dict(doc, {})]
+        if not hasattr(get_examples, "__call__"):
+            err = Errors.E930.format(
+                method="Language.initialize", obj=type(get_examples)
+            )
+            raise TypeError(err)
+        # Make sure the config is interpolated so we can resolve subsections
+        config = self.config.interpolate()
+        # These are the settings provided in the [initialize] block in the config
+        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+        before_init = I["before_init"]
+        if before_init is not None:
+            before_init(self)
+        try:
+            init_vocab(
+                self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
+            )
+        except IOError:
+            raise IOError(Errors.E884.format(vectors=I["vectors"]))
+        if self.vocab.vectors.data.shape[1] >= 1:
+            ops = get_current_ops()
+            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+        if hasattr(self.tokenizer, "initialize"):
+            tok_settings = validate_init_settings(
+                self.tokenizer.initialize,  # type: ignore[union-attr]
+                I["tokenizer"],
+                section="tokenizer",
+                name="tokenizer",
+            )
+            self.tokenizer.initialize(get_examples, nlp=self, **tok_settings)  # type: ignore[union-attr]
+        for name, proc in self.pipeline:
+            if isinstance(proc, ty.InitializableComponent):
+                p_settings = I["components"].get(name, {})
+                p_settings = validate_init_settings(
+                    proc.initialize, p_settings, section="components", name=name
+                )
+                proc.initialize(get_examples, nlp=self, **p_settings)
+        pretrain_cfg = config.get("pretraining")
+        if pretrain_cfg:
+            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
+            init_tok2vec(self, P, I)
+        self._link_components()
+        self._optimizer = sgd
+        if sgd is not None:
+            self._optimizer = sgd
+        elif self._optimizer is None:
+            self._optimizer = self.create_optimizer()
+        after_init = I["after_init"]
+        if after_init is not None:
+            after_init(self)
+        return self._optimizer
+
+    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
+        """Continue training a pretrained model.
+
+        Create and return an optimizer, and initialize "rehearsal" for any pipeline
+        component that has a .rehearse() method. Rehearsal is used to prevent
+        models from "forgetting" their initialized "knowledge". To perform
+        rehearsal, collect samples of text you want the models to retain performance
+        on, and call nlp.rehearse() with a batch of Example objects.
+
+        RETURNS (Optimizer): The optimizer.
+
+        DOCS: https://spacy.io/api/language#resume_training
+        """
+        ops = get_current_ops()
+        if self.vocab.vectors.data.shape[1] >= 1:
+            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
+        for name, proc in self.pipeline:
+            if hasattr(proc, "_rehearsal_model"):
+                proc._rehearsal_model = deepcopy(proc.model)  # type: ignore[attr-defined]
+        if sgd is not None:
+            self._optimizer = sgd
+        elif self._optimizer is None:
+            self._optimizer = self.create_optimizer()
+        return self._optimizer
+
+    def set_error_handler(
+        self,
+        error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn],
+    ):
+        """Set an error handler object for all the components in the pipeline that implement
+        a set_error_handler function.
+
+        error_handler (Callable[[str, Pipe, List[Doc], Exception], NoReturn]):
+            Function that deals with a failing batch of documents. This callable function should take in
+            the component's name, the component itself, the offending batch of documents, and the exception
+            that was thrown.
+        DOCS: https://spacy.io/api/language#set_error_handler
+        """
+        self.default_error_handler = error_handler
+        for name, pipe in self.pipeline:
+            if hasattr(pipe, "set_error_handler"):
+                pipe.set_error_handler(error_handler)
+
+    def evaluate(
+        self,
+        examples: Iterable[Example],
+        *,
+        batch_size: Optional[int] = None,
+        scorer: Optional[Scorer] = None,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        scorer_cfg: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """Evaluate a model's pipeline components.
+
+        examples (Iterable[Example]): `Example` objects.
+        batch_size (Optional[int]): Batch size to use.
+        scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
+            will be created.
+        component_cfg (dict): An optional dictionary with extra keyword
+            arguments for specific components.
+        scorer_cfg (dict): An optional dictionary with extra keyword arguments
+            for the scorer.
+
+        RETURNS (Scorer): The scorer containing the evaluation results.
+
+        DOCS: https://spacy.io/api/language#evaluate
+        """
+        examples = list(examples)
+        validate_examples(examples, "Language.evaluate")
+        examples = _copy_examples(examples)
+        if batch_size is None:
+            batch_size = self.batch_size
+        if component_cfg is None:
+            component_cfg = {}
+        if scorer_cfg is None:
+            scorer_cfg = {}
+        if scorer is None:
+            kwargs = dict(scorer_cfg)
+            kwargs.setdefault("nlp", self)
+            scorer = Scorer(**kwargs)
+        # reset annotation in predicted docs and time tokenization
+        start_time = timer()
+        # this is purely for timing
+        for eg in examples:
+            self.make_doc(eg.reference.text)
+        # apply all pipeline components
+        for name, pipe in self.pipeline:
+            kwargs = component_cfg.get(name, {})
+            kwargs.setdefault("batch_size", batch_size)
+            for doc, eg in zip(
+                _pipe(
+                    (eg.predicted for eg in examples),
+                    proc=pipe,
+                    name=name,
+                    default_error_handler=self.default_error_handler,
+                    kwargs=kwargs,
+                ),
+                examples,
+            ):
+                eg.predicted = doc
+        end_time = timer()
+        results = scorer.score(examples)
+        n_words = sum(len(eg.predicted) for eg in examples)
+        results["speed"] = n_words / (end_time - start_time)
+        return results
+
+    def create_optimizer(self):
+        """Create an optimizer, usually using the [training.optimizer] config."""
+        subconfig = {"optimizer": self.config["training"]["optimizer"]}
+        return registry.resolve(subconfig)["optimizer"]
+
+    @contextmanager
+    def use_params(self, params: Optional[dict]):
+        """Replace weights of models in the pipeline with those provided in the
+        params dictionary. Can be used as a contextmanager, in which case,
+        models go back to their original weights after the block.
+
+        params (dict): A dictionary of parameters keyed by model ID.
+
+        EXAMPLE:
+            >>> with nlp.use_params(optimizer.averages):
+            >>>     nlp.to_disk("/tmp/checkpoint")
+
+        DOCS: https://spacy.io/api/language#use_params
+        """
+        if not params:
+            yield
+        else:
+            contexts = [
+                pipe.use_params(params)  # type: ignore[attr-defined]
+                for name, pipe in self.pipeline
+                if hasattr(pipe, "use_params") and hasattr(pipe, "model")
+            ]
+            # TODO: Having trouble with contextlib
+            # Workaround: these aren't actually context managers atm.
+            for context in contexts:
+                try:
+                    next(context)
+                except StopIteration:
+                    pass
+            yield
+            for context in contexts:
+                try:
+                    next(context)
+                except StopIteration:
+                    pass
+
+    @overload
+    def pipe(
+        self,
+        texts: Iterable[Union[str, Doc]],
+        *,
+        as_tuples: Literal[False] = ...,
+        batch_size: Optional[int] = ...,
+        disable: Iterable[str] = ...,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
+        n_process: int = ...,
+    ) -> Iterator[Doc]:
+        ...
+
+    @overload
+    def pipe(  # noqa: F811
+        self,
+        texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
+        *,
+        as_tuples: Literal[True] = ...,
+        batch_size: Optional[int] = ...,
+        disable: Iterable[str] = ...,
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
+        n_process: int = ...,
+    ) -> Iterator[Tuple[Doc, _AnyContext]]:
+        ...
+
+    def pipe(  # noqa: F811
+        self,
+        texts: Union[
+            Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
+        ],
+        *,
+        as_tuples: bool = False,
+        batch_size: Optional[int] = None,
+        disable: Iterable[str] = SimpleFrozenList(),
+        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
+        n_process: int = 1,
+    ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
+        """Process texts as a stream, and yield `Doc` objects in order.
+
+        texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
+            process.
+        as_tuples (bool): If set to True, inputs should be a sequence of
+            (text, context) tuples. Output will then be a sequence of
+            (doc, context) tuples. Defaults to False.
+        batch_size (Optional[int]): The number of texts to buffer.
+        disable (List[str]): Names of the pipeline components to disable.
+        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
+            arguments for specific components.
+        n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
+        YIELDS (Doc): Documents in the order of the original text.
+
+        DOCS: https://spacy.io/api/language#pipe
+        """
+        # Handle texts with context as tuples
+        if as_tuples:
+            texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
+            docs_with_contexts = (
+                self._ensure_doc_with_context(text, context) for text, context in texts
+            )
+            docs = self.pipe(
+                docs_with_contexts,
+                batch_size=batch_size,
+                disable=disable,
+                n_process=n_process,
+                component_cfg=component_cfg,
+            )
+            for doc in docs:
+                context = doc._context
+                doc._context = None
+                yield (doc, context)
+            return
+
+        texts = cast(Iterable[Union[str, Doc]], texts)
+
+        # Set argument defaults
+        if n_process == -1:
+            n_process = mp.cpu_count()
+        if component_cfg is None:
+            component_cfg = {}
+        if batch_size is None:
+            batch_size = self.batch_size
+
+        pipes = (
+            []
+        )  # contains functools.partial objects to easily create multiprocess worker.
+        for name, proc in self.pipeline:
+            if name in disable:
+                continue
+            kwargs = component_cfg.get(name, {})
+            # Allow component_cfg to overwrite the top-level kwargs.
+            kwargs.setdefault("batch_size", batch_size)
+            f = functools.partial(
+                _pipe,
+                proc=proc,
+                name=name,
+                kwargs=kwargs,
+                default_error_handler=self.default_error_handler,
+            )
+            pipes.append(f)
+
+        if n_process != 1:
+            if self._has_gpu_model(disable):
+                warnings.warn(Warnings.W114)
+
+            docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
+        else:
+            # if n_process == 1, no processes are forked.
+            docs = (self._ensure_doc(text) for text in texts)
+            for pipe in pipes:
+                docs = pipe(docs)
+        for doc in docs:
+            yield doc
+
+    def _has_gpu_model(self, disable: Iterable[str]):
+        for name, proc in self.pipeline:
+            is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable  # type: ignore
+            if name in disable or not is_trainable:
+                continue
+
+            if hasattr(proc, "model") and hasattr(proc.model, "ops") and isinstance(proc.model.ops, CupyOps):  # type: ignore
+                return True
+
+        return False
+
+    def _multiprocessing_pipe(
+        self,
+        texts: Iterable[Union[str, Doc]],
+        pipes: Iterable[Callable[..., Iterator[Doc]]],
+        n_process: int,
+        batch_size: int,
+    ) -> Iterator[Doc]:
+        # raw_texts is used later to stop iteration.
+        texts, raw_texts = itertools.tee(texts)
+        # for sending texts to worker
+        texts_q: List[mp.Queue] = [mp.Queue() for _ in range(n_process)]
+        # for receiving byte-encoded docs from worker
+        bytedocs_recv_ch, bytedocs_send_ch = zip(
+            *[mp.Pipe(False) for _ in range(n_process)]
+        )
+
+        batch_texts = util.minibatch(texts, batch_size)
+        # Sender sends texts to the workers.
+        # This is necessary to properly handle infinite length of texts.
+        # (In this case, all data cannot be sent to the workers at once)
+        sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
+        # send twice to make process busy
+        sender.send()
+        sender.send()
+
+        procs = [
+            mp.Process(
+                target=_apply_pipes,
+                args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
+            )
+            for rch, sch in zip(texts_q, bytedocs_send_ch)
+        ]
+        for proc in procs:
+            proc.start()
+
+        # Cycle channels not to break the order of docs.
+        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
+        byte_tuples = chain.from_iterable(
+            recv.recv() for recv in cycle(bytedocs_recv_ch)
+        )
+        try:
+            for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
+                zip(raw_texts, byte_tuples), 1
+            ):
+                if byte_doc is not None:
+                    doc = Doc(self.vocab).from_bytes(byte_doc)
+                    doc._context = byte_context
+                    yield doc
+                elif byte_error is not None:
+                    error = srsly.msgpack_loads(byte_error)
+                    self.default_error_handler(
+                        None, None, None, ValueError(Errors.E871.format(error=error))
+                    )
+                if i % batch_size == 0:
+                    # tell `sender` that one batch was consumed.
+                    sender.step()
+        finally:
+            for proc in procs:
+                proc.terminate()
+
+    def _link_components(self) -> None:
+        """Register 'listeners' within pipeline components, to allow them to
+        effectively share weights.
+        """
+        # I had thought, "Why do we do this inside the Language object? Shouldn't
+        # it be the tok2vec/transformer/etc's job?
+        # The problem is we need to do it during deserialization...And the
+        # components don't receive the pipeline then. So this does have to be
+        # here :(
+        for i, (name1, proc1) in enumerate(self.pipeline):
+            if isinstance(proc1, ty.ListenedToComponent):
+                for name2, proc2 in self.pipeline[i + 1 :]:
+                    proc1.find_listeners(proc2)
+
+    @classmethod
+    def from_config(
+        cls,
+        config: Union[Dict[str, Any], Config] = {},
+        *,
+        vocab: Union[Vocab, bool] = True,
+        disable: Iterable[str] = SimpleFrozenList(),
+        exclude: Iterable[str] = SimpleFrozenList(),
+        meta: Dict[str, Any] = SimpleFrozenDict(),
+        auto_fill: bool = True,
+        validate: bool = True,
+    ) -> "Language":
+        """Create the nlp object from a loaded config. Will set up the tokenizer
+        and language data, add pipeline components etc. If no config is provided,
+        the default config of the given language is used.
+
+        config (Dict[str, Any] / Config): The loaded config.
+        vocab (Vocab): A Vocab object. If True, a vocab is created.
+        disable (Iterable[str]): Names of pipeline components to disable.
+            Disabled pipes will be loaded but they won't be run unless you
+            explicitly enable them by calling nlp.enable_pipe.
+        exclude (Iterable[str]): Names of pipeline components to exclude.
+            Excluded components won't be loaded.
+        meta (Dict[str, Any]): Meta overrides for nlp.meta.
+        auto_fill (bool): Automatically fill in missing values in config based
+            on defaults and function argument annotations.
+        validate (bool): Validate the component config and arguments against
+            the types expected by the factory.
+        RETURNS (Language): The initialized Language class.
+
+        DOCS: https://spacy.io/api/language#from_config
+        """
+        if auto_fill:
+            config = Config(
+                cls.default_config, section_order=CONFIG_SECTION_ORDER
+            ).merge(config)
+        if "nlp" not in config:
+            raise ValueError(Errors.E985.format(config=config))
+        config_lang = config["nlp"].get("lang")
+        if config_lang is not None and config_lang != cls.lang:
+            raise ValueError(
+                Errors.E958.format(
+                    bad_lang_code=config["nlp"]["lang"],
+                    lang_code=cls.lang,
+                    lang=util.get_object_name(cls),
+                )
+            )
+        config["nlp"]["lang"] = cls.lang
+        # This isn't very elegant, but we remove the [components] block here to prevent
+        # it from getting resolved (causes problems because we expect to pass in
+        # the nlp and name args for each component). If we're auto-filling, we're
+        # using the nlp.config with all defaults.
+        config = util.copy_config(config)
+        orig_pipeline = config.pop("components", {})
+        orig_pretraining = config.pop("pretraining", None)
+        config["components"] = {}
+        if auto_fill:
+            filled = registry.fill(config, validate=validate, schema=ConfigSchema)
+        else:
+            filled = config
+        filled["components"] = orig_pipeline
+        config["components"] = orig_pipeline
+        if orig_pretraining is not None:
+            filled["pretraining"] = orig_pretraining
+            config["pretraining"] = orig_pretraining
+        resolved_nlp = registry.resolve(
+            filled["nlp"], validate=validate, schema=ConfigSchemaNlp
+        )
+        create_tokenizer = resolved_nlp["tokenizer"]
+        before_creation = resolved_nlp["before_creation"]
+        after_creation = resolved_nlp["after_creation"]
+        after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
+        lang_cls = cls
+        if before_creation is not None:
+            lang_cls = before_creation(cls)
+            if (
+                not isinstance(lang_cls, type)
+                or not issubclass(lang_cls, cls)
+                or lang_cls is not cls
+            ):
+                raise ValueError(Errors.E943.format(value=type(lang_cls)))
+
+        # Warn about require_gpu usage in jupyter notebook
+        warn_if_jupyter_cupy()
+
+        # Note that we don't load vectors here, instead they get loaded explicitly
+        # inside stuff like the spacy train function. If we loaded them here,
+        # then we would load them twice at runtime: once when we make from config,
+        # and then again when we load from disk.
+        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
+        if after_creation is not None:
+            nlp = after_creation(nlp)
+            if not isinstance(nlp, cls):
+                raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
+        # To create the components we need to use the final interpolated config
+        # so all values are available (if component configs use variables).
+        # Later we replace the component config with the raw config again.
+        interpolated = filled.interpolate() if not filled.is_interpolated else filled
+        pipeline = interpolated.get("components", {})
+        sourced = util.get_sourced_components(interpolated)
+        # If components are loaded from a source (existing models), we cache
+        # them here so they're only loaded once
+        source_nlps = {}
+        source_nlp_vectors_hashes = {}
+        vocab_b = None
+        for pipe_name in config["nlp"]["pipeline"]:
+            if pipe_name not in pipeline:
+                opts = ", ".join(pipeline.keys())
+                raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
+            pipe_cfg = util.copy_config(pipeline[pipe_name])
+            raw_config = Config(filled["components"][pipe_name])
+            if pipe_name not in exclude:
+                if "factory" not in pipe_cfg and "source" not in pipe_cfg:
+                    err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
+                    raise ValueError(err)
+                if "factory" in pipe_cfg:
+                    factory = pipe_cfg.pop("factory")
+                    # The pipe name (key in the config) here is the unique name
+                    # of the component, not necessarily the factory
+                    nlp.add_pipe(
+                        factory,
+                        name=pipe_name,
+                        config=pipe_cfg,
+                        validate=validate,
+                        raw_config=raw_config,
+                    )
+                else:
+                    # We need the sourced components to reference the same
+                    # vocab without modifying the current vocab state **AND**
+                    # we still want to load the source model vectors to perform
+                    # the vectors check. Since the source vectors clobber the
+                    # current ones, we save the original vocab state and
+                    # restore after this loop. Existing strings are preserved
+                    # during deserialization, so they do not need any
+                    # additional handling.
+                    if vocab_b is None:
+                        vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
+                    model = pipe_cfg["source"]
+                    if model not in source_nlps:
+                        # Load with the same vocab, adding any strings
+                        source_nlps[model] = util.load_model(
+                            model, vocab=nlp.vocab, exclude=["lookups"]
+                        )
+                    source_name = pipe_cfg.get("component", pipe_name)
+                    listeners_replaced = False
+                    if "replace_listeners" in pipe_cfg:
+                        for name, proc in source_nlps[model].pipeline:
+                            if source_name in getattr(proc, "listening_components", []):
+                                source_nlps[model].replace_listeners(
+                                    name, source_name, pipe_cfg["replace_listeners"]
+                                )
+                                listeners_replaced = True
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings("ignore", message="\\[W113\\]")
+                        nlp.add_pipe(
+                            source_name, source=source_nlps[model], name=pipe_name
+                        )
+                    if model not in source_nlp_vectors_hashes:
+                        source_nlp_vectors_hashes[model] = hash(
+                            source_nlps[model].vocab.vectors.to_bytes()
+                        )
+                    if "_sourced_vectors_hashes" not in nlp.meta:
+                        nlp.meta["_sourced_vectors_hashes"] = {}
+                    nlp.meta["_sourced_vectors_hashes"][
+                        pipe_name
+                    ] = source_nlp_vectors_hashes[model]
+                    # Delete from cache if listeners were replaced
+                    if listeners_replaced:
+                        del source_nlps[model]
+        # Restore the original vocab after sourcing if necessary
+        if vocab_b is not None:
+            nlp.vocab.from_bytes(vocab_b)
+        disabled_pipes = [*config["nlp"]["disabled"], *disable]
+        nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
+        nlp.batch_size = config["nlp"]["batch_size"]
+        nlp.config = filled if auto_fill else config
+        if after_pipeline_creation is not None:
+            nlp = after_pipeline_creation(nlp)
+            if not isinstance(nlp, cls):
+                raise ValueError(
+                    Errors.E942.format(name="pipeline_creation", value=type(nlp))
+                )
+        # Detect components with listeners that are not frozen consistently
+        for name, proc in nlp.pipeline:
+            if isinstance(proc, ty.ListenedToComponent):
+                # Remove listeners not in the pipeline
+                listener_names = proc.listening_components
+                unused_listener_names = [
+                    ll for ll in listener_names if ll not in nlp.pipe_names
+                ]
+                for listener_name in unused_listener_names:
+                    for listener in proc.listener_map.get(listener_name, []):
+                        proc.remove_listener(listener, listener_name)
+
+                for listener_name in proc.listening_components:
+                    # e.g. tok2vec/transformer
+                    # If it's a component sourced from another pipeline, we check if
+                    # the tok2vec listeners should be replaced with standalone tok2vec
+                    # models (e.g. so component can be frozen without its performance
+                    # degrading when other components/tok2vec are updated)
+                    paths = sourced.get(listener_name, {}).get("replace_listeners", [])
+                    if paths:
+                        nlp.replace_listeners(name, listener_name, paths)
+        return nlp
+
+    def replace_listeners(
+        self,
+        tok2vec_name: str,
+        pipe_name: str,
+        listeners: Iterable[str],
+    ) -> None:
+        """Find listener layers (connecting to a token-to-vector embedding
+        component) of a given pipeline component model and replace
+        them with a standalone copy of the token-to-vector layer. This can be
+        useful when training a pipeline with components sourced from an existing
+        pipeline: if multiple components (e.g. tagger, parser, NER) listen to
+        the same tok2vec component, but some of them are frozen and not updated,
+        their performance may degrade significally as the tok2vec component is
+        updated with new data. To prevent this, listeners can be replaced with
+        a standalone tok2vec layer that is owned by the component and doesn't
+        change if the component isn't updated.
+
+        tok2vec_name (str): Name of the token-to-vector component, typically
+            "tok2vec" or "transformer".
+        pipe_name (str): Name of pipeline component to replace listeners for.
+        listeners (Iterable[str]): The paths to the listeners, relative to the
+            component config, e.g. ["model.tok2vec"]. Typically, implementations
+            will only connect to one tok2vec component, [model.tok2vec], but in
+            theory, custom models can use multiple listeners. The value here can
+            either be an empty list to not replace any listeners, or a complete
+            (!) list of the paths to all listener layers used by the model.
+
+        DOCS: https://spacy.io/api/language#replace_listeners
+        """
+        if tok2vec_name not in self.pipe_names:
+            err = Errors.E889.format(
+                tok2vec=tok2vec_name,
+                name=pipe_name,
+                unknown=tok2vec_name,
+                opts=", ".join(self.pipe_names),
+            )
+            raise ValueError(err)
+        if pipe_name not in self.pipe_names:
+            err = Errors.E889.format(
+                tok2vec=tok2vec_name,
+                name=pipe_name,
+                unknown=pipe_name,
+                opts=", ".join(self.pipe_names),
+            )
+            raise ValueError(err)
+        tok2vec = self.get_pipe(tok2vec_name)
+        tok2vec_cfg = self.get_pipe_config(tok2vec_name)
+        if not isinstance(tok2vec, ty.ListenedToComponent):
+            raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
+        tok2vec_model = tok2vec.model
+        pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
+        pipe = self.get_pipe(pipe_name)
+        pipe_cfg = self._pipe_configs[pipe_name]
+        if listeners:
+            util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
+            if len(list(listeners)) != len(pipe_listeners):
+                # The number of listeners defined in the component model doesn't
+                # match the listeners to replace, so we won't be able to update
+                # the nodes and generate a matching config
+                err = Errors.E887.format(
+                    name=pipe_name,
+                    tok2vec=tok2vec_name,
+                    paths=listeners,
+                    n_listeners=len(pipe_listeners),
+                )
+                raise ValueError(err)
+            # Update the config accordingly by copying the tok2vec model to all
+            # sections defined in the listener paths
+            for listener_path in listeners:
+                # Check if the path actually exists in the config
+                try:
+                    util.dot_to_object(pipe_cfg, listener_path)
+                except KeyError:
+                    err = Errors.E886.format(
+                        name=pipe_name, tok2vec=tok2vec_name, path=listener_path
+                    )
+                    raise ValueError(err)
+                new_config = tok2vec_cfg["model"]
+                if "replace_listener_cfg" in tok2vec_model.attrs:
+                    replace_func = tok2vec_model.attrs["replace_listener_cfg"]
+                    new_config = replace_func(
+                        tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
+                    )
+                util.set_dot_to_object(pipe_cfg, listener_path, new_config)
+            # Go over the listener layers and replace them
+            for listener in pipe_listeners:
+                new_model = tok2vec_model.copy()
+                if "replace_listener" in tok2vec_model.attrs:
+                    new_model = tok2vec_model.attrs["replace_listener"](new_model)
+                util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
+                tok2vec.remove_listener(listener, pipe_name)
+
+    def to_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> None:
+        """Save the current state to a directory.  If a model is loaded, this
+        will include the model.
+
+        path (str / Path): Path to a directory, which will be created if
+            it doesn't exist.
+        exclude (Iterable[str]): Names of components or serialization fields to exclude.
+
+        DOCS: https://spacy.io/api/language#to_disk
+        """
+        path = util.ensure_path(path)
+        serializers = {}
+        serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(  # type: ignore[union-attr]
+            p, exclude=["vocab"]
+        )
+        serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
+        serializers["config.cfg"] = lambda p: self.config.to_disk(p)
+        for name, proc in self._components:
+            if name in exclude:
+                continue
+            if not hasattr(proc, "to_disk"):
+                continue
+            serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])  # type: ignore[misc]
+        serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
+        util.to_disk(path, serializers, exclude)
+
+    def from_disk(
+        self,
+        path: Union[str, Path],
+        *,
+        exclude: Iterable[str] = SimpleFrozenList(),
+        overrides: Dict[str, Any] = SimpleFrozenDict(),
+    ) -> "Language":
+        """Loads state from a directory. Modifies the object in place and
+        returns it. If the saved `Language` object contains a model, the
+        model will be loaded.
+
+        path (str / Path): A path to a directory.
+        exclude (Iterable[str]): Names of components or serialization fields to exclude.
+        RETURNS (Language): The modified `Language` object.
+
+        DOCS: https://spacy.io/api/language#from_disk
+        """
+
+        def deserialize_meta(path: Path) -> None:
+            if path.exists():
+                data = srsly.read_json(path)
+                self.meta.update(data)
+                # self.meta always overrides meta["vectors"] with the metadata
+                # from self.vocab.vectors, so set the name directly
+                self.vocab.vectors.name = data.get("vectors", {}).get("name")
+
+        def deserialize_vocab(path: Path) -> None:
+            if path.exists():
+                self.vocab.from_disk(path, exclude=exclude)
+
+        path = util.ensure_path(path)
+        deserializers = {}
+        if Path(path / "config.cfg").exists():  # type: ignore[operator]
+            deserializers["config.cfg"] = lambda p: self.config.from_disk(
+                p, interpolate=False, overrides=overrides
+            )
+        deserializers["meta.json"] = deserialize_meta  # type: ignore[assignment]
+        deserializers["vocab"] = deserialize_vocab  # type: ignore[assignment]
+        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
+            p, exclude=["vocab"]
+        )
+        for name, proc in self._components:
+            if name in exclude:
+                continue
+            if not hasattr(proc, "from_disk"):
+                continue
+            deserializers[name] = lambda p, proc=proc: proc.from_disk(  # type: ignore[misc]
+                p, exclude=["vocab"]
+            )
+        if not (path / "vocab").exists() and "vocab" not in exclude:  # type: ignore[operator]
+            # Convert to list here in case exclude is (default) tuple
+            exclude = list(exclude) + ["vocab"]
+        util.from_disk(path, deserializers, exclude)  # type: ignore[arg-type]
+        self._path = path  # type: ignore[assignment]
+        self._link_components()
+        return self
+
+    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
+        """Serialize the current state to a binary string.
+
+        exclude (Iterable[str]): Names of components or serialization fields to exclude.
+        RETURNS (bytes): The serialized form of the `Language` object.
+
+        DOCS: https://spacy.io/api/language#to_bytes
+        """
+        serializers: Dict[str, Callable[[], bytes]] = {}
+        serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
+        serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])  # type: ignore[union-attr]
+        serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
+        serializers["config.cfg"] = lambda: self.config.to_bytes()
+        for name, proc in self._components:
+            if name in exclude:
+                continue
+            if not hasattr(proc, "to_bytes"):
+                continue
+            serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])  # type: ignore[misc]
+        return util.to_bytes(serializers, exclude)
+
+    def from_bytes(
+        self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
+    ) -> "Language":
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        exclude (Iterable[str]): Names of components or serialization fields to exclude.
+        RETURNS (Language): The `Language` object.
+
+        DOCS: https://spacy.io/api/language#from_bytes
+        """
+
+        def deserialize_meta(b):
+            data = srsly.json_loads(b)
+            self.meta.update(data)
+            # self.meta always overrides meta["vectors"] with the metadata
+            # from self.vocab.vectors, so set the name directly
+            self.vocab.vectors.name = data.get("vectors", {}).get("name")
+
+        deserializers: Dict[str, Callable[[bytes], Any]] = {}
+        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
+            b, interpolate=False
+        )
+        deserializers["meta.json"] = deserialize_meta
+        deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
+        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(  # type: ignore[union-attr]
+            b, exclude=["vocab"]
+        )
+        for name, proc in self._components:
+            if name in exclude:
+                continue
+            if not hasattr(proc, "from_bytes"):
+                continue
+            deserializers[name] = lambda b, proc=proc: proc.from_bytes(  # type: ignore[misc]
+                b, exclude=["vocab"]
+            )
+        util.from_bytes(bytes_data, deserializers, exclude)
+        self._link_components()
+        return self
+
+
+@dataclass
+class FactoryMeta:
+    """Dataclass containing information about a component and its defaults
+    provided by the @Language.component or @Language.factory decorator. It's
+    created whenever a component is defined and stored on the Language class for
+    each component instance and factory instance.
+    """
+
+    factory: str
+    default_config: Optional[Dict[str, Any]] = None  # noqa: E704
+    assigns: Iterable[str] = tuple()
+    requires: Iterable[str] = tuple()
+    retokenizes: bool = False
+    scores: Iterable[str] = tuple()
+    default_score_weights: Optional[Dict[str, Optional[float]]] = None  # noqa: E704
+
+
+class DisabledPipes(list):
+    """Manager for temporary pipeline disabling."""
+
+    def __init__(self, nlp: Language, names: List[str]) -> None:
+        self.nlp = nlp
+        self.names = names
+        for name in self.names:
+            self.nlp.disable_pipe(name)
+        list.__init__(self)
+        self.extend(self.names)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.restore()
+
+    def restore(self) -> None:
+        """Restore the pipeline to its state when DisabledPipes was created."""
+        for name in self.names:
+            if name not in self.nlp.component_names:
+                raise ValueError(Errors.E008.format(name=name))
+            self.nlp.enable_pipe(name)
+        self[:] = []
+
+
+def _copy_examples(examples: Iterable[Example]) -> List[Example]:
+    """Make a copy of a batch of examples, copying the predicted Doc as well.
+    This is used in contexts where we need to take ownership of the examples
+    so that they can be mutated, for instance during Language.evaluate and
+    Language.update.
+    """
+    return [Example(eg.x.copy(), eg.y) for eg in examples]
+
+
+def _apply_pipes(
+    ensure_doc: Callable[[Union[str, Doc]], Doc],
+    pipes: Iterable[Callable[..., Iterator[Doc]]],
+    receiver,
+    sender,
+    underscore_state: Tuple[dict, dict, dict],
+) -> None:
+    """Worker for Language.pipe
+
+    ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
+        or raise an error if the input is neither a Doc nor a string.
+    pipes (Iterable[Pipe]): The components to apply.
+    receiver (multiprocessing.Connection): Pipe to receive text. Usually
+        created by `multiprocessing.Pipe()`
+    sender (multiprocessing.Connection): Pipe to send doc. Usually created by
+        `multiprocessing.Pipe()`
+    underscore_state (Tuple[dict, dict, dict]): The data in the Underscore class
+        of the parent.
+    """
+    Underscore.load_state(underscore_state)
+    while True:
+        try:
+            texts = receiver.get()
+            docs = (ensure_doc(text) for text in texts)
+            for pipe in pipes:
+                docs = pipe(docs)  # type: ignore[arg-type, assignment]
+            # Connection does not accept unpickable objects, so send list.
+            byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
+            padding = [(None, None, None)] * (len(texts) - len(byte_docs))
+            sender.send(byte_docs + padding)  # type: ignore[operator]
+        except Exception:
+            error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
+            padding = [(None, None, None)] * (len(texts) - 1)
+            sender.send(error_msg + padding)
+
+
+class _Sender:
+    """Util for sending data to multiprocessing workers in Language.pipe"""
+
+    def __init__(
+        self, data: Iterable[Any], queues: List[mp.Queue], chunk_size: int
+    ) -> None:
+        self.data = iter(data)
+        self.queues = iter(cycle(queues))
+        self.chunk_size = chunk_size
+        self.count = 0
+
+    def send(self) -> None:
+        """Send chunk_size items from self.data to channels."""
+        for item, q in itertools.islice(
+            zip(self.data, cycle(self.queues)), self.chunk_size
+        ):
+            # cycle channels so that distribute the texts evenly
+            q.put(item)
+
+    def step(self) -> None:
+        """Tell sender that comsumed one item. Data is sent to the workers after
+        every chunk_size calls.
+        """
+        self.count += 1
+        if self.count >= self.chunk_size:
+            self.count = 0
+            self.send()
--- a/cgpenv/Lib/site-packages/spacy/lexeme.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/lexeme.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/lexeme.cpp
+++ b/cgpenv/Lib/site-packages/spacy/lexeme.cpp
--- a/cgpenv/Lib/site-packages/spacy/lexeme.pxd
+++ b/cgpenv/Lib/site-packages/spacy/lexeme.pxd
+from numpy cimport ndarray
+
+from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
+from .attrs cimport attr_id_t
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
+
+from .structs cimport LexemeC
+from .strings cimport StringStore
+from .vocab cimport Vocab
+
+
+cdef LexemeC EMPTY_LEXEME
+cdef attr_t OOV_RANK
+
+cdef class Lexeme:
+    cdef LexemeC* c
+    cdef readonly Vocab vocab
+    cdef readonly attr_t orth
+
+    @staticmethod
+    cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab):
+        cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
+        self.c = lex
+        self.vocab = vocab
+        self.orth = lex.orth
+        return self
+
+    @staticmethod
+    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
+        if name < (sizeof(flags_t) * 8):
+            Lexeme.c_set_flag(lex, name, value)
+        elif name == ID:
+            lex.id = value
+        elif name == LOWER:
+            lex.lower = value
+        elif name == NORM:
+            lex.norm = value
+        elif name == SHAPE:
+            lex.shape = value
+        elif name == PREFIX:
+            lex.prefix = value
+        elif name == SUFFIX:
+            lex.suffix = value
+        elif name == LANG:
+            lex.lang = value
+
+    @staticmethod
+    cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
+        if feat_name < (sizeof(flags_t) * 8):
+            if Lexeme.c_check_flag(lex, feat_name):
+                return 1
+            else:
+                return 0
+        elif feat_name == ID:
+            return lex.id
+        elif feat_name == ORTH:
+            return lex.orth
+        elif feat_name == LOWER:
+            return lex.lower
+        elif feat_name == NORM:
+            return lex.norm
+        elif feat_name == SHAPE:
+            return lex.shape
+        elif feat_name == PREFIX:
+            return lex.prefix
+        elif feat_name == SUFFIX:
+            return lex.suffix
+        elif feat_name == LENGTH:
+            return lex.length
+        elif feat_name == LANG:
+            return lex.lang
+        else:
+            return 0
+
+    @staticmethod
+    cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
+        cdef flags_t one = 1
+        if lexeme.flags & (one << flag_id):
+            return True
+        else:
+            return False
+
+    @staticmethod
+    cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) nogil:
+        cdef flags_t one = 1
+        if value:
+            lex.flags |= one << flag_id
+        else:
+            lex.flags &= ~(one << flag_id)
--- a/cgpenv/Lib/site-packages/spacy/lexeme.pyi
+++ b/cgpenv/Lib/site-packages/spacy/lexeme.pyi
+from typing import (
+    Union,
+    Any,
+)
+from thinc.types import Floats1d
+from .tokens import Doc, Span, Token
+from .vocab import Vocab
+
+class Lexeme:
+    def __init__(self, vocab: Vocab, orth: int) -> None: ...
+    def __richcmp__(self, other: Lexeme, op: int) -> bool: ...
+    def __hash__(self) -> int: ...
+    def set_attrs(self, **attrs: Any) -> None: ...
+    def set_flag(self, flag_id: int, value: bool) -> None: ...
+    def check_flag(self, flag_id: int) -> bool: ...
+    def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
+    @property
+    def has_vector(self) -> bool: ...
+    @property
+    def vector_norm(self) -> float: ...
+    vector: Floats1d
+    rank: str
+    sentiment: float
+    @property
+    def orth_(self) -> str: ...
+    @property
+    def text(self) -> str: ...
+    lower: str
+    norm: int
+    shape: int
+    prefix: int
+    suffix: int
+    cluster: int
+    lang: int
+    prob: float
+    lower_: str
+    norm_: str
+    shape_: str
+    prefix_: str
+    suffix_: str
+    lang_: str
+    flags: int
+    @property
+    def is_oov(self) -> bool: ...
+    is_stop: bool
+    is_alpha: bool
+    is_ascii: bool
+    is_digit: bool
+    is_lower: bool
+    is_upper: bool
+    is_title: bool
+    is_punct: bool
+    is_space: bool
+    is_bracket: bool
+    is_quote: bool
+    is_left_punct: bool
+    is_right_punct: bool
+    is_currency: bool
+    like_url: bool
+    like_num: bool
+    like_email: bool
--- a/cgpenv/Lib/site-packages/spacy/lexeme.pyx
+++ b/cgpenv/Lib/site-packages/spacy/lexeme.pyx
+# cython: embedsignature=True
+# Compiler crashes on memory view coercion without this. Should report bug.
+from cython.view cimport array as cvarray
+from libc.string cimport memset
+cimport numpy as np
+np.import_array()
+
+import numpy
+from thinc.api import get_array_module
+import warnings
+
+from .typedefs cimport attr_t, flags_t
+from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
+from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
+from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
+from .attrs cimport IS_CURRENCY
+
+from .attrs import intify_attrs
+from .errors import Errors, Warnings
+
+
+OOV_RANK = 0xffffffffffffffff # UINT64_MAX
+memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
+EMPTY_LEXEME.id = OOV_RANK
+
+
+cdef class Lexeme:
+    """An entry in the vocabulary. A `Lexeme` has no string context – it's a
+    word-type, as opposed to a word token.  It therefore has no part-of-speech
+    tag, dependency parse, or lemma (lemmatization depends on the
+    part-of-speech tag).
+
+    DOCS: https://spacy.io/api/lexeme
+    """
+    def __init__(self, Vocab vocab, attr_t orth):
+        """Create a Lexeme object.
+
+        vocab (Vocab): The parent vocabulary
+        orth (uint64): The orth id of the lexeme.
+        Returns (Lexeme): The newly constructd object.
+        """
+        self.vocab = vocab
+        self.orth = orth
+        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        if self.c.orth != orth:
+            raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
+
+    def __richcmp__(self, other, int op):
+        if other is None:
+            if op == 0 or op == 1 or op == 2:
+                return False
+            else:
+                return True
+        if isinstance(other, Lexeme):
+            a = self.orth
+            b = other.orth
+        elif isinstance(other, long):
+            a = self.orth
+            b = other
+        elif isinstance(other, str):
+            a = self.orth_
+            b = other
+        else:
+            a = 0
+            b = 1
+        if op == 2:  # ==
+            return a == b
+        elif op == 3:  # !=
+            return a != b
+        elif op == 0:  # <
+            return a < b
+        elif op == 1:  # <=
+            return a <= b
+        elif op == 4:  # >
+            return a > b
+        elif op == 5:  # >=
+            return a >= b
+        else:
+            raise NotImplementedError(op)
+
+    def __hash__(self):
+        return self.c.orth
+
+    def set_attrs(self, **attrs):
+        cdef attr_id_t attr
+        attrs = intify_attrs(attrs)
+        for attr, value in attrs.items():
+            # skip PROB, e.g. from lexemes.jsonl
+            if isinstance(value, float):
+                continue
+            elif isinstance(value, (int, long)):
+                 Lexeme.set_struct_attr(self.c, attr, value)
+            else:
+                Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
+
+    def set_flag(self, attr_id_t flag_id, bint value):
+        """Change the value of a boolean flag.
+
+        flag_id (int): The attribute ID of the flag to set.
+        value (bool): The new value of the flag.
+        """
+        Lexeme.c_set_flag(self.c, flag_id, value)
+
+    def check_flag(self, attr_id_t flag_id):
+        """Check the value of a boolean flag.
+
+        flag_id (int): The attribute ID of the flag to query.
+        RETURNS (bool): The value of the flag.
+        """
+        return True if Lexeme.c_check_flag(self.c, flag_id) else False
+
+    def similarity(self, other):
+        """Compute a semantic similarity estimate. Defaults to cosine over
+        vectors.
+
+        other (object): The object to compare with. By default, accepts `Doc`,
+            `Span`, `Token` and `Lexeme` objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
+        """
+        # Return 1.0 similarity for matches
+        if hasattr(other, "orth"):
+            if self.c.orth == other.orth:
+                return 1.0
+        elif hasattr(other, "__len__") and len(other) == 1 \
+        and hasattr(other[0], "orth"):
+            if self.c.orth == other[0].orth:
+                return 1.0
+        if self.vector_norm == 0 or other.vector_norm == 0:
+            warnings.warn(Warnings.W008.format(obj="Lexeme"))
+            return 0.0
+        vector = self.vector
+        xp = get_array_module(vector)
+        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
+
+    @property
+    def has_vector(self):
+        """RETURNS (bool): Whether a word vector is associated with the object.
+        """
+        return self.vocab.has_vector(self.c.orth)
+
+    @property
+    def vector_norm(self):
+        """RETURNS (float): The L2 norm of the vector representation."""
+        vector = self.vector
+        return numpy.sqrt((vector**2).sum())
+
+    property vector:
+        """A real-valued meaning representation.
+
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
+            representing the lexeme's semantics.
+        """
+        def __get__(self):
+            cdef int length = self.vocab.vectors_length
+            if length == 0:
+                raise ValueError(Errors.E010)
+            return self.vocab.get_vector(self.c.orth)
+
+        def __set__(self, vector):
+            if len(vector) != self.vocab.vectors_length:
+                raise ValueError(Errors.E073.format(new_length=len(vector),
+                                                    length=self.vocab.vectors_length))
+            self.vocab.set_vector(self.c.orth, vector)
+
+    property rank:
+        """RETURNS (str): Sequential ID of the lexeme's lexical type, used
+            to index into tables, e.g. for word vectors."""
+        def __get__(self):
+            return self.c.id
+
+        def __set__(self, value):
+            self.c.id = value
+
+    property sentiment:
+        """RETURNS (float): A scalar value indicating the positivity or
+            negativity of the lexeme."""
+        def __get__(self):
+            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
+            return sentiment_table.get(self.c.orth, 0.0)
+
+        def __set__(self, float x):
+            if "lexeme_sentiment" not in self.vocab.lookups:
+                self.vocab.lookups.add_table("lexeme_sentiment")
+            sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
+            sentiment_table[self.c.orth] = x
+
+    @property
+    def orth_(self):
+        """RETURNS (str): The original verbatim text of the lexeme
+            (identical to `Lexeme.text`). Exists mostly for consistency with
+            the other attributes."""
+        return self.vocab.strings[self.c.orth]
+
+    @property
+    def text(self):
+        """RETURNS (str): The original verbatim text of the lexeme."""
+        return self.orth_
+
+    property lower:
+        """RETURNS (str): Lowercase form of the lexeme."""
+        def __get__(self):
+            return self.c.lower
+
+        def __set__(self, attr_t x):
+            self.c.lower = x
+
+    property norm:
+        """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
+            lexeme text.
+        """
+        def __get__(self):
+            return self.c.norm
+
+        def __set__(self, attr_t x):
+            if "lexeme_norm" not in self.vocab.lookups:
+                self.vocab.lookups.add_table("lexeme_norm")
+            norm_table = self.vocab.lookups.get_table("lexeme_norm")
+            norm_table[self.c.orth] = self.vocab.strings[x]
+            self.c.norm = x
+
+    property shape:
+        """RETURNS (uint64): Transform of the word's string, to show
+            orthographic features.
+        """
+        def __get__(self):
+            return self.c.shape
+
+        def __set__(self, attr_t x):
+            self.c.shape = x
+
+    property prefix:
+        """RETURNS (uint64): Length-N substring from the start of the word.
+            Defaults to `N=1`.
+        """
+        def __get__(self):
+            return self.c.prefix
+
+        def __set__(self, attr_t x):
+            self.c.prefix = x
+
+    property suffix:
+        """RETURNS (uint64): Length-N substring from the end of the word.
+            Defaults to `N=3`.
+        """
+        def __get__(self):
+            return self.c.suffix
+
+        def __set__(self, attr_t x):
+            self.c.suffix = x
+
+    property cluster:
+        """RETURNS (int): Brown cluster ID."""
+        def __get__(self):
+            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
+            return cluster_table.get(self.c.orth, 0)
+
+        def __set__(self, int x):
+            cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
+            cluster_table[self.c.orth] = x
+
+    property lang:
+        """RETURNS (uint64): Language of the parent vocabulary."""
+        def __get__(self):
+            return self.c.lang
+
+        def __set__(self, attr_t x):
+            self.c.lang = x
+
+    property prob:
+        """RETURNS (float): Smoothed log probability estimate of the lexeme's
+            type."""
+        def __get__(self):
+            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
+            settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
+            default_oov_prob = settings_table.get("oov_prob", -20.0)
+            return prob_table.get(self.c.orth, default_oov_prob)
+
+        def __set__(self, float x):
+            prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
+            prob_table[self.c.orth] = x
+
+    property lower_:
+        """RETURNS (str): Lowercase form of the word."""
+        def __get__(self):
+            return self.vocab.strings[self.c.lower]
+
+        def __set__(self, str x):
+            self.c.lower = self.vocab.strings.add(x)
+
+    property norm_:
+        """RETURNS (str): The lexeme's norm, i.e. a normalised form of the
+            lexeme text.
+        """
+        def __get__(self):
+            return self.vocab.strings[self.c.norm]
+
+        def __set__(self, str x):
+            self.norm = self.vocab.strings.add(x)
+
+    property shape_:
+        """RETURNS (str): Transform of the word's string, to show
+            orthographic features.
+        """
+        def __get__(self):
+            return self.vocab.strings[self.c.shape]
+
+        def __set__(self, str x):
+            self.c.shape = self.vocab.strings.add(x)
+
+    property prefix_:
+        """RETURNS (str): Length-N substring from the start of the word.
+            Defaults to `N=1`.
+        """
+        def __get__(self):
+            return self.vocab.strings[self.c.prefix]
+
+        def __set__(self, str x):
+            self.c.prefix = self.vocab.strings.add(x)
+
+    property suffix_:
+        """RETURNS (str): Length-N substring from the end of the word.
+            Defaults to `N=3`.
+        """
+        def __get__(self):
+            return self.vocab.strings[self.c.suffix]
+
+        def __set__(self, str x):
+            self.c.suffix = self.vocab.strings.add(x)
+
+    property lang_:
+        """RETURNS (str): Language of the parent vocabulary."""
+        def __get__(self):
+            return self.vocab.strings[self.c.lang]
+
+        def __set__(self, str x):
+            self.c.lang = self.vocab.strings.add(x)
+
+    property flags:
+        """RETURNS (uint64): Container of the lexeme's binary flags."""
+        def __get__(self):
+            return self.c.flags
+
+        def __set__(self, flags_t x):
+            self.c.flags = x
+
+    @property
+    def is_oov(self):
+        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
+        return self.orth not in self.vocab.vectors
+
+    property is_stop:
+        """RETURNS (bool): Whether the lexeme is a stop word."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_STOP)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_STOP, x)
+
+    property is_alpha:
+        """RETURNS (bool): Whether the lexeme consists of alphabetic
+            characters. Equivalent to `lexeme.text.isalpha()`.
+        """
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_ALPHA)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_ALPHA, x)
+
+    property is_ascii:
+        """RETURNS (bool): Whether the lexeme consists of ASCII characters.
+            Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
+        """
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_ASCII)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_ASCII, x)
+
+    property is_digit:
+        """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
+            to `lexeme.text.isdigit()`.
+        """
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_DIGIT)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_DIGIT, x)
+
+    property is_lower:
+        """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
+            `lexeme.text.islower()`.
+        """
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_LOWER)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_LOWER, x)
+
+    property is_upper:
+        """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
+            `lexeme.text.isupper()`.
+        """
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_UPPER)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_UPPER, x)
+
+    property is_title:
+        """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
+            `lexeme.text.istitle()`.
+        """
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_TITLE)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_TITLE, x)
+
+    property is_punct:
+        """RETURNS (bool): Whether the lexeme is punctuation."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_PUNCT)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_PUNCT, x)
+
+    property is_space:
+        """RETURNS (bool): Whether the lexeme consist of whitespace characters.
+            Equivalent to `lexeme.text.isspace()`.
+        """
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_SPACE)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_SPACE, x)
+
+    property is_bracket:
+        """RETURNS (bool): Whether the lexeme is a bracket."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_BRACKET)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_BRACKET, x)
+
+    property is_quote:
+        """RETURNS (bool): Whether the lexeme is a quotation mark."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_QUOTE)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_QUOTE, x)
+
+    property is_left_punct:
+        """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
+
+    property is_right_punct:
+        """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
+
+    property is_currency:
+        """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, IS_CURRENCY)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
+
+    property like_url:
+        """RETURNS (bool): Whether the lexeme resembles a URL."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, LIKE_URL)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, LIKE_URL, x)
+
+    property like_num:
+        """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
+            "10", "ten", etc.
+        """
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, LIKE_NUM)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, LIKE_NUM, x)
+
+    property like_email:
+        """RETURNS (bool): Whether the lexeme resembles an email address."""
+        def __get__(self):
+            return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
+
+        def __set__(self, bint x):
+            Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
--- a/cgpenv/Lib/site-packages/spacy/lookups.py
+++ b/cgpenv/Lib/site-packages/spacy/lookups.py
+from typing import Any, List, Union, Optional, Dict
+from pathlib import Path
+import srsly
+from preshed.bloom import BloomFilter
+from collections import OrderedDict
+
+from .errors import Errors
+from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
+from .strings import get_string_id
+
+
+UNSET = object()
+
+
+def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
+    """Load the data from the spacy-lookups-data package for a given language,
+    if available. Returns an empty `Lookups` container if there's no data or if the package
+    is not installed.
+
+    lang (str): The language code (corresponds to entry point exposed by
+        the spacy-lookups-data package).
+    tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
+    strict (bool): Whether to raise an error if a table doesn't exist.
+    RETURNS (Lookups): The lookups container containing the loaded tables.
+    """
+    # TODO: import spacy_lookups_data instead of going via entry points here?
+    lookups = Lookups()
+    if lang not in registry.lookups:
+        if strict and len(tables) > 0:
+            raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
+        return lookups
+    data = registry.lookups.get(lang)
+    for table in tables:
+        if table not in data:
+            if strict:
+                raise ValueError(Errors.E955.format(table=table, lang=lang))
+            language_data = {}  # type: ignore[var-annotated]
+        else:
+            language_data = load_language_data(data[table])  # type: ignore[assignment]
+        lookups.add_table(table, language_data)
+    return lookups
+
+
+class Table(OrderedDict):
+    """A table in the lookups. Subclass of builtin dict that implements a
+    slightly more consistent and unified API.
+
+    Includes a Bloom filter to speed up missed lookups.
+    """
+
+    @classmethod
+    def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
+        """Initialize a new table from a dict.
+
+        data (dict): The dictionary.
+        name (str): Optional table name for reference.
+
+        DOCS: https://spacy.io/api/lookups#table.from_dict
+        """
+        self = cls(name=name)
+        self.update(data)
+        return self
+
+    def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
+        """Initialize a new table.
+
+        name (str): Optional table name for reference.
+        data (dict): Initial data, used to hint Bloom Filter.
+
+        DOCS: https://spacy.io/api/lookups#table.init
+        """
+        OrderedDict.__init__(self)
+        self.name = name
+        # Assume a default size of 1M items
+        self.default_size = 1e6
+        size = max(len(data), 1) if data is not None else self.default_size
+        self.bloom = BloomFilter.from_error_rate(size)
+        if data:
+            self.update(data)
+
+    def __setitem__(self, key: Union[str, int], value: Any) -> None:
+        """Set new key/value pair. String keys will be hashed.
+
+        key (str / int): The key to set.
+        value: The value to set.
+        """
+        key = get_string_id(key)
+        OrderedDict.__setitem__(self, key, value)
+        self.bloom.add(key)
+
+    def set(self, key: Union[str, int], value: Any) -> None:
+        """Set new key/value pair. String keys will be hashed.
+        Same as table[key] = value.
+
+        key (str / int): The key to set.
+        value: The value to set.
+        """
+        self[key] = value
+
+    def __getitem__(self, key: Union[str, int]) -> Any:
+        """Get the value for a given key. String keys will be hashed.
+
+        key (str / int): The key to get.
+        RETURNS: The value.
+        """
+        key = get_string_id(key)
+        return OrderedDict.__getitem__(self, key)
+
+    def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
+        """Get the value for a given key. String keys will be hashed.
+
+        key (str / int): The key to get.
+        default: The default value to return.
+        RETURNS: The value.
+        """
+        key = get_string_id(key)
+        return OrderedDict.get(self, key, default)
+
+    def __contains__(self, key: Union[str, int]) -> bool:  # type: ignore[override]
+        """Check whether a key is in the table. String keys will be hashed.
+
+        key (str / int): The key to check.
+        RETURNS (bool): Whether the key is in the table.
+        """
+        key = get_string_id(key)
+        # This can give a false positive, so we need to check it after
+        if key not in self.bloom:
+            return False
+        return OrderedDict.__contains__(self, key)
+
+    def to_bytes(self) -> bytes:
+        """Serialize table to a bytestring.
+
+        RETURNS (bytes): The serialized table.
+
+        DOCS: https://spacy.io/api/lookups#table.to_bytes
+        """
+        data = {
+            "name": self.name,
+            "dict": dict(self.items()),
+            "bloom": self.bloom.to_bytes(),
+        }
+        return srsly.msgpack_dumps(data)
+
+    def from_bytes(self, bytes_data: bytes) -> "Table":
+        """Load a table from a bytestring.
+
+        bytes_data (bytes): The data to load.
+        RETURNS (Table): The loaded table.
+
+        DOCS: https://spacy.io/api/lookups#table.from_bytes
+        """
+        loaded = srsly.msgpack_loads(bytes_data)
+        data = loaded.get("dict", {})
+        self.name = loaded["name"]
+        self.bloom = BloomFilter().from_bytes(loaded["bloom"])
+        self.clear()
+        self.update(data)
+        return self
+
+
+class Lookups:
+    """Container for large lookup tables and dictionaries, e.g. lemmatization
+    data or tokenizer exception lists. Lookups are available via vocab.lookups,
+    so they can be accessed before the pipeline components are applied (e.g.
+    in the tokenizer and lemmatizer), as well as within the pipeline components
+    via doc.vocab.lookups.
+    """
+
+    def __init__(self) -> None:
+        """Initialize the Lookups object.
+
+        DOCS: https://spacy.io/api/lookups#init
+        """
+        self._tables: Dict[str, Table] = {}
+
+    def __contains__(self, name: str) -> bool:
+        """Check if the lookups contain a table of a given name. Delegates to
+        Lookups.has_table.
+
+        name (str): Name of the table.
+        RETURNS (bool): Whether a table of that name is in the lookups.
+        """
+        return self.has_table(name)
+
+    def __len__(self) -> int:
+        """RETURNS (int): The number of tables in the lookups."""
+        return len(self._tables)
+
+    @property
+    def tables(self) -> List[str]:
+        """RETURNS (List[str]): Names of all tables in the lookups."""
+        return list(self._tables.keys())
+
+    def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> Table:
+        """Add a new table to the lookups. Raises an error if the table exists.
+
+        name (str): Unique name of table.
+        data (dict): Optional data to add to the table.
+        RETURNS (Table): The newly added table.
+
+        DOCS: https://spacy.io/api/lookups#add_table
+        """
+        if name in self.tables:
+            raise ValueError(Errors.E158.format(name=name))
+        table = Table(name=name, data=data)
+        self._tables[name] = table
+        return table
+
+    def set_table(self, name: str, table: Table) -> None:
+        """Set a table.
+
+        name (str): Name of the table to set.
+        table (Table): The Table to set.
+
+        DOCS: https://spacy.io/api/lookups#set_table
+        """
+        self._tables[name] = table
+
+    def get_table(self, name: str, default: Any = UNSET) -> Table:
+        """Get a table. Raises an error if the table doesn't exist and no
+        default value is provided.
+
+        name (str): Name of the table.
+        default (Any): Optional default value to return if table doesn't exist.
+        RETURNS (Table): The table.
+
+        DOCS: https://spacy.io/api/lookups#get_table
+        """
+        if name not in self._tables:
+            if default == UNSET:
+                raise KeyError(Errors.E159.format(name=name, tables=self.tables))
+            return default
+        return self._tables[name]
+
+    def remove_table(self, name: str) -> Table:
+        """Remove a table. Raises an error if the table doesn't exist.
+
+        name (str): Name of the table to remove.
+        RETURNS (Table): The removed table.
+
+        DOCS: https://spacy.io/api/lookups#remove_table
+        """
+        if name not in self._tables:
+            raise KeyError(Errors.E159.format(name=name, tables=self.tables))
+        return self._tables.pop(name)
+
+    def has_table(self, name: str) -> bool:
+        """Check if the lookups contain a table of a given name.
+
+        name (str): Name of the table.
+        RETURNS (bool): Whether a table of that name exists.
+
+        DOCS: https://spacy.io/api/lookups#has_table
+        """
+        return name in self._tables
+
+    def to_bytes(self, **kwargs) -> bytes:
+        """Serialize the lookups to a bytestring.
+
+        RETURNS (bytes): The serialized Lookups.
+
+        DOCS: https://spacy.io/api/lookups#to_bytes
+        """
+        return srsly.msgpack_dumps(self._tables)
+
+    def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
+        """Load the lookups from a bytestring.
+
+        bytes_data (bytes): The data to load.
+        RETURNS (Lookups): The loaded Lookups.
+
+        DOCS: https://spacy.io/api/lookups#from_bytes
+        """
+        self._tables = {}
+        for key, value in srsly.msgpack_loads(bytes_data).items():
+            self._tables[key] = Table(key, value)
+        return self
+
+    def to_disk(
+        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
+    ) -> None:
+        """Save the lookups to a directory as lookups.bin. Expects a path to a
+        directory, which will be created if it doesn't exist.
+
+        path (str / Path): The file path.
+
+        DOCS: https://spacy.io/api/lookups#to_disk
+        """
+        path = ensure_path(path)
+        if not path.exists():
+            path.mkdir()
+        filepath = path / filename
+        with filepath.open("wb") as file_:
+            file_.write(self.to_bytes())
+
+    def from_disk(
+        self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
+    ) -> "Lookups":
+        """Load lookups from a directory containing a lookups.bin. Will skip
+        loading if the file doesn't exist.
+
+        path (str / Path): The directory path.
+        RETURNS (Lookups): The loaded lookups.
+
+        DOCS: https://spacy.io/api/lookups#from_disk
+        """
+        path = ensure_path(path)
+        filepath = path / filename
+        if filepath.exists():
+            with filepath.open("rb") as file_:
+                data = file_.read()
+            return self.from_bytes(data)
+        return self
--- a/cgpenv/Lib/site-packages/spacy/morphology.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/morphology.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/morphology.cpp
+++ b/cgpenv/Lib/site-packages/spacy/morphology.cpp
--- a/cgpenv/Lib/site-packages/spacy/morphology.pxd
+++ b/cgpenv/Lib/site-packages/spacy/morphology.pxd
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+cimport numpy as np
+from libc.stdint cimport uint64_t
+
+from .structs cimport MorphAnalysisC
+from .strings cimport StringStore
+from .typedefs cimport attr_t, hash_t
+
+
+cdef class Morphology:
+    cdef readonly Pool mem
+    cdef readonly StringStore strings
+    cdef PreshMap tags # Keyed by hash, value is pointer to tag
+
+    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
+    cdef int insert(self, MorphAnalysisC tag) except -1
+
+
+cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
+cdef list list_features(const MorphAnalysisC* morph)
+cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
--- a/cgpenv/Lib/site-packages/spacy/morphology.pyx
+++ b/cgpenv/Lib/site-packages/spacy/morphology.pyx
+# cython: infer_types
+import numpy
+import warnings
+
+from .attrs cimport POS
+
+from .parts_of_speech import IDS as POS_IDS
+from .errors import Warnings
+from . import symbols
+
+
+cdef class Morphology:
+    """Store the possible morphological analyses for a language, and index them
+    by hash.
+
+    To save space on each token, tokens only know the hash of their
+    morphological analysis, so queries of morphological attributes are delegated
+    to this class.
+    """
+    FEATURE_SEP = "|"
+    FIELD_SEP = "="
+    VALUE_SEP = ","
+    # not an empty string so we can distinguish unset morph from empty morph
+    EMPTY_MORPH = symbols.NAMES[symbols._]
+
+    def __init__(self, StringStore strings):
+        self.mem = Pool()
+        self.strings = strings
+        self.tags = PreshMap()
+
+    def __reduce__(self):
+        tags = set([self.get(self.strings[s]) for s in self.strings])
+        tags -= set([""])
+        return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
+
+    def add(self, features):
+        """Insert a morphological analysis in the morphology table, if not
+        already present. The morphological analysis may be provided in the UD
+        FEATS format as a string or in the tag map dict format.
+        Returns the hash of the new analysis.
+        """
+        cdef MorphAnalysisC* tag_ptr
+        if isinstance(features, str):
+            if features == "":
+                features = self.EMPTY_MORPH
+            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
+            if tag_ptr != NULL:
+                return tag_ptr.key
+            features = self.feats_to_dict(features)
+        if not isinstance(features, dict):
+            warnings.warn(Warnings.W100.format(feature=features))
+            features = {}
+        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
+        # intified ("Field", "Field=Value") pairs
+        field_feature_pairs = []
+        for field in sorted(string_features):
+            values = string_features[field]
+            for value in values.split(self.VALUE_SEP):
+                field_feature_pairs.append((
+                    self.strings.add(field),
+                    self.strings.add(field + self.FIELD_SEP + value),
+                ))
+        cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
+        # the hash key for the tag is either the hash of the normalized UFEATS
+        # string or the hash of an empty placeholder
+        norm_feats_string = self.normalize_features(features)
+        tag.key = self.strings.add(norm_feats_string)
+        self.insert(tag)
+        return tag.key
+
+    def normalize_features(self, features):
+        """Create a normalized FEATS string from a features string or dict.
+
+        features (Union[dict, str]): Features as dict or UFEATS string.
+        RETURNS (str): Features as normalized UFEATS string.
+        """
+        if isinstance(features, str):
+            features = self.feats_to_dict(features)
+        if not isinstance(features, dict):
+            warnings.warn(Warnings.W100.format(feature=features))
+            features = {}
+        features = self.normalize_attrs(features)
+        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
+        # normalized UFEATS string with sorted fields and values
+        norm_feats_string = self.FEATURE_SEP.join(sorted([
+                self.FIELD_SEP.join([field, values])
+            for field, values in string_features.items()
+        ]))
+        return norm_feats_string or self.EMPTY_MORPH
+
+    def normalize_attrs(self, attrs):
+        """Convert attrs dict so that POS is always by ID, other features are
+        by string. Values separated by VALUE_SEP are sorted.
+        """
+        out = {}
+        attrs = dict(attrs)
+        for key, value in attrs.items():
+            # convert POS value to ID
+            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
+                if isinstance(value, str) and value.upper() in POS_IDS:
+                    value = POS_IDS[value.upper()]
+                elif isinstance(value, int) and value not in POS_IDS.values():
+                    warnings.warn(Warnings.W100.format(feature={key: value}))
+                    continue
+                out[POS] = value
+            # accept any string or ID fields and values and convert to strings
+            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
+                key = self.strings.as_string(key)
+                value = self.strings.as_string(value)
+                # sort values
+                if self.VALUE_SEP in value:
+                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
+                out[key] = value
+            else:
+                warnings.warn(Warnings.W100.format(feature={key: value}))
+        return out
+
+    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
+        """Creates a MorphAnalysisC from a list of intified
+        ("Field", "Field=Value") tuples where fields with multiple values have
+        been split into individual tuples, e.g.:
+        [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
+        ("Field2", "Field2=Value3")]
+        """
+        cdef MorphAnalysisC tag
+        tag.length = len(field_feature_pairs)
+        if tag.length > 0:
+            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
+            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
+        for i, (field, feature) in enumerate(field_feature_pairs):
+            tag.fields[i] = field
+            tag.features[i] = feature
+        return tag
+
+    cdef int insert(self, MorphAnalysisC tag) except -1:
+        cdef hash_t key = tag.key
+        if self.tags.get(key) == NULL:
+            tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
+            tag_ptr[0] = tag
+            self.tags.set(key, <void*>tag_ptr)
+
+    def get(self, hash_t morph):
+        tag = <MorphAnalysisC*>self.tags.get(morph)
+        if tag == NULL:
+            return ""
+        else:
+            return self.strings[tag.key]
+
+    @staticmethod
+    def feats_to_dict(feats):
+        if not feats or feats == Morphology.EMPTY_MORPH:
+            return {}
+        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
+                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+
+    @staticmethod
+    def dict_to_feats(feats_dict):
+        if len(feats_dict) == 0:
+            return ""
+        return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
+
+
+cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+    cdef int i
+    for i in range(morph.length):
+        if morph.features[i] == feature:
+            return True
+    return False
+
+
+cdef list list_features(const MorphAnalysisC* morph):
+    cdef int i
+    features = []
+    for i in range(morph.length):
+        features.append(morph.features[i])
+    return features
+
+
+cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
+    cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+    n = get_n_by_field(<uint64_t*>results.data, morph, field)
+    return results[:n]
+
+
+cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+    cdef int n_results = 0
+    cdef int i
+    for i in range(morph.length):
+        if morph.fields[i] == field:
+            results[n_results] = morph.features[i]
+            n_results += 1
+    return n_results
+
+def unpickle_morphology(strings, tags):
+    cdef Morphology morphology = Morphology(strings)
+    for tag in tags:
+        morphology.add(tag)
+    return morphology
--- a/cgpenv/Lib/site-packages/spacy/parts_of_speech.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/parts_of_speech.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/parts_of_speech.cpp
+++ b/cgpenv/Lib/site-packages/spacy/parts_of_speech.cpp
--- a/cgpenv/Lib/site-packages/spacy/parts_of_speech.pxd
+++ b/cgpenv/Lib/site-packages/spacy/parts_of_speech.pxd
+from . cimport symbols
+
+cpdef enum univ_pos_t:
+    NO_TAG = 0
+    ADJ = symbols.ADJ
+    ADP
+    ADV
+    AUX
+    CONJ
+    CCONJ # U20
+    DET
+    INTJ
+    NOUN
+    NUM
+    PART
+    PRON
+    PROPN
+    PUNCT
+    SCONJ
+    SYM
+    VERB
+    X
+    EOL
+    SPACE
--- a/cgpenv/Lib/site-packages/spacy/parts_of_speech.pyx
+++ b/cgpenv/Lib/site-packages/spacy/parts_of_speech.pyx
+
+IDS = {
+    "": NO_TAG,
+    "ADJ": ADJ,
+    "ADP": ADP,
+    "ADV": ADV,
+    "AUX": AUX,
+    "CONJ": CONJ,  # U20
+    "CCONJ": CCONJ,
+    "DET": DET,
+    "INTJ": INTJ,
+    "NOUN": NOUN,
+    "NUM": NUM,
+    "PART": PART,
+    "PRON": PRON,
+    "PROPN": PROPN,
+    "PUNCT": PUNCT,
+    "SCONJ": SCONJ,
+    "SYM": SYM,
+    "VERB": VERB,
+    "X": X,
+    "EOL": EOL,
+    "SPACE": SPACE
+}
+
+
+NAMES = {value: key for key, value in IDS.items()}
--- a/cgpenv/Lib/site-packages/spacy/pipe_analysis.py
+++ b/cgpenv/Lib/site-packages/spacy/pipe_analysis.py
+from typing import List, Set, Dict, Iterable, ItemsView, Union, TYPE_CHECKING
+from wasabi import msg
+
+from .tokens import Doc, Token, Span
+from .errors import Errors
+from .util import dot_to_dict
+
+if TYPE_CHECKING:
+    # This lets us add type hints for mypy etc. without causing circular imports
+    from .language import Language  # noqa: F401
+
+
+DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"]
+
+
+def validate_attrs(values: Iterable[str]) -> Iterable[str]:
+    """Validate component attributes provided to "assigns", "requires" etc.
+    Raises error for invalid attributes and formatting. Doesn't check if
+    custom extension attributes are registered, since this is something the
+    user might want to do themselves later in the component.
+
+    values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
+    RETURNS (Iterable[str]): The checked attributes.
+    """
+    data = dot_to_dict({value: True for value in values})
+    objs = {"doc": Doc, "token": Token, "span": Span}
+    for obj_key, attrs in data.items():
+        if obj_key == "span":
+            # Support Span only for custom extension attributes
+            span_attrs = [attr for attr in values if attr.startswith("span.")]
+            span_attrs = [attr for attr in span_attrs if not attr.startswith("span._.")]
+            if span_attrs:
+                raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
+        if obj_key not in objs:  # first element is not doc/token/span
+            invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
+            raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
+        if not isinstance(attrs, dict):  # attr is something like "doc"
+            raise ValueError(Errors.E182.format(attr=obj_key))
+        for attr, value in attrs.items():
+            if attr == "_":
+                if value is True:  # attr is something like "doc._"
+                    raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
+                for ext_attr, ext_value in value.items():
+                    # We don't check whether the attribute actually exists
+                    if ext_value is not True:  # attr is something like doc._.x.y
+                        good = f"{obj_key}._.{ext_attr}"
+                        bad = f"{good}.{'.'.join(ext_value)}"
+                        raise ValueError(Errors.E183.format(attr=bad, solution=good))
+                continue  # we can't validate those further
+            if attr.endswith("_"):  # attr is something like "token.pos_"
+                raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
+            if value is not True:  # attr is something like doc.x.y
+                good = f"{obj_key}.{attr}"
+                bad = f"{good}.{'.'.join(value)}"
+                raise ValueError(Errors.E183.format(attr=bad, solution=good))
+            obj = objs[obj_key]
+            if not hasattr(obj, attr):
+                raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
+    return values
+
+
+def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
+    """Check which components in the pipeline assign or require an attribute.
+
+    nlp (Language): The current nlp object.
+    attr (str): The attribute, e.g. "doc.tensor".
+    RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires",
+        mapped to a list of component names.
+    """
+    result: Dict[str, List[str]] = {"assigns": [], "requires": []}
+    for pipe_name in nlp.pipe_names:
+        meta = nlp.get_pipe_meta(pipe_name)
+        if attr in meta.assigns:
+            result["assigns"].append(pipe_name)
+        if attr in meta.requires:
+            result["requires"].append(pipe_name)
+    return result
+
+
+def analyze_pipes(
+    nlp: "Language", *, keys: List[str] = DEFAULT_KEYS
+) -> Dict[str, Dict[str, Union[List[str], Dict]]]:
+    """Print a formatted summary for the current nlp object's pipeline. Shows
+    a table with the pipeline components and why they assign and require, as
+    well as any problems if available.
+
+    nlp (Language): The nlp object.
+    keys (List[str]): The meta keys to show in the table.
+    RETURNS (dict): A dict with "summary" and "problems".
+    """
+    result: Dict[str, Dict[str, Union[List[str], Dict]]] = {
+        "summary": {},
+        "problems": {},
+    }
+    all_attrs: Set[str] = set()
+    for i, name in enumerate(nlp.pipe_names):
+        meta = nlp.get_pipe_meta(name)
+        all_attrs.update(meta.assigns)
+        all_attrs.update(meta.requires)
+        result["summary"][name] = {key: getattr(meta, key, None) for key in keys}
+        prev_pipes = nlp.pipeline[:i]
+        requires = {annot: False for annot in meta.requires}
+        if requires:
+            for prev_name, prev_pipe in prev_pipes:
+                prev_meta = nlp.get_pipe_meta(prev_name)
+                for annot in prev_meta.assigns:
+                    requires[annot] = True
+        result["problems"][name] = [
+            annot for annot, fulfilled in requires.items() if not fulfilled
+        ]
+    result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs}
+    return result
+
+
+def print_pipe_analysis(
+    analysis: Dict[str, Dict[str, Union[List[str], Dict]]],
+    *,
+    keys: List[str] = DEFAULT_KEYS,
+) -> None:
+    """Print a formatted version of the pipe analysis produced by analyze_pipes.
+
+    analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
+    keys (List[str]): The meta keys to show in the table.
+    """
+    msg.divider("Pipeline Overview")
+    header = ["#", "Component", *[key.capitalize() for key in keys]]
+    summary: ItemsView = analysis["summary"].items()
+    body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)]
+    msg.table(body, header=header, divider=True, multiline=True)
+    n_problems = sum(len(p) for p in analysis["problems"].values())
+    if any(p for p in analysis["problems"].values()):
+        msg.divider(f"Problems ({n_problems})")
+        for name, problem in analysis["problems"].items():
+            if problem:
+                msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
+    else:
+        msg.good("No problems found.")
--- a/cgpenv/Lib/site-packages/spacy/py.typed
+++ b/cgpenv/Lib/site-packages/spacy/py.typed
--- a/cgpenv/Lib/site-packages/spacy/schemas.py
+++ b/cgpenv/Lib/site-packages/spacy/schemas.py
+from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
+from typing import Iterable, TypeVar, TYPE_CHECKING
+from enum import Enum
+from pydantic import BaseModel, Field, ValidationError, validator, create_model
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
+from pydantic.main import ModelMetaclass
+from thinc.api import Optimizer, ConfigValidationError, Model
+from thinc.config import Promise
+from collections import defaultdict
+import inspect
+
+from .attrs import NAMES
+from .lookups import Lookups
+from .util import is_cython_func
+
+if TYPE_CHECKING:
+    # This lets us add type hints for mypy etc. without causing circular imports
+    from .language import Language  # noqa: F401
+    from .training import Example  # noqa: F401
+    from .vocab import Vocab  # noqa: F401
+
+
+# fmt: off
+ItemT = TypeVar("ItemT")
+Batcher = Union[Callable[[Iterable[ItemT]], Iterable[List[ItemT]]], Promise]
+Reader = Union[Callable[["Language", str], Iterable["Example"]], Promise]
+Logger = Union[Callable[["Language"], Tuple[Callable[[Dict[str, Any]], None], Callable]], Promise]
+# fmt: on
+
+
+def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
+    """Validate data against a given pydantic schema.
+
+    obj (Dict[str, Any]): JSON-serializable data to validate.
+    schema (pydantic.BaseModel): The schema to validate against.
+    RETURNS (List[str]): A list of error messages, if available.
+    """
+    try:
+        schema(**obj)
+        return []
+    except ValidationError as e:
+        errors = e.errors()
+        data = defaultdict(list)
+        for error in errors:
+            err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
+            data[err_loc].append(error.get("msg"))
+        return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]  # type: ignore[arg-type]
+
+
+# Initialization
+
+
+class ArgSchemaConfig:
+    extra = "forbid"
+    arbitrary_types_allowed = True
+
+
+class ArgSchemaConfigExtra:
+    extra = "forbid"
+    arbitrary_types_allowed = True
+
+
+def get_arg_model(
+    func: Callable,
+    *,
+    exclude: Iterable[str] = tuple(),
+    name: str = "ArgModel",
+    strict: bool = True,
+) -> ModelMetaclass:
+    """Generate a pydantic model for function arguments.
+
+    func (Callable): The function to generate the schema for.
+    exclude (Iterable[str]): Parameter names to ignore.
+    name (str): Name of created model class.
+    strict (bool): Don't allow extra arguments if no variable keyword arguments
+        are allowed on the function.
+    RETURNS (ModelMetaclass): A pydantic model.
+    """
+    sig_args = {}
+    try:
+        sig = inspect.signature(func)
+    except ValueError:
+        # Typically happens if the method is part of a Cython module without
+        # binding=True. Here we just use an empty model that allows everything.
+        return create_model(name, __config__=ArgSchemaConfigExtra)  # type: ignore[arg-type, return-value]
+    has_variable = False
+    for param in sig.parameters.values():
+        if param.name in exclude:
+            continue
+        if param.kind == param.VAR_KEYWORD:
+            # The function allows variable keyword arguments so we shouldn't
+            # include **kwargs etc. in the schema and switch to non-strict
+            # mode and pass through all other values
+            has_variable = True
+            continue
+        # If no annotation is specified assume it's anything
+        annotation = param.annotation if param.annotation != param.empty else Any
+        # If no default value is specified assume that it's required. Cython
+        # functions/methods will have param.empty for default value None so we
+        # need to treat them differently
+        default_empty = None if is_cython_func(func) else ...
+        default = param.default if param.default != param.empty else default_empty
+        sig_args[param.name] = (annotation, default)
+    is_strict = strict and not has_variable
+    sig_args["__config__"] = ArgSchemaConfig if is_strict else ArgSchemaConfigExtra  # type: ignore[assignment]
+    return create_model(name, **sig_args)  # type: ignore[arg-type, return-value]
+
+
+def validate_init_settings(
+    func: Callable,
+    settings: Dict[str, Any],
+    *,
+    section: Optional[str] = None,
+    name: str = "",
+    exclude: Iterable[str] = ("get_examples", "nlp"),
+) -> Dict[str, Any]:
+    """Validate initialization settings against the expected arguments in
+    the method signature. Will parse values if possible (e.g. int to string)
+    and return the updated settings dict. Will raise a ConfigValidationError
+    if types don't match or required values are missing.
+
+    func (Callable): The initialize method of a given component etc.
+    settings (Dict[str, Any]): The settings from the respective [initialize] block.
+    section (str): Initialize section, for error message.
+    name (str): Name of the block in the section.
+    exclude (Iterable[str]): Parameter names to exclude from schema.
+    RETURNS (Dict[str, Any]): The validated settings.
+    """
+    schema = get_arg_model(func, exclude=exclude, name="InitArgModel")
+    try:
+        return schema(**settings).dict()
+    except ValidationError as e:
+        block = "initialize" if not section else f"initialize.{section}"
+        title = f"Error validating initialization settings in [{block}]"
+        raise ConfigValidationError(
+            title=title, errors=e.errors(), config=settings, parent=name
+        ) from None
+
+
+# Matcher token patterns
+
+
+def validate_token_pattern(obj: list) -> List[str]:
+    # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
+    get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
+    if isinstance(obj, list):
+        converted = []
+        for pattern in obj:
+            if isinstance(pattern, dict):
+                pattern = {get_key(k): v for k, v in pattern.items()}
+            converted.append(pattern)
+        obj = converted
+    return validate(TokenPatternSchema, {"pattern": obj})
+
+
+class TokenPatternString(BaseModel):
+    REGEX: Optional[StrictStr] = Field(None, alias="regex")
+    IN: Optional[List[StrictStr]] = Field(None, alias="in")
+    NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
+    IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
+    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
+    INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
+
+    class Config:
+        extra = "forbid"
+        allow_population_by_field_name = True  # allow alias and field name
+
+    @validator("*", pre=True, each_item=True, allow_reuse=True)
+    def raise_for_none(cls, v):
+        if v is None:
+            raise ValueError("None / null is not allowed")
+        return v
+
+
+class TokenPatternNumber(BaseModel):
+    REGEX: Optional[StrictStr] = Field(None, alias="regex")
+    IN: Optional[List[StrictInt]] = Field(None, alias="in")
+    NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
+    IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
+    IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
+    INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
+    EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
+    NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
+    GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
+    LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
+    GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
+    LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
+
+    class Config:
+        extra = "forbid"
+        allow_population_by_field_name = True  # allow alias and field name
+
+    @validator("*", pre=True, each_item=True, allow_reuse=True)
+    def raise_for_none(cls, v):
+        if v is None:
+            raise ValueError("None / null is not allowed")
+        return v
+
+
+class TokenPatternOperator(str, Enum):
+    plus: StrictStr = StrictStr("+")
+    start: StrictStr = StrictStr("*")
+    question: StrictStr = StrictStr("?")
+    exclamation: StrictStr = StrictStr("!")
+
+
+StringValue = Union[TokenPatternString, StrictStr]
+NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
+UnderscoreValue = Union[
+    TokenPatternString, TokenPatternNumber, str, int, float, list, bool
+]
+
+
+class TokenPattern(BaseModel):
+    orth: Optional[StringValue] = None
+    text: Optional[StringValue] = None
+    lower: Optional[StringValue] = None
+    pos: Optional[StringValue] = None
+    tag: Optional[StringValue] = None
+    morph: Optional[StringValue] = None
+    dep: Optional[StringValue] = None
+    lemma: Optional[StringValue] = None
+    shape: Optional[StringValue] = None
+    ent_type: Optional[StringValue] = None
+    norm: Optional[StringValue] = None
+    length: Optional[NumberValue] = None
+    spacy: Optional[StrictBool] = None
+    is_alpha: Optional[StrictBool] = None
+    is_ascii: Optional[StrictBool] = None
+    is_digit: Optional[StrictBool] = None
+    is_lower: Optional[StrictBool] = None
+    is_upper: Optional[StrictBool] = None
+    is_title: Optional[StrictBool] = None
+    is_punct: Optional[StrictBool] = None
+    is_space: Optional[StrictBool] = None
+    is_bracket: Optional[StrictBool] = None
+    is_quote: Optional[StrictBool] = None
+    is_left_punct: Optional[StrictBool] = None
+    is_right_punct: Optional[StrictBool] = None
+    is_currency: Optional[StrictBool] = None
+    is_stop: Optional[StrictBool] = None
+    is_sent_start: Optional[StrictBool] = None
+    sent_start: Optional[StrictBool] = None
+    like_num: Optional[StrictBool] = None
+    like_url: Optional[StrictBool] = None
+    like_email: Optional[StrictBool] = None
+    op: Optional[TokenPatternOperator] = None
+    underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_")
+
+    class Config:
+        extra = "forbid"
+        allow_population_by_field_name = True
+        alias_generator = lambda value: value.upper()
+
+    @validator("*", pre=True, allow_reuse=True)
+    def raise_for_none(cls, v):
+        if v is None:
+            raise ValueError("None / null is not allowed")
+        return v
+
+
+class TokenPatternSchema(BaseModel):
+    pattern: List[TokenPattern] = Field(..., min_items=1)
+
+    class Config:
+        extra = "forbid"
+
+
+# Model meta
+
+
+class ModelMetaSchema(BaseModel):
+    # fmt: off
+    lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'")
+    name: StrictStr = Field(..., title="Model name")
+    version: StrictStr = Field(..., title="Model version")
+    spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier")
+    parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
+    requirements: List[StrictStr] = Field([], title="Additional Python package dependencies, used for the Python package setup")
+    pipeline: List[StrictStr] = Field([], title="Names of pipeline components")
+    description: StrictStr = Field("", title="Model description")
+    license: StrictStr = Field("", title="Model license")
+    author: StrictStr = Field("", title="Model author name")
+    email: StrictStr = Field("", title="Model author email")
+    url: StrictStr = Field("", title="Model author URL")
+    sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
+    vectors: Dict[str, Any] = Field({}, title="Included word vectors")
+    labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
+    performance: Dict[str, Any] = Field({}, title="Accuracy and speed numbers")
+    spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
+    # fmt: on
+
+
+# Config schema
+# We're not setting any defaults here (which is too messy) and are making all
+# fields required, so we can raise validation errors for missing values. To
+# provide a default, we include a separate .cfg file with all values and
+# check that against this schema in the test suite to make sure it's always
+# up to date.
+
+
+class ConfigSchemaTraining(BaseModel):
+    # fmt: off
+    dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
+    train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
+    batcher: Batcher = Field(..., title="Batcher for the training data")
+    dropout: StrictFloat = Field(..., title="Dropout rate")
+    patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
+    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
+    max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
+    eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
+    seed: Optional[StrictInt] = Field(..., title="Random seed")
+    gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
+    accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
+    score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
+    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    logger: Logger = Field(..., title="The logger to track training progress")
+    frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
+    annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
+    before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
+class ConfigSchemaNlp(BaseModel):
+    # fmt: off
+    lang: StrictStr = Field(..., title="The base language to use")
+    pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
+    disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
+    tokenizer: Callable = Field(..., title="The tokenizer to use")
+    before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
+    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
+    after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
+    batch_size: Optional[int] = Field(..., title="Default batch size")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
+class ConfigSchemaPretrainEmpty(BaseModel):
+    class Config:
+        extra = "forbid"
+
+
+class ConfigSchemaPretrain(BaseModel):
+    # fmt: off
+    max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
+    dropout: StrictFloat = Field(..., title="Dropout rate")
+    n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
+    n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
+    optimizer: Optimizer = Field(..., title="The optimizer to use")
+    corpus: StrictStr = Field(..., title="Path in the config to the training data")
+    batcher: Batcher = Field(..., title="Batcher for the training data")
+    component: str = Field(..., title="Component to find the layer to pretrain")
+    layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
+    objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
+class ConfigSchemaInit(BaseModel):
+    # fmt: off
+    vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
+    lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
+    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
+    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
+    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
+    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
+    before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
+    after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
+    # fmt: on
+
+    class Config:
+        extra = "forbid"
+        arbitrary_types_allowed = True
+
+
+class ConfigSchema(BaseModel):
+    training: ConfigSchemaTraining
+    nlp: ConfigSchemaNlp
+    pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}  # type: ignore[assignment]
+    components: Dict[str, Dict[str, Any]]
+    corpora: Dict[str, Reader]
+    initialize: ConfigSchemaInit
+
+    class Config:
+        extra = "allow"
+        arbitrary_types_allowed = True
+
+
+CONFIG_SCHEMAS = {
+    "nlp": ConfigSchemaNlp,
+    "training": ConfigSchemaTraining,
+    "pretraining": ConfigSchemaPretrain,
+    "initialize": ConfigSchemaInit,
+}
+
+
+# Project config Schema
+
+
+class ProjectConfigAssetGitItem(BaseModel):
+    # fmt: off
+    repo: StrictStr = Field(..., title="URL of Git repo to download from")
+    path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
+    branch: StrictStr = Field("master", title="Branch to clone from")
+    # fmt: on
+
+
+class ProjectConfigAssetURL(BaseModel):
+    # fmt: off
+    dest: StrictStr = Field(..., title="Destination of downloaded asset")
+    url: Optional[StrictStr] = Field(None, title="URL of asset")
+    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+    description: StrictStr = Field("", title="Description of asset")
+    # fmt: on
+
+
+class ProjectConfigAssetGit(BaseModel):
+    # fmt: off
+    git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
+    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+    description: Optional[StrictStr] = Field(None, title="Description of asset")
+    # fmt: on
+
+
+class ProjectConfigCommand(BaseModel):
+    # fmt: off
+    name: StrictStr = Field(..., title="Name of command")
+    help: Optional[StrictStr] = Field(None, title="Command description")
+    script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
+    deps: List[StrictStr] = Field([], title="File dependencies required by this command")
+    outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
+    outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
+    no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
+    # fmt: on
+
+    class Config:
+        title = "A single named command specified in a project config"
+        extra = "forbid"
+
+
+class ProjectConfigSchema(BaseModel):
+    # fmt: off
+    vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
+    env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
+    assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
+    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
+    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
+    title: Optional[str] = Field(None, title="Project title")
+    spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
+    # fmt: on
+
+    class Config:
+        title = "Schema for project configuration file"
+
+
+# Recommendations for init config workflows
+
+
+class RecommendationTrfItem(BaseModel):
+    name: str
+    size_factor: int
+
+
+class RecommendationTrf(BaseModel):
+    efficiency: RecommendationTrfItem
+    accuracy: RecommendationTrfItem
+
+
+class RecommendationSchema(BaseModel):
+    word_vectors: Optional[str] = None
+    transformer: Optional[RecommendationTrf] = None
+    has_letters: bool = True
--- a/cgpenv/Lib/site-packages/spacy/scorer.py
+++ b/cgpenv/Lib/site-packages/spacy/scorer.py
+from typing import Optional, Iterable, Dict, Set, List, Any, Callable, Tuple
+from typing import TYPE_CHECKING
+import numpy as np
+from collections import defaultdict
+
+from .training import Example
+from .tokens import Token, Doc, Span
+from .errors import Errors
+from .util import get_lang_class, SimpleFrozenList
+from .morphology import Morphology
+
+if TYPE_CHECKING:
+    # This lets us add type hints for mypy etc. without causing circular imports
+    from .language import Language  # noqa: F401
+
+
+DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
+MISSING_VALUES = frozenset([None, 0, ""])
+
+
+class PRFScore:
+    """A precision / recall / F score."""
+
+    def __init__(
+        self,
+        *,
+        tp: int = 0,
+        fp: int = 0,
+        fn: int = 0,
+    ) -> None:
+        self.tp = tp
+        self.fp = fp
+        self.fn = fn
+
+    def __len__(self) -> int:
+        return self.tp + self.fp + self.fn
+
+    def __iadd__(self, other):
+        self.tp += other.tp
+        self.fp += other.fp
+        self.fn += other.fn
+        return self
+
+    def __add__(self, other):
+        return PRFScore(
+            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
+        )
+
+    def score_set(self, cand: set, gold: set) -> None:
+        self.tp += len(cand.intersection(gold))
+        self.fp += len(cand - gold)
+        self.fn += len(gold - cand)
+
+    @property
+    def precision(self) -> float:
+        return self.tp / (self.tp + self.fp + 1e-100)
+
+    @property
+    def recall(self) -> float:
+        return self.tp / (self.tp + self.fn + 1e-100)
+
+    @property
+    def fscore(self) -> float:
+        p = self.precision
+        r = self.recall
+        return 2 * ((p * r) / (p + r + 1e-100))
+
+    def to_dict(self) -> Dict[str, float]:
+        return {"p": self.precision, "r": self.recall, "f": self.fscore}
+
+
+class ROCAUCScore:
+    """An AUC ROC score. This is only defined for binary classification.
+    Use the method is_binary before calculating the score, otherwise it
+    may throw an error."""
+
+    def __init__(self) -> None:
+        self.golds: List[Any] = []
+        self.cands: List[Any] = []
+        self.saved_score = 0.0
+        self.saved_score_at_len = 0
+
+    def score_set(self, cand, gold) -> None:
+        self.cands.append(cand)
+        self.golds.append(gold)
+
+    def is_binary(self):
+        return len(np.unique(self.golds)) == 2
+
+    @property
+    def score(self):
+        if not self.is_binary():
+            raise ValueError(Errors.E165.format(label=set(self.golds)))
+        if len(self.golds) == self.saved_score_at_len:
+            return self.saved_score
+        self.saved_score = _roc_auc_score(self.golds, self.cands)
+        self.saved_score_at_len = len(self.golds)
+        return self.saved_score
+
+
+class Scorer:
+    """Compute evaluation scores."""
+
+    def __init__(
+        self,
+        nlp: Optional["Language"] = None,
+        default_lang: str = "xx",
+        default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
+        **cfg,
+    ) -> None:
+        """Initialize the Scorer.
+
+        DOCS: https://spacy.io/api/scorer#init
+        """
+        self.cfg = cfg
+        if nlp:
+            self.nlp = nlp
+        else:
+            nlp = get_lang_class(default_lang)()
+            for pipe in default_pipeline:
+                nlp.add_pipe(pipe)
+            self.nlp = nlp
+
+    def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
+        """Evaluate a list of Examples.
+
+        examples (Iterable[Example]): The predicted annotations + correct annotations.
+        RETURNS (Dict): A dictionary of scores.
+
+        DOCS: https://spacy.io/api/scorer#score
+        """
+        scores = {}
+        if hasattr(self.nlp.tokenizer, "score"):
+            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))  # type: ignore
+        for name, component in self.nlp.pipeline:
+            if hasattr(component, "score"):
+                scores.update(component.score(examples, **self.cfg))
+        return scores
+
+    @staticmethod
+    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
+        """Returns accuracy and PRF scores for tokenization.
+        * token_acc: # correct tokens / # gold tokens
+        * token_p/r/f: PRF for token character spans
+
+        examples (Iterable[Example]): Examples to score
+        RETURNS (Dict[str, Any]): A dictionary containing the scores
+            token_acc/p/r/f.
+
+        DOCS: https://spacy.io/api/scorer#score_tokenization
+        """
+        acc_score = PRFScore()
+        prf_score = PRFScore()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            if gold_doc.has_unknown_spaces:
+                continue
+            align = example.alignment
+            gold_spans = set()
+            pred_spans = set()
+            for token in gold_doc:
+                if token.orth_.isspace():
+                    continue
+                gold_spans.add((token.idx, token.idx + len(token)))
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                pred_spans.add((token.idx, token.idx + len(token)))
+                if align.x2y.lengths[token.i] != 1:
+                    acc_score.fp += 1
+                else:
+                    acc_score.tp += 1
+            prf_score.score_set(pred_spans, gold_spans)
+        if len(acc_score) > 0:
+            return {
+                "token_acc": acc_score.fscore,
+                "token_p": prf_score.precision,
+                "token_r": prf_score.recall,
+                "token_f": prf_score.fscore,
+            }
+        else:
+            return {
+                "token_acc": None,
+                "token_p": None,
+                "token_r": None,
+                "token_f": None,
+            }
+
+    @staticmethod
+    def score_token_attr(
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Token, str], Any] = getattr,
+        missing_values: Set[Any] = MISSING_VALUES,  # type: ignore[assignment]
+        **cfg,
+    ) -> Dict[str, Any]:
+        """Returns an accuracy score for a token-level attribute.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        missing_values (Set[Any]): Attribute values to treat as missing annotation
+            in the reference annotation.
+        RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
+            under the key attr_acc.
+
+        DOCS: https://spacy.io/api/scorer#score_token_attr
+        """
+        tag_score = PRFScore()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            align = example.alignment
+            gold_tags = set()
+            missing_indices = set()
+            for gold_i, token in enumerate(gold_doc):
+                value = getter(token, attr)
+                if value not in missing_values:
+                    gold_tags.add((gold_i, getter(token, attr)))
+                else:
+                    missing_indices.add(gold_i)
+            pred_tags = set()
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] == 1:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    if gold_i not in missing_indices:
+                        pred_tags.add((gold_i, getter(token, attr)))
+            tag_score.score_set(pred_tags, gold_tags)
+        score_key = f"{attr}_acc"
+        if len(tag_score) == 0:
+            return {score_key: None}
+        else:
+            return {score_key: tag_score.fscore}
+
+    @staticmethod
+    def score_token_attr_per_feat(
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Token, str], Any] = getattr,
+        missing_values: Set[Any] = MISSING_VALUES,  # type: ignore[assignment]
+        **cfg,
+    ) -> Dict[str, Any]:
+        """Return micro PRF and PRF scores per feat for a token attribute in
+        UFEATS format.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        missing_values (Set[Any]): Attribute values to treat as missing
+            annotation in the reference annotation.
+        RETURNS (dict): A dictionary containing the micro PRF scores under the
+            key attr_micro_p/r/f and the per-feat PRF scores under
+            attr_per_feat.
+        """
+        micro_score = PRFScore()
+        per_feat = {}
+        for example in examples:
+            pred_doc = example.predicted
+            gold_doc = example.reference
+            align = example.alignment
+            gold_per_feat: Dict[str, Set] = {}
+            missing_indices = set()
+            for gold_i, token in enumerate(gold_doc):
+                value = getter(token, attr)
+                morph = gold_doc.vocab.strings[value]
+                if value not in missing_values and morph != Morphology.EMPTY_MORPH:
+                    for feat in morph.split(Morphology.FEATURE_SEP):
+                        field, values = feat.split(Morphology.FIELD_SEP)
+                        if field not in per_feat:
+                            per_feat[field] = PRFScore()
+                        if field not in gold_per_feat:
+                            gold_per_feat[field] = set()
+                        gold_per_feat[field].add((gold_i, feat))
+                else:
+                    missing_indices.add(gold_i)
+            pred_per_feat: Dict[str, Set] = {}
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] == 1:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                    if gold_i not in missing_indices:
+                        value = getter(token, attr)
+                        morph = gold_doc.vocab.strings[value]
+                        if (
+                            value not in missing_values
+                            and morph != Morphology.EMPTY_MORPH
+                        ):
+                            for feat in morph.split(Morphology.FEATURE_SEP):
+                                field, values = feat.split(Morphology.FIELD_SEP)
+                                if field not in per_feat:
+                                    per_feat[field] = PRFScore()
+                                if field not in pred_per_feat:
+                                    pred_per_feat[field] = set()
+                                pred_per_feat[field].add((gold_i, feat))
+            for field in per_feat:
+                micro_score.score_set(
+                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
+                )
+                per_feat[field].score_set(
+                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
+                )
+        result: Dict[str, Any] = {}
+        if len(micro_score) > 0:
+            result[f"{attr}_micro_p"] = micro_score.precision
+            result[f"{attr}_micro_r"] = micro_score.recall
+            result[f"{attr}_micro_f"] = micro_score.fscore
+            result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
+        else:
+            result[f"{attr}_micro_p"] = None
+            result[f"{attr}_micro_r"] = None
+            result[f"{attr}_micro_f"] = None
+            result[f"{attr}_per_feat"] = None
+        return result
+
+    @staticmethod
+    def score_spans(
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Doc, str], Iterable[Span]] = getattr,
+        has_annotation: Optional[Callable[[Doc], bool]] = None,
+        labeled: bool = True,
+        allow_overlap: bool = False,
+        **cfg,
+    ) -> Dict[str, Any]:
+        """Returns PRF scores for labeled spans.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
+            provided, getter(doc, attr) should return the spans for the
+            individual doc.
+        has_annotation (Optional[Callable[[Doc], bool]]) should return whether a `Doc`
+            has annotation for this `attr`. Docs without annotation are skipped for
+            scoring purposes.
+        labeled (bool): Whether or not to include label information in
+            the evaluation. If set to 'False', two spans will be considered
+            equal if their start and end match, irrespective of their label.
+        allow_overlap (bool): Whether or not to allow overlapping spans.
+            If set to 'False', the alignment will automatically resolve conflicts.
+        RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
+            the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
+
+        DOCS: https://spacy.io/api/scorer#score_spans
+        """
+        score = PRFScore()
+        score_per_type = dict()
+        for example in examples:
+            pred_doc = example.predicted
+            gold_doc = example.reference
+            # Option to handle docs without annotation for this attribute
+            if has_annotation is not None:
+                if not has_annotation(gold_doc):
+                    continue
+            # Find all labels in gold and doc
+            labels = set(
+                [k.label_ for k in getter(gold_doc, attr)]
+                + [k.label_ for k in getter(pred_doc, attr)]
+            )
+            # Set up all labels for per type scoring and prepare gold per type
+            gold_per_type: Dict[str, Set] = {label: set() for label in labels}
+            for label in labels:
+                if label not in score_per_type:
+                    score_per_type[label] = PRFScore()
+            # Find all predidate labels, for all and per type
+            gold_spans = set()
+            pred_spans = set()
+            for span in getter(gold_doc, attr):
+                gold_span: Tuple
+                if labeled:
+                    gold_span = (span.label_, span.start, span.end - 1)
+                else:
+                    gold_span = (span.start, span.end - 1)
+                gold_spans.add(gold_span)
+                gold_per_type[span.label_].add(gold_span)
+            pred_per_type: Dict[str, Set] = {label: set() for label in labels}
+            for span in example.get_aligned_spans_x2y(
+                getter(pred_doc, attr), allow_overlap
+            ):
+                pred_span: Tuple
+                if labeled:
+                    pred_span = (span.label_, span.start, span.end - 1)
+                else:
+                    pred_span = (span.start, span.end - 1)
+                pred_spans.add(pred_span)
+                pred_per_type[span.label_].add(pred_span)
+            # Scores per label
+            if labeled:
+                for k, v in score_per_type.items():
+                    if k in pred_per_type:
+                        v.score_set(pred_per_type[k], gold_per_type[k])
+            # Score for all labels
+            score.score_set(pred_spans, gold_spans)
+        # Assemble final result
+        final_scores: Dict[str, Any] = {
+            f"{attr}_p": None,
+            f"{attr}_r": None,
+            f"{attr}_f": None,
+        }
+        if labeled:
+            final_scores[f"{attr}_per_type"] = None
+        if len(score) > 0:
+            final_scores[f"{attr}_p"] = score.precision
+            final_scores[f"{attr}_r"] = score.recall
+            final_scores[f"{attr}_f"] = score.fscore
+            if labeled:
+                final_scores[f"{attr}_per_type"] = {
+                    k: v.to_dict() for k, v in score_per_type.items()
+                }
+        return final_scores
+
+    @staticmethod
+    def score_cats(
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Doc, str], Any] = getattr,
+        labels: Iterable[str] = SimpleFrozenList(),
+        multi_label: bool = True,
+        positive_label: Optional[str] = None,
+        threshold: Optional[float] = None,
+        **cfg,
+    ) -> Dict[str, Any]:
+        """Returns PRF and ROC AUC scores for a doc-level attribute with a
+        dict with scores for each label like Doc.cats. The reported overall
+        score depends on the scorer settings.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute to score.
+        getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
+            getter(doc, attr) should return the values for the individual doc.
+        labels (Iterable[str]): The set of possible labels. Defaults to [].
+        multi_label (bool): Whether the attribute allows multiple labels.
+            Defaults to True.
+        positive_label (str): The positive label for a binary task with
+            exclusive classes. Defaults to None.
+        threshold (float): Cutoff to consider a prediction "positive". Defaults
+            to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
+            otherwise.
+        RETURNS (Dict[str, Any]): A dictionary containing the scores, with
+            inapplicable scores as None:
+            for all:
+                attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
+                attr_score_desc (text description of the overall score),
+                attr_micro_p,
+                attr_micro_r,
+                attr_micro_f,
+                attr_macro_p,
+                attr_macro_r,
+                attr_macro_f,
+                attr_macro_auc,
+                attr_f_per_type,
+                attr_auc_per_type
+
+        DOCS: https://spacy.io/api/scorer#score_cats
+        """
+        if threshold is None:
+            threshold = 0.5 if multi_label else 0.0
+        f_per_type = {label: PRFScore() for label in labels}
+        auc_per_type = {label: ROCAUCScore() for label in labels}
+        labels = set(labels)
+        if labels:
+            for eg in examples:
+                labels.update(eg.predicted.cats.keys())
+                labels.update(eg.reference.cats.keys())
+        for example in examples:
+            # Through this loop, None in the gold_cats indicates missing label.
+            pred_cats = getter(example.predicted, attr)
+            gold_cats = getter(example.reference, attr)
+
+            for label in labels:
+                pred_score = pred_cats.get(label, 0.0)
+                gold_score = gold_cats.get(label, 0.0)
+                if gold_score is not None:
+                    auc_per_type[label].score_set(pred_score, gold_score)
+            if multi_label:
+                for label in labels:
+                    pred_score = pred_cats.get(label, 0.0)
+                    gold_score = gold_cats.get(label, 0.0)
+                    if gold_score is not None:
+                        if pred_score >= threshold and gold_score > 0:
+                            f_per_type[label].tp += 1
+                        elif pred_score >= threshold and gold_score == 0:
+                            f_per_type[label].fp += 1
+                        elif pred_score < threshold and gold_score > 0:
+                            f_per_type[label].fn += 1
+            elif pred_cats and gold_cats:
+                # Get the highest-scoring for each.
+                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
+                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
+                if gold_score is not None:
+                    if pred_label == gold_label and pred_score >= threshold:
+                        f_per_type[pred_label].tp += 1
+                    else:
+                        f_per_type[gold_label].fn += 1
+                        if pred_score >= threshold:
+                            f_per_type[pred_label].fp += 1
+            elif gold_cats:
+                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
+                if gold_score is not None and gold_score > 0:
+                    f_per_type[gold_label].fn += 1
+            elif pred_cats:
+                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
+                if pred_score >= threshold:
+                    f_per_type[pred_label].fp += 1
+        micro_prf = PRFScore()
+        for label_prf in f_per_type.values():
+            micro_prf.tp += label_prf.tp
+            micro_prf.fn += label_prf.fn
+            micro_prf.fp += label_prf.fp
+        n_cats = len(f_per_type) + 1e-100
+        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
+        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
+        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
+        # Limit macro_auc to those labels with gold annotations,
+        # but still divide by all cats to avoid artificial boosting of datasets with missing labels
+        macro_auc = (
+            sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values())
+            / n_cats
+        )
+        results: Dict[str, Any] = {
+            f"{attr}_score": None,
+            f"{attr}_score_desc": None,
+            f"{attr}_micro_p": micro_prf.precision,
+            f"{attr}_micro_r": micro_prf.recall,
+            f"{attr}_micro_f": micro_prf.fscore,
+            f"{attr}_macro_p": macro_p,
+            f"{attr}_macro_r": macro_r,
+            f"{attr}_macro_f": macro_f,
+            f"{attr}_macro_auc": macro_auc,
+            f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+            f"{attr}_auc_per_type": {
+                k: v.score if v.is_binary() else None for k, v in auc_per_type.items()
+            },
+        }
+        if len(labels) == 2 and not multi_label and positive_label:
+            positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
+            results[f"{attr}_score"] = positive_label_f
+            results[f"{attr}_score_desc"] = f"F ({positive_label})"
+        elif not multi_label:
+            results[f"{attr}_score"] = results[f"{attr}_macro_f"]
+            results[f"{attr}_score_desc"] = "macro F"
+        else:
+            results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
+            results[f"{attr}_score_desc"] = "macro AUC"
+        return results
+
+    @staticmethod
+    def score_links(
+        examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
+    ) -> Dict[str, Any]:
+        """Returns PRF for predicted links on the entity level.
+        To disentangle the performance of the NEL from the NER,
+        this method only evaluates NEL links for entities that overlap
+        between the gold reference and the predictions.
+
+        examples (Iterable[Example]): Examples to score
+        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
+        RETURNS (Dict[str, Any]): A dictionary containing the scores.
+
+        DOCS: https://spacy.io/api/scorer#score_links
+        """
+        f_per_type = {}
+        for example in examples:
+            gold_ent_by_offset = {}
+            for gold_ent in example.reference.ents:
+                gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
+
+            for pred_ent in example.predicted.ents:
+                gold_span = gold_ent_by_offset.get(
+                    (pred_ent.start_char, pred_ent.end_char), None
+                )
+                if gold_span is not None:
+                    label = gold_span.label_
+                    if label not in f_per_type:
+                        f_per_type[label] = PRFScore()
+                    gold = gold_span.kb_id_
+                    # only evaluating entities that overlap between gold and pred,
+                    # to disentangle the performance of the NEL from the NER
+                    if gold is not None:
+                        pred = pred_ent.kb_id_
+                        if gold in negative_labels and pred in negative_labels:
+                            # ignore true negatives
+                            pass
+                        elif gold == pred:
+                            f_per_type[label].tp += 1
+                        elif gold in negative_labels:
+                            f_per_type[label].fp += 1
+                        elif pred in negative_labels:
+                            f_per_type[label].fn += 1
+                        else:
+                            # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
+                            f_per_type[label].fp += 1
+                            f_per_type[label].fn += 1
+        micro_prf = PRFScore()
+        for label_prf in f_per_type.values():
+            micro_prf.tp += label_prf.tp
+            micro_prf.fn += label_prf.fn
+            micro_prf.fp += label_prf.fp
+        n_labels = len(f_per_type) + 1e-100
+        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
+        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
+        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
+        results = {
+            f"nel_score": micro_prf.fscore,
+            f"nel_score_desc": "micro F",
+            f"nel_micro_p": micro_prf.precision,
+            f"nel_micro_r": micro_prf.recall,
+            f"nel_micro_f": micro_prf.fscore,
+            f"nel_macro_p": macro_p,
+            f"nel_macro_r": macro_r,
+            f"nel_macro_f": macro_f,
+            f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
+        }
+        return results
+
+    @staticmethod
+    def score_deps(
+        examples: Iterable[Example],
+        attr: str,
+        *,
+        getter: Callable[[Token, str], Any] = getattr,
+        head_attr: str = "head",
+        head_getter: Callable[[Token, str], Token] = getattr,
+        ignore_labels: Iterable[str] = SimpleFrozenList(),
+        missing_values: Set[Any] = MISSING_VALUES,  # type: ignore[assignment]
+        **cfg,
+    ) -> Dict[str, Any]:
+        """Returns the UAS, LAS, and LAS per type scores for dependency
+        parses.
+
+        examples (Iterable[Example]): Examples to score
+        attr (str): The attribute containing the dependency label.
+        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
+            getter(token, attr) should return the value of the attribute for an
+            individual token.
+        head_attr (str): The attribute containing the head token. Defaults to
+            'head'.
+        head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
+            head_getter(token, attr) should return the value of the head for an
+            individual token.
+        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
+        missing_values (Set[Any]): Attribute values to treat as missing annotation
+            in the reference annotation.
+        RETURNS (Dict[str, Any]): A dictionary containing the scores:
+            attr_uas, attr_las, and attr_las_per_type.
+
+        DOCS: https://spacy.io/api/scorer#score_deps
+        """
+        unlabelled = PRFScore()
+        labelled = PRFScore()
+        labelled_per_dep = dict()
+        missing_indices = set()
+        for example in examples:
+            gold_doc = example.reference
+            pred_doc = example.predicted
+            align = example.alignment
+            gold_deps = set()
+            gold_deps_per_dep: Dict[str, Set] = {}
+            for gold_i, token in enumerate(gold_doc):
+                dep = getter(token, attr)
+                head = head_getter(token, head_attr)
+                if dep not in missing_values:
+                    if dep not in ignore_labels:
+                        gold_deps.add((gold_i, head.i, dep))
+                        if dep not in labelled_per_dep:
+                            labelled_per_dep[dep] = PRFScore()
+                        if dep not in gold_deps_per_dep:
+                            gold_deps_per_dep[dep] = set()
+                        gold_deps_per_dep[dep].add((gold_i, head.i, dep))
+                else:
+                    missing_indices.add(gold_i)
+            pred_deps = set()
+            pred_deps_per_dep: Dict[str, Set] = {}
+            for token in pred_doc:
+                if token.orth_.isspace():
+                    continue
+                if align.x2y.lengths[token.i] != 1:
+                    gold_i = None  # type: ignore
+                else:
+                    gold_i = align.x2y[token.i].dataXd[0, 0]
+                if gold_i not in missing_indices:
+                    dep = getter(token, attr)
+                    head = head_getter(token, head_attr)
+                    if dep not in ignore_labels and token.orth_.strip():
+                        if align.x2y.lengths[head.i] == 1:
+                            gold_head = align.x2y[head.i].dataXd[0, 0]
+                        else:
+                            gold_head = None
+                        # None is indistinct, so we can't just add it to the set
+                        # Multiple (None, None) deps are possible
+                        if gold_i is None or gold_head is None:
+                            unlabelled.fp += 1
+                            labelled.fp += 1
+                        else:
+                            pred_deps.add((gold_i, gold_head, dep))
+                            if dep not in labelled_per_dep:
+                                labelled_per_dep[dep] = PRFScore()
+                            if dep not in pred_deps_per_dep:
+                                pred_deps_per_dep[dep] = set()
+                            pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
+            labelled.score_set(pred_deps, gold_deps)
+            for dep in labelled_per_dep:
+                labelled_per_dep[dep].score_set(
+                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
+                )
+            unlabelled.score_set(
+                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
+            )
+        if len(unlabelled) > 0:
+            return {
+                f"{attr}_uas": unlabelled.fscore,
+                f"{attr}_las": labelled.fscore,
+                f"{attr}_las_per_type": {
+                    k: v.to_dict() for k, v in labelled_per_dep.items()
+                },
+            }
+        else:
+            return {
+                f"{attr}_uas": None,
+                f"{attr}_las": None,
+                f"{attr}_las_per_type": None,
+            }
+
+
+def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
+    """Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
+    score_per_type = defaultdict(PRFScore)
+    for eg in examples:
+        if not eg.y.has_annotation("ENT_IOB"):
+            continue
+        golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
+        align_x2y = eg.alignment.x2y
+        for pred_ent in eg.x.ents:
+            if pred_ent.label_ not in score_per_type:
+                score_per_type[pred_ent.label_] = PRFScore()
+            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
+            if len(indices):
+                g_span = eg.y[indices[0] : indices[-1] + 1]
+                # Check we aren't missing annotation on this span. If so,
+                # our prediction is neither right nor wrong, we just
+                # ignore it.
+                if all(token.ent_iob != 0 for token in g_span):
+                    key = (pred_ent.label_, indices[0], indices[-1] + 1)
+                    if key in golds:
+                        score_per_type[pred_ent.label_].tp += 1
+                        golds.remove(key)
+                    else:
+                        score_per_type[pred_ent.label_].fp += 1
+        for label, start, end in golds:
+            score_per_type[label].fn += 1
+    totals = PRFScore()
+    for prf in score_per_type.values():
+        totals += prf
+    if len(totals) > 0:
+        return {
+            "ents_p": totals.precision,
+            "ents_r": totals.recall,
+            "ents_f": totals.fscore,
+            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
+        }
+    else:
+        return {
+            "ents_p": None,
+            "ents_r": None,
+            "ents_f": None,
+            "ents_per_type": None,
+        }
+
+
+# The following implementation of roc_auc_score() is adapted from
+# scikit-learn, which is distributed under the New BSD License.
+# Copyright (c) 2007–2019 The scikit-learn developers.
+# See licenses/3rd_party_licenses.txt
+def _roc_auc_score(y_true, y_score):
+    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
+    from prediction scores.
+
+    Note: this implementation is restricted to the binary classification task
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples] or [n_samples, n_classes]
+        True binary labels or binary label indicators.
+        The multiclass case expects shape = [n_samples] and labels
+        with values in ``range(n_classes)``.
+
+    y_score : array, shape = [n_samples] or [n_samples, n_classes]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers). For binary
+        y_true, y_score is supposed to be the score of the class with greater
+        label. The multiclass case expects shape = [n_samples, n_classes]
+        where the scores correspond to probability estimates.
+
+    Returns
+    -------
+    auc : float
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Receiver operating characteristic
+            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
+           Letters, 2006, 27(8):861-874.
+
+    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
+            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
+    """
+    if len(np.unique(y_true)) != 2:
+        raise ValueError(Errors.E165.format(label=np.unique(y_true)))
+    fpr, tpr, _ = _roc_curve(y_true, y_score)
+    return _auc(fpr, tpr)
+
+
+def _roc_curve(y_true, y_score):
+    """Compute Receiver operating characteristic (ROC)
+
+    Note: this implementation is restricted to the binary classification task.
+
+    Parameters
+    ----------
+
+    y_true : array, shape = [n_samples]
+        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
+        pos_label should be explicitly given.
+
+    y_score : array, shape = [n_samples]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+
+    Returns
+    -------
+    fpr : array, shape = [>2]
+        Increasing false positive rates such that element i is the false
+        positive rate of predictions with score >= thresholds[i].
+
+    tpr : array, shape = [>2]
+        Increasing true positive rates such that element i is the true
+        positive rate of predictions with score >= thresholds[i].
+
+    thresholds : array, shape = [n_thresholds]
+        Decreasing thresholds on the decision function used to compute
+        fpr and tpr. `thresholds[0]` represents no instances being predicted
+        and is arbitrarily set to `max(y_score) + 1`.
+
+    Notes
+    -----
+    Since the thresholds are sorted from low to high values, they
+    are reversed upon returning them to ensure they correspond to both ``fpr``
+    and ``tpr``, which are sorted in reversed order during their calculation.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Receiver operating characteristic
+            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
+           Letters, 2006, 27(8):861-874.
+    """
+    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
+
+    # Add an extra threshold position
+    # to make sure that the curve starts at (0, 0)
+    tps = np.r_[0, tps]
+    fps = np.r_[0, fps]
+    thresholds = np.r_[thresholds[0] + 1, thresholds]
+
+    if fps[-1] <= 0:
+        fpr = np.repeat(np.nan, fps.shape)
+    else:
+        fpr = fps / fps[-1]
+
+    if tps[-1] <= 0:
+        tpr = np.repeat(np.nan, tps.shape)
+    else:
+        tpr = tps / tps[-1]
+
+    return fpr, tpr, thresholds
+
+
+def _binary_clf_curve(y_true, y_score):
+    """Calculate true and false positives per binary classification threshold.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True targets of binary classification
+
+    y_score : array, shape = [n_samples]
+        Estimated probabilities or decision function
+
+    Returns
+    -------
+    fps : array, shape = [n_thresholds]
+        A count of false positives, at index i being the number of negative
+        samples assigned a score >= thresholds[i]. The total number of
+        negative samples is equal to fps[-1] (thus true negatives are given by
+        fps[-1] - fps).
+
+    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
+        An increasing count of true positives, at index i being the number
+        of positive samples assigned a score >= thresholds[i]. The total
+        number of positive samples is equal to tps[-1] (thus false negatives
+        are given by tps[-1] - tps).
+
+    thresholds : array, shape = [n_thresholds]
+        Decreasing score values.
+    """
+    pos_label = 1.0
+
+    y_true = np.ravel(y_true)
+    y_score = np.ravel(y_score)
+
+    # make y_true a boolean vector
+    y_true = y_true == pos_label
+
+    # sort scores and corresponding truth values
+    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
+    y_score = y_score[desc_score_indices]
+    y_true = y_true[desc_score_indices]
+    weight = 1.0
+
+    # y_score typically has many tied values. Here we extract
+    # the indices associated with the distinct values. We also
+    # concatenate a value for the end of the curve.
+    distinct_value_indices = np.where(np.diff(y_score))[0]
+    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
+
+    # accumulate the true positives with decreasing threshold
+    tps = _stable_cumsum(y_true * weight)[threshold_idxs]
+    fps = 1 + threshold_idxs - tps
+    return fps, tps, y_score[threshold_idxs]
+
+
+def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
+    """Use high precision for cumsum and check that final value matches sum
+
+    Parameters
+    ----------
+    arr : array-like
+        To be cumulatively summed as flat
+    axis : int, optional
+        Axis along which the cumulative sum is computed.
+        The default (None) is to compute the cumsum over the flattened array.
+    rtol : float
+        Relative tolerance, see ``np.allclose``
+    atol : float
+        Absolute tolerance, see ``np.allclose``
+    """
+    out = np.cumsum(arr, axis=axis, dtype=np.float64)
+    expected = np.sum(arr, axis=axis, dtype=np.float64)
+    if not np.all(
+        np.isclose(
+            out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
+        )
+    ):
+        raise ValueError(Errors.E163)
+    return out
+
+
+def _auc(x, y):
+    """Compute Area Under the Curve (AUC) using the trapezoidal rule
+
+    This is a general function, given points on a curve.  For computing the
+    area under the ROC-curve, see :func:`roc_auc_score`.
+
+    Parameters
+    ----------
+    x : array, shape = [n]
+        x coordinates. These must be either monotonic increasing or monotonic
+        decreasing.
+    y : array, shape = [n]
+        y coordinates.
+
+    Returns
+    -------
+    auc : float
+    """
+    x = np.ravel(x)
+    y = np.ravel(y)
+
+    direction = 1
+    dx = np.diff(x)
+    if np.any(dx < 0):
+        if np.all(dx <= 0):
+            direction = -1
+        else:
+            raise ValueError(Errors.E164.format(x=x))
+
+    area = direction * np.trapz(y, x)
+    if isinstance(area, np.memmap):
+        # Reductions such as .sum used internally in np.trapz do not return a
+        # scalar by default for numpy.memmap instances contrary to
+        # regular numpy.ndarray instances.
+        area = area.dtype.type(area)
+    return area
--- a/cgpenv/Lib/site-packages/spacy/strings.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/strings.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/strings.cpp
+++ b/cgpenv/Lib/site-packages/spacy/strings.cpp
--- a/cgpenv/Lib/site-packages/spacy/strings.pxd
+++ b/cgpenv/Lib/site-packages/spacy/strings.pxd
+from libc.stdint cimport int64_t
+from libcpp.vector cimport vector
+from libcpp.set cimport set
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+from murmurhash.mrmr cimport hash64
+
+from .typedefs cimport attr_t, hash_t
+
+
+cpdef hash_t hash_string(str string) except 0
+cdef hash_t hash_utf8(char* utf8_string, int length) nogil
+
+cdef str decode_Utf8Str(const Utf8Str* string)
+
+
+ctypedef union Utf8Str:
+    unsigned char[8] s
+    unsigned char* p
+
+
+cdef class StringStore:
+    cdef Pool mem
+
+    cdef vector[hash_t] keys
+    cdef public PreshMap _map
+
+    cdef const Utf8Str* intern_unicode(self, str py_string)
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
--- a/cgpenv/Lib/site-packages/spacy/strings.pyi
+++ b/cgpenv/Lib/site-packages/spacy/strings.pyi
+from typing import Optional, Iterable, Iterator, Union, Any
+from pathlib import Path
+
+def get_string_id(key: Union[str, int]) -> int: ...
+
+class StringStore:
+    def __init__(
+        self, strings: Optional[Iterable[str]] = ..., freeze: bool = ...
+    ) -> None: ...
+    def __getitem__(self, string_or_id: Union[bytes, str, int]) -> Union[str, int]: ...
+    def as_int(self, key: Union[bytes, str, int]) -> int: ...
+    def as_string(self, key: Union[bytes, str, int]) -> str: ...
+    def add(self, string: str) -> int: ...
+    def __len__(self) -> int: ...
+    def __contains__(self, string: str) -> bool: ...
+    def __iter__(self) -> Iterator[str]: ...
+    def __reduce__(self) -> Any: ...
+    def to_disk(self, path: Union[str, Path]) -> None: ...
+    def from_disk(self, path: Union[str, Path]) -> StringStore: ...
+    def to_bytes(self, **kwargs: Any) -> bytes: ...
+    def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ...
+    def _reset_and_load(self, strings: Iterable[str]) -> None: ...
--- a/cgpenv/Lib/site-packages/spacy/strings.pyx
+++ b/cgpenv/Lib/site-packages/spacy/strings.pyx
+# cython: infer_types=True
+cimport cython
+from libc.string cimport memcpy
+from libcpp.set cimport set
+from libc.stdint cimport uint32_t
+from murmurhash.mrmr cimport hash64, hash32
+
+import srsly
+
+from .typedefs cimport hash_t
+
+from .symbols import IDS as SYMBOLS_BY_STR
+from .symbols import NAMES as SYMBOLS_BY_INT
+from .errors import Errors
+from . import util
+
+
+def get_string_id(key):
+    """Get a string ID, handling the reserved symbols correctly. If the key is
+    already an ID, return it.
+
+    This function optimises for convenience over performance, so shouldn't be
+    used in tight loops.
+    """
+    if not isinstance(key, str):
+        return key
+    elif key in SYMBOLS_BY_STR:
+        return SYMBOLS_BY_STR[key]
+    elif not key:
+        return 0
+    else:
+        chars = key.encode("utf8")
+        return hash_utf8(chars, len(chars))
+
+
+cpdef hash_t hash_string(str string) except 0:
+    chars = string.encode("utf8")
+    return hash_utf8(chars, len(chars))
+
+
+cdef hash_t hash_utf8(char* utf8_string, int length) nogil:
+    return hash64(utf8_string, length, 1)
+
+
+cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
+    return hash32(utf8_string, length, 1)
+
+
+cdef str decode_Utf8Str(const Utf8Str* string):
+    cdef int i, length
+    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
+        return string.s[1:string.s[0]+1].decode("utf8")
+    elif string.p[0] < 255:
+        return string.p[1:string.p[0]+1].decode("utf8")
+    else:
+        i = 0
+        length = 0
+        while string.p[i] == 255:
+            i += 1
+            length += 255
+        length += string.p[i]
+        i += 1
+        return string.p[i:length + i].decode("utf8")
+
+
+cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
+    cdef int n_length_bytes
+    cdef int i
+    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
+    cdef uint32_t ulength = length
+    if length < sizeof(string.s):
+        string.s[0] = <unsigned char>length
+        memcpy(&string.s[1], chars, length)
+        return string
+    elif length < 255:
+        string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
+        string.p[0] = length
+        memcpy(&string.p[1], chars, length)
+        return string
+    else:
+        i = 0
+        n_length_bytes = (length // 255) + 1
+        string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
+        for i in range(n_length_bytes-1):
+            string.p[i] = 255
+        string.p[n_length_bytes-1] = length % 255
+        memcpy(&string.p[n_length_bytes], chars, length)
+        return string
+
+
+cdef class StringStore:
+    """Look up strings by 64-bit hashes.
+
+    DOCS: https://spacy.io/api/stringstore
+    """
+    def __init__(self, strings=None, freeze=False):
+        """Create the StringStore.
+
+        strings (iterable): A sequence of unicode strings to add to the store.
+        """
+        self.mem = Pool()
+        self._map = PreshMap()
+        if strings is not None:
+            for string in strings:
+                self.add(string)
+
+    def __getitem__(self, object string_or_id):
+        """Retrieve a string from a given hash, or vice versa.
+
+        string_or_id (bytes, str or uint64): The value to encode.
+        Returns (str / uint64): The value to be retrieved.
+        """
+        if isinstance(string_or_id, str) and len(string_or_id) == 0:
+            return 0
+        elif string_or_id == 0:
+            return ""
+        elif string_or_id in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string_or_id]
+        cdef hash_t key
+        if isinstance(string_or_id, str):
+            key = hash_string(string_or_id)
+            return key
+        elif isinstance(string_or_id, bytes):
+            key = hash_utf8(string_or_id, len(string_or_id))
+            return key
+        elif string_or_id < len(SYMBOLS_BY_INT):
+            return SYMBOLS_BY_INT[string_or_id]
+        else:
+            key = string_or_id
+            utf8str = <Utf8Str*>self._map.get(key)
+            if utf8str is NULL:
+                raise KeyError(Errors.E018.format(hash_value=string_or_id))
+            else:
+                return decode_Utf8Str(utf8str)
+
+    def as_int(self, key):
+        """If key is an int, return it; otherwise, get the int value."""
+        if not isinstance(key, str):
+            return key
+        else:
+            return self[key]
+
+    def as_string(self, key):
+        """If key is a string, return it; otherwise, get the string value."""
+        if isinstance(key, str):
+            return key
+        else:
+            return self[key]
+
+    def add(self, string):
+        """Add a string to the StringStore.
+
+        string (str): The string to add.
+        RETURNS (uint64): The string's hash value.
+        """
+        if isinstance(string, str):
+            if string in SYMBOLS_BY_STR:
+                return SYMBOLS_BY_STR[string]
+            key = hash_string(string)
+            self.intern_unicode(string)
+        elif isinstance(string, bytes):
+            if string in SYMBOLS_BY_STR:
+                return SYMBOLS_BY_STR[string]
+            key = hash_utf8(string, len(string))
+            self._intern_utf8(string, len(string))
+        else:
+            raise TypeError(Errors.E017.format(value_type=type(string)))
+        return key
+
+    def __len__(self):
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self.keys.size()
+
+    def __contains__(self, string not None):
+        """Check whether a string is in the store.
+
+        string (str): The string to check.
+        RETURNS (bool): Whether the store contains the string.
+        """
+        cdef hash_t key
+        if isinstance(string, int) or isinstance(string, long):
+            if string == 0:
+                return True
+            key = string
+        elif len(string) == 0:
+            return True
+        elif string in SYMBOLS_BY_STR:
+            return True
+        elif isinstance(string, str):
+            key = hash_string(string)
+        else:
+            string = string.encode("utf8")
+            key = hash_utf8(string, len(string))
+        if key < len(SYMBOLS_BY_INT):
+            return True
+        else:
+            return self._map.get(key) is not NULL
+
+    def __iter__(self):
+        """Iterate over the strings in the store, in order.
+
+        YIELDS (str): A string in the store.
+        """
+        cdef int i
+        cdef hash_t key
+        for i in range(self.keys.size()):
+            key = self.keys[i]
+            utf8str = <Utf8Str*>self._map.get(key)
+            yield decode_Utf8Str(utf8str)
+        # TODO: Iterate OOV here?
+
+    def __reduce__(self):
+        strings = list(self)
+        return (StringStore, (strings,), None, None, None)
+
+    def to_disk(self, path):
+        """Save the current state to a directory.
+
+        path (str / Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or Path-like objects.
+        """
+        path = util.ensure_path(path)
+        strings = sorted(self)
+        srsly.write_json(path, strings)
+
+    def from_disk(self, path):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (str / Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        RETURNS (StringStore): The modified `StringStore` object.
+        """
+        path = util.ensure_path(path)
+        strings = srsly.read_json(path)
+        prev = list(self)
+        self._reset_and_load(strings)
+        for word in prev:
+            self.add(word)
+        return self
+
+    def to_bytes(self, **kwargs):
+        """Serialize the current state to a binary string.
+
+        RETURNS (bytes): The serialized form of the `StringStore` object.
+        """
+        return srsly.json_dumps(sorted(self))
+
+    def from_bytes(self, bytes_data, **kwargs):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        RETURNS (StringStore): The `StringStore` object.
+        """
+        strings = srsly.json_loads(bytes_data)
+        prev = list(self)
+        self._reset_and_load(strings)
+        for word in prev:
+            self.add(word)
+        return self
+
+    def _reset_and_load(self, strings):
+        self.mem = Pool()
+        self._map = PreshMap()
+        self.keys.clear()
+        for string in strings:
+            self.add(string)
+
+    cdef const Utf8Str* intern_unicode(self, str py_string):
+        # 0 means missing, but we don't bother offsetting the index.
+        cdef bytes byte_string = py_string.encode("utf8")
+        return self._intern_utf8(byte_string, len(byte_string))
+
+    @cython.final
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length):
+        # TODO: This function's API/behaviour is an unholy mess...
+        # 0 means missing, but we don't bother offsetting the index.
+        cdef hash_t key = hash_utf8(utf8_string, length)
+        cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
+        if value is not NULL:
+            return value
+        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
+        self._map.set(key, value)
+        self.keys.push_back(key)
+        return value
--- a/cgpenv/Lib/site-packages/spacy/structs.pxd
+++ b/cgpenv/Lib/site-packages/spacy/structs.pxd
+from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
+from libcpp.vector cimport vector
+from libcpp.unordered_set cimport unordered_set
+from libcpp.unordered_map cimport unordered_map
+from libc.stdint cimport int32_t, int64_t
+
+from .typedefs cimport flags_t, attr_t, hash_t
+from .parts_of_speech cimport univ_pos_t
+
+
+cdef struct LexemeC:
+    flags_t flags
+
+    attr_t lang
+
+    attr_t id
+    attr_t length
+
+    attr_t orth
+    attr_t lower
+    attr_t norm
+    attr_t shape
+    attr_t prefix
+    attr_t suffix
+
+
+cdef struct SpanC:
+    hash_t id
+    int start
+    int end
+    int start_char
+    int end_char
+    attr_t label
+    attr_t kb_id
+
+
+cdef struct TokenC:
+    const LexemeC* lex
+    uint64_t morph
+    univ_pos_t pos
+    bint spacy
+    attr_t tag
+    int idx
+    attr_t lemma
+    attr_t norm
+    int head
+    attr_t dep
+
+    uint32_t l_kids
+    uint32_t r_kids
+    uint32_t l_edge
+    uint32_t r_edge
+
+    int sent_start
+    int ent_iob
+    attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+    attr_t ent_kb_id
+    hash_t ent_id
+
+
+cdef struct MorphAnalysisC:
+    hash_t key
+    int length
+
+    attr_t* fields
+    attr_t* features
+
+
+# Internal struct, for storage and disambiguation of entities.
+cdef struct KBEntryC:
+
+    # The hash of this entry's unique ID/name in the kB
+    hash_t entity_hash
+
+    # Allows retrieval of the entity vector, as an index into a vectors table of the KB.
+    # Can be expanded later to refer to multiple rows (compositional model to reduce storage footprint).
+    int32_t vector_index
+
+    # Allows retrieval of a struct of non-vector features.
+    # This is currently not implemented and set to -1 for the common case where there are no features.
+    int32_t feats_row
+
+    # log probability of entity, based on corpus frequency
+    float freq
+
+
+# Each alias struct stores a list of Entry pointers with their prior probabilities
+# for this specific mention/alias.
+cdef struct AliasC:
+
+    # All entry candidates for this alias
+    vector[int64_t] entry_indices
+
+    # Prior probability P(entity|alias) - should sum up to (at most) 1.
+    vector[float] probs
+
+
+cdef struct EdgeC:
+    hash_t label
+    int32_t head
+    int32_t tail
+
+
+cdef struct GraphC:
+    vector[vector[int32_t]] nodes
+    vector[EdgeC] edges
+    vector[float] weights
+    vector[int] n_heads
+    vector[int] n_tails
+    vector[int] first_head
+    vector[int] first_tail
+    unordered_set[int]* roots
+    unordered_map[hash_t, int]* node_map
+    unordered_map[hash_t, int]* edge_map
--- a/cgpenv/Lib/site-packages/spacy/symbols.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/symbols.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/symbols.cpp
+++ b/cgpenv/Lib/site-packages/spacy/symbols.cpp
--- a/cgpenv/Lib/site-packages/spacy/symbols.pxd
+++ b/cgpenv/Lib/site-packages/spacy/symbols.pxd
+cdef enum symbol_t:
+    NIL
+    IS_ALPHA
+    IS_ASCII
+    IS_DIGIT
+    IS_LOWER
+    IS_PUNCT
+    IS_SPACE
+    IS_TITLE
+    IS_UPPER
+    LIKE_URL
+    LIKE_NUM
+    LIKE_EMAIL
+    IS_STOP
+    IS_OOV_DEPRECATED
+    IS_BRACKET
+    IS_QUOTE
+    IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT
+    IS_CURRENCY
+
+    FLAG19 = 19
+    FLAG20
+    FLAG21
+    FLAG22
+    FLAG23
+    FLAG24
+    FLAG25
+    FLAG26
+    FLAG27
+    FLAG28
+    FLAG29
+    FLAG30
+    FLAG31
+    FLAG32
+    FLAG33
+    FLAG34
+    FLAG35
+    FLAG36
+    FLAG37
+    FLAG38
+    FLAG39
+    FLAG40
+    FLAG41
+    FLAG42
+    FLAG43
+    FLAG44
+    FLAG45
+    FLAG46
+    FLAG47
+    FLAG48
+    FLAG49
+    FLAG50
+    FLAG51
+    FLAG52
+    FLAG53
+    FLAG54
+    FLAG55
+    FLAG56
+    FLAG57
+    FLAG58
+    FLAG59
+    FLAG60
+    FLAG61
+    FLAG62
+    FLAG63
+
+    ID
+    ORTH
+    LOWER
+    NORM
+    SHAPE
+    PREFIX
+    SUFFIX
+
+    LENGTH
+    CLUSTER
+    LEMMA
+    POS
+    TAG
+    DEP
+    ENT_IOB
+    ENT_TYPE
+    HEAD
+    SENT_START
+    SPACY
+    PROB
+    LANG
+
+    ADJ
+    ADP
+    ADV
+    AUX
+    CONJ
+    CCONJ # U20
+    DET
+    INTJ
+    NOUN
+    NUM
+    PART
+    PRON
+    PROPN
+    PUNCT
+    SCONJ
+    SYM
+    VERB
+    X
+    EOL
+    SPACE
+
+    DEPRECATED001
+    DEPRECATED002
+    DEPRECATED003
+    DEPRECATED004
+    DEPRECATED005
+    DEPRECATED006
+    DEPRECATED007
+    DEPRECATED008
+    DEPRECATED009
+    DEPRECATED010
+    DEPRECATED011
+    DEPRECATED012
+    DEPRECATED013
+    DEPRECATED014
+    DEPRECATED015
+    DEPRECATED016
+    DEPRECATED017
+    DEPRECATED018
+    DEPRECATED019
+    DEPRECATED020
+    DEPRECATED021
+    DEPRECATED022
+    DEPRECATED023
+    DEPRECATED024
+    DEPRECATED025
+    DEPRECATED026
+    DEPRECATED027
+    DEPRECATED028
+    DEPRECATED029
+    DEPRECATED030
+    DEPRECATED031
+    DEPRECATED032
+    DEPRECATED033
+    DEPRECATED034
+    DEPRECATED035
+    DEPRECATED036
+    DEPRECATED037
+    DEPRECATED038
+    DEPRECATED039
+    DEPRECATED040
+    DEPRECATED041
+    DEPRECATED042
+    DEPRECATED043
+    DEPRECATED044
+    DEPRECATED045
+    DEPRECATED046
+    DEPRECATED047
+    DEPRECATED048
+    DEPRECATED049
+    DEPRECATED050
+    DEPRECATED051
+    DEPRECATED052
+    DEPRECATED053
+    DEPRECATED054
+    DEPRECATED055
+    DEPRECATED056
+    DEPRECATED057
+    DEPRECATED058
+    DEPRECATED059
+    DEPRECATED060
+    DEPRECATED061
+    DEPRECATED062
+    DEPRECATED063
+    DEPRECATED064
+    DEPRECATED065
+    DEPRECATED066
+    DEPRECATED067
+    DEPRECATED068
+    DEPRECATED069
+    DEPRECATED070
+    DEPRECATED071
+    DEPRECATED072
+    DEPRECATED073
+    DEPRECATED074
+    DEPRECATED075
+    DEPRECATED076
+    DEPRECATED077
+    DEPRECATED078
+    DEPRECATED079
+    DEPRECATED080
+    DEPRECATED081
+    DEPRECATED082
+    DEPRECATED083
+    DEPRECATED084
+    DEPRECATED085
+    DEPRECATED086
+    DEPRECATED087
+    DEPRECATED088
+    DEPRECATED089
+    DEPRECATED090
+    DEPRECATED091
+    DEPRECATED092
+    DEPRECATED093
+    DEPRECATED094
+    DEPRECATED095
+    DEPRECATED096
+    DEPRECATED097
+    DEPRECATED098
+    DEPRECATED099
+    DEPRECATED100
+    DEPRECATED101
+    DEPRECATED102
+    DEPRECATED103
+    DEPRECATED104
+    DEPRECATED105
+    DEPRECATED106
+    DEPRECATED107
+    DEPRECATED108
+    DEPRECATED109
+    DEPRECATED110
+    DEPRECATED111
+    DEPRECATED112
+    DEPRECATED113
+    DEPRECATED114
+    DEPRECATED115
+    DEPRECATED116
+    DEPRECATED117
+    DEPRECATED118
+    DEPRECATED119
+    DEPRECATED120
+    DEPRECATED121
+    DEPRECATED122
+    DEPRECATED123
+    DEPRECATED124
+    DEPRECATED125
+    DEPRECATED126
+    DEPRECATED127
+    DEPRECATED128
+    DEPRECATED129
+    DEPRECATED130
+    DEPRECATED131
+    DEPRECATED132
+    DEPRECATED133
+    DEPRECATED134
+    DEPRECATED135
+    DEPRECATED136
+    DEPRECATED137
+    DEPRECATED138
+    DEPRECATED139
+    DEPRECATED140
+    DEPRECATED141
+    DEPRECATED142
+    DEPRECATED143
+    DEPRECATED144
+    DEPRECATED145
+    DEPRECATED146
+    DEPRECATED147
+    DEPRECATED148
+    DEPRECATED149
+    DEPRECATED150
+    DEPRECATED151
+    DEPRECATED152
+    DEPRECATED153
+    DEPRECATED154
+    DEPRECATED155
+    DEPRECATED156
+    DEPRECATED157
+    DEPRECATED158
+    DEPRECATED159
+    DEPRECATED160
+    DEPRECATED161
+    DEPRECATED162
+    DEPRECATED163
+    DEPRECATED164
+    DEPRECATED165
+    DEPRECATED166
+    DEPRECATED167
+    DEPRECATED168
+    DEPRECATED169
+    DEPRECATED170
+    DEPRECATED171
+    DEPRECATED172
+    DEPRECATED173
+    DEPRECATED174
+    DEPRECATED175
+    DEPRECATED176
+    DEPRECATED177
+    DEPRECATED178
+    DEPRECATED179
+    DEPRECATED180
+    DEPRECATED181
+    DEPRECATED182
+    DEPRECATED183
+    DEPRECATED184
+    DEPRECATED185
+    DEPRECATED186
+    DEPRECATED187
+    DEPRECATED188
+    DEPRECATED189
+    DEPRECATED190
+    DEPRECATED191
+    DEPRECATED192
+    DEPRECATED193
+    DEPRECATED194
+    DEPRECATED195
+    DEPRECATED196
+    DEPRECATED197
+    DEPRECATED198
+    DEPRECATED199
+    DEPRECATED200
+    DEPRECATED201
+    DEPRECATED202
+    DEPRECATED203
+    DEPRECATED204
+    DEPRECATED205
+    DEPRECATED206
+    DEPRECATED207
+    DEPRECATED208
+    DEPRECATED209
+    DEPRECATED210
+    DEPRECATED211
+    DEPRECATED212
+    DEPRECATED213
+    DEPRECATED214
+    DEPRECATED215
+    DEPRECATED216
+    DEPRECATED217
+    DEPRECATED218
+    DEPRECATED219
+    DEPRECATED220
+    DEPRECATED221
+    DEPRECATED222
+    DEPRECATED223
+    DEPRECATED224
+    DEPRECATED225
+    DEPRECATED226
+    DEPRECATED227
+    DEPRECATED228
+    DEPRECATED229
+    DEPRECATED230
+    DEPRECATED231
+    DEPRECATED232
+    DEPRECATED233
+    DEPRECATED234
+    DEPRECATED235
+    DEPRECATED236
+    DEPRECATED237
+    DEPRECATED238
+    DEPRECATED239
+    DEPRECATED240
+    DEPRECATED241
+    DEPRECATED242
+    DEPRECATED243
+    DEPRECATED244
+    DEPRECATED245
+    DEPRECATED246
+    DEPRECATED247
+    DEPRECATED248
+    DEPRECATED249
+    DEPRECATED250
+    DEPRECATED251
+    DEPRECATED252
+    DEPRECATED253
+    DEPRECATED254
+    DEPRECATED255
+    DEPRECATED256
+    DEPRECATED257
+    DEPRECATED258
+    DEPRECATED259
+    DEPRECATED260
+    DEPRECATED261
+    DEPRECATED262
+    DEPRECATED263
+    DEPRECATED264
+    DEPRECATED265
+    DEPRECATED266
+    DEPRECATED267
+    DEPRECATED268
+    DEPRECATED269
+    DEPRECATED270
+    DEPRECATED271
+    DEPRECATED272
+    DEPRECATED273
+    DEPRECATED274
+    DEPRECATED275
+    DEPRECATED276
+
+    PERSON
+    NORP
+    FACILITY
+    ORG
+    GPE
+    LOC
+    PRODUCT
+    EVENT
+    WORK_OF_ART
+    LANGUAGE
+    LAW
+
+    DATE
+    TIME
+    PERCENT
+    MONEY
+    QUANTITY
+    ORDINAL
+    CARDINAL
+
+    acomp
+    advcl
+    advmod
+    agent
+    amod
+    appos
+    attr
+    aux
+    auxpass
+    cc
+    ccomp
+    complm
+    conj
+    cop # U20
+    csubj
+    csubjpass
+    dep
+    det
+    dobj
+    expl
+    hmod
+    hyph
+    infmod
+    intj
+    iobj
+    mark
+    meta
+    neg
+    nmod
+    nn
+    npadvmod
+    nsubj
+    nsubjpass
+    num
+    number
+    oprd
+    obj # U20
+    obl # U20
+    parataxis
+    partmod
+    pcomp
+    pobj
+    poss
+    possessive
+    preconj
+    prep
+    prt
+    punct
+    quantmod
+    relcl
+    rcmod
+    root
+    xcomp
+
+    acl
+
+    ENT_KB_ID
+    MORPH
+    ENT_ID
+
+    IDX
+    _
--- a/cgpenv/Lib/site-packages/spacy/symbols.pyx
+++ b/cgpenv/Lib/site-packages/spacy/symbols.pyx
+# cython: optimize.unpack_method_calls=False
+IDS = {
+    "": NIL,
+    "IS_ALPHA": IS_ALPHA,
+    "IS_ASCII": IS_ASCII,
+    "IS_DIGIT": IS_DIGIT,
+    "IS_LOWER": IS_LOWER,
+    "IS_PUNCT": IS_PUNCT,
+    "IS_SPACE": IS_SPACE,
+    "IS_TITLE": IS_TITLE,
+    "IS_UPPER": IS_UPPER,
+    "LIKE_URL": LIKE_URL,
+    "LIKE_NUM": LIKE_NUM,
+    "LIKE_EMAIL": LIKE_EMAIL,
+    "IS_STOP": IS_STOP,
+    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
+    "IS_BRACKET": IS_BRACKET,
+    "IS_QUOTE": IS_QUOTE,
+    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
+    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
+    "IS_CURRENCY": IS_CURRENCY,
+
+    "FLAG19": FLAG19,
+    "FLAG20": FLAG20,
+    "FLAG21": FLAG21,
+    "FLAG22": FLAG22,
+    "FLAG23": FLAG23,
+    "FLAG24": FLAG24,
+    "FLAG25": FLAG25,
+    "FLAG26": FLAG26,
+    "FLAG27": FLAG27,
+    "FLAG28": FLAG28,
+    "FLAG29": FLAG29,
+    "FLAG30": FLAG30,
+    "FLAG31": FLAG31,
+    "FLAG32": FLAG32,
+    "FLAG33": FLAG33,
+    "FLAG34": FLAG34,
+    "FLAG35": FLAG35,
+    "FLAG36": FLAG36,
+    "FLAG37": FLAG37,
+    "FLAG38": FLAG38,
+    "FLAG39": FLAG39,
+    "FLAG40": FLAG40,
+    "FLAG41": FLAG41,
+    "FLAG42": FLAG42,
+    "FLAG43": FLAG43,
+    "FLAG44": FLAG44,
+    "FLAG45": FLAG45,
+    "FLAG46": FLAG46,
+    "FLAG47": FLAG47,
+    "FLAG48": FLAG48,
+    "FLAG49": FLAG49,
+    "FLAG50": FLAG50,
+    "FLAG51": FLAG51,
+    "FLAG52": FLAG52,
+    "FLAG53": FLAG53,
+    "FLAG54": FLAG54,
+    "FLAG55": FLAG55,
+    "FLAG56": FLAG56,
+    "FLAG57": FLAG57,
+    "FLAG58": FLAG58,
+    "FLAG59": FLAG59,
+    "FLAG60": FLAG60,
+    "FLAG61": FLAG61,
+    "FLAG62": FLAG62,
+    "FLAG63": FLAG63,
+
+    "ID": ID,
+    "ORTH": ORTH,
+    "LOWER": LOWER,
+    "NORM": NORM,
+    "SHAPE": SHAPE,
+    "PREFIX": PREFIX,
+    "SUFFIX": SUFFIX,
+
+    "LENGTH": LENGTH,
+    "CLUSTER": CLUSTER,
+    "LEMMA": LEMMA,
+    "POS": POS,
+    "TAG": TAG,
+    "DEP": DEP,
+    "ENT_IOB": ENT_IOB,
+    "ENT_TYPE": ENT_TYPE,
+    "ENT_ID": ENT_ID,
+    "ENT_KB_ID": ENT_KB_ID,
+    "HEAD": HEAD,
+    "SENT_START": SENT_START,
+    "SPACY": SPACY,
+    "PROB": PROB,
+    "LANG": LANG,
+    "IDX": IDX,
+
+    "ADJ": ADJ,
+    "ADP": ADP,
+    "ADV": ADV,
+    "AUX": AUX,
+    "CONJ": CONJ,
+    "CCONJ": CCONJ, # U20
+    "DET": DET,
+    "INTJ": INTJ,
+    "NOUN": NOUN,
+    "NUM": NUM,
+    "PART": PART,
+    "PRON": PRON,
+    "PROPN": PROPN,
+    "PUNCT": PUNCT,
+    "SCONJ": SCONJ,
+    "SYM": SYM,
+    "VERB": VERB,
+    "X": X,
+    "EOL": EOL,
+    "SPACE": SPACE,
+
+    "DEPRECATED001": DEPRECATED001,
+    "DEPRECATED002": DEPRECATED002,
+    "DEPRECATED003": DEPRECATED003,
+    "DEPRECATED004": DEPRECATED004,
+    "DEPRECATED005": DEPRECATED005,
+    "DEPRECATED006": DEPRECATED006,
+    "DEPRECATED007": DEPRECATED007,
+    "DEPRECATED008": DEPRECATED008,
+    "DEPRECATED009": DEPRECATED009,
+    "DEPRECATED010": DEPRECATED010,
+    "DEPRECATED011": DEPRECATED011,
+    "DEPRECATED012": DEPRECATED012,
+    "DEPRECATED013": DEPRECATED013,
+    "DEPRECATED014": DEPRECATED014,
+    "DEPRECATED015": DEPRECATED015,
+    "DEPRECATED016": DEPRECATED016,
+    "DEPRECATED017": DEPRECATED017,
+    "DEPRECATED018": DEPRECATED018,
+    "DEPRECATED019": DEPRECATED019,
+    "DEPRECATED020": DEPRECATED020,
+    "DEPRECATED021": DEPRECATED021,
+    "DEPRECATED022": DEPRECATED022,
+    "DEPRECATED023": DEPRECATED023,
+    "DEPRECATED024": DEPRECATED024,
+    "DEPRECATED025": DEPRECATED025,
+    "DEPRECATED026": DEPRECATED026,
+    "DEPRECATED027": DEPRECATED027,
+    "DEPRECATED028": DEPRECATED028,
+    "DEPRECATED029": DEPRECATED029,
+    "DEPRECATED030": DEPRECATED030,
+    "DEPRECATED031": DEPRECATED031,
+    "DEPRECATED032": DEPRECATED032,
+    "DEPRECATED033": DEPRECATED033,
+    "DEPRECATED034": DEPRECATED034,
+    "DEPRECATED035": DEPRECATED035,
+    "DEPRECATED036": DEPRECATED036,
+    "DEPRECATED037": DEPRECATED037,
+    "DEPRECATED038": DEPRECATED038,
+    "DEPRECATED039": DEPRECATED039,
+    "DEPRECATED040": DEPRECATED040,
+    "DEPRECATED041": DEPRECATED041,
+    "DEPRECATED042": DEPRECATED042,
+    "DEPRECATED043": DEPRECATED043,
+    "DEPRECATED044": DEPRECATED044,
+    "DEPRECATED045": DEPRECATED045,
+    "DEPRECATED046": DEPRECATED046,
+    "DEPRECATED047": DEPRECATED047,
+    "DEPRECATED048": DEPRECATED048,
+    "DEPRECATED049": DEPRECATED049,
+    "DEPRECATED050": DEPRECATED050,
+    "DEPRECATED051": DEPRECATED051,
+    "DEPRECATED052": DEPRECATED052,
+    "DEPRECATED053": DEPRECATED053,
+    "DEPRECATED054": DEPRECATED054,
+    "DEPRECATED055": DEPRECATED055,
+    "DEPRECATED056": DEPRECATED056,
+    "DEPRECATED057": DEPRECATED057,
+    "DEPRECATED058": DEPRECATED058,
+    "DEPRECATED059": DEPRECATED059,
+    "DEPRECATED060": DEPRECATED060,
+    "DEPRECATED061": DEPRECATED061,
+    "DEPRECATED062": DEPRECATED062,
+    "DEPRECATED063": DEPRECATED063,
+    "DEPRECATED064": DEPRECATED064,
+    "DEPRECATED065": DEPRECATED065,
+    "DEPRECATED066": DEPRECATED066,
+    "DEPRECATED067": DEPRECATED067,
+    "DEPRECATED068": DEPRECATED068,
+    "DEPRECATED069": DEPRECATED069,
+    "DEPRECATED070": DEPRECATED070,
+    "DEPRECATED071": DEPRECATED071,
+    "DEPRECATED072": DEPRECATED072,
+    "DEPRECATED073": DEPRECATED073,
+    "DEPRECATED074": DEPRECATED074,
+    "DEPRECATED075": DEPRECATED075,
+    "DEPRECATED076": DEPRECATED076,
+    "DEPRECATED077": DEPRECATED077,
+    "DEPRECATED078": DEPRECATED078,
+    "DEPRECATED079": DEPRECATED079,
+    "DEPRECATED080": DEPRECATED080,
+    "DEPRECATED081": DEPRECATED081,
+    "DEPRECATED082": DEPRECATED082,
+    "DEPRECATED083": DEPRECATED083,
+    "DEPRECATED084": DEPRECATED084,
+    "DEPRECATED085": DEPRECATED085,
+    "DEPRECATED086": DEPRECATED086,
+    "DEPRECATED087": DEPRECATED087,
+    "DEPRECATED088": DEPRECATED088,
+    "DEPRECATED089": DEPRECATED089,
+    "DEPRECATED090": DEPRECATED090,
+    "DEPRECATED091": DEPRECATED091,
+    "DEPRECATED092": DEPRECATED092,
+    "DEPRECATED093": DEPRECATED093,
+    "DEPRECATED094": DEPRECATED094,
+    "DEPRECATED095": DEPRECATED095,
+    "DEPRECATED096": DEPRECATED096,
+    "DEPRECATED097": DEPRECATED097,
+    "DEPRECATED098": DEPRECATED098,
+    "DEPRECATED099": DEPRECATED099,
+    "DEPRECATED100": DEPRECATED100,
+    "DEPRECATED101": DEPRECATED101,
+    "DEPRECATED102": DEPRECATED102,
+    "DEPRECATED103": DEPRECATED103,
+    "DEPRECATED104": DEPRECATED104,
+    "DEPRECATED105": DEPRECATED105,
+    "DEPRECATED106": DEPRECATED106,
+    "DEPRECATED107": DEPRECATED107,
+    "DEPRECATED108": DEPRECATED108,
+    "DEPRECATED109": DEPRECATED109,
+    "DEPRECATED110": DEPRECATED110,
+    "DEPRECATED111": DEPRECATED111,
+    "DEPRECATED112": DEPRECATED112,
+    "DEPRECATED113": DEPRECATED113,
+    "DEPRECATED114": DEPRECATED114,
+    "DEPRECATED115": DEPRECATED115,
+    "DEPRECATED116": DEPRECATED116,
+    "DEPRECATED117": DEPRECATED117,
+    "DEPRECATED118": DEPRECATED118,
+    "DEPRECATED119": DEPRECATED119,
+    "DEPRECATED120": DEPRECATED120,
+    "DEPRECATED121": DEPRECATED121,
+    "DEPRECATED122": DEPRECATED122,
+    "DEPRECATED123": DEPRECATED123,
+    "DEPRECATED124": DEPRECATED124,
+    "DEPRECATED125": DEPRECATED125,
+    "DEPRECATED126": DEPRECATED126,
+    "DEPRECATED127": DEPRECATED127,
+    "DEPRECATED128": DEPRECATED128,
+    "DEPRECATED129": DEPRECATED129,
+    "DEPRECATED130": DEPRECATED130,
+    "DEPRECATED131": DEPRECATED131,
+    "DEPRECATED132": DEPRECATED132,
+    "DEPRECATED133": DEPRECATED133,
+    "DEPRECATED134": DEPRECATED134,
+    "DEPRECATED135": DEPRECATED135,
+    "DEPRECATED136": DEPRECATED136,
+    "DEPRECATED137": DEPRECATED137,
+    "DEPRECATED138": DEPRECATED138,
+    "DEPRECATED139": DEPRECATED139,
+    "DEPRECATED140": DEPRECATED140,
+    "DEPRECATED141": DEPRECATED141,
+    "DEPRECATED142": DEPRECATED142,
+    "DEPRECATED143": DEPRECATED143,
+    "DEPRECATED144": DEPRECATED144,
+    "DEPRECATED145": DEPRECATED145,
+    "DEPRECATED146": DEPRECATED146,
+    "DEPRECATED147": DEPRECATED147,
+    "DEPRECATED148": DEPRECATED148,
+    "DEPRECATED149": DEPRECATED149,
+    "DEPRECATED150": DEPRECATED150,
+    "DEPRECATED151": DEPRECATED151,
+    "DEPRECATED152": DEPRECATED152,
+    "DEPRECATED153": DEPRECATED153,
+    "DEPRECATED154": DEPRECATED154,
+    "DEPRECATED155": DEPRECATED155,
+    "DEPRECATED156": DEPRECATED156,
+    "DEPRECATED157": DEPRECATED157,
+    "DEPRECATED158": DEPRECATED158,
+    "DEPRECATED159": DEPRECATED159,
+    "DEPRECATED160": DEPRECATED160,
+    "DEPRECATED161": DEPRECATED161,
+    "DEPRECATED162": DEPRECATED162,
+    "DEPRECATED163": DEPRECATED163,
+    "DEPRECATED164": DEPRECATED164,
+    "DEPRECATED165": DEPRECATED165,
+    "DEPRECATED166": DEPRECATED166,
+    "DEPRECATED167": DEPRECATED167,
+    "DEPRECATED168": DEPRECATED168,
+    "DEPRECATED169": DEPRECATED169,
+    "DEPRECATED170": DEPRECATED170,
+    "DEPRECATED171": DEPRECATED171,
+    "DEPRECATED172": DEPRECATED172,
+    "DEPRECATED173": DEPRECATED173,
+    "DEPRECATED174": DEPRECATED174,
+    "DEPRECATED175": DEPRECATED175,
+    "DEPRECATED176": DEPRECATED176,
+    "DEPRECATED177": DEPRECATED177,
+    "DEPRECATED178": DEPRECATED178,
+    "DEPRECATED179": DEPRECATED179,
+    "DEPRECATED180": DEPRECATED180,
+    "DEPRECATED181": DEPRECATED181,
+    "DEPRECATED182": DEPRECATED182,
+    "DEPRECATED183": DEPRECATED183,
+    "DEPRECATED184": DEPRECATED184,
+    "DEPRECATED185": DEPRECATED185,
+    "DEPRECATED186": DEPRECATED186,
+    "DEPRECATED187": DEPRECATED187,
+    "DEPRECATED188": DEPRECATED188,
+    "DEPRECATED189": DEPRECATED189,
+    "DEPRECATED190": DEPRECATED190,
+    "DEPRECATED191": DEPRECATED191,
+    "DEPRECATED192": DEPRECATED192,
+    "DEPRECATED193": DEPRECATED193,
+    "DEPRECATED194": DEPRECATED194,
+    "DEPRECATED195": DEPRECATED195,
+    "DEPRECATED196": DEPRECATED196,
+    "DEPRECATED197": DEPRECATED197,
+    "DEPRECATED198": DEPRECATED198,
+    "DEPRECATED199": DEPRECATED199,
+    "DEPRECATED200": DEPRECATED200,
+    "DEPRECATED201": DEPRECATED201,
+    "DEPRECATED202": DEPRECATED202,
+    "DEPRECATED203": DEPRECATED203,
+    "DEPRECATED204": DEPRECATED204,
+    "DEPRECATED205": DEPRECATED205,
+    "DEPRECATED206": DEPRECATED206,
+    "DEPRECATED207": DEPRECATED207,
+    "DEPRECATED208": DEPRECATED208,
+    "DEPRECATED209": DEPRECATED209,
+    "DEPRECATED210": DEPRECATED210,
+    "DEPRECATED211": DEPRECATED211,
+    "DEPRECATED212": DEPRECATED212,
+    "DEPRECATED213": DEPRECATED213,
+    "DEPRECATED214": DEPRECATED214,
+    "DEPRECATED215": DEPRECATED215,
+    "DEPRECATED216": DEPRECATED216,
+    "DEPRECATED217": DEPRECATED217,
+    "DEPRECATED218": DEPRECATED218,
+    "DEPRECATED219": DEPRECATED219,
+    "DEPRECATED220": DEPRECATED220,
+    "DEPRECATED221": DEPRECATED221,
+    "DEPRECATED222": DEPRECATED222,
+    "DEPRECATED223": DEPRECATED223,
+    "DEPRECATED224": DEPRECATED224,
+    "DEPRECATED225": DEPRECATED225,
+    "DEPRECATED226": DEPRECATED226,
+    "DEPRECATED227": DEPRECATED227,
+    "DEPRECATED228": DEPRECATED228,
+    "DEPRECATED229": DEPRECATED229,
+    "DEPRECATED230": DEPRECATED230,
+    "DEPRECATED231": DEPRECATED231,
+    "DEPRECATED232": DEPRECATED232,
+    "DEPRECATED233": DEPRECATED233,
+    "DEPRECATED234": DEPRECATED234,
+    "DEPRECATED235": DEPRECATED235,
+    "DEPRECATED236": DEPRECATED236,
+    "DEPRECATED237": DEPRECATED237,
+    "DEPRECATED238": DEPRECATED238,
+    "DEPRECATED239": DEPRECATED239,
+    "DEPRECATED240": DEPRECATED240,
+    "DEPRECATED241": DEPRECATED241,
+    "DEPRECATED242": DEPRECATED242,
+    "DEPRECATED243": DEPRECATED243,
+    "DEPRECATED244": DEPRECATED244,
+    "DEPRECATED245": DEPRECATED245,
+    "DEPRECATED246": DEPRECATED246,
+    "DEPRECATED247": DEPRECATED247,
+    "DEPRECATED248": DEPRECATED248,
+    "DEPRECATED249": DEPRECATED249,
+    "DEPRECATED250": DEPRECATED250,
+    "DEPRECATED251": DEPRECATED251,
+    "DEPRECATED252": DEPRECATED252,
+    "DEPRECATED253": DEPRECATED253,
+    "DEPRECATED254": DEPRECATED254,
+    "DEPRECATED255": DEPRECATED255,
+    "DEPRECATED256": DEPRECATED256,
+    "DEPRECATED257": DEPRECATED257,
+    "DEPRECATED258": DEPRECATED258,
+    "DEPRECATED259": DEPRECATED259,
+    "DEPRECATED260": DEPRECATED260,
+    "DEPRECATED261": DEPRECATED261,
+    "DEPRECATED262": DEPRECATED262,
+    "DEPRECATED263": DEPRECATED263,
+    "DEPRECATED264": DEPRECATED264,
+    "DEPRECATED265": DEPRECATED265,
+    "DEPRECATED266": DEPRECATED266,
+    "DEPRECATED267": DEPRECATED267,
+    "DEPRECATED268": DEPRECATED268,
+    "DEPRECATED269": DEPRECATED269,
+    "DEPRECATED270": DEPRECATED270,
+    "DEPRECATED271": DEPRECATED271,
+    "DEPRECATED272": DEPRECATED272,
+    "DEPRECATED273": DEPRECATED273,
+    "DEPRECATED274": DEPRECATED274,
+    "DEPRECATED275": DEPRECATED275,
+    "DEPRECATED276": DEPRECATED276,
+
+    "PERSON": PERSON,
+    "NORP": NORP,
+    "FACILITY": FACILITY,
+    "ORG": ORG,
+    "GPE": GPE,
+    "LOC": LOC,
+    "PRODUCT": PRODUCT,
+    "EVENT": EVENT,
+    "WORK_OF_ART": WORK_OF_ART,
+    "LANGUAGE": LANGUAGE,
+
+    "DATE": DATE,
+    "TIME": TIME,
+    "PERCENT": PERCENT,
+    "MONEY": MONEY,
+    "QUANTITY": QUANTITY,
+    "ORDINAL": ORDINAL,
+    "CARDINAL": CARDINAL,
+
+    "acomp": acomp,
+    "advcl": advcl,
+    "advmod": advmod,
+    "agent": agent,
+    "amod": amod,
+    "appos": appos,
+    "attr": attr,
+    "aux": aux,
+    "auxpass": auxpass,
+    "cc": cc,
+    "ccomp": ccomp,
+    "complm": complm,
+    "conj": conj,
+    "cop": cop, # U20
+    "csubj": csubj,
+    "csubjpass": csubjpass,
+    "dep": dep,
+    "det": det,
+    "dobj": dobj,
+    "expl": expl,
+    "hmod": hmod,
+    "hyph": hyph,
+    "infmod": infmod,
+    "intj": intj,
+    "iobj": iobj,
+    "mark": mark,
+    "meta": meta,
+    "neg": neg,
+    "nmod": nmod,
+    "nn": nn,
+    "npadvmod": npadvmod,
+    "nsubj": nsubj,
+    "nsubjpass": nsubjpass,
+    "num": num,
+    "number": number,
+    "oprd": oprd,
+    "obj": obj, # U20
+    "obl": obl, # U20
+    "parataxis": parataxis,
+    "partmod": partmod,
+    "pcomp": pcomp,
+    "pobj": pobj,
+    "poss": poss,
+    "possessive": possessive,
+    "preconj": preconj,
+    "prep": prep,
+    "prt": prt,
+    "punct": punct,
+    "quantmod": quantmod,
+    "rcmod": rcmod,
+    "relcl": relcl,
+    "root": root,
+    "xcomp": xcomp,
+
+    "acl": acl,
+    "LAW": LAW,
+    "MORPH": MORPH,
+    "_": _,
+}
+
+
+def sort_nums(x):
+    return x[1]
+
+
+NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
+# Unfortunate hack here, to work around problem with long cpdef enum
+# (which is generating an enormous amount of C++ in Cython 0.24+)
+# We keep the enum cdef, and just make sure the names are available to Python
+locals().update(IDS)
--- a/cgpenv/Lib/site-packages/spacy/tokenizer.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/tokenizer.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/tokenizer.cpp
+++ b/cgpenv/Lib/site-packages/spacy/tokenizer.cpp
--- a/cgpenv/Lib/site-packages/spacy/tokenizer.pxd
+++ b/cgpenv/Lib/site-packages/spacy/tokenizer.pxd
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
+from cymem.cymem cimport Pool
+
+from .typedefs cimport hash_t
+from .structs cimport LexemeC, SpanC, TokenC
+from .strings cimport StringStore
+from .tokens.doc cimport Doc
+from .vocab cimport Vocab, LexemesOrTokens, _Cached
+from .matcher.phrasematcher cimport PhraseMatcher
+
+
+cdef class Tokenizer:
+    cdef Pool mem
+    cdef PreshMap _cache
+    cdef PreshMap _specials
+    cdef readonly Vocab vocab
+
+    cdef object _token_match
+    cdef object _url_match
+    cdef object _prefix_search
+    cdef object _suffix_search
+    cdef object _infix_finditer
+    cdef object _rules
+    cdef PhraseMatcher _special_matcher
+    # TODO next two are unused and should be removed in v4
+    # https://github.com/explosion/spaCy/pull/9150
+    cdef int _unused_int1
+    cdef int _unused_int2
+
+    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
+    cdef int _apply_special_cases(self, Doc doc) except -1
+    cdef void _filter_special_spans(self, vector[SpanC] &original,
+                            vector[SpanC] &filtered, int doc_len) nogil
+    cdef object _prepare_special_spans(self, Doc doc,
+                                       vector[SpanC] &filtered)
+    cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
+                                       object span_data)
+    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
+                                     int* has_special,
+                                     bint with_special_cases) except -1
+    cdef int _tokenize(self, Doc tokens, str span, hash_t key,
+                       int* has_special, bint with_special_cases) except -1
+    cdef str _split_affixes(self, Pool mem, str string,
+                                vector[LexemeC*] *prefixes,
+                                vector[LexemeC*] *suffixes, int* has_special,
+                                bint with_special_cases)
+    cdef int _attach_tokens(self, Doc tokens, str string,
+                            vector[LexemeC*] *prefixes,
+                            vector[LexemeC*] *suffixes, int* has_special,
+                            bint with_special_cases) except -1
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
+                          int* has_special, int n) except -1
--- a/cgpenv/Lib/site-packages/spacy/tokenizer.pyx
+++ b/cgpenv/Lib/site-packages/spacy/tokenizer.pyx
+# cython: embedsignature=True, profile=True, binding=True
+from cython.operator cimport dereference as deref
+from cython.operator cimport preincrement as preinc
+from libc.string cimport memcpy, memset
+from libcpp.set cimport set as stdset
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+cimport cython
+
+import re
+import warnings
+
+from .tokens.doc cimport Doc
+from .strings cimport hash_string
+from .lexeme cimport EMPTY_LEXEME
+
+from .attrs import intify_attrs
+from .symbols import ORTH, NORM
+from .errors import Errors, Warnings
+from . import util
+from .util import registry, get_words_and_spaces
+from .attrs import intify_attrs
+from .symbols import ORTH
+from .scorer import Scorer
+from .training import validate_examples
+from .tokens import Span
+
+
+cdef class Tokenizer:
+    """Segment text, and create Doc objects with the discovered segment
+    boundaries.
+
+    DOCS: https://spacy.io/api/tokenizer
+    """
+    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
+                 suffix_search=None, infix_finditer=None, token_match=None,
+                 url_match=None):
+        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
+
+        vocab (Vocab): A storage container for lexical types.
+        rules (dict): Exceptions and special-cases for the tokenizer.
+        prefix_search (callable): A function matching the signature of
+            `re.compile(string).search` to match prefixes.
+        suffix_search (callable): A function matching the signature of
+            `re.compile(string).search` to match suffixes.
+        `infix_finditer` (callable): A function matching the signature of
+            `re.compile(string).finditer` to find infixes.
+        token_match (callable): A boolean function matching strings to be
+            recognized as tokens.
+        url_match (callable): A boolean function matching strings to be
+            recognized as tokens after considering prefixes and suffixes.
+
+        EXAMPLE:
+            >>> tokenizer = Tokenizer(nlp.vocab)
+
+        DOCS: https://spacy.io/api/tokenizer#init
+        """
+        self.mem = Pool()
+        self._cache = PreshMap()
+        self._specials = PreshMap()
+        self.token_match = token_match
+        self.url_match = url_match
+        self.prefix_search = prefix_search
+        self.suffix_search = suffix_search
+        self.infix_finditer = infix_finditer
+        self.vocab = vocab
+        self._rules = {}
+        self._special_matcher = PhraseMatcher(self.vocab)
+        self._load_special_cases(rules)
+
+    property token_match:
+        def __get__(self):
+            return self._token_match
+
+        def __set__(self, token_match):
+            self._token_match = token_match
+            self._reload_special_cases()
+
+    property url_match:
+        def __get__(self):
+            return self._url_match
+
+        def __set__(self, url_match):
+            self._url_match = url_match
+            self._reload_special_cases()
+
+    property prefix_search:
+        def __get__(self):
+            return self._prefix_search
+
+        def __set__(self, prefix_search):
+            self._prefix_search = prefix_search
+            self._reload_special_cases()
+
+    property suffix_search:
+        def __get__(self):
+            return self._suffix_search
+
+        def __set__(self, suffix_search):
+            self._suffix_search = suffix_search
+            self._reload_special_cases()
+
+    property infix_finditer:
+        def __get__(self):
+            return self._infix_finditer
+
+        def __set__(self, infix_finditer):
+            self._infix_finditer = infix_finditer
+            self._reload_special_cases()
+
+    property rules:
+        def __get__(self):
+            return self._rules
+
+        def __set__(self, rules):
+            self._rules = {}
+            self._flush_cache()
+            self._flush_specials()
+            self._cache = PreshMap()
+            self._specials = PreshMap()
+            self._load_special_cases(rules)
+
+    def __reduce__(self):
+        args = (self.vocab,
+                self.rules,
+                self.prefix_search,
+                self.suffix_search,
+                self.infix_finditer,
+                self.token_match,
+                self.url_match)
+        return (self.__class__, args, None, None)
+
+    def __call__(self, str string):
+        """Tokenize a string.
+
+        string (str): The string to tokenize.
+        RETURNS (Doc): A container for linguistic annotations.
+
+        DOCS: https://spacy.io/api/tokenizer#call
+        """
+        doc = self._tokenize_affixes(string, True)
+        self._apply_special_cases(doc)
+        return doc
+
+    @cython.boundscheck(False)
+    cdef Doc _tokenize_affixes(self, str string, bint with_special_cases):
+        """Tokenize according to affix and token_match settings.
+
+        string (str): The string to tokenize.
+        RETURNS (Doc): A container for linguistic annotations.
+        """
+        if len(string) >= (2 ** 30):
+            raise ValueError(Errors.E025.format(length=len(string)))
+        cdef int length = len(string)
+        cdef Doc doc = Doc(self.vocab)
+        if length == 0:
+            return doc
+        cdef int i = 0
+        cdef int start = 0
+        cdef int has_special = 0
+        cdef bint in_ws = string[0].isspace()
+        cdef str span
+        # The task here is much like string.split, but not quite
+        # We find spans of whitespace and non-space characters, and ignore
+        # spans that are exactly ' '. So, our sequences will all be separated
+        # by either ' ' or nothing.
+        for uc in string:
+            if uc.isspace() != in_ws:
+                if start < i:
+                    # When we want to make this fast, get the data buffer once
+                    # with PyUnicode_AS_DATA, and then maintain a start_byte
+                    # and end_byte, so we can call hash64 directly. That way
+                    # we don't have to create the slice when we hit the cache.
+                    span = string[start:i]
+                    key = hash_string(span)
+                    if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
+                        self._tokenize(doc, span, key, &has_special, with_special_cases)
+                if uc == ' ':
+                    doc.c[doc.length - 1].spacy = True
+                    start = i + 1
+                else:
+                    start = i
+                in_ws = not in_ws
+            i += 1
+        if start < i:
+            span = string[start:]
+            key = hash_string(span)
+            if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
+                self._tokenize(doc, span, key, &has_special, with_special_cases)
+            doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
+        return doc
+
+    def pipe(self, texts, batch_size=1000):
+        """Tokenize a stream of texts.
+
+        texts: A sequence of unicode texts.
+        batch_size (int): Number of texts to accumulate in an internal buffer.
+        Defaults to 1000.
+        YIELDS (Doc): A sequence of Doc objects, in order.
+
+        DOCS: https://spacy.io/api/tokenizer#pipe
+        """
+        for text in texts:
+            yield self(text)
+
+    def _flush_cache(self):
+        self._reset_cache([key for key in self._cache])
+
+    def _reset_cache(self, keys):
+        for k in keys:
+            cached = <_Cached*>self._cache.get(k)
+            del self._cache[k]
+            if cached is not NULL:
+                self.mem.free(cached)
+
+    def _flush_specials(self):
+        self._special_matcher = PhraseMatcher(self.vocab)
+        for k in self._specials:
+            cached = <_Cached*>self._specials.get(k)
+            del self._specials[k]
+            if cached is not NULL:
+                self.mem.free(cached)
+
+    cdef int _apply_special_cases(self, Doc doc) except -1:
+        """Retokenize doc according to special cases.
+
+        doc (Doc): Document.
+        """
+        cdef int i
+        cdef int max_length = 0
+        cdef bint modify_in_place
+        cdef Pool mem = Pool()
+        cdef vector[SpanC] c_matches
+        cdef vector[SpanC] c_filtered
+        cdef int offset
+        cdef int modified_doc_length
+        # Find matches for special cases
+        self._special_matcher.find_matches(doc, 0, doc.length, &c_matches)
+        # Skip processing if no matches
+        if c_matches.size() == 0:
+            return True
+        self._filter_special_spans(c_matches, c_filtered, doc.length)
+        # Put span info in span.start-indexed dict and calculate maximum
+        # intermediate document size
+        (span_data, max_length, modify_in_place) = self._prepare_special_spans(doc, c_filtered)
+        # If modifications never increase doc length, can modify in place
+        if modify_in_place:
+            tokens = doc.c
+        # Otherwise create a separate array to store modified tokens
+        else:
+            assert max_length > 0
+            tokens = <TokenC*>mem.alloc(max_length, sizeof(TokenC))
+        # Modify tokenization according to filtered special cases
+        offset = self._retokenize_special_spans(doc, tokens, span_data)
+        # Allocate more memory for doc if needed
+        modified_doc_length = doc.length + offset
+        while modified_doc_length >= doc.max_length:
+            doc._realloc(doc.max_length * 2)
+        # If not modified in place, copy tokens back to doc
+        if not modify_in_place:
+            memcpy(doc.c, tokens, max_length * sizeof(TokenC))
+        for i in range(doc.length + offset, doc.length):
+            memset(&doc.c[i], 0, sizeof(TokenC))
+            doc.c[i].lex = &EMPTY_LEXEME
+        doc.length = doc.length + offset
+        return True
+
+    cdef void _filter_special_spans(self, vector[SpanC] &original, vector[SpanC] &filtered, int doc_len) nogil:
+
+        cdef int seen_i
+        cdef SpanC span
+        cdef stdset[int] seen_tokens
+        stdsort(original.begin(), original.end(), len_start_cmp)
+        cdef int orig_i = original.size() - 1
+        while orig_i >= 0:
+            span = original[orig_i]
+            if not seen_tokens.count(span.start) and not seen_tokens.count(span.end - 1):
+                filtered.push_back(span)
+            for seen_i in range(span.start, span.end):
+                seen_tokens.insert(seen_i)
+            orig_i -= 1
+        stdsort(filtered.begin(), filtered.end(), start_cmp)
+
+    cdef object _prepare_special_spans(self, Doc doc, vector[SpanC] &filtered):
+        spans = [doc[match.start:match.end] for match in filtered]
+        cdef bint modify_in_place = True
+        cdef int curr_length = doc.length
+        cdef int max_length
+        cdef int span_length_diff = 0
+        span_data = {}
+        for span in spans:
+            rule = self._rules.get(span.text, None)
+            span_length_diff = 0
+            if rule:
+                span_length_diff = len(rule) - (span.end - span.start)
+            if span_length_diff > 0:
+                modify_in_place = False
+            curr_length += span_length_diff
+            if curr_length > max_length:
+                max_length = curr_length
+            span_data[span.start] = (span.text, span.start, span.end, span_length_diff)
+        return (span_data, max_length, modify_in_place)
+
+    cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, object span_data):
+        cdef int i = 0
+        cdef int j = 0
+        cdef int offset = 0
+        cdef _Cached* cached
+        cdef int idx_offset = 0
+        cdef int orig_final_spacy
+        cdef int orig_idx
+        cdef int span_start
+        cdef int span_end
+        while i < doc.length:
+            if not i in span_data:
+                tokens[i + offset] = doc.c[i]
+                i += 1
+            else:
+                span = span_data[i]
+                span_start = span[1]
+                span_end = span[2]
+                cached = <_Cached*>self._specials.get(hash_string(span[0]))
+                if cached == NULL:
+                    # Copy original tokens if no rule found
+                    for j in range(span_end - span_start):
+                        tokens[i + offset + j] = doc.c[i + j]
+                    i += span_end - span_start
+                else:
+                    # Copy special case tokens into doc and adjust token and
+                    # character offsets
+                    idx_offset = 0
+                    orig_final_spacy = doc.c[span_end - 1].spacy
+                    orig_idx = doc.c[i].idx
+                    for j in range(cached.length):
+                        tokens[i + offset + j] = cached.data.tokens[j]
+                        tokens[i + offset + j].idx = orig_idx + idx_offset
+                        idx_offset += cached.data.tokens[j].lex.length
+                        if cached.data.tokens[j].spacy:
+                            idx_offset += 1
+                    tokens[i + offset + cached.length - 1].spacy = orig_final_spacy
+                    i += span_end - span_start
+                    offset += span[3]
+        return offset
+
+    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1:
+        cdef bint specials_hit = 0
+        cdef bint cache_hit = 0
+        cdef int i
+        if with_special_cases:
+            cached = <_Cached*>self._specials.get(key)
+            if cached == NULL:
+                specials_hit = False
+            else:
+                for i in range(cached.length):
+                    tokens.push_back(&cached.data.tokens[i], False)
+                has_special[0] = 1
+                specials_hit = True
+        if not specials_hit:
+            cached = <_Cached*>self._cache.get(key)
+            if cached == NULL:
+                cache_hit = False
+            else:
+                if cached.is_lex:
+                    for i in range(cached.length):
+                        tokens.push_back(cached.data.lexemes[i], False)
+                else:
+                    for i in range(cached.length):
+                        tokens.push_back(&cached.data.tokens[i], False)
+                cache_hit = True
+        if not specials_hit and not cache_hit:
+            return False
+        return True
+
+    cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
+        cdef vector[LexemeC*] prefixes
+        cdef vector[LexemeC*] suffixes
+        cdef int orig_size
+        orig_size = tokens.length
+        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
+                                   has_special, with_special_cases)
+        self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
+                            with_special_cases)
+        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
+                          tokens.length - orig_size)
+
+    cdef str _split_affixes(self, Pool mem, str string,
+                                vector[const LexemeC*] *prefixes,
+                                vector[const LexemeC*] *suffixes,
+                                int* has_special,
+                                bint with_special_cases):
+        cdef size_t i
+        cdef str prefix
+        cdef str suffix
+        cdef str minus_pre
+        cdef str minus_suf
+        cdef size_t last_size = 0
+        while string and len(string) != last_size:
+            if self.token_match and self.token_match(string):
+                break
+            if with_special_cases and self._specials.get(hash_string(string)) != NULL:
+                break
+            last_size = len(string)
+            pre_len = self.find_prefix(string)
+            if pre_len != 0:
+                prefix = string[:pre_len]
+                minus_pre = string[pre_len:]
+                if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
+                    string = minus_pre
+                    prefixes.push_back(self.vocab.get(mem, prefix))
+                    break
+            suf_len = self.find_suffix(string[pre_len:])
+            if suf_len != 0:
+                suffix = string[-suf_len:]
+                minus_suf = string[:-suf_len]
+                if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
+                    string = minus_suf
+                    suffixes.push_back(self.vocab.get(mem, suffix))
+                    break
+            if pre_len and suf_len and (pre_len + suf_len) <= len(string):
+                string = string[pre_len:-suf_len]
+                prefixes.push_back(self.vocab.get(mem, prefix))
+                suffixes.push_back(self.vocab.get(mem, suffix))
+            elif pre_len:
+                string = minus_pre
+                prefixes.push_back(self.vocab.get(mem, prefix))
+            elif suf_len:
+                string = minus_suf
+                suffixes.push_back(self.vocab.get(mem, suffix))
+        return string
+
+    cdef int _attach_tokens(self, Doc tokens, str string,
+                            vector[const LexemeC*] *prefixes,
+                            vector[const LexemeC*] *suffixes,
+                            int* has_special,
+                            bint with_special_cases) except -1:
+        cdef bint specials_hit = 0
+        cdef bint cache_hit = 0
+        cdef int split, end
+        cdef const LexemeC* const* lexemes
+        cdef const LexemeC* lexeme
+        cdef str span
+        cdef int i
+        if prefixes.size():
+            for i in range(prefixes.size()):
+                tokens.push_back(prefixes[0][i], False)
+        if string:
+            if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
+                pass
+            elif (self.token_match and self.token_match(string)) or \
+                    (self.url_match and \
+                    self.url_match(string)):
+                # We're always saying 'no' to spaces here -- the caller will
+                # fix up the outermost one, with reference to the original.
+                # See Issue #859
+                tokens.push_back(self.vocab.get(tokens.mem, string), False)
+            else:
+                matches = self.find_infix(string)
+                if not matches:
+                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                else:
+                    # Let's say we have dyn-o-mite-dave - the regex finds the
+                    # start and end positions of the hyphens
+                    start = 0
+                    start_before_infixes = start
+                    for match in matches:
+                        infix_start = match.start()
+                        infix_end = match.end()
+
+                        if infix_start == start_before_infixes:
+                            continue
+
+                        if infix_start != start:
+                            span = string[start:infix_start]
+                            tokens.push_back(self.vocab.get(tokens.mem, span), False)
+
+                        if infix_start != infix_end:
+                            # If infix_start != infix_end, it means the infix
+                            # token is non-empty. Empty infix tokens are useful
+                            # for tokenization in some languages (see
+                            # https://github.com/explosion/spaCy/issues/768)
+                            infix_span = string[infix_start:infix_end]
+                            tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
+                        start = infix_end
+                    span = string[start:]
+                    if span:
+                        tokens.push_back(self.vocab.get(tokens.mem, span), False)
+        cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
+        while it != suffixes.rend():
+            lexeme = deref(it)
+            preinc(it)
+            tokens.push_back(lexeme, False)
+
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
+                          int* has_special, int n) except -1:
+        cdef int i
+        if n <= 0:
+            # avoid mem alloc of zero length
+            return 0
+        for i in range(n):
+            if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
+                return 0
+        # See #1250
+        if has_special[0]:
+            return 0
+        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
+        cached.length = n
+        cached.is_lex = True
+        lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
+        for i in range(n):
+            lexemes[i] = tokens[i].lex
+        cached.data.lexemes = <const LexemeC* const*>lexemes
+        self._cache.set(key, cached)
+
+    def find_infix(self, str string):
+        """Find internal split points of the string, such as hyphens.
+
+        string (str): The string to segment.
+        RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
+            and `.end()` methods, denoting the placement of internal segment
+            separators, e.g. hyphens.
+
+        DOCS: https://spacy.io/api/tokenizer#find_infix
+        """
+        if self.infix_finditer is None:
+            return 0
+        return list(self.infix_finditer(string))
+
+    def find_prefix(self, str string):
+        """Find the length of a prefix that should be segmented from the
+        string, or None if no prefix rules match.
+
+        string (str): The string to segment.
+        RETURNS (int): The length of the prefix if present, otherwise `None`.
+
+        DOCS: https://spacy.io/api/tokenizer#find_prefix
+        """
+        if self.prefix_search is None:
+            return 0
+        match = self.prefix_search(string)
+        return (match.end() - match.start()) if match is not None else 0
+
+    def find_suffix(self, str string):
+        """Find the length of a suffix that should be segmented from the
+        string, or None if no suffix rules match.
+
+        string (str): The string to segment.
+        Returns (int): The length of the suffix if present, otherwise `None`.
+
+        DOCS: https://spacy.io/api/tokenizer#find_suffix
+        """
+        if self.suffix_search is None:
+            return 0
+        match = self.suffix_search(string)
+        return (match.end() - match.start()) if match is not None else 0
+
+    def _load_special_cases(self, special_cases):
+        """Add special-case tokenization rules."""
+        if special_cases is not None:
+            for chunk, substrings in sorted(special_cases.items()):
+                self.add_special_case(chunk, substrings)
+
+    def _validate_special_case(self, chunk, substrings):
+        """Check whether the `ORTH` fields match the string. Check that
+        additional features beyond `ORTH` and `NORM` are not set by the
+        exception.
+
+        chunk (str): The string to specially tokenize.
+        substrings (iterable): A sequence of dicts, where each dict describes
+            a token and its attributes.
+        """
+        attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
+        orth = "".join([spec[ORTH] for spec in attrs])
+        if chunk != orth:
+            raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
+        for substring in attrs:
+            for attr in substring:
+                if attr not in (ORTH, NORM):
+                    raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
+
+    def add_special_case(self, str string, substrings):
+        """Add a special-case tokenization rule.
+
+        string (str): The string to specially tokenize.
+        substrings (iterable): A sequence of dicts, where each dict describes
+            a token and its attributes. The `ORTH` fields of the attributes
+            must exactly match the string when they are concatenated.
+
+        DOCS: https://spacy.io/api/tokenizer#add_special_case
+        """
+        self._validate_special_case(string, substrings)
+        substrings = list(substrings)
+        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
+        cached.length = len(substrings)
+        cached.is_lex = False
+        cached.data.tokens = self.vocab.make_fused_token(substrings)
+        key = hash_string(string)
+        stale_special = <_Cached*>self._specials.get(key)
+        self._specials.set(key, cached)
+        if stale_special is not NULL:
+            self.mem.free(stale_special)
+        self._rules[string] = substrings
+        self._flush_cache()
+        if self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string:
+            self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
+
+    def _reload_special_cases(self):
+        self._flush_cache()
+        self._flush_specials()
+        self._load_special_cases(self._rules)
+
+    def explain(self, text):
+        """A debugging tokenizer that provides information about which
+        tokenizer rule or pattern was matched for each token. The tokens
+        produced are identical to `nlp.tokenizer()` except for whitespace
+        tokens.
+
+        string (str): The string to tokenize.
+        RETURNS (list): A list of (pattern_string, token_string) tuples
+
+        DOCS: https://spacy.io/api/tokenizer#explain
+        """
+        prefix_search = self.prefix_search
+        if prefix_search is None:
+            prefix_search = re.compile("a^").search
+        suffix_search = self.suffix_search
+        if suffix_search is None:
+            suffix_search = re.compile("a^").search
+        infix_finditer = self.infix_finditer
+        if infix_finditer is None:
+            infix_finditer = re.compile("a^").finditer
+        token_match = self.token_match
+        if token_match is None:
+            token_match = re.compile("a^").match
+        url_match = self.url_match
+        if url_match is None:
+            url_match = re.compile("a^").match
+        special_cases = {}
+        for orth, special_tokens in self.rules.items():
+            special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
+        tokens = []
+        for substring in text.split():
+            suffixes = []
+            while substring:
+                while prefix_search(substring) or suffix_search(substring):
+                    if token_match(substring):
+                        tokens.append(("TOKEN_MATCH", substring))
+                        substring = ''
+                        break
+                    if substring in special_cases:
+                        tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                        substring = ''
+                        break
+                    if prefix_search(substring):
+                        split = prefix_search(substring).end()
+                        # break if pattern matches the empty string
+                        if split == 0:
+                            break
+                        tokens.append(("PREFIX", substring[:split]))
+                        substring = substring[split:]
+                        if substring in special_cases:
+                            continue
+                    if suffix_search(substring):
+                        split = suffix_search(substring).start()
+                        # break if pattern matches the empty string
+                        if split == len(substring):
+                            break
+                        suffixes.append(("SUFFIX", substring[split:]))
+                        substring = substring[:split]
+                if len(substring) == 0:
+                    continue
+                if token_match(substring):
+                    tokens.append(("TOKEN_MATCH", substring))
+                    substring = ''
+                elif url_match(substring):
+                    tokens.append(("URL_MATCH", substring))
+                    substring = ''
+                elif substring in special_cases:
+                    tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    substring = ''
+                elif list(infix_finditer(substring)):
+                    infixes = infix_finditer(substring)
+                    offset = 0
+                    for match in infixes:
+                        if substring[offset : match.start()]:
+                            tokens.append(("TOKEN", substring[offset : match.start()]))
+                        if substring[match.start() : match.end()]:
+                            tokens.append(("INFIX", substring[match.start() : match.end()]))
+                        offset = match.end()
+                    if substring[offset:]:
+                        tokens.append(("TOKEN", substring[offset:]))
+                    substring = ''
+                elif substring:
+                    tokens.append(("TOKEN", substring))
+                    substring = ''
+            tokens.extend(reversed(suffixes))
+        # Find matches for special cases handled by special matcher
+        words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
+        t_words = []
+        t_spaces = []
+        for word, space in zip(words, spaces):
+            if not word.isspace():
+                t_words.append(word)
+                t_spaces.append(space)
+        doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
+        matches = self._special_matcher(doc)
+        spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
+        spans = util.filter_spans(spans)
+        # Replace matched tokens with their exceptions
+        i = 0
+        final_tokens = []
+        spans_by_start = {s.start: s for s in spans}
+        while i < len(tokens):
+            if i in spans_by_start:
+                span = spans_by_start[i]
+                exc = [d[ORTH] for d in special_cases[span.label_]]
+                for j, orth in enumerate(exc):
+                    final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
+                i += len(span)
+            else:
+                final_tokens.append(tokens[i])
+                i += 1
+        return final_tokens
+
+    def score(self, examples, **kwargs):
+        validate_examples(examples, "Tokenizer.score")
+        return Scorer.score_tokenization(examples)
+
+    def to_disk(self, path, **kwargs):
+        """Save the current state to a directory.
+
+        path (str / Path): A path to a directory, which will be created if
+            it doesn't exist.
+        exclude (list): String names of serialization fields to exclude.
+
+        DOCS: https://spacy.io/api/tokenizer#to_disk
+        """
+        path = util.ensure_path(path)
+        with path.open("wb") as file_:
+            file_.write(self.to_bytes(**kwargs))
+
+    def from_disk(self, path, *, exclude=tuple()):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (str / Path): A path to a directory.
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (Tokenizer): The modified `Tokenizer` object.
+
+        DOCS: https://spacy.io/api/tokenizer#from_disk
+        """
+        path = util.ensure_path(path)
+        with path.open("rb") as file_:
+            bytes_data = file_.read()
+        self.from_bytes(bytes_data, exclude=exclude)
+        return self
+
+    def to_bytes(self, *, exclude=tuple()):
+        """Serialize the current state to a binary string.
+
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized form of the `Tokenizer` object.
+
+        DOCS: https://spacy.io/api/tokenizer#to_bytes
+        """
+        serializers = {
+            "vocab": lambda: self.vocab.to_bytes(exclude=exclude),
+            "prefix_search": lambda: _get_regex_pattern(self.prefix_search),
+            "suffix_search": lambda: _get_regex_pattern(self.suffix_search),
+            "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer),
+            "token_match": lambda: _get_regex_pattern(self.token_match),
+            "url_match": lambda: _get_regex_pattern(self.url_match),
+            "exceptions": lambda: dict(sorted(self._rules.items()))
+        }
+        return util.to_bytes(serializers, exclude)
+
+    def from_bytes(self, bytes_data, *, exclude=tuple()):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (Tokenizer): The `Tokenizer` object.
+
+        DOCS: https://spacy.io/api/tokenizer#from_bytes
+        """
+        data = {}
+        deserializers = {
+            "vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
+            "prefix_search": lambda b: data.setdefault("prefix_search", b),
+            "suffix_search": lambda b: data.setdefault("suffix_search", b),
+            "infix_finditer": lambda b: data.setdefault("infix_finditer", b),
+            "token_match": lambda b: data.setdefault("token_match", b),
+            "url_match": lambda b: data.setdefault("url_match", b),
+            "exceptions": lambda b: data.setdefault("rules", b)
+        }
+        # reset all properties and flush all caches (through rules),
+        # reset rules first so that _reload_special_cases is trivial/fast as
+        # the other properties are reset
+        self.rules = {}
+        self.prefix_search = None
+        self.suffix_search = None
+        self.infix_finditer = None
+        self.token_match = None
+        self.url_match = None
+        msg = util.from_bytes(bytes_data, deserializers, exclude)
+        if "prefix_search" in data and isinstance(data["prefix_search"], str):
+            self.prefix_search = re.compile(data["prefix_search"]).search
+        if "suffix_search" in data and isinstance(data["suffix_search"], str):
+            self.suffix_search = re.compile(data["suffix_search"]).search
+        if "infix_finditer" in data and isinstance(data["infix_finditer"], str):
+            self.infix_finditer = re.compile(data["infix_finditer"]).finditer
+        if "token_match" in data and isinstance(data["token_match"], str):
+            self.token_match = re.compile(data["token_match"]).match
+        if "url_match" in data and isinstance(data["url_match"], str):
+            self.url_match = re.compile(data["url_match"]).match
+        if "rules" in data and isinstance(data["rules"], dict):
+            self.rules = data["rules"]
+        return self
+
+
+def _get_regex_pattern(regex):
+    """Get a pattern string for a regex, or None if the pattern is None."""
+    return None if regex is None else regex.__self__.pattern
+
+
+cdef extern from "<algorithm>" namespace "std" nogil:
+    void stdsort "sort"(vector[SpanC].iterator,
+                        vector[SpanC].iterator,
+                        bint (*)(SpanC, SpanC))
+
+
+cdef bint len_start_cmp(SpanC a, SpanC b) nogil:
+    if a.end - a.start == b.end - b.start:
+        return b.start < a.start
+    return a.end - a.start < b.end - b.start
+
+
+cdef bint start_cmp(SpanC a, SpanC b) nogil:
+    return a.start < b.start
--- a/cgpenv/Lib/site-packages/spacy/ty.py
+++ b/cgpenv/Lib/site-packages/spacy/ty.py
+from typing import TYPE_CHECKING
+from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List
+from .compat import Protocol, runtime_checkable
+
+from thinc.api import Optimizer, Model
+
+if TYPE_CHECKING:
+    from .training import Example
+
+
+@runtime_checkable
+class TrainableComponent(Protocol):
+    model: Any
+    is_trainable: bool
+
+    def update(
+        self,
+        examples: Iterable["Example"],
+        *,
+        drop: float = 0.0,
+        sgd: Optional[Optimizer] = None,
+        losses: Optional[Dict[str, float]] = None
+    ) -> Dict[str, float]:
+        ...
+
+    def finish_update(self, sgd: Optimizer) -> None:
+        ...
+
+
+@runtime_checkable
+class InitializableComponent(Protocol):
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable["Example"]],
+        nlp: Iterable["Example"],
+        **kwargs: Any
+    ):
+        ...
+
+
+@runtime_checkable
+class ListenedToComponent(Protocol):
+    model: Any
+    listeners: Sequence[Model]
+    listener_map: Dict[str, Sequence[Model]]
+    listening_components: List[str]
+
+    def add_listener(self, listener: Model, component_name: str) -> None:
+        ...
+
+    def remove_listener(self, listener: Model, component_name: str) -> bool:
+        ...
+
+    def find_listeners(self, component) -> None:
+        ...
--- a/cgpenv/Lib/site-packages/spacy/typedefs.pxd
+++ b/cgpenv/Lib/site-packages/spacy/typedefs.pxd
+from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
+from libc.stdint cimport uint8_t
+
+
+ctypedef float weight_t
+ctypedef uint64_t hash_t
+ctypedef uint64_t class_t
+ctypedef uint64_t attr_t
+ctypedef uint64_t flags_t
+ctypedef uint16_t len_t
+ctypedef uint16_t tag_t
--- a/cgpenv/Lib/site-packages/spacy/typedefs.pyx
+++ b/cgpenv/Lib/site-packages/spacy/typedefs.pyx
--- a/cgpenv/Lib/site-packages/spacy/util.py
+++ b/cgpenv/Lib/site-packages/spacy/util.py
+from typing import List, Mapping, NoReturn, Union, Dict, Any, Set
+from typing import Optional, Iterable, Callable, Tuple, Type
+from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING
+from types import ModuleType
+import os
+import importlib
+import importlib.util
+import re
+from pathlib import Path
+import thinc
+from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
+from thinc.api import ConfigValidationError, Model
+import functools
+import itertools
+import numpy.random
+import numpy
+import srsly
+import catalogue
+from catalogue import RegistryError, Registry
+import langcodes
+import sys
+import warnings
+from packaging.specifiers import SpecifierSet, InvalidSpecifier
+from packaging.version import Version, InvalidVersion
+from packaging.requirements import Requirement
+import subprocess
+from contextlib import contextmanager
+from collections import defaultdict
+import tempfile
+import shutil
+import shlex
+import inspect
+import pkgutil
+import logging
+
+try:
+    import cupy.random
+except ImportError:
+    cupy = None
+
+# These are functions that were previously (v2.x) available from spacy.util
+# and have since moved to Thinc. We're importing them here so people's code
+# doesn't break, but they should always be imported from Thinc from now on,
+# not from spacy.util.
+from thinc.api import fix_random_seed, compounding, decaying  # noqa: F401
+
+
+from .symbols import ORTH
+from .compat import cupy, CudaStream, is_windows, importlib_metadata
+from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
+from . import about
+
+if TYPE_CHECKING:
+    # This lets us add type hints for mypy etc. without causing circular imports
+    from .language import Language  # noqa: F401
+    from .pipeline import Pipe  # noqa: F401
+    from .tokens import Doc, Span  # noqa: F401
+    from .vocab import Vocab  # noqa: F401
+
+
+# fmt: off
+OOV_RANK = numpy.iinfo(numpy.uint64).max
+DEFAULT_OOV_PROB = -20
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
+
+# Default order of sections in the config.cfg. Not all sections needs to exist,
+# and additional sections are added at the end, in alphabetical order.
+CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
+# fmt: on
+
+
+logger = logging.getLogger("spacy")
+logger_stream_handler = logging.StreamHandler()
+logger_stream_handler.setFormatter(
+    logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
+)
+logger.addHandler(logger_stream_handler)
+
+
+class ENV_VARS:
+    CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
+    PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
+
+
+class registry(thinc.registry):
+    languages = catalogue.create("spacy", "languages", entry_points=True)
+    architectures = catalogue.create("spacy", "architectures", entry_points=True)
+    tokenizers = catalogue.create("spacy", "tokenizers", entry_points=True)
+    lemmatizers = catalogue.create("spacy", "lemmatizers", entry_points=True)
+    lookups = catalogue.create("spacy", "lookups", entry_points=True)
+    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
+    misc = catalogue.create("spacy", "misc", entry_points=True)
+    # Callback functions used to manipulate nlp object etc.
+    callbacks = catalogue.create("spacy", "callbacks", entry_points=True)
+    batchers = catalogue.create("spacy", "batchers", entry_points=True)
+    readers = catalogue.create("spacy", "readers", entry_points=True)
+    augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
+    loggers = catalogue.create("spacy", "loggers", entry_points=True)
+    scorers = catalogue.create("spacy", "scorers", entry_points=True)
+    # These are factories registered via third-party packages and the
+    # spacy_factories entry point. This registry only exists so we can easily
+    # load them via the entry points. The "true" factories are added via the
+    # Language.factory decorator (in the spaCy code base and user code) and those
+    # are the factories used to initialize components via registry.resolve.
+    _entry_point_factories = catalogue.create("spacy", "factories", entry_points=True)
+    factories = catalogue.create("spacy", "internal_factories")
+    # This is mostly used to get a list of all installed models in the current
+    # environment. spaCy models packaged with `spacy package` will "advertise"
+    # themselves via entry points.
+    models = catalogue.create("spacy", "models", entry_points=True)
+    cli = catalogue.create("spacy", "cli", entry_points=True)
+
+    @classmethod
+    def get_registry_names(cls) -> List[str]:
+        """List all available registries."""
+        names = []
+        for name, value in inspect.getmembers(cls):
+            if not name.startswith("_") and isinstance(value, Registry):
+                names.append(name)
+        return sorted(names)
+
+    @classmethod
+    def get(cls, registry_name: str, func_name: str) -> Callable:
+        """Get a registered function from the registry."""
+        # We're overwriting this classmethod so we're able to provide more
+        # specific error messages and implement a fallback to spacy-legacy.
+        if not hasattr(cls, registry_name):
+            names = ", ".join(cls.get_registry_names()) or "none"
+            raise RegistryError(Errors.E892.format(name=registry_name, available=names))
+        reg = getattr(cls, registry_name)
+        try:
+            func = reg.get(func_name)
+        except RegistryError:
+            if func_name.startswith("spacy."):
+                legacy_name = func_name.replace("spacy.", "spacy-legacy.")
+                try:
+                    return reg.get(legacy_name)
+                except catalogue.RegistryError:
+                    pass
+            available = ", ".join(sorted(reg.get_all().keys())) or "none"
+            raise RegistryError(
+                Errors.E893.format(
+                    name=func_name, reg_name=registry_name, available=available
+                )
+            ) from None
+        return func
+
+    @classmethod
+    def find(cls, registry_name: str, func_name: str) -> Callable:
+        """Get info about a registered function from the registry."""
+        # We're overwriting this classmethod so we're able to provide more
+        # specific error messages and implement a fallback to spacy-legacy.
+        if not hasattr(cls, registry_name):
+            names = ", ".join(cls.get_registry_names()) or "none"
+            raise RegistryError(Errors.E892.format(name=registry_name, available=names))
+        reg = getattr(cls, registry_name)
+        try:
+            func_info = reg.find(func_name)
+        except RegistryError:
+            if func_name.startswith("spacy."):
+                legacy_name = func_name.replace("spacy.", "spacy-legacy.")
+                try:
+                    return reg.find(legacy_name)
+                except catalogue.RegistryError:
+                    pass
+            available = ", ".join(sorted(reg.get_all().keys())) or "none"
+            raise RegistryError(
+                Errors.E893.format(
+                    name=func_name, reg_name=registry_name, available=available
+                )
+            ) from None
+        return func_info
+
+    @classmethod
+    def has(cls, registry_name: str, func_name: str) -> bool:
+        """Check whether a function is available in a registry."""
+        if not hasattr(cls, registry_name):
+            return False
+        reg = getattr(cls, registry_name)
+        if func_name.startswith("spacy."):
+            legacy_name = func_name.replace("spacy.", "spacy-legacy.")
+            return func_name in reg or legacy_name in reg
+        return func_name in reg
+
+
+class SimpleFrozenDict(dict):
+    """Simplified implementation of a frozen dict, mainly used as default
+    function or method argument (for arguments that should default to empty
+    dictionary). Will raise an error if user or spaCy attempts to add to dict.
+    """
+
+    def __init__(self, *args, error: str = Errors.E095, **kwargs) -> None:
+        """Initialize the frozen dict. Can be initialized with pre-defined
+        values.
+
+        error (str): The error message when user tries to assign to dict.
+        """
+        super().__init__(*args, **kwargs)
+        self.error = error
+
+    def __setitem__(self, key, value):
+        raise NotImplementedError(self.error)
+
+    def pop(self, key, default=None):
+        raise NotImplementedError(self.error)
+
+    def update(self, other):
+        raise NotImplementedError(self.error)
+
+
+class SimpleFrozenList(list):
+    """Wrapper class around a list that lets us raise custom errors if certain
+    attributes/methods are accessed. Mostly used for properties like
+    Language.pipeline that return an immutable list (and that we don't want to
+    convert to a tuple to not break too much backwards compatibility). If a user
+    accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
+    """
+
+    def __init__(self, *args, error: str = Errors.E927) -> None:
+        """Initialize the frozen list.
+
+        error (str): The error message when user tries to mutate the list.
+        """
+        self.error = error
+        super().__init__(*args)
+
+    def append(self, *args, **kwargs):
+        raise NotImplementedError(self.error)
+
+    def clear(self, *args, **kwargs):
+        raise NotImplementedError(self.error)
+
+    def extend(self, *args, **kwargs):
+        raise NotImplementedError(self.error)
+
+    def insert(self, *args, **kwargs):
+        raise NotImplementedError(self.error)
+
+    def pop(self, *args, **kwargs):
+        raise NotImplementedError(self.error)
+
+    def remove(self, *args, **kwargs):
+        raise NotImplementedError(self.error)
+
+    def reverse(self, *args, **kwargs):
+        raise NotImplementedError(self.error)
+
+    def sort(self, *args, **kwargs):
+        raise NotImplementedError(self.error)
+
+
+def lang_class_is_loaded(lang: str) -> bool:
+    """Check whether a Language class is already loaded. Language classes are
+    loaded lazily, to avoid expensive setup code associated with the language
+    data.
+
+    lang (str): Two-letter language code, e.g. 'en'.
+    RETURNS (bool): Whether a Language class has been loaded.
+    """
+    return lang in registry.languages
+
+
+def find_matching_language(lang: str) -> Optional[str]:
+    """
+    Given an IETF language code, find a supported spaCy language that is a
+    close match for it (according to Unicode CLDR language-matching rules).
+    This allows for language aliases, ISO 639-2 codes, more detailed language
+    tags, and close matches.
+
+    Returns the language code if a matching language is available, or None
+    if there is no matching language.
+
+    >>> find_matching_language('en')
+    'en'
+    >>> find_matching_language('pt-BR')  # Brazilian Portuguese
+    'pt'
+    >>> find_matching_language('fra')  # an ISO 639-2 code for French
+    'fr'
+    >>> find_matching_language('iw')  # obsolete alias for Hebrew
+    'he'
+    >>> find_matching_language('no')  # Norwegian
+    'nb'
+    >>> find_matching_language('mo')  # old code for ro-MD
+    'ro'
+    >>> find_matching_language('zh-Hans')  # Simplified Chinese
+    'zh'
+    >>> find_matching_language('zxx')
+    None
+    """
+    import spacy.lang  # noqa: F401
+
+    if lang == "xx":
+        return "xx"
+
+    # Find out which language modules we have
+    possible_languages = []
+    for modinfo in pkgutil.iter_modules(spacy.lang.__path__):  # type: ignore
+        code = modinfo.name
+        if code == "xx":
+            # Temporarily make 'xx' into a valid language code
+            possible_languages.append("mul")
+        elif langcodes.tag_is_valid(code):
+            possible_languages.append(code)
+
+    # Distances from 1-9 allow near misses like Bosnian -> Croatian and
+    # Norwegian -> Norwegian Bokmål. A distance of 10 would include several
+    # more possibilities, like variants of Chinese like 'wuu', but text that
+    # is labeled that way is probably trying to be distinct from 'zh' and
+    # shouldn't automatically match.
+    match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
+    if match == "mul":
+        # Convert 'mul' back to spaCy's 'xx'
+        return "xx"
+    else:
+        return match
+
+
+def get_lang_class(lang: str) -> Type["Language"]:
+    """Import and load a Language class.
+
+    lang (str): IETF language code, such as 'en'.
+    RETURNS (Language): Language class.
+    """
+    # Check if language is registered / entry point is available
+    if lang in registry.languages:
+        return registry.languages.get(lang)
+    else:
+        # Find the language in the spacy.lang subpackage
+        try:
+            module = importlib.import_module(f".lang.{lang}", "spacy")
+        except ImportError as err:
+            # Find a matching language. For example, if the language 'no' is
+            # requested, we can use language-matching to load `spacy.lang.nb`.
+            try:
+                match = find_matching_language(lang)
+            except langcodes.tag_parser.LanguageTagError:
+                # proceed to raising an import error
+                match = None
+
+            if match:
+                lang = match
+                module = importlib.import_module(f".lang.{lang}", "spacy")
+            else:
+                raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
+        set_lang_class(lang, getattr(module, module.__all__[0]))  # type: ignore[attr-defined]
+    return registry.languages.get(lang)
+
+
+def set_lang_class(name: str, cls: Type["Language"]) -> None:
+    """Set a custom Language class name that can be loaded via get_lang_class.
+
+    name (str): Name of Language class.
+    cls (Language): Language class.
+    """
+    registry.languages.register(name, func=cls)
+
+
+def ensure_path(path: Any) -> Any:
+    """Ensure string is converted to a Path.
+
+    path (Any): Anything. If string, it's converted to Path.
+    RETURNS: Path or original argument.
+    """
+    if isinstance(path, str):
+        return Path(path)
+    else:
+        return path
+
+
+def load_language_data(path: Union[str, Path]) -> Union[dict, list]:
+    """Load JSON language data using the given path as a base. If the provided
+    path isn't present, will attempt to load a gzipped version before giving up.
+
+    path (str / Path): The data to load.
+    RETURNS: The loaded data.
+    """
+    path = ensure_path(path)
+    if path.exists():
+        return srsly.read_json(path)
+    path = path.with_suffix(path.suffix + ".gz")
+    if path.exists():
+        return srsly.read_gzip_json(path)
+    raise ValueError(Errors.E160.format(path=path))
+
+
+def get_module_path(module: ModuleType) -> Path:
+    """Get the path of a Python module.
+
+    module (ModuleType): The Python module.
+    RETURNS (Path): The path.
+    """
+    if not hasattr(module, "__module__"):
+        raise ValueError(Errors.E169.format(module=repr(module)))
+    return Path(sys.modules[module.__module__].__file__).parent
+
+
+def load_model(
+    name: Union[str, Path],
+    *,
+    vocab: Union["Vocab", bool] = True,
+    disable: Iterable[str] = SimpleFrozenList(),
+    exclude: Iterable[str] = SimpleFrozenList(),
+    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
+) -> "Language":
+    """Load a model from a package or data path.
+
+    name (str): Package name or model path.
+    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
+        a new Vocab object will be created.
+    disable (Iterable[str]): Names of pipeline components to disable.
+    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
+        keyed by section values in dot notation.
+    RETURNS (Language): The loaded nlp object.
+    """
+    kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
+    if isinstance(name, str):  # name or string path
+        if name.startswith("blank:"):  # shortcut for blank model
+            return get_lang_class(name.replace("blank:", ""))()
+        if is_package(name):  # installed as package
+            return load_model_from_package(name, **kwargs)  # type: ignore[arg-type]
+        if Path(name).exists():  # path to model data directory
+            return load_model_from_path(Path(name), **kwargs)  # type: ignore[arg-type]
+    elif hasattr(name, "exists"):  # Path or Path-like to model data
+        return load_model_from_path(name, **kwargs)  # type: ignore[arg-type]
+    if name in OLD_MODEL_SHORTCUTS:
+        raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name]))  # type: ignore[index]
+    raise IOError(Errors.E050.format(name=name))
+
+
+def load_model_from_package(
+    name: str,
+    *,
+    vocab: Union["Vocab", bool] = True,
+    disable: Iterable[str] = SimpleFrozenList(),
+    exclude: Iterable[str] = SimpleFrozenList(),
+    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
+) -> "Language":
+    """Load a model from an installed package.
+
+    name (str): The package name.
+    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
+        a new Vocab object will be created.
+    disable (Iterable[str]): Names of pipeline components to disable. Disabled
+        pipes will be loaded but they won't be run unless you explicitly
+        enable them by calling nlp.enable_pipe.
+    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
+        components won't be loaded.
+    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
+        keyed by section values in dot notation.
+    RETURNS (Language): The loaded nlp object.
+    """
+    cls = importlib.import_module(name)
+    return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)  # type: ignore[attr-defined]
+
+
+def load_model_from_path(
+    model_path: Path,
+    *,
+    meta: Optional[Dict[str, Any]] = None,
+    vocab: Union["Vocab", bool] = True,
+    disable: Iterable[str] = SimpleFrozenList(),
+    exclude: Iterable[str] = SimpleFrozenList(),
+    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
+) -> "Language":
+    """Load a model from a data directory path. Creates Language class with
+    pipeline from config.cfg and then calls from_disk() with path.
+
+    model_path (Path): Mmodel path.
+    meta (Dict[str, Any]): Optional model meta.
+    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
+        a new Vocab object will be created.
+    disable (Iterable[str]): Names of pipeline components to disable. Disabled
+        pipes will be loaded but they won't be run unless you explicitly
+        enable them by calling nlp.enable_pipe.
+    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
+        components won't be loaded.
+    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
+        keyed by section values in dot notation.
+    RETURNS (Language): The loaded nlp object.
+    """
+    if not model_path.exists():
+        raise IOError(Errors.E052.format(path=model_path))
+    if not meta:
+        meta = get_model_meta(model_path)
+    config_path = model_path / "config.cfg"
+    overrides = dict_to_dot(config)
+    config = load_config(config_path, overrides=overrides)
+    nlp = load_model_from_config(config, vocab=vocab, disable=disable, exclude=exclude)
+    return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
+
+
+def load_model_from_config(
+    config: Union[Dict[str, Any], Config],
+    *,
+    vocab: Union["Vocab", bool] = True,
+    disable: Iterable[str] = SimpleFrozenList(),
+    exclude: Iterable[str] = SimpleFrozenList(),
+    auto_fill: bool = False,
+    validate: bool = True,
+) -> "Language":
+    """Create an nlp object from a config. Expects the full config file including
+    a section "nlp" containing the settings for the nlp object.
+
+    name (str): Package name or model path.
+    meta (Dict[str, Any]): Optional model meta.
+    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
+        a new Vocab object will be created.
+    disable (Iterable[str]): Names of pipeline components to disable. Disabled
+        pipes will be loaded but they won't be run unless you explicitly
+        enable them by calling nlp.enable_pipe.
+    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
+        components won't be loaded.
+    auto_fill (bool): Whether to auto-fill config with missing defaults.
+    validate (bool): Whether to show config validation errors.
+    RETURNS (Language): The loaded nlp object.
+    """
+    if "nlp" not in config:
+        raise ValueError(Errors.E985.format(config=config))
+    nlp_config = config["nlp"]
+    if "lang" not in nlp_config or nlp_config["lang"] is None:
+        raise ValueError(Errors.E993.format(config=nlp_config))
+    # This will automatically handle all codes registered via the languages
+    # registry, including custom subclasses provided via entry points
+    lang_cls = get_lang_class(nlp_config["lang"])
+    nlp = lang_cls.from_config(
+        config,
+        vocab=vocab,
+        disable=disable,
+        exclude=exclude,
+        auto_fill=auto_fill,
+        validate=validate,
+    )
+    return nlp
+
+
+def get_sourced_components(
+    config: Union[Dict[str, Any], Config]
+) -> Dict[str, Dict[str, Any]]:
+    """RETURNS (List[str]): All sourced components in the original config,
+    e.g. {"source": "en_core_web_sm"}. If the config contains a key
+    "factory", we assume it refers to a component factory.
+    """
+    return {
+        name: cfg
+        for name, cfg in config.get("components", {}).items()
+        if "factory" not in cfg and "source" in cfg
+    }
+
+
+def resolve_dot_names(
+    config: Config, dot_names: List[Optional[str]]
+) -> Tuple[Any, ...]:
+    """Resolve one or more "dot notation" names, e.g. corpora.train.
+    The paths could point anywhere into the config, so we don't know which
+    top-level section we'll be looking within.
+
+    We resolve the whole top-level section, although we could resolve less --
+    we could find the lowest part of the tree.
+    """
+    # TODO: include schema?
+    resolved = {}
+    output: List[Any] = []
+    errors = []
+    for name in dot_names:
+        if name is None:
+            output.append(name)
+        else:
+            section = name.split(".")[0]
+            # We want to avoid resolving the same thing twice
+            if section not in resolved:
+                if registry.is_promise(config[section]):
+                    # Otherwise we can't resolve [corpus] if it's a promise
+                    result = registry.resolve({"config": config[section]})["config"]
+                else:
+                    result = registry.resolve(config[section])
+                resolved[section] = result
+            try:
+                output.append(dot_to_object(resolved, name))  # type: ignore[arg-type]
+            except KeyError:
+                msg = f"not a valid section reference: {name}"
+                errors.append({"loc": name.split("."), "msg": msg})
+    if errors:
+        raise ConfigValidationError(config=config, errors=errors)
+    return tuple(output)
+
+
+def load_model_from_init_py(
+    init_file: Union[Path, str],
+    *,
+    vocab: Union["Vocab", bool] = True,
+    disable: Iterable[str] = SimpleFrozenList(),
+    exclude: Iterable[str] = SimpleFrozenList(),
+    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
+) -> "Language":
+    """Helper function to use in the `load()` method of a model package's
+    __init__.py.
+
+    vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
+        a new Vocab object will be created.
+    disable (Iterable[str]): Names of pipeline components to disable. Disabled
+        pipes will be loaded but they won't be run unless you explicitly
+        enable them by calling nlp.enable_pipe.
+    exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
+        components won't be loaded.
+    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
+        keyed by section values in dot notation.
+    RETURNS (Language): The loaded nlp object.
+    """
+    model_path = Path(init_file).parent
+    meta = get_model_meta(model_path)
+    data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
+    data_path = model_path / data_dir
+    if not model_path.exists():
+        raise IOError(Errors.E052.format(path=data_path))
+    return load_model_from_path(
+        data_path,
+        vocab=vocab,
+        meta=meta,
+        disable=disable,
+        exclude=exclude,
+        config=config,
+    )
+
+
+def load_config(
+    path: Union[str, Path],
+    overrides: Dict[str, Any] = SimpleFrozenDict(),
+    interpolate: bool = False,
+) -> Config:
+    """Load a config file. Takes care of path validation and section order.
+
+    path (Union[str, Path]): Path to the config file or "-" to read from stdin.
+    overrides: (Dict[str, Any]): Config overrides as nested dict or
+        dict keyed by section values in dot notation.
+    interpolate (bool): Whether to interpolate and resolve variables.
+    RETURNS (Config): The loaded config.
+    """
+    config_path = ensure_path(path)
+    config = Config(section_order=CONFIG_SECTION_ORDER)
+    if str(config_path) == "-":  # read from standard input
+        return config.from_str(
+            sys.stdin.read(), overrides=overrides, interpolate=interpolate
+        )
+    else:
+        if not config_path or not config_path.exists() or not config_path.is_file():
+            raise IOError(Errors.E053.format(path=config_path, name="config.cfg"))
+        return config.from_disk(
+            config_path, overrides=overrides, interpolate=interpolate
+        )
+
+
+def load_config_from_str(
+    text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False
+):
+    """Load a full config from a string. Wrapper around Thinc's Config.from_str.
+
+    text (str): The string config to load.
+    interpolate (bool): Whether to interpolate and resolve variables.
+    RETURNS (Config): The loaded config.
+    """
+    return Config(section_order=CONFIG_SECTION_ORDER).from_str(
+        text, overrides=overrides, interpolate=interpolate
+    )
+
+
+def get_installed_models() -> List[str]:
+    """List all model packages currently installed in the environment.
+
+    RETURNS (List[str]): The string names of the models.
+    """
+    return list(registry.models.get_all().keys())
+
+
+def get_package_version(name: str) -> Optional[str]:
+    """Get the version of an installed package. Typically used to get model
+    package versions.
+
+    name (str): The name of the installed Python package.
+    RETURNS (str / None): The version or None if package not installed.
+    """
+    try:
+        return importlib_metadata.version(name)  # type: ignore[attr-defined]
+    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
+        return None
+
+
+def is_compatible_version(
+    version: str, constraint: str, prereleases: bool = True
+) -> Optional[bool]:
+    """Check if a version (e.g. "2.0.0") is compatible given a version
+    constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
+    it's interpreted as =={version}.
+
+    version (str): The version to check.
+    constraint (str): The constraint string.
+    prereleases (bool): Whether to allow prereleases. If set to False,
+        prerelease versions will be considered incompatible.
+    RETURNS (bool / None): Whether the version is compatible, or None if the
+        version or constraint are invalid.
+    """
+    # Handle cases where exact version is provided as constraint
+    if constraint[0].isdigit():
+        constraint = f"=={constraint}"
+    try:
+        spec = SpecifierSet(constraint)
+        version = Version(version)  # type: ignore[assignment]
+    except (InvalidSpecifier, InvalidVersion):
+        return None
+    spec.prereleases = prereleases
+    return version in spec
+
+
+def is_unconstrained_version(
+    constraint: str, prereleases: bool = True
+) -> Optional[bool]:
+    # We have an exact version, this is the ultimate constrained version
+    if constraint[0].isdigit():
+        return False
+    try:
+        spec = SpecifierSet(constraint)
+    except InvalidSpecifier:
+        return None
+    spec.prereleases = prereleases
+    specs = [sp for sp in spec]
+    # We only have one version spec and it defines > or >=
+    if len(specs) == 1 and specs[0].operator in (">", ">="):
+        return True
+    # One specifier is exact version
+    if any(sp.operator in ("==") for sp in specs):
+        return False
+    has_upper = any(sp.operator in ("<", "<=") for sp in specs)
+    has_lower = any(sp.operator in (">", ">=") for sp in specs)
+    # We have a version spec that defines an upper and lower bound
+    if has_upper and has_lower:
+        return False
+    # Everything else, like only an upper version, only a lower version etc.
+    return True
+
+
+def split_requirement(requirement: str) -> Tuple[str, str]:
+    """Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3")."""
+    req = Requirement(requirement)
+    return (req.name, str(req.specifier))
+
+
+def get_minor_version_range(version: str) -> str:
+    """Generate a version range like >=1.2.3,<1.3.0 based on a given version
+    (e.g. of spaCy).
+    """
+    release = Version(version).release
+    return f">={version},<{release[0]}.{release[1] + 1}.0"
+
+
+def get_model_lower_version(constraint: str) -> Optional[str]:
+    """From a version range like >=1.2.3,<1.3.0 return the lower pin."""
+    try:
+        specset = SpecifierSet(constraint)
+        for spec in specset:
+            if spec.operator in (">=", "==", "~="):
+                return spec.version
+    except Exception:
+        pass
+    return None
+
+
+def get_base_version(version: str) -> str:
+    """Generate the base version without any prerelease identifiers.
+
+    version (str): The version, e.g. "3.0.0.dev1".
+    RETURNS (str): The base version, e.g. "3.0.0".
+    """
+    return Version(version).base_version
+
+
+def get_minor_version(version: str) -> Optional[str]:
+    """Get the major + minor version (without patch or prerelease identifiers).
+
+    version (str): The version.
+    RETURNS (str): The major + minor version or None if version is invalid.
+    """
+    try:
+        v = Version(version)
+    except (TypeError, InvalidVersion):
+        return None
+    return f"{v.major}.{v.minor}"
+
+
+def is_minor_version_match(version_a: str, version_b: str) -> bool:
+    """Compare two versions and check if they match in major and minor, without
+    patch or prerelease identifiers. Used internally for compatibility checks
+    that should be insensitive to patch releases.
+
+    version_a (str): The first version
+    version_b (str): The second version.
+    RETURNS (bool): Whether the versions match.
+    """
+    a = get_minor_version(version_a)
+    b = get_minor_version(version_b)
+    return a is not None and b is not None and a == b
+
+
+def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
+    """Load a model meta.json from a path and validate its contents.
+
+    path (Union[str, Path]): Path to meta.json.
+    RETURNS (Dict[str, Any]): The loaded meta.
+    """
+    path = ensure_path(path)
+    if not path.parent.exists():
+        raise IOError(Errors.E052.format(path=path.parent))
+    if not path.exists() or not path.is_file():
+        raise IOError(Errors.E053.format(path=path.parent, name="meta.json"))
+    meta = srsly.read_json(path)
+    for setting in ["lang", "name", "version"]:
+        if setting not in meta or not meta[setting]:
+            raise ValueError(Errors.E054.format(setting=setting))
+    if "spacy_version" in meta:
+        if not is_compatible_version(about.__version__, meta["spacy_version"]):
+            lower_version = get_model_lower_version(meta["spacy_version"])
+            lower_version = get_minor_version(lower_version)  # type: ignore[arg-type]
+            if lower_version is not None:
+                lower_version = "v" + lower_version
+            elif "spacy_git_version" in meta:
+                lower_version = "git commit " + meta["spacy_git_version"]
+            else:
+                lower_version = "version unknown"
+            warn_msg = Warnings.W095.format(
+                model=f"{meta['lang']}_{meta['name']}",
+                model_version=meta["version"],
+                version=lower_version,
+                current=about.__version__,
+            )
+            warnings.warn(warn_msg)
+        if is_unconstrained_version(meta["spacy_version"]):
+            warn_msg = Warnings.W094.format(
+                model=f"{meta['lang']}_{meta['name']}",
+                model_version=meta["version"],
+                version=meta["spacy_version"],
+                example=get_minor_version_range(about.__version__),
+            )
+            warnings.warn(warn_msg)
+    return meta
+
+
+def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]:
+    """Get model meta.json from a directory path and validate its contents.
+
+    path (str / Path): Path to model directory.
+    RETURNS (Dict[str, Any]): The model's meta data.
+    """
+    model_path = ensure_path(path)
+    return load_meta(model_path / "meta.json")
+
+
+def is_package(name: str) -> bool:
+    """Check if string maps to a package installed via pip.
+
+    name (str): Name of package.
+    RETURNS (bool): True if installed package, False if not.
+    """
+    try:
+        importlib_metadata.distribution(name)  # type: ignore[attr-defined]
+        return True
+    except:  # noqa: E722
+        return False
+
+
+def get_package_path(name: str) -> Path:
+    """Get the path to an installed package.
+
+    name (str): Package name.
+    RETURNS (Path): Path to installed package.
+    """
+    name = name.lower()  # use lowercase version to be safe
+    # Here we're importing the module just to find it. This is worryingly
+    # indirect, but it's otherwise very difficult to find the package.
+    pkg = importlib.import_module(name)
+    return Path(pkg.__file__).parent
+
+
+def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
+    """Replace a node within a model with a new one, updating refs.
+
+    model (Model): The parent model.
+    target (Model): The target node.
+    replacement (Model): The node to replace the target with.
+    """
+    # Place the node into the sublayers
+    for node in model.walk():
+        if target in node.layers:
+            node.layers[node.layers.index(target)] = replacement
+    # Now fix any node references
+    for node in model.walk():
+        for ref_name in node.ref_names:
+            if node.maybe_get_ref(ref_name) is target:
+                node.set_ref(ref_name, replacement)
+
+
+def split_command(command: str) -> List[str]:
+    """Split a string command using shlex. Handles platform compatibility.
+
+    command (str) : The command to split
+    RETURNS (List[str]): The split command.
+    """
+    return shlex.split(command, posix=not is_windows)
+
+
+def join_command(command: List[str]) -> str:
+    """Join a command using shlex. shlex.join is only available for Python 3.8+,
+    so we're using a workaround here.
+
+    command (List[str]): The command to join.
+    RETURNS (str): The joined command
+    """
+    return " ".join(shlex.quote(cmd) for cmd in command)
+
+
+def run_command(
+    command: Union[str, List[str]],
+    *,
+    stdin: Optional[Any] = None,
+    capture: bool = False,
+) -> subprocess.CompletedProcess:
+    """Run a command on the command line as a subprocess. If the subprocess
+    returns a non-zero exit code, a system exit is performed.
+
+    command (str / List[str]): The command. If provided as a string, the
+        string will be split using shlex.split.
+    stdin (Optional[Any]): stdin to read from or None.
+    capture (bool): Whether to capture the output and errors. If False,
+        the stdout and stderr will not be redirected, and if there's an error,
+        sys.exit will be called with the return code. You should use capture=False
+        when you want to turn over execution to the command, and capture=True
+        when you want to run the command more like a function.
+    RETURNS (Optional[CompletedProcess]): The process object.
+    """
+    if isinstance(command, str):
+        cmd_list = split_command(command)
+        cmd_str = command
+    else:
+        cmd_list = command
+        cmd_str = " ".join(command)
+    try:
+        ret = subprocess.run(
+            cmd_list,
+            env=os.environ.copy(),
+            input=stdin,
+            encoding="utf8",
+            check=False,
+            stdout=subprocess.PIPE if capture else None,
+            stderr=subprocess.STDOUT if capture else None,
+        )
+    except FileNotFoundError:
+        # Indicates the *command* wasn't found, it's an error before the command
+        # is run.
+        raise FileNotFoundError(
+            Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
+        ) from None
+    if ret.returncode != 0 and capture:
+        message = f"Error running command:\n\n{cmd_str}\n\n"
+        message += f"Subprocess exited with status {ret.returncode}"
+        if ret.stdout is not None:
+            message += f"\n\nProcess log (stdout and stderr):\n\n"
+            message += ret.stdout
+        error = subprocess.SubprocessError(message)
+        error.ret = ret  # type: ignore[attr-defined]
+        error.command = cmd_str  # type: ignore[attr-defined]
+        raise error
+    elif ret.returncode != 0:
+        sys.exit(ret.returncode)
+    return ret
+
+
+@contextmanager
+def working_dir(path: Union[str, Path]) -> Iterator[Path]:
+    """Change current working directory and returns to previous on exit.
+
+    path (str / Path): The directory to navigate to.
+    YIELDS (Path): The absolute path to the current working directory. This
+        should be used if the block needs to perform actions within the working
+        directory, to prevent mismatches with relative paths.
+    """
+    prev_cwd = Path.cwd()
+    current = Path(path).resolve()
+    os.chdir(str(current))
+    try:
+        yield current
+    finally:
+        os.chdir(str(prev_cwd))
+
+
+@contextmanager
+def make_tempdir() -> Generator[Path, None, None]:
+    """Execute a block in a temporary directory and remove the directory and
+    its contents at the end of the with block.
+
+    YIELDS (Path): The path of the temp directory.
+    """
+    d = Path(tempfile.mkdtemp())
+    yield d
+    try:
+        shutil.rmtree(str(d))
+    except PermissionError as e:
+        warnings.warn(Warnings.W091.format(dir=d, msg=e))
+
+
+def is_cwd(path: Union[Path, str]) -> bool:
+    """Check whether a path is the current working directory.
+
+    path (Union[Path, str]): The directory path.
+    RETURNS (bool): Whether the path is the current working directory.
+    """
+    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
+
+
+def is_in_jupyter() -> bool:
+    """Check if user is running spaCy from a Jupyter notebook by detecting the
+    IPython kernel. Mainly used for the displaCy visualizer.
+    RETURNS (bool): True if in Jupyter, False if not.
+    """
+    # https://stackoverflow.com/a/39662359/6400719
+    try:
+        shell = get_ipython().__class__.__name__  # type: ignore[name-defined]
+        if shell == "ZMQInteractiveShell":
+            return True  # Jupyter notebook or qtconsole
+    except NameError:
+        return False  # Probably standard Python interpreter
+    return False
+
+
+def get_object_name(obj: Any) -> str:
+    """Get a human-readable name of a Python object, e.g. a pipeline component.
+
+    obj (Any): The Python object, typically a function or class.
+    RETURNS (str): A human-readable name.
+    """
+    if hasattr(obj, "name") and obj.name is not None:
+        return obj.name
+    if hasattr(obj, "__name__"):
+        return obj.__name__
+    if hasattr(obj, "__class__") and hasattr(obj.__class__, "__name__"):
+        return obj.__class__.__name__
+    return repr(obj)
+
+
+def is_same_func(func1: Callable, func2: Callable) -> bool:
+    """Approximately decide whether two functions are the same, even if their
+    identity is different (e.g. after they have been live reloaded). Mostly
+    used in the @Language.component and @Language.factory decorators to decide
+    whether to raise if a factory already exists. Allows decorator to run
+    multiple times with the same function.
+
+    func1 (Callable): The first function.
+    func2 (Callable): The second function.
+    RETURNS (bool): Whether it's the same function (most likely).
+    """
+    if not callable(func1) or not callable(func2):
+        return False
+    if not hasattr(func1, "__qualname__") or not hasattr(func2, "__qualname__"):
+        return False
+    same_name = func1.__qualname__ == func2.__qualname__
+    same_file = inspect.getfile(func1) == inspect.getfile(func2)
+    same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2)
+    return same_name and same_file and same_code
+
+
+def get_cuda_stream(
+    require: bool = False, non_blocking: bool = True
+) -> Optional[CudaStream]:
+    ops = get_current_ops()
+    if CudaStream is None:
+        return None
+    elif isinstance(ops, NumpyOps):
+        return None
+    else:
+        return CudaStream(non_blocking=non_blocking)
+
+
+def get_async(stream, numpy_array):
+    if cupy is None:
+        return numpy_array
+    else:
+        array = cupy.ndarray(numpy_array.shape, order="C", dtype=numpy_array.dtype)
+        array.set(numpy_array, stream=stream)
+        return array
+
+
+def read_regex(path: Union[str, Path]) -> Pattern:
+    path = ensure_path(path)
+    with path.open(encoding="utf8") as file_:
+        entries = file_.read().split("\n")
+    expression = "|".join(
+        ["^" + re.escape(piece) for piece in entries if piece.strip()]
+    )
+    return re.compile(expression)
+
+
+def compile_prefix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
+    """Compile a sequence of prefix rules into a regex object.
+
+    entries (Iterable[Union[str, Pattern]]): The prefix rules, e.g.
+        spacy.lang.punctuation.TOKENIZER_PREFIXES.
+    RETURNS (Pattern): The regex object. to be used for Tokenizer.prefix_search.
+    """
+    expression = "|".join(["^" + piece for piece in entries if piece.strip()])  # type: ignore[operator, union-attr]
+    return re.compile(expression)
+
+
+def compile_suffix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
+    """Compile a sequence of suffix rules into a regex object.
+
+    entries (Iterable[Union[str, Pattern]]): The suffix rules, e.g.
+        spacy.lang.punctuation.TOKENIZER_SUFFIXES.
+    RETURNS (Pattern): The regex object. to be used for Tokenizer.suffix_search.
+    """
+    expression = "|".join([piece + "$" for piece in entries if piece.strip()])  # type: ignore[operator, union-attr]
+    return re.compile(expression)
+
+
+def compile_infix_regex(entries: Iterable[Union[str, Pattern]]) -> Pattern:
+    """Compile a sequence of infix rules into a regex object.
+
+    entries (Iterable[Union[str, Pattern]]): The infix rules, e.g.
+        spacy.lang.punctuation.TOKENIZER_INFIXES.
+    RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer.
+    """
+    expression = "|".join([piece for piece in entries if piece.strip()])  # type: ignore[misc, union-attr]
+    return re.compile(expression)
+
+
+def add_lookups(default_func: Callable[[str], Any], *lookups) -> Callable[[str], Any]:
+    """Extend an attribute function with special cases. If a word is in the
+    lookups, the value is returned. Otherwise the previous function is used.
+
+    default_func (callable): The default function to execute.
+    *lookups (dict): Lookup dictionary mapping string to attribute value.
+    RETURNS (callable): Lexical attribute getter.
+    """
+    # This is implemented as functools.partial instead of a closure, to allow
+    # pickle to work.
+    return functools.partial(_get_attr_unless_lookup, default_func, lookups)
+
+
+def _get_attr_unless_lookup(
+    default_func: Callable[[str], Any], lookups: Dict[str, Any], string: str
+) -> Any:
+    for lookup in lookups:
+        if string in lookup:
+            return lookup[string]  # type: ignore[index]
+    return default_func(string)
+
+
+def update_exc(
+    base_exceptions: Dict[str, List[dict]], *addition_dicts
+) -> Dict[str, List[dict]]:
+    """Update and validate tokenizer exceptions. Will overwrite exceptions.
+
+    base_exceptions (Dict[str, List[dict]]): Base exceptions.
+    *addition_dicts (Dict[str, List[dict]]): Exceptions to add to the base dict, in order.
+    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
+    """
+    exc = dict(base_exceptions)
+    for additions in addition_dicts:
+        for orth, token_attrs in additions.items():
+            if not all(isinstance(attr[ORTH], str) for attr in token_attrs):
+                raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
+            described_orth = "".join(attr[ORTH] for attr in token_attrs)
+            if orth != described_orth:
+                raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
+        exc.update(additions)
+    exc = expand_exc(exc, "'", "’")
+    return exc
+
+
+def expand_exc(
+    excs: Dict[str, List[dict]], search: str, replace: str
+) -> Dict[str, List[dict]]:
+    """Find string in tokenizer exceptions, duplicate entry and replace string.
+    For example, to add additional versions with typographic apostrophes.
+
+    excs (Dict[str, List[dict]]): Tokenizer exceptions.
+    search (str): String to find and replace.
+    replace (str): Replacement.
+    RETURNS (Dict[str, List[dict]]): Combined tokenizer exceptions.
+    """
+
+    def _fix_token(token, search, replace):
+        fixed = dict(token)
+        fixed[ORTH] = fixed[ORTH].replace(search, replace)
+        return fixed
+
+    new_excs = dict(excs)
+    for token_string, tokens in excs.items():
+        if search in token_string:
+            new_key = token_string.replace(search, replace)
+            new_value = [_fix_token(t, search, replace) for t in tokens]
+            new_excs[new_key] = new_value
+    return new_excs
+
+
+def normalize_slice(
+    length: int, start: int, stop: int, step: Optional[int] = None
+) -> Tuple[int, int]:
+    if not (step is None or step == 1):
+        raise ValueError(Errors.E057)
+    if start is None:
+        start = 0
+    elif start < 0:
+        start += length
+    start = min(length, max(0, start))
+    if stop is None:
+        stop = length
+    elif stop < 0:
+        stop += length
+    stop = min(length, max(start, stop))
+    return start, stop
+
+
+def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
+    """Filter a sequence of spans and remove duplicates or overlaps. Useful for
+    creating named entities (where one token can only be part of one entity) or
+    when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
+    longest span is preferred over shorter spans.
+
+    spans (Iterable[Span]): The spans to filter.
+    RETURNS (List[Span]): The filtered spans.
+    """
+    get_sort_key = lambda span: (span.end - span.start, -span.start)
+    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
+    result = []
+    seen_tokens: Set[int] = set()
+    for span in sorted_spans:
+        # Check for end - 1 here because boundaries are inclusive
+        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
+            result.append(span)
+            seen_tokens.update(range(span.start, span.end))
+    result = sorted(result, key=lambda span: span.start)
+    return result
+
+
+def to_bytes(getters: Dict[str, Callable[[], bytes]], exclude: Iterable[str]) -> bytes:
+    return srsly.msgpack_dumps(to_dict(getters, exclude))
+
+
+def from_bytes(
+    bytes_data: bytes,
+    setters: Dict[str, Callable[[bytes], Any]],
+    exclude: Iterable[str],
+) -> None:
+    return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)  # type: ignore[return-value]
+
+
+def to_dict(
+    getters: Dict[str, Callable[[], Any]], exclude: Iterable[str]
+) -> Dict[str, Any]:
+    serialized = {}
+    for key, getter in getters.items():
+        # Split to support file names like meta.json
+        if key.split(".")[0] not in exclude:
+            serialized[key] = getter()
+    return serialized
+
+
+def from_dict(
+    msg: Dict[str, Any],
+    setters: Dict[str, Callable[[Any], Any]],
+    exclude: Iterable[str],
+) -> Dict[str, Any]:
+    for key, setter in setters.items():
+        # Split to support file names like meta.json
+        if key.split(".")[0] not in exclude and key in msg:
+            setter(msg[key])
+    return msg
+
+
+def to_disk(
+    path: Union[str, Path],
+    writers: Dict[str, Callable[[Path], None]],
+    exclude: Iterable[str],
+) -> Path:
+    path = ensure_path(path)
+    if not path.exists():
+        path.mkdir()
+    for key, writer in writers.items():
+        # Split to support file names like meta.json
+        if key.split(".")[0] not in exclude:
+            writer(path / key)
+    return path
+
+
+def from_disk(
+    path: Union[str, Path],
+    readers: Dict[str, Callable[[Path], None]],
+    exclude: Iterable[str],
+) -> Path:
+    path = ensure_path(path)
+    for key, reader in readers.items():
+        # Split to support file names like meta.json
+        if key.split(".")[0] not in exclude:
+            reader(path / key)
+    return path
+
+
+def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
+    """Import module from a file. Used to load models from a directory.
+
+    name (str): Name of module to load.
+    loc (str / Path): Path to the file.
+    RETURNS: The loaded module.
+    """
+    spec = importlib.util.spec_from_file_location(name, str(loc))
+    module = importlib.util.module_from_spec(spec)  # type: ignore[arg-type]
+    spec.loader.exec_module(module)  # type: ignore[union-attr]
+    return module
+
+
+def minify_html(html: str) -> str:
+    """Perform a template-specific, rudimentary HTML minification for displaCy.
+    Disclaimer: NOT a general-purpose solution, only removes indentation and
+    newlines.
+
+    html (str): Markup to minify.
+    RETURNS (str): "Minified" HTML.
+    """
+    return html.strip().replace("    ", "").replace("\n", "")
+
+
+def escape_html(text: str) -> str:
+    """Replace <, >, &, " with their HTML encoded representation. Intended to
+    prevent HTML errors in rendered displaCy markup.
+
+    text (str): The original text.
+    RETURNS (str): Equivalent text to be safely used within HTML.
+    """
+    text = text.replace("&", "&amp;")
+    text = text.replace("<", "&lt;")
+    text = text.replace(">", "&gt;")
+    text = text.replace('"', "&quot;")
+    return text
+
+
+def get_words_and_spaces(
+    words: Iterable[str], text: str
+) -> Tuple[List[str], List[bool]]:
+    """Given a list of words and a text, reconstruct the original tokens and
+    return a list of words and spaces that can be used to create a Doc. This
+    can help recover destructive tokenization that didn't preserve any
+    whitespace information.
+
+    words (Iterable[str]): The words.
+    text (str): The original text.
+    RETURNS (Tuple[List[str], List[bool]]): The words and spaces.
+    """
+    if "".join("".join(words).split()) != "".join(text.split()):
+        raise ValueError(Errors.E194.format(text=text, words=words))
+    text_words = []
+    text_spaces = []
+    text_pos = 0
+    # normalize words to remove all whitespace tokens
+    norm_words = [word for word in words if not word.isspace()]
+    # align words with text
+    for word in norm_words:
+        try:
+            word_start = text[text_pos:].index(word)
+        except ValueError:
+            raise ValueError(Errors.E194.format(text=text, words=words)) from None
+        if word_start > 0:
+            text_words.append(text[text_pos : text_pos + word_start])
+            text_spaces.append(False)
+            text_pos += word_start
+        text_words.append(word)
+        text_spaces.append(False)
+        text_pos += len(word)
+        if text_pos < len(text) and text[text_pos] == " ":
+            text_spaces[-1] = True
+            text_pos += 1
+    if text_pos < len(text):
+        text_words.append(text[text_pos:])
+        text_spaces.append(False)
+    return (text_words, text_spaces)
+
+
+def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
+    """Deep copy a Config. Will raise an error if the config contents are not
+    JSON-serializable.
+
+    config (Config): The config to copy.
+    RETURNS (Config): The copied config.
+    """
+    try:
+        return Config(config).copy()
+    except ValueError:
+        raise ValueError(Errors.E961.format(config=config)) from None
+
+
+def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
+    """Convert dot notation to a dict. For example: {"token.pos": True,
+    "token._.xyz": True} becomes {"token": {"pos": True, "_": {"xyz": True }}}.
+
+    values (Dict[str, Any]): The key/value pairs to convert.
+    RETURNS (Dict[str, dict]): The converted values.
+    """
+    result: Dict[str, dict] = {}
+    for key, value in values.items():
+        path = result
+        parts = key.lower().split(".")
+        for i, item in enumerate(parts):
+            is_last = i == len(parts) - 1
+            path = path.setdefault(item, value if is_last else {})
+    return result
+
+
+def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
+    """Convert dot notation to a dict. For example: {"token": {"pos": True,
+    "_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
+
+    values (Dict[str, dict]): The dict to convert.
+    RETURNS (Dict[str, Any]): The key/value pairs.
+    """
+    return {".".join(key): value for key, value in walk_dict(obj)}
+
+
+def dot_to_object(config: Config, section: str):
+    """Convert dot notation of a "section" to a specific part of the Config.
+    e.g. "training.optimizer" would return the Optimizer object.
+    Throws an error if the section is not defined in this config.
+
+    config (Config): The config.
+    section (str): The dot notation of the section in the config.
+    RETURNS: The object denoted by the section
+    """
+    component = config
+    parts = section.split(".")
+    for item in parts:
+        try:
+            component = component[item]
+        except (KeyError, TypeError):
+            raise KeyError(Errors.E952.format(name=section)) from None
+    return component
+
+
+def set_dot_to_object(config: Config, section: str, value: Any) -> None:
+    """Update a config at a given position from a dot notation.
+
+    config (Config): The config.
+    section (str): The dot notation of the section in the config.
+    value (Any): The value to set in the config.
+    """
+    component = config
+    parts = section.split(".")
+    for i, item in enumerate(parts):
+        try:
+            if i == len(parts) - 1:
+                component[item] = value
+            else:
+                component = component[item]
+        except (KeyError, TypeError):
+            raise KeyError(Errors.E952.format(name=section)) from None
+
+
+def walk_dict(
+    node: Dict[str, Any], parent: List[str] = []
+) -> Iterator[Tuple[List[str], Any]]:
+    """Walk a dict and yield the path and values of the leaves."""
+    for key, value in node.items():
+        key_parent = [*parent, key]
+        if isinstance(value, dict):
+            yield from walk_dict(value, key_parent)
+        else:
+            yield (key_parent, value)
+
+
+def get_arg_names(func: Callable) -> List[str]:
+    """Get a list of all named arguments of a function (regular,
+    keyword-only).
+
+    func (Callable): The function
+    RETURNS (List[str]): The argument names.
+    """
+    argspec = inspect.getfullargspec(func)
+    return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
+
+
+def combine_score_weights(
+    weights: List[Dict[str, Optional[float]]],
+    overrides: Dict[str, Optional[float]] = SimpleFrozenDict(),
+) -> Dict[str, Optional[float]]:
+    """Combine and normalize score weights defined by components, e.g.
+    {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
+
+    weights (List[dict]): The weights defined by the components.
+    overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
+        should be preserved.
+    RETURNS (Dict[str, float]): The combined and normalized weights.
+    """
+    # We divide each weight by the total weight sum.
+    # We first need to extract all None/null values for score weights that
+    # shouldn't be shown in the table *or* be weighted
+    result: Dict[str, Optional[float]] = {
+        key: value for w_dict in weights for (key, value) in w_dict.items()
+    }
+    result.update(overrides)
+    weight_sum = sum([v if v else 0.0 for v in result.values()])
+    for key, value in result.items():
+        if value and weight_sum > 0:
+            result[key] = round(value / weight_sum, 2)
+    return result
+
+
+class DummyTokenizer:
+    def __call__(self, text):
+        raise NotImplementedError
+
+    def pipe(self, texts, **kwargs):
+        for text in texts:
+            yield self(text)
+
+    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
+    # allow serialization (see #1557)
+    def to_bytes(self, **kwargs):
+        return b""
+
+    def from_bytes(self, data: bytes, **kwargs) -> "DummyTokenizer":
+        return self
+
+    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
+        return None
+
+    def from_disk(self, path: Union[str, Path], **kwargs) -> "DummyTokenizer":
+        return self
+
+
+def create_default_optimizer() -> Optimizer:
+    return Adam()
+
+
+def minibatch(items, size):
+    """Iterate over batches of items. `size` may be an iterator,
+    so that batch-size can vary on each step.
+    """
+    if isinstance(size, int):
+        size_ = itertools.repeat(size)
+    else:
+        size_ = size
+    items = iter(items)
+    while True:
+        batch_size = next(size_)
+        batch = list(itertools.islice(items, int(batch_size)))
+        if len(batch) == 0:
+            break
+        yield list(batch)
+
+
+def is_cython_func(func: Callable) -> bool:
+    """Slightly hacky check for whether a callable is implemented in Cython.
+    Can be used to implement slightly different behaviors, especially around
+    inspecting and parameter annotations. Note that this will only return True
+    for actual cdef functions and methods, not regular Python functions defined
+    in Python modules.
+
+    func (Callable): The callable to check.
+    RETURNS (bool): Whether the callable is Cython (probably).
+    """
+    attr = "__pyx_vtable__"
+    if hasattr(func, attr):  # function or class instance
+        return True
+    # https://stackoverflow.com/a/55767059
+    if (
+        hasattr(func, "__qualname__")
+        and hasattr(func, "__module__")
+        and func.__module__ in sys.modules
+    ):  # method
+        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
+        return hasattr(cls_func, attr)
+    return False
+
+
+def check_bool_env_var(env_var: str) -> bool:
+    """Convert the value of an environment variable to a boolean. Add special
+    check for "0" (falsy) and consider everything else truthy, except unset.
+
+    env_var (str): The name of the environment variable to check.
+    RETURNS (bool): Its boolean value.
+    """
+    value = os.environ.get(env_var, False)
+    if value == "0":
+        return False
+    return bool(value)
+
+
+def _pipe(
+    docs: Iterable["Doc"],
+    proc: "Pipe",
+    name: str,
+    default_error_handler: Callable[[str, "Pipe", List["Doc"], Exception], NoReturn],
+    kwargs: Mapping[str, Any],
+) -> Iterator["Doc"]:
+    if hasattr(proc, "pipe"):
+        yield from proc.pipe(docs, **kwargs)
+    else:
+        # We added some args for pipe that __call__ doesn't expect.
+        kwargs = dict(kwargs)
+        error_handler = default_error_handler
+        if hasattr(proc, "get_error_handler"):
+            error_handler = proc.get_error_handler()
+        for arg in ["batch_size"]:
+            if arg in kwargs:
+                kwargs.pop(arg)
+        for doc in docs:
+            try:
+                doc = proc(doc, **kwargs)  # type: ignore[call-arg]
+                yield doc
+            except Exception as e:
+                error_handler(name, proc, [doc], e)
+
+
+def raise_error(proc_name, proc, docs, e):
+    raise e
+
+
+def ignore_error(proc_name, proc, docs, e):
+    pass
+
+
+def warn_if_jupyter_cupy():
+    """Warn about require_gpu if a jupyter notebook + cupy + mismatched
+    contextvars vs. thread ops are detected
+    """
+    if is_in_jupyter():
+        from thinc.backends.cupy_ops import CupyOps
+
+        if CupyOps.xp is not None:
+            from thinc.backends import contextvars_eq_thread_ops
+
+            if not contextvars_eq_thread_ops():
+                warnings.warn(Warnings.W111)
+
+
+def check_lexeme_norms(vocab, component_name):
+    lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
+    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
+        langs = ", ".join(LEXEME_NORM_LANGS)
+        logger.debug(Warnings.W033.format(model=component_name, langs=langs))
+
+
+def to_ternary_int(val) -> int:
+    """Convert a value to the ternary 1/0/-1 int used for True/None/False in
+    attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
+    (None), any other values are -1 (False).
+    """
+    if val is True:
+        return 1
+    elif val is None:
+        return 0
+    elif val is False:
+        return -1
+    elif val == 1:
+        return 1
+    elif val == 0:
+        return 0
+    else:
+        return -1
+
+
+# The following implementation of packages_distributions() is adapted from
+# importlib_metadata, which is distributed under the Apache 2.0 License.
+# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw
+# See licenses/3rd_party_licenses.txt
+def packages_distributions() -> Dict[str, List[str]]:
+    """Return a mapping of top-level packages to their distributions. We're
+    inlining this helper from the importlib_metadata "backport" here, since
+    it's not available in the builtin importlib.metadata.
+    """
+    pkg_to_dist = defaultdict(list)
+    for dist in importlib_metadata.distributions():  # type: ignore[attr-defined]
+        for pkg in (dist.read_text("top_level.txt") or "").split():
+            pkg_to_dist[pkg].append(dist.metadata["Name"])
+    return dict(pkg_to_dist)
--- a/cgpenv/Lib/site-packages/spacy/vectors.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/vectors.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/vectors.cpp
+++ b/cgpenv/Lib/site-packages/spacy/vectors.cpp
--- a/cgpenv/Lib/site-packages/spacy/vectors.pyx
+++ b/cgpenv/Lib/site-packages/spacy/vectors.pyx
+cimport numpy as np
+from libc.stdint cimport uint32_t
+from cython.operator cimport dereference as deref
+from libcpp.set cimport set as cppset
+from murmurhash.mrmr cimport hash128_x64
+
+import functools
+import numpy
+from typing import cast
+import warnings
+from enum import Enum
+import srsly
+from thinc.api import get_array_module, get_current_ops
+from thinc.backends import get_array_ops
+from thinc.types import Floats2d
+
+from .strings cimport StringStore
+
+from .strings import get_string_id
+from .errors import Errors, Warnings
+from . import util
+
+
+def unpickle_vectors(bytes_data):
+    return Vectors().from_bytes(bytes_data)
+
+
+class Mode(str, Enum):
+    default = "default"
+    floret = "floret"
+
+    @classmethod
+    def values(cls):
+        return list(cls.__members__.keys())
+
+
+cdef class Vectors:
+    """Store, save and load word vectors.
+
+    Vectors data is kept in the vectors.data attribute, which should be an
+    instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
+    (for GPU vectors).
+
+    In the default mode, `vectors.key2row` is a dictionary mapping word hashes
+    to rows in the vectors.data table. Multiple keys can be mapped to the same
+    vector, and not all of the rows in the table need to be assigned - so
+    len(list(vectors.keys())) may be greater or smaller than vectors.shape[0].
+
+    In floret mode, the floret settings (minn, maxn, etc.) are used to
+    calculate the vector from the rows corresponding to the key's ngrams.
+
+    DOCS: https://spacy.io/api/vectors
+    """
+    cdef public object strings
+    cdef public object name
+    cdef readonly object mode
+    cdef public object data
+    cdef public object key2row
+    cdef cppset[int] _unset
+    cdef readonly uint32_t minn
+    cdef readonly uint32_t maxn
+    cdef readonly uint32_t hash_count
+    cdef readonly uint32_t hash_seed
+    cdef readonly unicode bow
+    cdef readonly unicode eow
+
+    def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
+        """Create a new vector store.
+
+        strings (StringStore): The string store.
+        shape (tuple): Size of the table, as (# entries, # columns)
+        data (numpy.ndarray or cupy.ndarray): The vector data.
+        keys (iterable): A sequence of keys, aligned with the data.
+        name (str): A name to identify the vectors table.
+        mode (str): Vectors mode: "default" or "floret" (default: "default").
+        minn (int): The floret char ngram minn (default: 0).
+        maxn (int): The floret char ngram maxn (default: 0).
+        hash_count (int): The floret hash count (1-4, default: 1).
+        hash_seed (int): The floret hash seed (default: 0).
+        bow (str): The floret BOW string (default: "<").
+        eow (str): The floret EOW string (default: ">").
+
+        DOCS: https://spacy.io/api/vectors#init
+        """
+        self.strings = strings
+        if self.strings is None:
+            self.strings = StringStore()
+        self.name = name
+        if mode not in Mode.values():
+            raise ValueError(
+                Errors.E202.format(
+                    name="vectors",
+                    mode=mode,
+                    modes=str(Mode.values())
+                )
+            )
+        self.mode = Mode(mode).value
+        self.key2row = {}
+        self.minn = minn
+        self.maxn = maxn
+        self.hash_count = hash_count
+        self.hash_seed = hash_seed
+        self.bow = bow
+        self.eow = eow
+        if self.mode == Mode.default:
+            if data is None:
+                if shape is None:
+                    shape = (0,0)
+                ops = get_current_ops()
+                data = ops.xp.zeros(shape, dtype="f")
+                self._unset = cppset[int]({i for i in range(data.shape[0])})
+            else:
+                self._unset = cppset[int]()
+            self.data = data
+            if keys is not None:
+                for i, key in enumerate(keys):
+                    self.add(key, row=i)
+        elif self.mode == Mode.floret:
+            if maxn < minn:
+                raise ValueError(Errors.E863)
+            if hash_count < 1 or hash_count >= 5:
+                raise ValueError(Errors.E862)
+            if data is None:
+                raise ValueError(Errors.E864)
+            if keys is not None:
+                raise ValueError(Errors.E861)
+            self.data = data
+            self._unset = cppset[int]()
+
+    @property
+    def shape(self):
+        """Get `(rows, dims)` tuples of number of rows and number of dimensions
+        in the vector table.
+
+        RETURNS (tuple): A `(rows, dims)` pair.
+
+        DOCS: https://spacy.io/api/vectors#shape
+        """
+        return self.data.shape
+
+    @property
+    def size(self):
+        """The vector size i,e. rows * dims.
+
+        RETURNS (int): The vector size.
+
+        DOCS: https://spacy.io/api/vectors#size
+        """
+        return self.data.shape[0] * self.data.shape[1]
+
+    @property
+    def is_full(self):
+        """Whether the vectors table is full.
+
+        RETURNS (bool): `True` if no slots are available for new keys.
+
+        DOCS: https://spacy.io/api/vectors#is_full
+        """
+        if self.mode == Mode.floret:
+            return True
+        return self._unset.size() == 0
+
+    @property
+    def n_keys(self):
+        """Get the number of keys in the table. Note that this is the number
+        of all keys, not just unique vectors.
+
+        RETURNS (int): The number of keys in the table for default vectors.
+        For floret vectors, return -1.
+
+        DOCS: https://spacy.io/api/vectors#n_keys
+        """
+        return len(self.key2row)
+
+    def __reduce__(self):
+        return (unpickle_vectors, (self.to_bytes(),))
+
+    def __getitem__(self, key):
+        """Get a vector by key. If the key is not found, a KeyError is raised.
+
+        key (str/int): The key to get the vector for.
+        RETURNS (ndarray): The vector for the key.
+
+        DOCS: https://spacy.io/api/vectors#getitem
+        """
+        if self.mode == Mode.default:
+            i = self.key2row.get(get_string_id(key), None)
+            if i is None:
+                raise KeyError(Errors.E058.format(key=key))
+            else:
+                return self.data[i]
+        elif self.mode == Mode.floret:
+            return self.get_batch([key])[0]
+        raise KeyError(Errors.E058.format(key=key))
+
+    def __setitem__(self, key, vector):
+        """Set a vector for the given key.
+
+        key (str/int): The key to set the vector for.
+        vector (ndarray): The vector to set.
+
+        DOCS: https://spacy.io/api/vectors#setitem
+        """
+        if self.mode == Mode.floret:
+            warnings.warn(Warnings.W115.format(method="Vectors.__setitem__"))
+            return
+        key = get_string_id(key)
+        i = self.key2row[key]
+        self.data[i] = vector
+        if self._unset.count(i):
+            self._unset.erase(self._unset.find(i))
+
+    def __iter__(self):
+        """Iterate over the keys in the table.
+
+        YIELDS (int): A key in the table.
+
+        DOCS: https://spacy.io/api/vectors#iter
+        """
+        yield from self.key2row
+
+    def __len__(self):
+        """Return the number of vectors in the table.
+
+        RETURNS (int): The number of vectors in the data.
+
+        DOCS: https://spacy.io/api/vectors#len
+        """
+        return self.data.shape[0]
+
+    def __contains__(self, key):
+        """Check whether a key has been mapped to a vector entry in the table.
+
+        key (int): The key to check.
+        RETURNS (bool): Whether the key has a vector entry.
+
+        DOCS: https://spacy.io/api/vectors#contains
+        """
+        if self.mode == Mode.floret:
+            return True
+        else:
+            return key in self.key2row
+
+    def resize(self, shape, inplace=False):
+        """Resize the underlying vectors array. If inplace=True, the memory
+        is reallocated. This may cause other references to the data to become
+        invalid, so only use inplace=True if you're sure that's what you want.
+
+        If the number of vectors is reduced, keys mapped to rows that have been
+        deleted are removed. These removed items are returned as a list of
+        `(key, row)` tuples.
+
+        shape (tuple): A `(rows, dims)` tuple.
+        inplace (bool): Reallocate the memory.
+        RETURNS (list): The removed items as a list of `(key, row)` tuples.
+
+        DOCS: https://spacy.io/api/vectors#resize
+        """
+        if self.mode == Mode.floret:
+            warnings.warn(Warnings.W115.format(method="Vectors.resize"))
+            return -1
+        xp = get_array_module(self.data)
+        if inplace:
+            if shape[1] != self.data.shape[1]:
+                raise ValueError(Errors.E193.format(new_dim=shape[1], curr_dim=self.data.shape[1]))
+            if xp == numpy:
+                self.data.resize(shape, refcheck=False)
+            else:
+                raise ValueError(Errors.E192)
+        else:
+            resized_array = xp.zeros(shape, dtype=self.data.dtype)
+            copy_shape = (min(shape[0], self.data.shape[0]), min(shape[1], self.data.shape[1]))
+            resized_array[:copy_shape[0], :copy_shape[1]] = self.data[:copy_shape[0], :copy_shape[1]]
+            self.data = resized_array
+        self._sync_unset()
+        removed_items = []
+        for key, row in list(self.key2row.items()):
+            if row >= shape[0]:
+                self.key2row.pop(key)
+                removed_items.append((key, row))
+        return removed_items
+
+    def keys(self):
+        """RETURNS (iterable): A sequence of keys in the table."""
+        return self.key2row.keys()
+
+    def values(self):
+        """Iterate over vectors that have been assigned to at least one key.
+
+        Note that some vectors may be unassigned, so the number of vectors
+        returned may be less than the length of the vectors table.
+
+        YIELDS (ndarray): A vector in the table.
+
+        DOCS: https://spacy.io/api/vectors#values
+        """
+        for row, vector in enumerate(range(self.data.shape[0])):
+            if not self._unset.count(row):
+                yield vector
+
+    def items(self):
+        """Iterate over `(key, vector)` pairs.
+
+        YIELDS (tuple): A key/vector pair.
+
+        DOCS: https://spacy.io/api/vectors#items
+        """
+        for key, row in self.key2row.items():
+            yield key, self.data[row]
+
+    def find(self, *, key=None, keys=None, row=None, rows=None):
+        """Look up one or more keys by row, or vice versa.
+
+        key (Union[int, str]): Find the row that the given key points to.
+            Returns int, -1 if missing.
+        keys (Iterable[Union[int, str]]): Find rows that the keys point to.
+            Returns ndarray.
+        row (int): Find the first key that points to the row.
+            Returns int.
+        rows (Iterable[int]): Find the keys that point to the rows.
+            Returns ndarray.
+        RETURNS: The requested key, keys, row or rows.
+        """
+        if self.mode == Mode.floret:
+            raise ValueError(
+                Errors.E858.format(
+                    mode=self.mode,
+                    alternative="Use Vectors[key] instead.",
+                )
+            )
+        if sum(arg is None for arg in (key, keys, row, rows)) != 3:
+            bad_kwargs = {"key": key, "keys": keys, "row": row, "rows": rows}
+            raise ValueError(Errors.E059.format(kwargs=bad_kwargs))
+        xp = get_array_module(self.data)
+        if key is not None:
+            key = get_string_id(key)
+            return self.key2row.get(key, -1)
+        elif keys is not None:
+            keys = [get_string_id(key) for key in keys]
+            rows = [self.key2row.get(key, -1.) for key in keys]
+            return xp.asarray(rows, dtype="i")
+        else:
+            row2key = {row: key for key, row in self.key2row.items()}
+            if row is not None:
+                return row2key[row]
+            else:
+                results = [row2key[row] for row in rows]
+                return xp.asarray(results, dtype="uint64")
+
+    def _get_ngram_hashes(self, unicode s):
+        """Calculate up to 4 32-bit hash values with MurmurHash3_x64_128 using
+        the floret hash settings.
+        key (str): The string key.
+        RETURNS: A list of the integer hashes.
+        """
+        cdef uint32_t[4] out
+        chars = s.encode("utf8")
+        cdef char* utf8_string = chars
+        hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
+        rows = [out[i] for i in range(min(self.hash_count, 4))]
+        return rows
+
+    def _get_ngrams(self, unicode key):
+        """Get all padded ngram strings using the ngram settings.
+        key (str): The string key.
+        RETURNS: A list of the ngram strings for the padded key.
+        """
+        key = self.bow + key + self.eow
+        ngrams = [key] + [
+            key[start:start+ngram_size]
+            for ngram_size in range(self.minn, self.maxn + 1)
+            for start in range(0, len(key) - ngram_size + 1)
+        ]
+        return ngrams
+
+    def get_batch(self, keys):
+        """Get the vectors for the provided keys efficiently as a batch.
+        keys (Iterable[Union[int, str]]): The keys.
+        RETURNS: The requested vectors from the vector table.
+        """
+        ops = get_array_ops(self.data)
+        if self.mode == Mode.default:
+            rows = self.find(keys=keys)
+            vecs = self.data[rows]
+        elif self.mode == Mode.floret:
+            keys = [self.strings.as_string(key) for key in keys]
+            if sum(len(key) for key in keys) == 0:
+                return ops.xp.zeros((len(keys), self.data.shape[1]))
+            unique_keys = tuple(set(keys))
+            row_index = {key: i for i, key in enumerate(unique_keys)}
+            rows = [row_index[key] for key in keys]
+            indices = []
+            lengths = []
+            for key in unique_keys:
+                if key == "":
+                    ngram_rows = []
+                else:
+                    ngram_rows = [
+                        h % self.data.shape[0]
+                        for ngram in self._get_ngrams(key)
+                        for h in self._get_ngram_hashes(ngram)
+                    ]
+                indices.extend(ngram_rows)
+                lengths.append(len(ngram_rows))
+            indices = ops.asarray(indices, dtype="int32")
+            lengths = ops.asarray(lengths, dtype="int32")
+            vecs = ops.reduce_mean(cast(Floats2d, self.data[indices]), lengths)
+            vecs = vecs[rows]
+        return ops.as_contig(vecs)
+
+    def add(self, key, *, vector=None, row=None):
+        """Add a key to the table. Keys can be mapped to an existing vector
+        by setting `row`, or a new vector can be added.
+
+        key (int): The key to add.
+        vector (ndarray / None): A vector to add for the key.
+        row (int / None): The row number of a vector to map the key to.
+        RETURNS (int): The row the vector was added to.
+
+        DOCS: https://spacy.io/api/vectors#add
+        """
+        if self.mode == Mode.floret:
+            warnings.warn(Warnings.W115.format(method="Vectors.add"))
+            return -1
+        # use int for all keys and rows in key2row for more efficient access
+        # and serialization
+        key = int(get_string_id(key))
+        if row is not None:
+            row = int(row)
+        if row is None and key in self.key2row:
+            row = self.key2row[key]
+        elif row is None:
+            if self.is_full:
+                raise ValueError(Errors.E060.format(rows=self.data.shape[0],
+                                                    cols=self.data.shape[1]))
+            row = deref(self._unset.begin())
+        if row < self.data.shape[0]:
+            self.key2row[key] = row
+        else:
+            raise ValueError(Errors.E197.format(row=row, key=key))
+        if vector is not None:
+            xp = get_array_module(self.data)
+            vector = xp.asarray(vector)
+            self.data[row] = vector
+        if self._unset.count(row):
+            self._unset.erase(self._unset.find(row))
+        return row
+
+    def most_similar(self, queries, *, batch_size=1024, n=1, sort=True):
+        """For each of the given vectors, find the n most similar entries
+        to it, by cosine.
+
+        Queries are by vector. Results are returned as a `(keys, best_rows,
+        scores)` tuple. If `queries` is large, the calculations are performed in
+        chunks, to avoid consuming too much memory. You can set the `batch_size`
+        to control the size/space trade-off during the calculations.
+
+        queries (ndarray): An array with one or more vectors.
+        batch_size (int): The batch size to use.
+        n (int): The number of entries to return for each query.
+        sort (bool): Whether to sort the n entries returned by score.
+        RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)`
+            tuple.
+        """
+        if self.mode == Mode.floret:
+            raise ValueError(Errors.E858.format(
+                mode=self.mode,
+                alternative="",
+            ))
+        xp = get_array_module(self.data)
+        filled = sorted(list({row for row in self.key2row.values()}))
+        if len(filled) < n:
+            raise ValueError(Errors.E198.format(n=n, n_rows=len(filled)))
+        filled = xp.asarray(filled)
+
+        norms = xp.linalg.norm(self.data[filled], axis=1, keepdims=True)
+        norms[norms == 0] = 1
+        vectors = self.data[filled] / norms
+
+        best_rows = xp.zeros((queries.shape[0], n), dtype='i')
+        scores = xp.zeros((queries.shape[0], n), dtype='f')
+        # Work in batches, to avoid memory problems.
+        for i in range(0, queries.shape[0], batch_size):
+            batch = queries[i : i+batch_size]
+            batch_norms = xp.linalg.norm(batch, axis=1, keepdims=True)
+            batch_norms[batch_norms == 0] = 1
+            batch /= batch_norms
+            # batch   e.g. (1024, 300)
+            # vectors e.g. (10000, 300)
+            # sims    e.g. (1024, 10000)
+            sims = xp.dot(batch, vectors.T)
+            best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:]
+            scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
+
+            if sort and n >= 2:
+                sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
+                scores[i:i+batch_size] = scores[sorted_index]
+                best_rows[i:i+batch_size] = best_rows[sorted_index]
+
+        for i, j in numpy.ndindex(best_rows.shape):
+            best_rows[i, j] = filled[best_rows[i, j]]
+        # Round values really close to 1 or -1
+        scores = xp.around(scores, decimals=4, out=scores)
+        # Account for numerical error we want to return in range -1, 1
+        scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
+        row2key = {row: key for key, row in self.key2row.items()}
+
+        numpy_rows = get_current_ops().to_numpy(best_rows)
+        keys = xp.asarray(
+            [[row2key[row] for row in numpy_rows[i] if row in row2key]
+                    for i in range(len(queries)) ], dtype="uint64")
+        return (keys, best_rows, scores)
+
+    def _get_cfg(self):
+        if self.mode == Mode.default:
+            return {
+                "mode": Mode(self.mode).value,
+            }
+        elif self.mode == Mode.floret:
+            return {
+                "mode": Mode(self.mode).value,
+                "minn": self.minn,
+                "maxn": self.maxn,
+                "hash_count": self.hash_count,
+                "hash_seed": self.hash_seed,
+                "bow": self.bow,
+                "eow": self.eow,
+            }
+
+    def _set_cfg(self, cfg):
+        self.mode = Mode(cfg.get("mode", Mode.default)).value
+        self.minn = cfg.get("minn", 0)
+        self.maxn = cfg.get("maxn", 0)
+        self.hash_count = cfg.get("hash_count", 0)
+        self.hash_seed = cfg.get("hash_seed", 0)
+        self.bow = cfg.get("bow", "<")
+        self.eow = cfg.get("eow", ">")
+
+    def to_disk(self, path, *, exclude=tuple()):
+        """Save the current state to a directory.
+
+        path (str / Path): A path to a directory, which will be created if
+            it doesn't exists.
+
+        DOCS: https://spacy.io/api/vectors#to_disk
+        """
+        xp = get_array_module(self.data)
+        if xp is numpy:
+            save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
+        else:
+            save_array = lambda arr, file_: xp.save(file_, arr)
+
+        def save_vectors(path):
+            # the source of numpy.save indicates that the file object is closed after use.
+            # but it seems that somehow this does not happen, as ResourceWarnings are raised here.
+            # in order to not rely on this, wrap in context manager.
+            with path.open("wb") as _file:
+                save_array(self.data, _file)
+
+        serializers = {
+            "strings": lambda p: self.strings.to_disk(p.with_suffix(".json")),
+            "vectors": lambda p: save_vectors(p),
+            "key2row": lambda p: srsly.write_msgpack(p, self.key2row),
+            "vectors.cfg": lambda p: srsly.write_json(p, self._get_cfg()),
+        }
+        return util.to_disk(path, serializers, exclude)
+
+    def from_disk(self, path, *, exclude=tuple()):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (str / Path): Directory path, string or Path-like object.
+        RETURNS (Vectors): The modified object.
+
+        DOCS: https://spacy.io/api/vectors#from_disk
+        """
+        def load_key2row(path):
+            if path.exists():
+                self.key2row = srsly.read_msgpack(path)
+            for key, row in self.key2row.items():
+                if self._unset.count(row):
+                    self._unset.erase(self._unset.find(row))
+
+        def load_keys(path):
+            if path.exists():
+                keys = numpy.load(str(path))
+                for i, key in enumerate(keys):
+                    self.add(key, row=i)
+
+        def load_vectors(path):
+            ops = get_current_ops()
+            if path.exists():
+                self.data = ops.xp.load(str(path))
+
+        def load_settings(path):
+            if path.exists():
+                self._set_cfg(srsly.read_json(path))
+
+        serializers = {
+            "strings": lambda p: self.strings.from_disk(p.with_suffix(".json")),
+            "vectors": load_vectors,
+            "keys": load_keys,
+            "key2row": load_key2row,
+            "vectors.cfg": load_settings,
+        }
+
+        util.from_disk(path, serializers, exclude)
+        self._sync_unset()
+        return self
+
+    def to_bytes(self, *, exclude=tuple()):
+        """Serialize the current state to a binary string.
+
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized form of the `Vectors` object.
+
+        DOCS: https://spacy.io/api/vectors#to_bytes
+        """
+        def serialize_weights():
+            if hasattr(self.data, "to_bytes"):
+                return self.data.to_bytes()
+            else:
+                return srsly.msgpack_dumps(self.data)
+
+        serializers = {
+            "strings": lambda: self.strings.to_bytes(),
+            "key2row": lambda: srsly.msgpack_dumps(self.key2row),
+            "vectors": serialize_weights,
+            "vectors.cfg": lambda: srsly.json_dumps(self._get_cfg()),
+        }
+        return util.to_bytes(serializers, exclude)
+
+    def from_bytes(self, data, *, exclude=tuple()):
+        """Load state from a binary string.
+
+        data (bytes): The data to load from.
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (Vectors): The `Vectors` object.
+
+        DOCS: https://spacy.io/api/vectors#from_bytes
+        """
+        def deserialize_weights(b):
+            if hasattr(self.data, "from_bytes"):
+                self.data.from_bytes()
+            else:
+                xp = get_array_module(self.data)
+                self.data = xp.asarray(srsly.msgpack_loads(b))
+
+        deserializers = {
+            "strings": lambda b: self.strings.from_bytes(b),
+            "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)),
+            "vectors": deserialize_weights,
+            "vectors.cfg": lambda b: self._set_cfg(srsly.json_loads(b))
+        }
+        util.from_bytes(data, deserializers, exclude)
+        self._sync_unset()
+        return self
+
+    def clear(self):
+        """Clear all entries in the vector table.
+
+        DOCS: https://spacy.io/api/vectors#clear
+        """
+        if self.mode == Mode.floret:
+            raise ValueError(Errors.E859)
+        self.key2row = {}
+        self._sync_unset()
+
+    def _sync_unset(self):
+        filled = {row for row in self.key2row.values()}
+        self._unset = cppset[int]({row for row in range(self.data.shape[0]) if row not in filled})
--- a/cgpenv/Lib/site-packages/spacy/vocab.cp38-win_amd64.pyd
+++ b/cgpenv/Lib/site-packages/spacy/vocab.cp38-win_amd64.pyd
--- a/cgpenv/Lib/site-packages/spacy/vocab.cpp
+++ b/cgpenv/Lib/site-packages/spacy/vocab.cpp
--- a/cgpenv/Lib/site-packages/spacy/vocab.pxd
+++ b/cgpenv/Lib/site-packages/spacy/vocab.pxd
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
+from cymem.cymem cimport Pool
+from murmurhash.mrmr cimport hash64
+
+from .structs cimport LexemeC, TokenC
+from .typedefs cimport attr_t, hash_t
+from .strings cimport StringStore
+from .morphology cimport Morphology
+
+
+cdef LexemeC EMPTY_LEXEME
+
+
+cdef union LexemesOrTokens:
+    const LexemeC* const* lexemes
+    const TokenC* tokens
+
+
+cdef struct _Cached:
+    LexemesOrTokens data
+    bint is_lex
+    int length
+
+
+cdef class Vocab:
+    cdef Pool mem
+    cdef readonly StringStore strings
+    cdef public Morphology morphology
+    cdef public object _vectors
+    cdef public object _lookups
+    cdef public object writing_system
+    cdef public object get_noun_chunks
+    cdef readonly int length
+    cdef public object _unused_object # TODO remove in v4, see #9150
+    cdef public object lex_attr_getters
+    cdef public object cfg
+
+    cdef const LexemeC* get(self, Pool mem, str string) except NULL
+    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
+    cdef const TokenC* make_fused_token(self, substrings) except NULL
+
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
+
+    cdef PreshMap _by_orth
--- a/cgpenv/Lib/site-packages/spacy/vocab.pyi
+++ b/cgpenv/Lib/site-packages/spacy/vocab.pyi
+from typing import Callable, Iterator, Optional, Union, List, Dict
+from typing import Any, Iterable
+from thinc.types import Floats1d, FloatsXd
+from . import Language
+from .strings import StringStore
+from .lexeme import Lexeme
+from .lookups import Lookups
+from .morphology import Morphology
+from .tokens import Doc, Span
+from .vectors import Vectors
+from pathlib import Path
+
+def create_vocab(
+    lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
+) -> Vocab: ...
+
+class Vocab:
+    cfg: Dict[str, Any]
+    get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]]
+    lookups: Lookups
+    morphology: Morphology
+    strings: StringStore
+    vectors: Vectors
+    writing_system: Dict[str, Any]
+    def __init__(
+        self,
+        lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
+        strings: Optional[Union[List[str], StringStore]] = ...,
+        lookups: Optional[Lookups] = ...,
+        oov_prob: float = ...,
+        vectors_name: Optional[str] = ...,
+        writing_system: Dict[str, Any] = ...,
+        get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
+    ) -> None: ...
+    @property
+    def lang(self) -> str: ...
+    def __len__(self) -> int: ...
+    def add_flag(
+        self, flag_getter: Callable[[str], bool], flag_id: int = ...
+    ) -> int: ...
+    def __contains__(self, key: str) -> bool: ...
+    def __iter__(self) -> Iterator[Lexeme]: ...
+    def __getitem__(self, id_or_string: Union[str, int]) -> Lexeme: ...
+    @property
+    def vectors_length(self) -> int: ...
+    def reset_vectors(
+        self, *, width: Optional[int] = ..., shape: Optional[int] = ...
+    ) -> None: ...
+    def prune_vectors(self, nr_row: int, batch_size: int = ...) -> Dict[str, float]: ...
+    def get_vector(
+        self,
+        orth: Union[int, str],
+        minn: Optional[int] = ...,
+        maxn: Optional[int] = ...,
+    ) -> FloatsXd: ...
+    def set_vector(self, orth: Union[int, str], vector: Floats1d) -> None: ...
+    def has_vector(self, orth: Union[int, str]) -> bool: ...
+    def to_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = ...
+    ) -> None: ...
+    def from_disk(
+        self, path: Union[str, Path], *, exclude: Iterable[str] = ...
+    ) -> Vocab: ...
+    def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
+    def from_bytes(
+        self, bytes_data: bytes, *, exclude: Iterable[str] = ...
+    ) -> Vocab: ...
+
+def pickle_vocab(vocab: Vocab) -> Any: ...
+def unpickle_vocab(
+    sstore: StringStore,
+    vectors: Any,
+    morphology: Any,
+    _unused_object: Any,
+    lex_attr_getters: Any,
+    lookups: Any,
+    get_noun_chunks: Any,
+) -> Vocab: ...
--- a/cgpenv/Lib/site-packages/spacy/vocab.pyx
+++ b/cgpenv/Lib/site-packages/spacy/vocab.pyx
+# cython: profile=True
+from libc.string cimport memcpy
+
+import srsly
+from thinc.api import get_array_module, get_current_ops
+import functools
+
+from .lexeme cimport EMPTY_LEXEME, OOV_RANK
+from .lexeme cimport Lexeme
+from .typedefs cimport attr_t
+from .tokens.token cimport Token
+from .attrs cimport LANG, ORTH
+
+from .compat import copy_reg
+from .errors import Errors
+from .attrs import intify_attrs, NORM, IS_STOP
+from .vectors import Vectors, Mode as VectorsMode
+from .util import registry
+from .lookups import Lookups
+from . import util
+from .lang.norm_exceptions import BASE_NORMS
+from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
+
+
+def create_vocab(lang, defaults, vectors_name=None):
+    # If the spacy-lookups-data package is installed, we pre-populate the lookups
+    # with lexeme data, if available
+    lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
+    # This is messy, but it's the minimal working fix to Issue #639.
+    lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
+    # Ensure that getter can be pickled
+    lex_attrs[LANG] = functools.partial(get_lang, lang=lang)
+    lex_attrs[NORM] = util.add_lookups(
+        lex_attrs.get(NORM, LEX_ATTRS[NORM]),
+        BASE_NORMS,
+    )
+    return Vocab(
+        lex_attr_getters=lex_attrs,
+        writing_system=defaults.writing_system,
+        get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
+        vectors_name=vectors_name,
+    )
+
+
+cdef class Vocab:
+    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
+    instance also provides access to the `StringStore`, and owns underlying
+    C-data that is shared between `Doc` objects.
+
+    DOCS: https://spacy.io/api/vocab
+    """
+    def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
+                 oov_prob=-20., vectors_name=None, writing_system={},
+                 get_noun_chunks=None, **deprecated_kwargs):
+        """Create the vocabulary.
+
+        lex_attr_getters (dict): A dictionary mapping attribute IDs to
+            functions to compute them. Defaults to `None`.
+        strings (StringStore): StringStore that maps strings to integers, and
+            vice versa.
+        lookups (Lookups): Container for large lookup tables and dictionaries.
+        oov_prob (float): Default OOV probability.
+        vectors_name (str): Optional name to identify the vectors table.
+        get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
+            A function that yields base noun phrases used for Doc.noun_chunks.
+        """
+        lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
+        if lookups in (None, True, False):
+            lookups = Lookups()
+        self.cfg = {'oov_prob': oov_prob}
+        self.mem = Pool()
+        self._by_orth = PreshMap()
+        self.strings = StringStore()
+        self.length = 0
+        if strings:
+            for string in strings:
+                _ = self[string]
+        self.lex_attr_getters = lex_attr_getters
+        self.morphology = Morphology(self.strings)
+        self.vectors = Vectors(strings=self.strings, name=vectors_name)
+        self.lookups = lookups
+        self.writing_system = writing_system
+        self.get_noun_chunks = get_noun_chunks
+
+    property vectors:
+        def __get__(self):
+            return self._vectors
+
+        def __set__(self, vectors):
+            for s in vectors.strings:
+                self.strings.add(s)
+            self._vectors = vectors
+            self._vectors.strings = self.strings
+
+    @property
+    def lang(self):
+        langfunc = None
+        if self.lex_attr_getters:
+            langfunc = self.lex_attr_getters.get(LANG, None)
+        return langfunc("_") if langfunc else ""
+
+    def __len__(self):
+        """The current number of lexemes stored.
+
+        RETURNS (int): The current number of lexemes stored.
+        """
+        return self.length
+
+    def add_flag(self, flag_getter, int flag_id=-1):
+        """Set a new boolean flag to words in the vocabulary.
+
+        The flag_getter function will be called over the words currently in the
+        vocab, and then applied to new words as they occur. You'll then be able
+        to access the flag value on each token using token.check_flag(flag_id).
+        See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
+        `Token.check_flag`.
+
+        flag_getter (callable): A function `f(str) -> bool`, to get the
+            flag value.
+        flag_id (int): An integer between 1 and 63 (inclusive), specifying
+            the bit at which the flag will be stored. If -1, the lowest
+            available bit will be chosen.
+        RETURNS (int): The integer ID by which the flag value can be checked.
+
+        DOCS: https://spacy.io/api/vocab#add_flag
+        """
+        if flag_id == -1:
+            for bit in range(1, 64):
+                if bit not in self.lex_attr_getters:
+                    flag_id = bit
+                    break
+            else:
+                raise ValueError(Errors.E062)
+        elif flag_id >= 64 or flag_id < 1:
+            raise ValueError(Errors.E063.format(value=flag_id))
+        for lex in self:
+            lex.set_flag(flag_id, flag_getter(lex.orth_))
+        self.lex_attr_getters[flag_id] = flag_getter
+        return flag_id
+
+    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
+        """Get a pointer to a `LexemeC` from the lexicon, creating a new
+        `Lexeme` if necessary using memory acquired from the given pool. If the
+        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
+        """
+        if string == "":
+            return &EMPTY_LEXEME
+        cdef LexemeC* lex
+        cdef hash_t key = self.strings[string]
+        lex = <LexemeC*>self._by_orth.get(key)
+        cdef size_t addr
+        if lex != NULL:
+            assert lex.orth in self.strings
+            if lex.orth != key:
+                raise KeyError(Errors.E064.format(string=lex.orth,
+                                                  orth=key, orth_id=string))
+            return lex
+        else:
+            return self._new_lexeme(mem, string)
+
+    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
+        """Get a pointer to a `LexemeC` from the lexicon, creating a new
+        `Lexeme` if necessary using memory acquired from the given pool. If the
+        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
+        """
+        if orth == 0:
+            return &EMPTY_LEXEME
+        cdef LexemeC* lex
+        lex = <LexemeC*>self._by_orth.get(orth)
+        if lex != NULL:
+            return lex
+        else:
+            return self._new_lexeme(mem, self.strings[orth])
+
+    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
+        # I think this heuristic is bad, and the Vocab should always
+        # own the lexemes. It avoids weird bugs this way, as it's how the thing
+        # was originally supposed to work. The best solution to the growing
+        # memory use is to periodically reset the vocab, which is an action
+        # that should be up to the user to do (so we don't need to keep track
+        # of the doc ownership).
+        # TODO: Change the C API so that the mem isn't passed in here.
+        mem = self.mem
+        #if len(string) < 3 or self.length < 10000:
+        #    mem = self.mem
+        cdef bint is_oov = mem is not self.mem
+        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
+        lex.orth = self.strings.add(string)
+        lex.length = len(string)
+        if self.vectors is not None:
+            lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
+        else:
+            lex.id = OOV_RANK
+        if self.lex_attr_getters is not None:
+            for attr, func in self.lex_attr_getters.items():
+                value = func(string)
+                if isinstance(value, str):
+                    value = self.strings.add(value)
+                if value is not None:
+                    Lexeme.set_struct_attr(lex, attr, value)
+        if not is_oov:
+            self._add_lex_to_vocab(lex.orth, lex)
+        if lex == NULL:
+            raise ValueError(Errors.E085.format(string=string))
+        return lex
+
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+        self._by_orth.set(lex.orth, <void*>lex)
+        self.length += 1
+
+    def __contains__(self, key):
+        """Check whether the string or int key has an entry in the vocabulary.
+
+        string (str): The ID string.
+        RETURNS (bool) Whether the string has an entry in the vocabulary.
+
+        DOCS: https://spacy.io/api/vocab#contains
+        """
+        cdef hash_t int_key
+        if isinstance(key, bytes):
+            int_key = self.strings[key.decode("utf8")]
+        elif isinstance(key, str):
+            int_key = self.strings[key]
+        else:
+            int_key = key
+        lex = self._by_orth.get(int_key)
+        return lex is not NULL
+
+    def __iter__(self):
+        """Iterate over the lexemes in the vocabulary.
+
+        YIELDS (Lexeme): An entry in the vocabulary.
+
+        DOCS: https://spacy.io/api/vocab#iter
+        """
+        cdef attr_t key
+        cdef size_t addr
+        for key, addr in self._by_orth.items():
+            lex = Lexeme(self, key)
+            yield lex
+
+    def __getitem__(self, id_or_string):
+        """Retrieve a lexeme, given an int ID or a unicode string. If a
+        previously unseen unicode string is given, a new lexeme is created and
+        stored.
+
+        id_or_string (int or str): The integer ID of a word, or its unicode
+            string. If `int >= Lexicon.size`, `IndexError` is raised. If
+            `id_or_string` is neither an int nor a unicode string, `ValueError`
+            is raised.
+        RETURNS (Lexeme): The lexeme indicated by the given ID.
+
+        EXAMPLE:
+            >>> apple = nlp.vocab.strings["apple"]
+            >>> assert nlp.vocab[apple] == nlp.vocab[u"apple"]
+
+        DOCS: https://spacy.io/api/vocab#getitem
+        """
+        cdef attr_t orth
+        if isinstance(id_or_string, str):
+            orth = self.strings.add(id_or_string)
+        else:
+            orth = id_or_string
+        return Lexeme(self, orth)
+
+    cdef const TokenC* make_fused_token(self, substrings) except NULL:
+        cdef int i
+        tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
+        for i, props in enumerate(substrings):
+            props = intify_attrs(props, strings_map=self.strings,
+                                 _do_deprecated=True)
+            token = &tokens[i]
+            # Set the special tokens up to have arbitrary attributes
+            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
+            token.lex = lex
+            for attr_id, value in props.items():
+                Token.set_struct_attr(token, attr_id, value)
+                # NORM is the only one that overlaps between the two
+                # (which is maybe not great?)
+                if attr_id != NORM:
+                    Lexeme.set_struct_attr(lex, attr_id, value)
+        return tokens
+
+    @property
+    def vectors_length(self):
+        return self.vectors.data.shape[1]
+
+    def reset_vectors(self, *, width=None, shape=None):
+        """Drop the current vector table. Because all vectors must be the same
+        width, you have to call this to change the size of the vectors.
+        """
+        if width is not None and shape is not None:
+            raise ValueError(Errors.E065.format(width=width, shape=shape))
+        elif shape is not None:
+            self.vectors = Vectors(strings=self.strings, shape=shape)
+        else:
+            width = width if width is not None else self.vectors.data.shape[1]
+            self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
+
+    def prune_vectors(self, nr_row, batch_size=1024):
+        """Reduce the current vector table to `nr_row` unique entries. Words
+        mapped to the discarded vectors will be remapped to the closest vector
+        among those remaining.
+
+        For example, suppose the original table had vectors for the words:
+        ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to
+        two rows, we would discard the vectors for 'feline' and 'reclined'.
+        These words would then be remapped to the closest remaining vector
+        -- so "feline" would have the same vector as "cat", and "reclined"
+        would have the same vector as "sat".
+
+        The similarities are judged by cosine. The original vectors may
+        be large, so the cosines are calculated in minibatches, to reduce
+        memory usage.
+
+        nr_row (int): The number of rows to keep in the vector table.
+        batch_size (int): Batch of vectors for calculating the similarities.
+            Larger batch sizes might be faster, while temporarily requiring
+            more memory.
+        RETURNS (dict): A dictionary keyed by removed words mapped to
+            `(string, score)` tuples, where `string` is the entry the removed
+            word was mapped to, and `score` the similarity score between the
+            two words.
+
+        DOCS: https://spacy.io/api/vocab#prune_vectors
+        """
+        if self.vectors.mode != VectorsMode.default:
+            raise ValueError(Errors.E866)
+        ops = get_current_ops()
+        xp = get_array_module(self.vectors.data)
+        # Make sure all vectors are in the vocab
+        for orth in self.vectors:
+            self[orth]
+        # Make prob negative so it sorts by rank ascending
+        # (key2row contains the rank)
+        priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
+                    for lex in self if lex.orth in self.vectors.key2row]
+        priority.sort()
+        indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
+        keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
+        keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
+        toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
+        self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
+        syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
+        syn_keys = ops.to_numpy(syn_keys)
+        remap = {}
+        for i, key in enumerate(ops.to_numpy(keys[nr_row:])):
+            self.vectors.add(key, row=syn_rows[i][0])
+            word = self.strings[key]
+            synonym = self.strings[syn_keys[i][0]]
+            score = scores[i][0]
+            remap[word] = (synonym, score)
+        return remap
+
+    def get_vector(self, orth):
+        """Retrieve a vector for a word in the vocabulary. Words can be looked
+        up by string or int ID. If no vectors data is loaded, ValueError is
+        raised.
+
+        orth (int / unicode): The hash value of a word, or its unicode string.
+        RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size
+            and shape determined by the `vocab.vectors` instance. Usually, a
+            numpy ndarray of shape (300,) and dtype float32.
+
+        DOCS: https://spacy.io/api/vocab#get_vector
+        """
+        if isinstance(orth, str):
+            orth = self.strings.add(orth)
+        if self.has_vector(orth):
+            return self.vectors[orth]
+        xp = get_array_module(self.vectors.data)
+        vectors = xp.zeros((self.vectors_length,), dtype="f")
+        return vectors
+
+    def set_vector(self, orth, vector):
+        """Set a vector for a word in the vocabulary. Words can be referenced
+        by string or int ID.
+
+        orth (int / str): The word.
+        vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
+
+        DOCS: https://spacy.io/api/vocab#set_vector
+        """
+        if isinstance(orth, str):
+            orth = self.strings.add(orth)
+        if self.vectors.is_full and orth not in self.vectors:
+            new_rows = max(100, int(self.vectors.shape[0]*1.3))
+            if self.vectors.shape[1] == 0:
+                width = vector.size
+            else:
+                width = self.vectors.shape[1]
+            self.vectors.resize((new_rows, width))
+        lex = self[orth]  # Add word to vocab if necessary
+        row = self.vectors.add(orth, vector=vector)
+        if row >= 0:
+            lex.rank = row
+
+    def has_vector(self, orth):
+        """Check whether a word has a vector. Returns False if no vectors have
+        been loaded. Words can be looked up by string or int ID.
+
+        orth (int / str): The word.
+        RETURNS (bool): Whether the word has a vector.
+
+        DOCS: https://spacy.io/api/vocab#has_vector
+        """
+        if isinstance(orth, str):
+            orth = self.strings.add(orth)
+        return orth in self.vectors
+
+    property lookups:
+        def __get__(self):
+            return self._lookups
+
+        def __set__(self, lookups):
+            self._lookups = lookups
+            if lookups.has_table("lexeme_norm"):
+                self.lex_attr_getters[NORM] = util.add_lookups(
+                    self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
+                    self.lookups.get_table("lexeme_norm"),
+                )
+
+
+    def to_disk(self, path, *, exclude=tuple()):
+        """Save the current state to a directory.
+
+        path (str or Path): A path to a directory, which will be created if
+            it doesn't exist.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+
+        DOCS: https://spacy.io/api/vocab#to_disk
+        """
+        path = util.ensure_path(path)
+        if not path.exists():
+            path.mkdir()
+        setters = ["strings", "vectors"]
+        if "strings" not in exclude:
+            self.strings.to_disk(path / "strings.json")
+        if "vectors" not in "exclude":
+            self.vectors.to_disk(path, exclude=["strings"])
+        if "lookups" not in "exclude":
+            self.lookups.to_disk(path)
+
+    def from_disk(self, path, *, exclude=tuple()):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (str or Path): A path to a directory.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (Vocab): The modified `Vocab` object.
+
+        DOCS: https://spacy.io/api/vocab#to_disk
+        """
+        path = util.ensure_path(path)
+        getters = ["strings", "vectors"]
+        if "strings" not in exclude:
+            self.strings.from_disk(path / "strings.json")  # TODO: add exclude?
+        if "vectors" not in exclude:
+            if self.vectors is not None:
+                self.vectors.from_disk(path, exclude=["strings"])
+        if "lookups" not in exclude:
+            self.lookups.from_disk(path)
+        if "lexeme_norm" in self.lookups:
+            self.lex_attr_getters[NORM] = util.add_lookups(
+                self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
+            )
+        self.length = 0
+        self._by_orth = PreshMap()
+        return self
+
+    def to_bytes(self, *, exclude=tuple()):
+        """Serialize the current state to a binary string.
+
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized form of the `Vocab` object.
+
+        DOCS: https://spacy.io/api/vocab#to_bytes
+        """
+        def deserialize_vectors():
+            if self.vectors is None:
+                return None
+            else:
+                return self.vectors.to_bytes(exclude=["strings"])
+
+        getters = {
+            "strings": lambda: self.strings.to_bytes(),
+            "vectors": deserialize_vectors,
+            "lookups": lambda: self.lookups.to_bytes(),
+        }
+        return util.to_bytes(getters, exclude)
+
+    def from_bytes(self, bytes_data, *, exclude=tuple()):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        exclude (Iterable[str]): String names of serialization fields to exclude.
+        RETURNS (Vocab): The `Vocab` object.
+
+        DOCS: https://spacy.io/api/vocab#from_bytes
+        """
+        def serialize_vectors(b):
+            if self.vectors is None:
+                return None
+            else:
+                return self.vectors.from_bytes(b, exclude=["strings"])
+
+        setters = {
+            "strings": lambda b: self.strings.from_bytes(b),
+            "vectors": lambda b: serialize_vectors(b),
+            "lookups": lambda b: self.lookups.from_bytes(b),
+        }
+        util.from_bytes(bytes_data, setters, exclude)
+        if "lexeme_norm" in self.lookups:
+            self.lex_attr_getters[NORM] = util.add_lookups(
+                self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), self.lookups.get_table("lexeme_norm")
+            )
+        self.length = 0
+        self._by_orth = PreshMap()
+        return self
+
+    def _reset_cache(self, keys, strings):
+        # I'm not sure this made sense. Disable it for now.
+        raise NotImplementedError
+
+
+def pickle_vocab(vocab):
+    sstore = vocab.strings
+    vectors = vocab.vectors
+    morph = vocab.morphology
+    _unused_object = vocab._unused_object
+    lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
+    lookups = vocab.lookups
+    get_noun_chunks = vocab.get_noun_chunks
+    return (unpickle_vocab,
+            (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
+
+
+def unpickle_vocab(sstore, vectors, morphology, _unused_object,
+                   lex_attr_getters, lookups, get_noun_chunks):
+    cdef Vocab vocab = Vocab()
+    vocab.vectors = vectors
+    vocab.strings = sstore
+    vocab.morphology = morphology
+    vocab._unused_object = _unused_object
+    vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
+    vocab.lookups = lookups
+    vocab.get_noun_chunks = get_noun_chunks
+    return vocab
+
+
+copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)