GITENV file updated 20

parent 6e58b382
from .attributeruler import AttributeRuler
from .dep_parser import DependencyParser
from .entity_linker import EntityLinker
from .ner import EntityRecognizer
from .entityruler import EntityRuler
from .lemmatizer import Lemmatizer
from .morphologizer import Morphologizer
from .pipe import Pipe
from .trainable_pipe import TrainablePipe
from .senter import SentenceRecognizer
from .sentencizer import Sentencizer
from .tagger import Tagger
from .textcat import TextCategorizer
from .spancat import SpanCategorizer
from .textcat_multilabel import MultiLabel_TextCategorizer
from .tok2vec import Tok2Vec
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
__all__ = [
"AttributeRuler",
"DependencyParser",
"EntityLinker",
"EntityRecognizer",
"EntityRuler",
"Morphologizer",
"Lemmatizer",
"MultiLabel_TextCategorizer",
"Pipe",
"SentenceRecognizer",
"Sentencizer",
"SpanCategorizer",
"Tagger",
"TextCategorizer",
"Tok2Vec",
"TrainablePipe",
"merge_entities",
"merge_noun_chunks",
"merge_subtokens",
]
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from typing import Dict, Any
import srsly
from ..language import Language
from ..matcher import Matcher
from ..tokens import Doc
from .. import util
@Language.component(
"merge_noun_chunks",
requires=["token.dep", "token.tag", "token.pos"],
retokenizes=True,
)
def merge_noun_chunks(doc: Doc) -> Doc:
"""Merge noun chunks into a single token.
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun chunks.
DOCS: https://spacy.io/api/pipeline-functions#merge_noun_chunks
"""
if not doc.has_annotation("DEP"):
return doc
with doc.retokenize() as retokenizer:
for np in doc.noun_chunks:
attrs = {"tag": np.root.tag, "dep": np.root.dep}
retokenizer.merge(np, attrs=attrs) # type: ignore[arg-type]
return doc
@Language.component(
"merge_entities",
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
retokenizes=True,
)
def merge_entities(doc: Doc):
"""Merge entities into a single token.
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged entities.
DOCS: https://spacy.io/api/pipeline-functions#merge_entities
"""
with doc.retokenize() as retokenizer:
for ent in doc.ents:
attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
retokenizer.merge(ent, attrs=attrs) # type: ignore[arg-type]
return doc
@Language.component("merge_subtokens", requires=["token.dep"], retokenizes=True)
def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
"""Merge subtokens into a single token.
doc (Doc): The Doc object.
label (str): The subtoken dependency label.
RETURNS (Doc): The Doc object with merged subtokens.
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
"""
# TODO: make stateful component with "label" config
merger = Matcher(doc.vocab)
merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]])
matches = merger(doc)
spans = util.filter_spans([doc[start : end + 1] for _, start, end in matches]) # type: ignore[misc, operator]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
return doc
@Language.factory(
"token_splitter",
default_config={"min_length": 25, "split_length": 10},
retokenizes=True,
)
def make_token_splitter(
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
):
return TokenSplitter(min_length=min_length, split_length=split_length)
class TokenSplitter:
def __init__(self, min_length: int = 0, split_length: int = 0):
self.min_length = min_length
self.split_length = split_length
def __call__(self, doc: Doc) -> Doc:
if self.min_length > 0 and self.split_length > 0:
with doc.retokenize() as retokenizer:
for t in doc:
if len(t.text) >= self.min_length:
orths = []
heads = []
attrs = {} # type: ignore[var-annotated]
for i in range(0, len(t.text), self.split_length):
orths.append(t.text[i : i + self.split_length])
heads.append((t, i / self.split_length))
retokenizer.split(t, orths, heads, attrs) # type: ignore[arg-type]
return doc
def _get_config(self) -> Dict[str, Any]:
return {
"min_length": self.min_length,
"split_length": self.split_length,
}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.min_length = config.get("min_length", 0)
self.split_length = config.get("split_length", 0)
def to_bytes(self, **kwargs):
serializers = {
"cfg": lambda: srsly.json_dumps(self._get_config()),
}
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
deserializers = {
"cfg": lambda b: self._set_config(srsly.json_loads(b)),
}
util.from_bytes(data, deserializers, [])
return self
def to_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = {
"cfg": lambda p: srsly.write_json(p, self._get_config()),
}
return util.to_disk(path, serializers, [])
def from_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = {
"cfg": lambda p: self._set_config(srsly.read_json(p)),
}
util.from_disk(path, serializers, [])
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# cython: infer_types=True, profile=True, binding=True
from typing import Optional
import numpy
from thinc.api import CosineDistance, to_categorical, Model, Config
from thinc.api import set_dropout_rate
from ..tokens.doc cimport Doc
from .trainable_pipe import TrainablePipe
from .tagger import Tagger
from ..training import validate_examples
from ..language import Language
from ._parser_internals import nonproj
from ..attrs import POS, ID
from ..errors import Errors
default_model_config = """
[model]
@architectures = "spacy.MultiTask.v1"
maxout_pieces = 3
token_vector_width = 96
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
"""
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"nn_labeller",
default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
)
def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
return MultitaskObjective(nlp.vocab, model, name)
class MultitaskObjective(Tagger):
"""Experimental: Assist training of a parser or tagger, by training a
side-objective.
"""
def __init__(self, vocab, model, name="nn_labeller", *, target):
self.vocab = vocab
self.model = model
self.name = name
if target == "dep":
self.make_label = self.make_dep
elif target == "tag":
self.make_label = self.make_tag
elif target == "ent":
self.make_label = self.make_ent
elif target == "dep_tag_offset":
self.make_label = self.make_dep_tag_offset
elif target == "ent_tag":
self.make_label = self.make_ent_tag
elif target == "sent_start":
self.make_label = self.make_sent_start
elif hasattr(target, "__call__"):
self.make_label = target
else:
raise ValueError(Errors.E016)
cfg = {"labels": {}, "target": target}
self.cfg = dict(cfg)
@property
def labels(self):
return self.cfg.setdefault("labels", {})
@labels.setter
def labels(self, value):
self.cfg["labels"] = value
def set_annotations(self, docs, dep_ids):
pass
def initialize(self, get_examples, nlp=None, labels=None):
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
raise ValueError(err)
if labels is not None:
self.labels = labels
else:
for example in get_examples():
for token in example.y:
label = self.make_label(token)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
self.model.initialize() # TODO: fix initialization by defining X and Y
def predict(self, docs):
tokvecs = self.model.get_ref("tok2vec")(docs)
scores = self.model.get_ref("softmax")(tokvecs)
return tokvecs, scores
def get_loss(self, examples, scores):
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
docs = [eg.predicted for eg in examples]
for i, eg in enumerate(examples):
# Handles alignment for tokenization differences
doc_annots = eg.get_aligned() # TODO
for j in range(len(eg.predicted)):
tok_annots = {key: values[j] for key, values in tok_annots.items()}
label = self.make_label(j, tok_annots)
if label is None or label not in self.labels:
correct[idx] = guesses[idx]
else:
correct[idx] = self.labels[label]
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
loss = (d_scores**2).sum()
return float(loss), d_scores
@staticmethod
def make_dep(token):
return token.dep_
@staticmethod
def make_tag(token):
return token.tag_
@staticmethod
def make_ent(token):
if token.ent_iob_ == "O":
return "O"
else:
return token.ent_iob_ + "-" + token.ent_type_
@staticmethod
def make_dep_tag_offset(token):
dep = token.dep_
tag = token.tag_
offset = token.head.i - token.i
offset = min(offset, 2)
offset = max(offset, -2)
return f"{dep}-{tag}:{offset}"
@staticmethod
def make_ent_tag(token):
if token.ent_iob_ == "O":
ent = "O"
else:
ent = token.ent_iob_ + "-" + token.ent_type_
tag = token.tag_
return f"{tag}-{ent}"
@staticmethod
def make_sent_start(token):
"""A multi-task objective for representing sentence boundaries,
using BILU scheme. (O is impossible)
"""
if token.is_sent_start and token.is_sent_end:
return "U-SENT"
elif token.is_sent_start:
return "B-SENT"
else:
return "I-SENT"
class ClozeMultitask(TrainablePipe):
def __init__(self, vocab, model, **cfg):
self.vocab = vocab
self.model = model
self.cfg = cfg
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
def set_annotations(self, docs, dep_ids):
pass
def initialize(self, get_examples, nlp=None):
self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.initialize(X)
def predict(self, docs):
tokvecs = self.model.get_ref("tok2vec")(docs)
vectors = self.model.get_ref("output_layer")(tokvecs)
return tokvecs, vectors
def get_loss(self, examples, vectors, prediction):
validate_examples(examples, "ClozeMultitask.get_loss")
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
target = vectors[ids]
gradient = self.distance.get_grad(prediction, target)
loss = self.distance.get_loss(prediction, target)
return float(loss), gradient
def update(self, examples, *, drop=0., sgd=None, losses=None):
pass
def rehearse(self, examples, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
set_dropout_rate(self.model, drop)
validate_examples(examples, "ClozeMultitask.rehearse")
docs = [eg.predicted for eg in examples]
predictions, bp_predictions = self.model.begin_update()
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
bp_predictions(d_predictions)
if sgd is not None:
self.finish_update(sgd)
if losses is not None:
losses[self.name] += loss
return losses
def add_label(self, label):
raise NotImplementedError
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
cdef class Pipe:
cdef public str name
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Iterator, List
from typing import NoReturn, Optional, Tuple, Union
from ..tokens.doc import Doc
from ..training import Example
from ..language import Language
class Pipe:
def __call__(self, doc: Doc) -> Doc: ...
def pipe(
self, stream: Iterable[Doc], *, batch_size: int = ...
) -> Iterator[Doc]: ...
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Language = ...,
) -> None: ...
def score(
self, examples: Iterable[Example], **kwargs: Any
) -> Dict[str, Union[float, Dict[str, float]]]: ...
@property
def is_trainable(self) -> bool: ...
@property
def labels(self) -> Tuple[str, ...]: ...
@property
def label_data(self) -> Any: ...
def _require_labels(self) -> None: ...
def set_error_handler(
self, error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn]
) -> None: ...
def get_error_handler(
self,
) -> Callable[[str, "Pipe", List[Doc], Exception], NoReturn]: ...
def deserialize_config(path: Path) -> Any: ...
# cython: infer_types=True, profile=True
from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict
import srsly
import warnings
from ..tokens.doc cimport Doc
from ..training import Example
from ..errors import Errors, Warnings
from ..language import Language
from ..util import raise_error
cdef class Pipe:
"""This class is a base class and not instantiated directly. It provides
an interface for pipeline components to implement.
Trainable pipeline components like the EntityRecognizer or TextCategorizer
should inherit from the subclass 'TrainablePipe'.
DOCS: https://spacy.io/api/pipe
"""
@classmethod
def __init_subclass__(cls, **kwargs):
"""Raise a warning if an inheriting class implements 'begin_training'
(from v2) instead of the new 'initialize' method (from v3)"""
if hasattr(cls, "begin_training"):
warnings.warn(Warnings.W088.format(name=cls.__name__))
def __call__(self, Doc doc) -> Doc:
"""Apply the pipe to one document. The document is modified in place,
and returned. This usually happens under the hood when the nlp object
is called on a text and all components are applied to the Doc.
docs (Doc): The Doc to process.
RETURNS (Doc): The processed Doc.
DOCS: https://spacy.io/api/pipe#call
"""
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://spacy.io/api/pipe#pipe
"""
error_handler = self.get_error_handler()
for doc in stream:
try:
doc = self(doc)
yield doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
"""Initialize the pipe. For non-trainable components, this method
is optional. For trainable components, which should inherit
from the subclass TrainablePipe, the provided data examples
should be used to ensure that the internal model is initialized
properly and all input/output dimensions throughout the network are
inferred.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
DOCS: https://spacy.io/api/pipe#initialize
"""
pass
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Union[float, Dict[str, float]]]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS: https://spacy.io/api/pipe#score
"""
if hasattr(self, "scorer") and self.scorer is not None:
scorer_kwargs = {}
# use default settings from cfg (e.g., threshold)
if hasattr(self, "cfg") and isinstance(self.cfg, dict):
scorer_kwargs.update(self.cfg)
# override self.cfg["labels"] with self.labels
if hasattr(self, "labels"):
scorer_kwargs["labels"] = self.labels
# override with kwargs settings
scorer_kwargs.update(kwargs)
return self.scorer(examples, **scorer_kwargs)
return {}
@property
def is_trainable(self) -> bool:
return False
@property
def labels(self) -> Tuple[str, ...]:
return tuple()
@property
def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate
the label set if provided to the `pipe.initialize()` method.
"""
return None
def _require_labels(self) -> None:
"""Raise an error if this component has no labels defined."""
if not self.labels or list(self.labels) == [""]:
raise ValueError(Errors.E143.format(name=self.name))
def set_error_handler(self, error_handler: Callable) -> None:
"""Set an error handler function.
error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]):
Function that deals with a failing batch of documents. This callable function should take in
the component's name, the component itself, the offending batch of documents, and the exception
that was thrown.
DOCS: https://spacy.io/api/pipe#set_error_handler
"""
self.error_handler = error_handler
def get_error_handler(self) -> Callable:
"""Retrieve the error handler function.
RETURNS (Callable): The error handler, or if it's not set a default function that just reraises.
DOCS: https://spacy.io/api/pipe#get_error_handler
"""
if hasattr(self, "error_handler"):
return self.error_handler
return raise_error
def deserialize_config(path):
if path.exists():
return srsly.read_json(path)
else:
return {}
This diff is collapsed.
# cython: infer_types=True, profile=True, binding=True
from typing import Optional, List, Callable
import srsly
from ..tokens.doc cimport Doc
from .pipe import Pipe
from .senter import senter_score
from ..language import Language
from ..scorer import Scorer
from .. import util
# see #9050
BACKWARD_OVERWRITE = False
@Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_sentencizer(
nlp: Language,
name: str,
punct_chars: Optional[List[str]],
overwrite: bool,
scorer: Optional[Callable],
):
return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)
class Sentencizer(Pipe):
"""Segment the Doc into sentences using a rule-based strategy.
DOCS: https://spacy.io/api/sentencizer
"""
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
'‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
'꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
'﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
'𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
'。', '。']
def __init__(
self,
name="sentencizer",
*,
punct_chars=None,
overwrite=BACKWARD_OVERWRITE,
scorer=senter_score,
):
"""Initialize the sentencizer.
punct_chars (list): Punctuation characters to split on. Will be
serialized with the nlp object.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
DOCS: https://spacy.io/api/sentencizer#init
"""
self.name = name
if punct_chars:
self.punct_chars = set(punct_chars)
else:
self.punct_chars = set(self.default_punct_chars)
self.overwrite = overwrite
self.scorer = scorer
def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
doc (Doc): The document to process.
RETURNS (Doc): The processed Doc.
DOCS: https://spacy.io/api/sentencizer#call
"""
error_handler = self.get_error_handler()
try:
tags = self.predict([doc])
self.set_annotations([doc], tags)
return doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def predict(self, docs):
"""Apply the pipe to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
RETURNS: The predictions for each document.
"""
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
guesses = [[] for doc in docs]
return guesses
guesses = []
for doc in docs:
doc_guesses = [False] * len(doc)
if len(doc) > 0:
start = 0
seen_period = False
doc_guesses[0] = True
for i, token in enumerate(doc):
is_in_punct_chars = token.text in self.punct_chars
if seen_period and not token.is_punct and not is_in_punct_chars:
doc_guesses[start] = True
start = token.i
seen_period = False
elif is_in_punct_chars:
seen_period = True
if start < len(doc):
doc_guesses[start] = True
guesses.append(doc_guesses)
return guesses
def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
scores: The tag IDs produced by Sentencizer.predict.
"""
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef int idx = 0
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
for j, tag_id in enumerate(doc_tag_ids):
if doc.c[j].sent_start == 0 or self.overwrite:
if tag_id:
doc.c[j].sent_start = 1
else:
doc.c[j].sent_start = -1
def to_bytes(self, *, exclude=tuple()):
"""Serialize the sentencizer to a bytestring.
RETURNS (bytes): The serialized object.
DOCS: https://spacy.io/api/sentencizer#to_bytes
"""
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the sentencizer from a bytestring.
bytes_data (bytes): The data to load.
returns (Sentencizer): The loaded object.
DOCS: https://spacy.io/api/sentencizer#from_bytes
"""
cfg = srsly.msgpack_loads(bytes_data)
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
self.overwrite = cfg.get("overwrite", self.overwrite)
return self
def to_disk(self, path, *, exclude=tuple()):
"""Serialize the sentencizer to disk.
DOCS: https://spacy.io/api/sentencizer#to_disk
"""
path = util.ensure_path(path)
path = path.with_suffix(".json")
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
def from_disk(self, path, *, exclude=tuple()):
"""Load the sentencizer from disk.
DOCS: https://spacy.io/api/sentencizer#from_disk
"""
path = util.ensure_path(path)
path = path.with_suffix(".json")
cfg = srsly.read_json(path)
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
self.overwrite = cfg.get("overwrite", self.overwrite)
return self
This diff is collapsed.
# cython: infer_types=True, profile=True, binding=True
from itertools import islice
from typing import Optional, Callable
import srsly
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
from ..tokens.doc cimport Doc
from .tagger import Tagger
from ..language import Language
from ..errors import Errors
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
from .. import util
# See #9050
BACKWARD_OVERWRITE = False
default_model_config = """
[model]
@architectures = "spacy.Tagger.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 12
depth = 1
embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
"""
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"senter",
assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
def senter_score(examples, **kwargs):
def has_sents(doc):
return doc.has_annotation("SENT_START")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results
@registry.scorers("spacy.senter_scorer.v1")
def make_senter_scorer():
return senter_score
class SentenceRecognizer(Tagger):
"""Pipeline component for sentence segmentation.
DOCS: https://spacy.io/api/sentencerecognizer
"""
def __init__(
self,
vocab,
model,
name="senter",
*,
overwrite=BACKWARD_OVERWRITE,
scorer=senter_score,
):
"""Initialize a sentence recognizer.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
DOCS: https://spacy.io/api/sentencerecognizer#init
"""
self.vocab = vocab
self.model = model
self.name = name
self._rehearsal_model = None
self.cfg = {"overwrite": overwrite}
self.scorer = scorer
@property
def labels(self):
"""RETURNS (Tuple[str]): The labels."""
# labels are numbered by index internally, so this matches GoldParse
# and Example where the sentence-initial tag is 1 and other positions
# are 0
return tuple(["I", "S"])
@property
def label_data(self):
return None
def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
"""
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
if doc.c[j].sent_start == 0 or overwrite:
if tag_id == 1:
doc.c[j].sent_start = 1
else:
doc.c[j].sent_start = -1
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/sentencerecognizer#get_loss
"""
validate_examples(examples, "SentenceRecognizer.get_loss")
labels = self.labels
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
truths = []
for eg in examples:
eg_truth = []
for x in eg.get_aligned("SENT_START"):
if x is None:
eg_truth.append(None)
elif x == 1:
eg_truth.append(labels[1])
else:
# anything other than 1: 0, -1, -1 as uint64
eg_truth.append(labels[0])
truths.append(eg_truth)
d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
DOCS: https://spacy.io/api/sentencerecognizer#initialize
"""
validate_get_examples(get_examples, "SentenceRecognizer.initialize")
util.check_lexeme_norms(self.vocab, "senter")
doc_sample = []
label_sample = []
assert self.labels, Errors.E924.format(name=self.name)
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
gold_tags = example.get_aligned("SENT_START")
gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
def add_label(self, label, values=None):
raise NotImplementedError
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from itertools import islice
from typing import Iterable, Optional, Dict, List, Callable, Any
from thinc.api import Model, Config
from thinc.types import Floats2d
from ..language import Language
from ..training import Example, validate_get_examples
from ..errors import Errors
from ..scorer import Scorer
from ..tokens import Doc
from ..util import registry
from ..vocab import Vocab
from .textcat import TextCategorizer
multi_label_default_config = """
[model]
@architectures = "spacy.TextCatEnsemble.v2"
[model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2
[model.linear_model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
"""
DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["model"]
multi_label_bow_config = """
[model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
"""
multi_label_cnn_config = """
[model]
@architectures = "spacy.TextCatCNN.v2"
exclusive_classes = false
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
@Language.factory(
"textcat_multilabel",
assigns=["doc.cats"],
default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
"cats_macro_auc_per_type": None,
},
)
def make_multilabel_textcat(
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
to be non-mutually exclusive, which means that there can be zero or more labels
per doc).
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
"""
return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer
)
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_cats(
examples,
"cats",
multi_label=True,
**kwargs,
)
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
def make_textcat_multilabel_scorer():
return textcat_multilabel_score
class MultiLabel_TextCategorizer(TextCategorizer):
"""Pipeline component for multi-label text classification.
DOCS: https://spacy.io/api/textcategorizer
"""
def __init__(
self,
vocab: Vocab,
model: Model,
name: str = "textcat_multilabel",
*,
threshold: float,
scorer: Optional[Callable] = textcat_multilabel_score,
) -> None:
"""Initialize a text categorizer for multi-label classification.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
DOCS: https://spacy.io/api/textcategorizer#init
"""
self.vocab = vocab
self.model = model
self.name = name
self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold}
self.cfg = dict(cfg)
self.scorer = scorer
def initialize( # type: ignore[override]
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
labels: Optional[Iterable[str]] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
labels: The labels to add to the component, typically generated by the
`init labels` command. If no labels are provided, the get_examples
callback is used to extract the labels from the data.
DOCS: https://spacy.io/api/textcategorizer#initialize
"""
validate_get_examples(get_examples, "MultiLabel_TextCategorizer.initialize")
if labels is None:
for example in get_examples():
for cat in example.y.cats:
self.add_label(cat)
else:
for label in labels:
self.add_label(label)
subbatch = list(islice(get_examples(), 10))
doc_sample = [eg.reference for eg in subbatch]
label_sample, _ = self._examples_to_truth(subbatch)
self._require_labels()
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
def _validate_categories(self, examples: Iterable[Example]):
"""This component allows any type of single- or multi-label annotations.
This method overwrites the more strict one from 'textcat'."""
pass
This diff is collapsed.
This diff is collapsed.
from .pipe cimport Pipe
from ..vocab cimport Vocab
cdef class TrainablePipe(Pipe):
cdef public Vocab vocab
cdef public object model
cdef public object cfg
cdef public object scorer
This diff is collapsed.
This diff is collapsed.
from cymem.cymem cimport Pool
from ..vocab cimport Vocab
from .trainable_pipe cimport TrainablePipe
from ._parser_internals.transition_system cimport Transition, TransitionSystem
from ._parser_internals._state cimport StateC
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
cdef class Parser(TrainablePipe):
cdef public object _rehearsal_model
cdef readonly TransitionSystem moves
cdef public object _multitasks
cdef void _parseC(self, StateC** states,
WeightsC weights, SizesC sizes) nogil
cdef void c_transition_batch(self, StateC** states, const float* scores,
int nr_class, int batch_size) nogil
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment