GITENV file updated 19

parent 327c3ee6

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

from wasabi import msg
from ._util import app, setup_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train_cli # noqa: F401
from .assemble import assemble_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401
from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401
from .project.assets import project_assets # noqa: F401
from .project.run import project_run # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.push import project_push # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.document import project_document # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
def link(*args, **kwargs):
"""As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained
pipeline packages using their full names or from a directory path."""
msg.warn(
"As of spaCy v3.0, model symlinks are not supported anymore. You can load trained "
"pipeline packages using their full names or from a directory path."
)
This diff is collapsed.
from typing import Optional
from pathlib import Path
from wasabi import msg
import typer
import logging
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from .. import util
from ..util import get_sourced_components, load_model_from_config
@app.command(
"assemble",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def assemble_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on
):
"""
Assemble a spaCy pipeline from a config file. The config file includes
all settings for initializing the pipeline. To override settings in the
config, e.g. settings that point to local paths or that you want to
experiment with, you can override them as command line options. The
--code argument lets you pass in a Python file that can be used to
register custom functions that are referenced in the config.
DOCS: https://spacy.io/api/cli#assemble
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides, interpolate=False)
msg.divider("Initializing pipeline")
nlp = load_model_from_config(config, auto_fill=True)
config = config.interpolate()
sourced = get_sourced_components(config)
# Make sure that listeners are defined before initializing further
nlp._link_components()
with nlp.select_pipes(disable=[*sourced]):
nlp.initialize()
msg.good("Initialized pipeline")
msg.divider("Serializing to disk")
if output_path is not None and not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}")
nlp.to_disk(output_path)
This diff is collapsed.
from typing import Optional, Dict, Any, Union, List
from pathlib import Path
from wasabi import msg, table
from thinc.api import Config
from thinc.config import VARIABLE_RE
import typer
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
from ..util import registry
from .. import util
@debug_cli.command(
"config",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def debug_config_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
# fmt: on
):
"""Debug a config.cfg file and show validation errors. The command will
create all objects in the tree and validate them. Note that some config
validation errors are blocking and will prevent the rest of the config from
being resolved. This means that you may not see all validation errors at
once and some issues are only shown once previous errors have been fixed.
Similar as with the 'train' command, you can override settings from the config
as command line options. For instance, --training.batch_size 128 overrides
the value of "batch_size" in the block "[training]".
DOCS: https://spacy.io/api/cli#debug-config
"""
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
debug_config(
config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
)
def debug_config(
config_path: Path,
*,
overrides: Dict[str, Any] = {},
show_funcs: bool = False,
show_vars: bool = False,
):
msg.divider("Config validation")
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
nlp = util.load_model_from_config(config)
config = nlp.config.interpolate()
msg.divider("Config validation for [initialize]")
with show_validation_error(config_path):
T = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
msg.divider("Config validation for [training]")
with show_validation_error(config_path):
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"]]
util.resolve_dot_names(config, dot_names)
msg.good("Config is valid")
if show_vars:
variables = get_variables(config)
msg.divider(f"Variables ({len(variables)})")
head = ("Variable", "Value")
msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2)
if show_funcs:
funcs = get_registered_funcs(config)
msg.divider(f"Registered functions ({len(funcs)})")
for func in funcs:
func_data = {
"Registry": f"@{func['registry']}",
"Name": func["name"],
"Module": func["module"],
"File": f"{func['file']} (line {func['line_no']})",
}
msg.info(f"[{func['path']}]")
print(table(func_data).strip())
def get_registered_funcs(config: Config) -> List[Dict[str, Optional[Union[str, int]]]]:
result = []
for key, value in util.walk_dict(config):
if not key[-1].startswith("@"):
continue
# We have a reference to a registered function
reg_name = key[-1][1:]
registry = getattr(util.registry, reg_name)
path = ".".join(key[:-1])
info = registry.find(value)
result.append({"name": value, "registry": reg_name, "path": path, **info})
return result
def get_variables(config: Config) -> Dict[str, Any]:
result = {}
for variable in sorted(set(VARIABLE_RE.findall(config.to_str()))):
path = variable[2:-1].replace(":", ".")
value = util.dot_to_object(config, path)
result[variable] = repr(value)
return result
This diff is collapsed.
from typing import Dict, Any, Optional
from pathlib import Path
import itertools
from spacy.training import Example
from spacy.util import resolve_dot_names
from wasabi import msg
from thinc.api import fix_random_seed, set_dropout_rate
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
from ._util import Arg, Opt, debug_cli, show_validation_error
from ._util import parse_config_overrides, string_to_list, setup_gpu
from ..schemas import ConfigSchemaTraining
from ..util import registry
from .. import util
@debug_cli.command(
"model",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def debug_model_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
P3: bool = Opt(False, "--print-step3", "-P3", help="Print final predictions"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
"""
Analyze a Thinc model implementation. Includes checks for internal structure
and activations during training.
DOCS: https://spacy.io/api/cli#debug-model
"""
setup_gpu(use_gpu)
layers = string_to_list(layers, intify=True)
print_settings = {
"dimensions": dimensions,
"parameters": parameters,
"gradients": gradients,
"attributes": attributes,
"layers": layers,
"print_before_training": P0,
"print_after_init": P1,
"print_after_training": P2,
"print_prediction": P3,
}
config_overrides = parse_config_overrides(ctx.args)
with show_validation_error(config_path):
raw_config = util.load_config(
config_path, overrides=config_overrides, interpolate=False
)
config = raw_config.interpolate()
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
with show_validation_error(config_path):
nlp = util.load_model_from_config(raw_config)
config = nlp.config.interpolate()
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
seed = T["seed"]
if seed is not None:
msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed)
pipe = nlp.get_pipe(component)
debug_model(config, T, nlp, pipe, print_settings=print_settings)
def debug_model(
config,
resolved_train_config,
nlp,
pipe,
*,
print_settings: Optional[Dict[str, Any]] = None,
):
if not hasattr(pipe, "model"):
msg.fail(
f"The component '{pipe}' does not specify an object that holds a Model.",
exits=1,
)
model = pipe.model
if not isinstance(model, Model):
msg.fail(
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
exits=1,
)
if print_settings is None:
print_settings = {}
# STEP 0: Printing before training
msg.info(f"Analysing model with ID {model.id}")
if print_settings.get("print_before_training"):
msg.divider(f"STEP 0 - before training")
_print_model(model, print_settings)
# STEP 1: Initializing the model and printing again
with data_validation(False):
try:
dot_names = [resolved_train_config["train_corpus"]]
with show_validation_error():
(train_corpus,) = resolve_dot_names(config, dot_names)
nlp.initialize(lambda: train_corpus(nlp))
msg.info("Initialized the model with the training corpus.")
examples = list(itertools.islice(train_corpus(nlp), 5))
except ValueError:
try:
_set_output_dim(nO=7, model=model)
with show_validation_error():
examples = [Example.from_dict(x, {}) for x in _get_docs()]
nlp.initialize(lambda: examples)
msg.info("Initialized the model with dummy data.")
except Exception:
msg.fail(
"Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.",
exits=1,
)
if print_settings.get("print_after_init"):
msg.divider(f"STEP 1 - after initialization")
_print_model(model, print_settings)
# STEP 2: Updating the model and printing again
set_dropout_rate(model, 0.2)
# ugly hack to deal with Tok2Vec/Transformer listeners
upstream_component = None
if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name:
upstream_component = nlp.get_pipe("tok2vec")
if (
model.has_ref("tok2vec")
and "transformer-listener" in model.get_ref("tok2vec").name
):
upstream_component = nlp.get_pipe("transformer")
for e in range(3):
if upstream_component:
upstream_component.update(examples)
pipe.update(examples)
if print_settings.get("print_after_training"):
msg.divider(f"STEP 2 - after training")
_print_model(model, print_settings)
# STEP 3: the final prediction
prediction = model.predict([ex.predicted for ex in examples])
if print_settings.get("print_prediction"):
msg.divider(f"STEP 3 - prediction")
msg.info(str(prediction))
msg.good(f"Succesfully ended analysis - model looks good.")
def _sentences():
return [
"Apple is looking at buying U.K. startup for $1 billion",
"Autonomous cars shift insurance liability toward manufacturers",
"San Francisco considers banning sidewalk delivery robots",
"London is a big city in the United Kingdom.",
]
def _get_docs(lang: str = "en"):
nlp = util.get_lang_class(lang)()
return list(nlp.pipe(_sentences()))
def _set_output_dim(model, nO):
# simulating dim inference by directly setting the nO argument of the model
if model.has_dim("nO") is None:
model.set_dim("nO", nO)
if model.has_ref("output_layer"):
if model.get_ref("output_layer").has_dim("nO") is None:
model.get_ref("output_layer").set_dim("nO", nO)
def _print_model(model, print_settings):
layers = print_settings.get("layers", "")
parameters = print_settings.get("parameters", False)
dimensions = print_settings.get("dimensions", False)
gradients = print_settings.get("gradients", False)
attributes = print_settings.get("attributes", False)
for i, node in enumerate(model.walk()):
if not layers or i in layers:
msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'")
if dimensions:
for name in node.dim_names:
msg.info(f" - dim {name}: {node.maybe_get_dim(name)}")
if parameters:
for name in node.param_names:
if node.has_param(name):
print_value = _print_matrix(node.get_param(name))
msg.info(f" - param {name}: {print_value}")
else:
msg.info(f" - param {name}: {node.has_param(name)}")
if gradients:
for name in node.param_names:
if node.has_grad(name):
print_value = _print_matrix(node.get_grad(name))
msg.info(f" - grad {name}: {print_value}")
else:
msg.info(f" - grad {name}: {node.has_grad(name)}")
if attributes:
attrs = node.attrs
for name, value in attrs.items():
msg.info(f" - attr {name}: {value}")
def _print_matrix(value):
if value is None or isinstance(value, bool):
return value
result = str(value.shape) + " - sample: "
sample_matrix = value
for d in range(value.ndim - 1):
sample_matrix = sample_matrix[0]
sample_matrix = sample_matrix[0:5]
result = result + str(sample_matrix)
return result
from typing import Optional, Sequence
import requests
import sys
from wasabi import msg
import typer
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
from ..util import is_package, get_minor_version, run_command
from ..errors import OLD_MODEL_SHORTCUTS
@app.command(
"download",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def download_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Name of pipeline package to download"),
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
# fmt: on
):
"""
Download compatible trained pipeline from the default download path using
pip. If --direct flag is set, the command expects the full package name with
version. For direct downloads, the compatibility check will be skipped. All
additional arguments provided to this command will be passed to `pip install`
on package installation.
DOCS: https://spacy.io/api/cli#download
AVAILABLE PACKAGES: https://spacy.io/models
"""
download(model, direct, sdist, *ctx.args)
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None:
if (
not (is_package("spacy") or is_package("spacy-nightly"))
and "--no-deps" not in pip_args
):
msg.warn(
"Skipping pipeline package dependencies and setting `--no-deps`. "
"You don't seem to have the spaCy package itself installed "
"(maybe because you've built from source?), so installing the "
"package dependencies would cause spaCy to be downloaded, which "
"probably isn't what you want. If the pipeline package has other "
"dependencies, you'll have to install them manually."
)
pip_args = pip_args + ("--no-deps",)
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
if direct:
components = model.split("-")
model_name = "".join(components[:-1])
version = components[-1]
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
else:
model_name = model
if model in OLD_MODEL_SHORTCUTS:
msg.warn(
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
)
model_name = OLD_MODEL_SHORTCUTS[model]
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
msg.good(
"Download and installation successful",
f"You can now load the package via spacy.load('{model_name}')",
)
def get_compatibility() -> dict:
version = get_minor_version(about.__version__)
r = requests.get(about.__compatibility__)
if r.status_code != 200:
msg.fail(
f"Server error ({r.status_code})",
f"Couldn't fetch compatibility table. Please find a package for your spaCy "
f"installation (v{about.__version__}), and download it manually. "
f"For more details, see the documentation: "
f"https://spacy.io/usage/models",
exits=1,
)
comp_table = r.json()
comp = comp_table["spacy"]
if version not in comp:
msg.fail(f"No compatible packages found for v{version} of spaCy", exits=1)
return comp[version]
def get_version(model: str, comp: dict) -> str:
if model not in comp:
msg.fail(
f"No compatible package found for '{model}' (spaCy v{about.__version__})",
exits=1,
)
return comp[model][0]
def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None:
download_url = about.__download_url__ + "/" + filename
pip_args = list(user_pip_args) if user_pip_args is not None else []
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
run_command(cmd)
from typing import Optional, List, Dict, Any, Union
from wasabi import Printer
from pathlib import Path
import re
import srsly
from thinc.api import fix_random_seed
from ..training import Corpus
from ..tokens import Doc
from ._util import app, Arg, Opt, setup_gpu, import_code
from ..scorer import Scorer
from .. import util
from .. import displacy
@app.command("evaluate")
def evaluate_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
# fmt: on
):
"""
Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation
data in the binary .spacy format. The --gold-preproc option sets up the
evaluation examples with gold-standard sentences and tokens for the
predictions. Gold preprocessing helps the annotations align to the
tokenization, and may result in sequences of more consistent length. However,
it may reduce runtime accuracy due to train/test skew. To render a sample of
dependency parses in a HTML file, set as output directory as the
displacy_path argument.
DOCS: https://spacy.io/api/cli#evaluate
"""
import_code(code_path)
evaluate(
model,
data_path,
output=output,
use_gpu=use_gpu,
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
silent=False,
)
def evaluate(
model: str,
data_path: Path,
output: Optional[Path] = None,
use_gpu: int = -1,
gold_preproc: bool = False,
displacy_path: Optional[Path] = None,
displacy_limit: int = 25,
silent: bool = True,
spans_key: str = "sc",
) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed()
setup_gpu(use_gpu, silent=silent)
data_path = util.ensure_path(data_path)
output_path = util.ensure_path(output)
displacy_path = util.ensure_path(displacy_path)
if not data_path.exists():
msg.fail("Evaluation data not found", data_path, exits=1)
if displacy_path and not displacy_path.exists():
msg.fail("Visualization output directory not found", displacy_path, exits=1)
corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model)
dev_dataset = list(corpus(nlp))
scores = nlp.evaluate(dev_dataset)
metrics = {
"TOK": "token_acc",
"TAG": "tag_acc",
"POS": "pos_acc",
"MORPH": "morph_acc",
"LEMMA": "lemma_acc",
"UAS": "dep_uas",
"LAS": "dep_las",
"NER P": "ents_p",
"NER R": "ents_r",
"NER F": "ents_f",
"TEXTCAT": "cats_score",
"SENT P": "sents_p",
"SENT R": "sents_r",
"SENT F": "sents_f",
"SPAN P": f"spans_{spans_key}_p",
"SPAN R": f"spans_{spans_key}_r",
"SPAN F": f"spans_{spans_key}_f",
"SPEED": "speed",
}
results = {}
data = {}
for metric, key in metrics.items():
if key in scores:
if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if isinstance(scores[key], (int, float)):
if key == "speed":
results[metric] = f"{scores[key]:.0f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
else:
results[metric] = "-"
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
msg.table(results, title="Results")
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
render_deps = "parser" in factory_names
render_ents = "ner" in factory_names
render_parses(
docs,
displacy_path,
model_name=model,
limit=displacy_limit,
deps=render_deps,
ents=render_ents,
)
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
if output_path is not None:
srsly.write_json(output_path, data)
msg.good(f"Saved results to {output_path}")
return data
def handle_scores_per_type(
scores: Dict[str, Any],
data: Dict[str, Any] = {},
*,
spans_key: str = "sc",
silent: bool = False,
) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent)
if "morph_per_feat" in scores:
if scores["morph_per_feat"]:
print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
data["morph_per_feat"] = scores["morph_per_feat"]
if "dep_las_per_type" in scores:
if scores["dep_las_per_type"]:
print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
data["dep_las_per_type"] = scores["dep_las_per_type"]
if "ents_per_type" in scores:
if scores["ents_per_type"]:
print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
data["ents_per_type"] = scores["ents_per_type"]
if f"spans_{spans_key}_per_type" in scores:
if scores[f"spans_{spans_key}_per_type"]:
print_prf_per_type(
msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type"
)
data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"]
if "cats_f_per_type" in scores:
if scores["cats_f_per_type"]:
print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
data["cats_f_per_type"] = scores["cats_f_per_type"]
if "cats_auc_per_type" in scores:
if scores["cats_auc_per_type"]:
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
data["cats_auc_per_type"] = scores["cats_auc_per_type"]
return scores
def render_parses(
docs: List[Doc],
output_path: Path,
model_name: str = "",
limit: int = 250,
deps: bool = True,
ents: bool = True,
):
docs[0].user_data["title"] = model_name
if ents:
html = displacy.render(docs[:limit], style="ent", page=True)
with (output_path / "entities.html").open("w", encoding="utf8") as file_:
file_.write(html)
if deps:
html = displacy.render(
docs[:limit], style="dep", page=True, options={"compact": True}
)
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
file_.write(html)
def print_prf_per_type(
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
) -> None:
data = []
for key, value in scores.items():
row = [key]
for k in ("p", "r", "f"):
v = value[k]
row.append(f"{v * 100:.2f}" if isinstance(v, (int, float)) else v)
data.append(row)
msg.table(
data,
header=("", "P", "R", "F"),
aligns=("l", "r", "r", "r"),
title=f"{name} (per {type})",
)
def print_textcats_auc_per_cat(
msg: Printer, scores: Dict[str, Dict[str, float]]
) -> None:
msg.table(
[
(k, f"{v:.2f}" if isinstance(v, (float, int)) else v)
for k, v in scores.items()
],
header=("", "ROC AUC"),
aligns=("l", "r"),
title="Textcat ROC AUC (per label)",
)
from typing import Optional, Dict, Any, Union, List
import platform
from pathlib import Path
from wasabi import Printer, MarkdownRenderer
import srsly
from ._util import app, Arg, Opt, string_to_list
from .. import util
from .. import about
@app.command("info")
def info_cli(
# fmt: off
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
# fmt: on
):
"""
Print info about spaCy installation. If a pipeline is specified as an argument,
print its meta information. Flag --markdown prints details in Markdown for easy
copy-pasting to GitHub issues.
DOCS: https://spacy.io/api/cli#info
"""
exclude = string_to_list(exclude)
info(model, markdown=markdown, silent=silent, exclude=exclude)
def info(
model: Optional[str] = None,
*,
markdown: bool = False,
silent: bool = True,
exclude: Optional[List[str]] = None,
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if not exclude:
exclude = []
if model:
title = f"Info about pipeline '{model}'"
data = info_model(model, silent=silent)
else:
title = "Info about spaCy"
data = info_spacy()
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
if "Pipelines" in data and isinstance(data["Pipelines"], dict):
data["Pipelines"] = ", ".join(
f"{n} ({v})" for n, v in data["Pipelines"].items()
)
markdown_data = get_markdown(data, title=title, exclude=exclude)
if markdown:
if not silent:
print(markdown_data)
return markdown_data
if not silent:
table_data = {k: v for k, v in data.items() if k not in exclude}
msg.table(table_data, title=title)
return raw_data
def info_spacy() -> Dict[str, Any]:
"""Generate info about the current spaCy intallation.
RETURNS (dict): The spaCy info.
"""
all_models = {}
for pkg_name in util.get_installed_models():
package = pkg_name.replace("-", "_")
all_models[package] = util.get_package_version(pkg_name)
return {
"spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Pipelines": all_models,
}
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
"""Generate info about a specific model.
model (str): Model name of path.
silent (bool): Don't print anything, just return.
RETURNS (dict): The model meta.
"""
msg = Printer(no_print=silent, pretty=not silent)
if util.is_package(model):
model_path = util.get_package_path(model)
else:
model_path = Path(model)
meta_path = model_path / "meta.json"
if not meta_path.is_file():
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
return {
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
}
def get_markdown(
data: Dict[str, Any],
title: Optional[str] = None,
exclude: Optional[List[str]] = None,
) -> str:
"""Get data in GitHub-flavoured Markdown format for issues etc.
data (Dict[str, Any]): Label/value pairs.
title (str): Optional title, will be rendered as headline 2.
exclude (List[str]): Names of keys to exclude.
RETURNS (str): The Markdown string.
"""
md = MarkdownRenderer()
if title:
md.add(md.title(2, title))
items = []
for key, value in data.items():
if exclude and key in exclude:
continue
if isinstance(value, str):
try:
existing_path = Path(value).exists()
except Exception:
# invalid Path, like a URL string
existing_path = False
if existing_path:
continue
items.append(f"{md.bold(f'{key}:')} {value}")
md.add(md.list(items))
return f"\n{md.text}\n"
from typing import Optional, List, Tuple
from enum import Enum
from pathlib import Path
from wasabi import Printer, diff_strings
from thinc.api import Config
import srsly
import re
from jinja2 import Template
from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
from ._util import string_to_list, import_code
ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
class Optimizations(str, Enum):
efficiency = "efficiency"
accuracy = "accuracy"
@init_cli.command("config")
def init_config_cli(
# fmt: off
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
# fmt: on
):
"""
Generate a starter config.cfg for training. Based on your requirements
specified via the CLI arguments, this command generates a config with the
optimal settings for your use case. This includes the choice of architecture,
pretrained weights and related hyperparameters.
DOCS: https://spacy.io/api/cli#init-config
"""
pipeline = string_to_list(pipeline)
is_stdout = str(output_file) == "-"
if not is_stdout and output_file.exists() and not force_overwrite:
msg = Printer()
msg.fail(
"The provided output file already exists. To force overwriting the config file, set the --force or -F flag.",
exits=1,
)
config = init_config(
lang=lang,
pipeline=pipeline,
optimize=optimize.value,
gpu=gpu,
pretraining=pretraining,
silent=is_stdout,
)
save_config(config, output_file, is_stdout=is_stdout)
@init_cli.command("fill-config")
def init_fill_config_cli(
# fmt: off
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
# fmt: on
):
"""
Fill partial config.cfg with default values. Will add all missing settings
from the default config and will create all objects, check the registered
functions for their default values and update the base config. This command
can be used with a config generated via the training quickstart widget:
https://spacy.io/usage/training#quickstart
DOCS: https://spacy.io/api/cli#init-fill-config
"""
import_code(code_path)
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
def fill_config(
output_file: Path,
base_path: Path,
*,
pretraining: bool = False,
diff: bool = False,
silent: bool = False,
) -> Tuple[Config, Config]:
is_stdout = str(output_file) == "-"
no_print = is_stdout or silent
msg = Printer(no_print=no_print)
with show_validation_error(hint_fill=False):
config = util.load_config(base_path)
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
# Load a second time with validation to be extra sure that the produced
# config result is a valid config
nlp = util.load_model_from_config(nlp.config)
filled = nlp.config
# If we have sourced components in the base config, those will have been
# replaced with their actual config after loading, so we have to re-add them
sourced = util.get_sourced_components(config)
filled["components"].update(sourced)
if pretraining:
validate_config_for_pretrain(filled, msg)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
filled = pretrain_config.merge(filled)
before = config.to_str()
after = filled.to_str()
if before == after:
msg.warn("Nothing to auto-fill: base config is already complete")
else:
msg.good("Auto-filled config with all values")
if diff and not no_print:
if before == after:
msg.warn("No diff to show: nothing was auto-filled")
else:
msg.divider("START CONFIG DIFF")
print("")
print(diff_strings(before, after))
msg.divider("END CONFIG DIFF")
print("")
save_config(filled, output_file, is_stdout=is_stdout, silent=silent)
return config, filled
def init_config(
*,
lang: str,
pipeline: List[str],
optimize: str,
gpu: bool,
pretraining: bool = False,
silent: bool = True,
) -> Config:
msg = Printer(no_print=silent)
with TEMPLATE_PATH.open("r") as f:
template = Template(f.read())
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
defaults = RECOMMENDATIONS["__default__"]
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict()
variables = {
"lang": lang,
"components": pipeline,
"optimize": optimize,
"hardware": "gpu" if gpu else "cpu",
"transformer_data": reco["transformer"],
"word_vectors": reco["word_vectors"],
"has_letters": reco["has_letters"],
}
if variables["transformer_data"] and not has_spacy_transformers():
msg.warn(
"To generate a more effective transformer-based config (GPU-only), "
"install the spacy-transformers package and re-run this command. "
"The config generated now does not use transformers."
)
variables["transformer_data"] = None
base_template = template.render(variables).strip()
# Giving up on getting the newlines right in jinja for now
base_template = re.sub(r"\n\n\n+", "\n\n", base_template)
# Access variables declared in templates
template_vars = template.make_module(variables)
use_case = {
"Language": lang,
"Pipeline": ", ".join(pipeline),
"Optimize for": optimize,
"Hardware": variables["hardware"].upper(),
"Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined]
if template_vars.use_transformer # type: ignore[attr-defined]
else None,
}
msg.info("Generated config template specific for your use case")
for label, value in use_case.items():
msg.text(f"- {label}: {value}")
with show_validation_error(hint_fill=False):
config = util.load_config_from_str(base_template)
nlp = util.load_model_from_config(config, auto_fill=True)
config = nlp.config
if pretraining:
validate_config_for_pretrain(config, msg)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
config = pretrain_config.merge(config)
msg.good("Auto-filled config with all values")
return config
def save_config(
config: Config, output_file: Path, is_stdout: bool = False, silent: bool = False
) -> None:
no_print = is_stdout or silent
msg = Printer(no_print=no_print)
if is_stdout:
print(config.to_str())
else:
if not output_file.parent.exists():
output_file.parent.mkdir(parents=True)
config.to_disk(output_file, interpolate=False)
msg.good("Saved config", output_file)
msg.text("You can now add your data and train your pipeline:")
variables = ["--paths.train ./train.spacy", "--paths.dev ./dev.spacy"]
if not no_print:
print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}")
def has_spacy_transformers() -> bool:
try:
import spacy_transformers # noqa: F401
return True
except ImportError:
return False
def validate_config_for_pretrain(config: Config, msg: Printer) -> None:
if "tok2vec" not in config["nlp"]["pipeline"]:
msg.warn(
"No tok2vec component found in the pipeline. If your tok2vec "
"component has a different name, you may need to adjust the "
"tok2vec_model reference in the [pretraining] block. If you don't "
"have a tok2vec component, make sure to add it to your [components] "
"and the pipeline specified in the [nlp] block, so you can pretrain "
"weights for it."
)
from typing import Optional
import logging
from pathlib import Path
from wasabi import msg
import typer
import srsly
from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
@init_cli.command("vectors")
def init_vectors_cli(
# fmt: off
lang: str = Arg(..., help="The language of the nlp object to create"),
vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True),
output_dir: Path = Arg(..., help="Pipeline output directory"),
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
# fmt: on
):
"""Convert word vectors for use with spaCy. Will export an nlp object that
you can use in the [initialize] block of your config to initialize
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
update_lexemes(nlp, jsonl_loc)
convert_vectors(
nlp,
vectors_loc,
truncate=truncate,
prune=prune,
name=name,
mode=mode,
)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir)
msg.good(
"Saved nlp object with vectors to output directory. You can now use the "
"path to it in your config as the 'vectors' setting in [initialize].",
output_dir.resolve(),
)
def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
# Mostly used for backwards-compatibility and may be removed in the future
lex_attrs = srsly.read_jsonl(jsonl_loc)
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
@init_cli.command(
"nlp",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
hidden=True,
)
def init_pipeline_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the prepared data"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu)
nlp.to_disk(output_path)
msg.good(f"Saved initialized pipeline to {output_path}")
@init_cli.command(
"labels",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def init_labels_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the labels"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
"""Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not output_path.exists():
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu)
_init_labels(nlp, output_path)
def _init_labels(nlp, output_path):
for name, component in nlp.pipeline:
if getattr(component, "label_data", None) is not None:
output_file = output_path / f"{name}.json"
srsly.write_json(output_file, component.label_data)
msg.good(f"Saving label data for component '{name}' to {output_file}")
else:
msg.info(f"No label data found for component '{name}'")
This diff is collapsed.
from typing import Optional
from pathlib import Path
from wasabi import msg
import typer
import re
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.pretrain import pretrain
from ..util import load_config
@app.command(
"pretrain",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def pretrain_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
using an approximate language-modelling objective. Two objective types
are available, vector-based and character-based.
In the vector-based objective, we load word vectors that have been trained
using a word2vec-style distributional similarity algorithm, and train a
component like a CNN, BiLSTM, etc to predict vectors which match the
pretrained ones. The weights are saved to a directory after each epoch. You
can then pass a path to one of these pretrained weights files to the
'spacy train' command.
This technique may be especially helpful if you have little labelled data.
However, it's still quite experimental, so your mileage may vary.
To load the weights back in during 'spacy train', you need to ensure
all settings are the same between pretraining and training. Ideally,
this is done by using the same config file for both commands.
DOCS: https://spacy.io/api/cli#pretrain
"""
config_overrides = parse_config_overrides(ctx.args)
import_code(code_path)
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
setup_gpu(use_gpu)
msg.info(f"Loading config from: {config_path}")
with show_validation_error(config_path):
raw_config = load_config(
config_path, overrides=config_overrides, interpolate=False
)
config = raw_config.interpolate()
if not config.get("pretraining"):
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
# Save non-interpolated config
raw_config.to_disk(output_dir / "config.cfg")
msg.good("Saved config file in the output directory")
pretrain(
config,
output_dir,
resume_path=resume_path,
epoch_resume=epoch_resume,
use_gpu=use_gpu,
silent=False,
)
msg.good("Successfully finished pretrain")
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
if output_dir.exists() and [p for p in output_dir.iterdir()]:
if resume_path:
msg.warn(
"Output directory is not empty.",
"If you're resuming a run in this directory, the old weights "
"for the consecutive epochs will be overwritten with the new ones.",
)
else:
msg.warn(
"Output directory is not empty. ",
"It is better to use an empty directory or refer to a new output path, "
"then the new directory will be created for you.",
)
if resume_path is not None:
if resume_path.is_dir():
# This is necessary because Windows gives a Permission Denied when we
# try to open the directory later, which is confusing. See #7878
msg.fail(
"--resume-path should be a weights file, but {resume_path} is a directory.",
exits=True,
)
model_name = re.search(r"model\d+\.bin", str(resume_path))
if not model_name and not epoch_resume:
msg.fail(
"You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
exits=True,
)
elif not model_name and epoch_resume < 0:
msg.fail(
f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
exits=True,
)
from typing import Optional, Sequence, Union, Iterator
import tqdm
from pathlib import Path
import srsly
import cProfile
import pstats
import sys
import itertools
from wasabi import msg, Printer
import typer
from ._util import app, debug_cli, Arg, Opt, NAME
from ..language import Language
from ..util import load_model
@debug_cli.command("profile")
@app.command("profile", hidden=True)
def profile_cli(
# fmt: off
ctx: typer.Context, # This is only used to read current calling context
model: str = Arg(..., help="Trained pipeline to load"),
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
# fmt: on
):
"""
Profile which functions take the most time in a spaCy pipeline.
Input should be formatted as one JSON object per line with a key "text".
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
DOCS: https://spacy.io/api/cli#debug-profile
"""
if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command
msg.warn(
"The profile command is now available via the 'debug profile' "
"subcommand. You can run python -m spacy debug --help for an "
"overview of the other available debugging commands."
)
profile(model, inputs=inputs, n_texts=n_texts)
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
if inputs is not None:
texts = _read_inputs(inputs, msg)
texts = list(itertools.islice(texts, n_texts))
if inputs is None:
try:
import ml_datasets
except ImportError:
msg.fail(
"This command, when run without an input file, "
"requires the ml_datasets library to be installed: "
"pip install ml_datasets",
exits=1,
)
with msg.loading("Loading IMDB dataset via ml_datasets..."):
imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0)
texts, _ = zip(*imdb_train)
msg.info(f"Loaded IMDB dataset and using {n_texts} examples")
with msg.loading(f"Loading pipeline '{model}'..."):
nlp = load_model(model)
msg.good(f"Loaded pipeline '{model}'")
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
msg.divider("Profile stats")
s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
pass
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
if loc == "-":
msg.info("Reading input from sys.stdin")
file_ = sys.stdin
file_ = (line.encode("utf8") for line in file_)
else:
input_path = Path(loc)
if not input_path.exists() or not input_path.is_file():
msg.fail("Not a valid input data file", loc, exits=1)
msg.info(f"Using data from {input_path.parts[-1]}")
file_ = input_path.open() # type: ignore[assignment]
for line in file_:
data = srsly.json_loads(line)
text = data["text"]
yield text
from typing import Any, Dict, Optional
from pathlib import Path
from wasabi import msg
import re
import shutil
import requests
import typer
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import get_checksum, download_file, git_checkout, get_git_version
from .._util import SimpleFrozenDict, parse_config_overrides
@project_cli.command(
"assets",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_assets_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on
):
"""Fetch project assets like datasets and pretrained weights. Assets are
defined in the "assets" section of the project.yml. If a checksum is
provided in the project.yml, the file is only downloaded if no local file
with the same checksum exists.
DOCS: https://spacy.io/api/cli#project-assets
"""
overrides = parse_config_overrides(ctx.args)
project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
def project_assets(
project_dir: Path,
*,
overrides: Dict[str, Any] = SimpleFrozenDict(),
sparse_checkout: bool = False,
) -> None:
"""Fetch assets for a project using DVC if possible.
project_dir (Path): Path to project directory.
"""
project_path = ensure_path(project_dir)
config = load_project_config(project_path, overrides=overrides)
assets = config.get("assets", {})
if not assets:
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)")
for asset in assets:
dest = (project_dir / asset["dest"]).resolve()
checksum = asset.get("checksum")
if "git" in asset:
git_err = (
f"Cloning spaCy project templates requires Git and the 'git' command. "
f"Make sure it's installed and that the executable is available."
)
get_git_version(error=git_err)
if dest.exists():
# If there's already a file, check for checksum
if checksum and checksum == get_checksum(dest):
msg.good(
f"Skipping download with matching checksum: {asset['dest']}"
)
continue
else:
if dest.is_dir():
shutil.rmtree(dest)
else:
dest.unlink()
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
msg.fail(
"A git asset must include 'repo', the repository address.", exits=1
)
if "path" not in asset["git"] or asset["git"]["path"] is None:
msg.fail(
"A git asset must include 'path' - use \"\" to get the entire repository.",
exits=1,
)
git_checkout(
asset["git"]["repo"],
asset["git"]["path"],
dest,
branch=asset["git"].get("branch"),
sparse=sparse_checkout,
)
msg.good(f"Downloaded asset {dest}")
else:
url = asset.get("url")
if not url:
# project.yml defines asset without URL that the user has to place
check_private_asset(dest, checksum)
continue
fetch_asset(project_path, url, dest, checksum)
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
"""Check and validate assets without a URL (private assets that the user
has to provide themselves) and give feedback about the checksum.
dest (Path): Destination path of the asset.
checksum (Optional[str]): Optional checksum of the expected file.
"""
if not Path(dest).exists():
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
msg.warn(err)
else:
if not checksum:
msg.good(f"Asset already exists: {dest}")
elif checksum == get_checksum(dest):
msg.good(f"Asset exists with matching checksum: {dest}")
else:
msg.fail(f"Asset available but with incorrect checksum: {dest}")
def fetch_asset(
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
) -> None:
"""Fetch an asset from a given URL or path. If a checksum is provided and a
local file exists, it's only re-downloaded if the checksum doesn't match.
project_path (Path): Path to project directory.
url (str): URL or path to asset.
checksum (Optional[str]): Optional expected checksum of local file.
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
the asset failed.
"""
dest_path = (project_path / dest).resolve()
if dest_path.exists() and checksum:
# If there's already a file, check for checksum
if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}")
# We might as well support the user here and create parent directories in
# case the asset dir isn't listed as a dir to create in the project.yml
if not dest_path.parent.exists():
dest_path.parent.mkdir(parents=True)
with working_dir(project_path):
url = convert_asset_url(url)
try:
download_file(url, dest_path)
msg.good(f"Downloaded asset {dest}")
except requests.exceptions.RequestException as e:
if Path(url).exists() and Path(url).is_file():
# If it's a local file, copy to destination
shutil.copy(url, str(dest_path))
msg.good(f"Copied local asset {dest}")
else:
msg.fail(f"Download failed: {dest}", e)
if checksum and checksum != get_checksum(dest_path):
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
def convert_asset_url(url: str) -> str:
"""Check and convert the asset URL if needed.
url (str): The asset URL.
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(
"Downloading from a regular GitHub URL. This will only download "
"the source of the page, not the actual file. Converting the URL "
"to a raw URL.",
converted,
)
return converted
return url
from typing import Optional
from pathlib import Path
from wasabi import msg
import subprocess
import re
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_checkout, get_git_version
DEFAULT_REPO = about.__projects__
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
DEFAULT_BRANCH = "master"
@project_cli.command("clone")
def project_clone_cli(
# fmt: off
name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on
):
"""Clone a project template from a repository. Calls into "git" and will
only download the files from the given subdirectory. The GitHub repo
defaults to the official spaCy template repo, but can be customized
(including using a private repo).
DOCS: https://spacy.io/api/cli#project-clone
"""
if dest is None:
dest = Path.cwd() / Path(name).parts[-1]
if branch is None:
# If it's a user repo, we want to default to other branch
branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
def project_clone(
name: str,
dest: Path,
*,
repo: str = about.__projects__,
branch: str = about.__projects_branch__,
sparse_checkout: bool = False,
) -> None:
"""Clone a project template from a repository.
name (str): Name of subdirectory to clone.
dest (Path): Destination path of cloned project.
repo (str): URL of Git repo containing project templates.
branch (str): The branch to clone from
"""
dest = ensure_path(dest)
check_clone(name, dest, repo)
project_dir = dest.resolve()
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
try:
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'"
msg.fail(err, exits=1)
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
if not (project_dir / PROJECT_FILE).exists():
msg.warn(f"No {PROJECT_FILE} found in directory")
else:
msg.good(f"Your project is now ready!")
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
def check_clone(name: str, dest: Path, repo: str) -> None:
"""Check and validate that the destination path can be used to clone. Will
check that Git is available and that the destination path is suitable.
name (str): Name of the directory to clone from the repo.
dest (Path): Local destination of cloned directory.
repo (str): URL of the repo to clone from.
"""
git_err = (
f"Cloning spaCy project templates requires Git and the 'git' command. "
f"To clone a project without Git, copy the files from the '{name}' "
f"directory in the {repo} to {dest} manually."
)
get_git_version(error=git_err)
if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists():
# Directory already exists (not allowed, clone needs to create it)
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
if not dest.parent.exists():
# We're not creating parents, parent dir should exist
msg.fail(
f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
f"Create the necessary folder(s) first before continuing.",
exits=1,
)
from pathlib import Path
from wasabi import msg, MarkdownRenderer
from ...util import working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
DOCS_URL = "https://spacy.io"
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
project, as well as the available commands and workflows. For details, see the
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
INTRO_COMMANDS = f"""The following commands are defined by the project. They
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
Commands are only re-run if their inputs have changed."""
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
and will run the specified commands in order. Commands are only re-run if their
inputs have changed."""
INTRO_ASSETS = f"""The following assets are defined by the project. They can
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
in the project directory."""
# These markers are added to the Markdown and can be used to update the file in
# place if it already exists. Only the auto-generated part will be replaced.
MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
# If this marker is used in an existing README, it's ignored and not replaced
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
@project_cli.command("document")
def project_document_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
# fmt: on
):
"""
Auto-generate a README.md for a project. If the content is saved to a file,
hidden markers are added so you can add custom content before or after the
auto-generated section and only the auto-generated docs will be replaced
when you re-run the command.
DOCS: https://spacy.io/api/cli#project-document
"""
project_document(project_dir, output_file, no_emoji=no_emoji)
def project_document(
project_dir: Path, output_file: Path, *, no_emoji: bool = False
) -> None:
is_stdout = str(output_file) == "-"
config = load_project_config(project_dir)
md = MarkdownRenderer(no_emoji=no_emoji)
md.add(MARKER_START)
title = config.get("title")
description = config.get("description")
md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
if description:
md.add(description)
md.add(md.title(2, PROJECT_FILE, "📋"))
md.add(INTRO_PROJECT)
# Commands
cmds = config.get("commands", [])
data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
if data:
md.add(md.title(3, "Commands", "⏯"))
md.add(INTRO_COMMANDS)
md.add(md.table(data, ["Command", "Description"]))
# Workflows
wfs = config.get("workflows", {}).items()
data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
if data:
md.add(md.title(3, "Workflows", "⏭"))
md.add(INTRO_WORKFLOWS)
md.add(md.table(data, ["Workflow", "Steps"]))
# Assets
assets = config.get("assets", [])
data = []
for a in assets:
source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
dest_path = a["dest"]
dest = md.code(dest_path)
if source == "Local":
# Only link assets if they're in the repo
with working_dir(project_dir) as p:
if (p / dest_path).exists():
dest = md.link(dest, dest_path)
data.append((dest, source, a.get("description", "")))
if data:
md.add(md.title(3, "Assets", "🗂"))
md.add(INTRO_ASSETS)
md.add(md.table(data, ["File", "Source", "Description"]))
md.add(MARKER_END)
# Output result
if is_stdout:
print(md.text)
else:
content = md.text
if output_file.exists():
with output_file.open("r", encoding="utf8") as f:
existing = f.read()
if MARKER_IGNORE in existing:
msg.warn("Found ignore marker in existing file: skipping", output_file)
return
if MARKER_START in existing and MARKER_END in existing:
msg.info("Found existing file: only replacing auto-generated docs")
before = existing.split(MARKER_START)[0]
after = existing.split(MARKER_END)[1]
content = f"{before}{content}{after}"
else:
msg.warn("Replacing existing file")
with output_file.open("w", encoding="utf8") as f:
f.write(content)
msg.good("Saved project documentation", output_file)
"""This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional, Iterable
import subprocess
from pathlib import Path
from wasabi import msg
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
from .._util import Arg, Opt, NAME, COMMAND
from ...util import working_dir, split_command, join_command, run_command
from ...util import SimpleFrozenList
DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc"
UPDATE_COMMAND = "dvc"
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
# edited your {PROJECT_FILE}, you can regenerate this file by running:
# {COMMAND} project {UPDATE_COMMAND}"""
@project_cli.command(UPDATE_COMMAND)
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
"""Auto-generate Data Version Control (DVC) config. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. If no workflow is specified, the first defined
workflow is used. The DVC config will only be updated if the project.yml
changed.
DOCS: https://spacy.io/api/cli#project-dvc
"""
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
def project_update_dvc(
project_dir: Path,
workflow: Optional[str] = None,
*,
verbose: bool = False,
force: bool = False,
) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. Will only update the file if the checksum changed.
project_dir (Path): The project directory.
workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used.
verbose (bool): Print more info.
force (bool): Force update DVC config.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(
project_dir, config, workflow, verbose=verbose, force=force
)
help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated:
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
else:
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
def update_dvc_config(
path: Path,
config: Dict[str, Any],
workflow: Optional[str] = None,
verbose: bool = False,
silent: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
project directory. The file is auto-generated based on the config. The
first line of the auto-generated file specifies the hash of the config
dict, so if any of the config values change, the DVC config is regenerated.
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
ensure_dvc(path)
workflows = config.get("workflows", {})
workflow_names = list(workflows.keys())
check_workflows(workflow_names, workflow)
if not workflow:
workflow = workflow_names[0]
config_hash = get_hash(config)
path = path.resolve()
dvc_config_path = path / DVC_CONFIG
if dvc_config_path.exists():
# Check if the file was generated using the current config, if not, redo
with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force:
return False # Nothing has changed in project.yml, don't need to update
dvc_config_path.unlink()
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
for name in workflows[workflow]:
command = config_commands[name]
deps = command.get("deps", [])
outputs = command.get("outputs", [])
outputs_no_cache = command.get("outputs_no_cache", [])
if not deps and not outputs and not outputs_no_cache:
continue
# Default to the working dir as the project path since dvc.yaml is auto-generated
# and we don't want arbitrary paths in there
project_cmd = ["python", "-m", NAME, "project", "run", name]
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"):
dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd))
with working_dir(path):
dvc_flags = {"--verbose": verbose, "--quiet": silent}
run_dvc_commands(dvc_commands, flags=dvc_flags)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
return True
def run_dvc_commands(
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.
commands (List[str]): The string commands without the leading "dvc".
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
for c in commands:
command = split_command(c)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True
for flag, is_active in flags.items():
if is_active:
dvc_command.append(flag)
run_command(dvc_command)
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config.
workflows (List[str]): Names of the available workflows.
workflow (Optional[str]): The name of the workflow to convert.
"""
if not workflows:
msg.fail(
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
f"define at least one list of commands.",
exits=1,
)
if workflow is not None and workflow not in workflows:
msg.fail(
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
f"Available workflows: {', '.join(workflows)}",
exits=1,
)
if not workflow:
msg.warn(
f"No workflow specified for DVC pipeline. Using the first workflow "
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
)
def ensure_dvc(project_dir: Path) -> None:
"""Ensure that the "dvc" command is available and that the current project
directory is an initialized DVC project.
"""
try:
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
"To use spaCy projects with DVC (Data Version Control), DVC needs "
"to be installed and the 'dvc' command needs to be available",
"You can install the Python package from pip (pip install dvc) or "
"conda (conda install -c conda-forge dvc). For more details, see the "
"documentation: https://dvc.org/doc/install",
exits=1,
)
if not (project_dir / ".dvc").exists():
msg.fail(
"Project not initialized as a DVC project",
"To initialize a DVC project, you can run 'dvc init' in the project "
"directory. For more details, see the documentation: "
"https://dvc.org/doc/command-reference/init",
exits=1,
)
from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_command_hash
from .._util import project_cli, Arg, logger
from .._util import load_project_config
from .run import update_lockfile
@project_cli.command("pull")
def project_pull_cli(
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Retrieve available precomputed outputs from a remote storage.
You can alias remotes in your project.yml by mapping them to storage paths.
A storage can be anything that the smart-open library can upload to, e.g.
AWS, Google Cloud Storage, SSH, local directories etc.
DOCS: https://spacy.io/api/cli#project-pull
"""
for url, output_path in project_pull(project_dir, remote):
if url is not None:
msg.good(f"Pulled {output_path} from {url}")
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# TODO: We don't have tests for this :(. It would take a bit of mockery to
# set up. I guess see if it breaks first?
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
commands = list(config.get("commands", []))
# We use a while loop here because we don't know how the commands
# will be ordered. A command might need dependencies from one that's later
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
logger.debug(f"CMD: {cmd['name']}.")
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
logger.debug(
f"URL: {url} for {output_path} with command hash {cmd_hash}"
)
yield url, output_path
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
if all(loc.exists() for loc in out_locs):
update_lockfile(project_dir, cmd)
# We remove the command from the list here, and break, so that
# we iterate over the loop again.
commands.pop(i)
break
else:
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
else:
# If we didn't break the for loop, break the while loop.
break
from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_content_hash, get_command_hash
from .._util import load_project_config
from .._util import project_cli, Arg, logger
@project_cli.command("push")
def project_push_cli(
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Persist outputs to a remote storage. You can alias remotes in your
project.yml by mapping them to storage paths. A storage can be anything that
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
local directories etc.
DOCS: https://spacy.io/api/cli#project-push
"""
for output_path, url in project_push(project_dir, remote):
if url is None:
msg.info(f"Skipping {output_path}")
else:
msg.good(f"Pushed {output_path} to {url}")
def project_push(project_dir: Path, remote: str):
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
by mapping them to storage paths. A storage can be anything that the smart-open
library can upload to, e.g. gcs, aws, ssh, local directories etc
"""
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
logger.debug(f"CMD: cmd['name']")
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
logger.debug(f"CMD_HASH: {cmd_hash}")
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc):
url = storage.push(
output_path,
command_hash=cmd_hash,
content_hash=get_content_hash(output_loc),
)
logger.debug(
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
)
yield output_path, url
def _is_not_empty_dir(loc: Path):
if not loc.is_dir():
return True
elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
return True
else:
return False
from typing import Optional, List, Dict, TYPE_CHECKING
import os
import site
import hashlib
import urllib.parse
import tarfile
from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
class RemoteStorage:
"""Push and pull outputs to and from a remote file storage.
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
ssh, etc.
"""
def __init__(self, project_root: Path, url: str, *, compression="gz"):
self.root = project_root
self.url = ensure_pathy(url)
self.compression = compression
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
"""Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done.
Within the remote storage, files are addressed by their project path
(url encoded) and two user-supplied hashes, representing their creation
context and their file contents. If the URL already exists, the data is
not uploaded. Paths are archived and compressed prior to upload.
"""
loc = self.root / path
if not loc.exists():
raise IOError(f"Cannot push {loc}: does not exist.")
url = self.make_url(path, command_hash, content_hash)
if url.exists():
return url
tmp: Path
with make_tempdir() as tmp:
tar_loc = tmp / self.encode_name(str(path))
mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path))
with tar_loc.open(mode="rb") as input_file:
with url.open(mode="wb") as output_file:
output_file.write(input_file.read())
return url
def pull(
self,
path: Path,
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
"""Retrieve a file from the remote cache. If the file already exists,
nothing is done.
If the command_hash and/or content_hash are specified, only matching
results are returned. If no results are available, an error is raised.
"""
dest = self.root / path
if dest.exists():
return None
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
if url is None:
return url
else:
# Make sure the destination exists
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
tmp: Path
with make_tempdir() as tmp:
tar_loc = tmp / url.parts[-1]
download_file(url, tar_loc)
mode_string = f"r:{self.compression}" if self.compression else "r"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
# This requires that the path is added correctly, relative
# to root. This is how we set things up in push()
tar_file.extractall(self.root)
return url
def find(
self,
path: Path,
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
"""Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most
recent matching file is preferred.
"""
name = self.encode_name(str(path))
if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash)
urls = [url] if url.exists() else []
elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir())
else:
urls = list((self.url / name).iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
"""Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash
def encode_name(self, name: str) -> str:
"""Encode a subpath into a URL-safe name."""
return urllib.parse.quote_plus(name)
def get_content_hash(loc: Path) -> str:
return get_checksum(loc)
def get_command_hash(
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
) -> str:
"""Create a hash representing the execution of a command. This includes the
currently installed packages, whatever environment variables have been marked
as relevant, and the command.
"""
if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
spacy_v = GIT_VERSION
else:
spacy_v = str(get_minor_version(about.__version__) or "")
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
hashes.extend(cmd)
creation_bytes = "".join(hashes).encode("utf8")
return hashlib.md5(creation_bytes).hexdigest()
def get_site_hash():
"""Hash the current Python environment's site-packages contents, including
the name and version of the libraries. The list we're hashing is what
`pip freeze` would output.
"""
site_dirs = site.getsitepackages()
if site.ENABLE_USER_SITE:
site_dirs.extend(site.getusersitepackages())
packages = set()
for site_dir in site_dirs:
site_dir = Path(site_dir)
for subpath in site_dir.iterdir():
if subpath.parts[-1].endswith("dist-info"):
packages.add(subpath.parts[-1].replace(".dist-info", ""))
package_bytes = "".join(sorted(packages)).encode("utf8")
return hashlib.md5sum(package_bytes).hexdigest()
def get_env_hash(env: Dict[str, str]) -> str:
"""Construct a hash of the environment variables that will be passed into
the commands.
Values in the env dict may be references to the current os.environ, using
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
"""
env_vars = {}
for key, value in env.items():
if value.startswith("$"):
env_vars[key] = os.environ.get(value[1:], "")
else:
env_vars[key] = value
return get_hash(env_vars)
This diff is collapsed.
from typing import Optional, Dict, Any, Union
from pathlib import Path
from wasabi import msg
import typer
import logging
import sys
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.loop import train as train_nlp
from ..training.initialize import init_nlp
from .. import util
@app.command(
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
)
def train_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
"""
Train or update a spaCy pipeline. Requires data in spaCy's binary format. To
convert data from other formats, use the `spacy convert` command. The
config file includes all settings and hyperparameters used during training.
To override settings in the config, e.g. settings that point to local
paths or that you want to experiment with, you can override them as
command line options. For instance, --training.batch_size 128 overrides
the value of "batch_size" in the block "[training]". The --code argument
lets you pass in a Python file that's imported before training. It can be
used to register custom functions and architectures that can then be
referenced in the config.
DOCS: https://spacy.io/api/cli#train
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
def train(
config_path: Union[str, Path],
output_path: Optional[Union[str, Path]] = None,
*,
use_gpu: int = -1,
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
):
config_path = util.ensure_path(config_path)
output_path = util.ensure_path(output_path)
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
if not output_path:
msg.info("No output directory provided")
else:
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}")
msg.info(f"Saving to output directory: {output_path}")
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides, interpolate=False)
msg.divider("Initializing pipeline")
with show_validation_error(config_path, hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu)
msg.good("Initialized pipeline")
msg.divider("Training pipeline")
train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
from typing import Tuple
from pathlib import Path
import sys
import requests
from wasabi import msg, Printer
import warnings
from ._util import app
from .. import about
from ..util import get_package_version, get_installed_models, get_minor_version
from ..util import get_package_path, get_model_meta, is_compatible_version
@app.command("validate")
def validate_cli():
"""
Validate the currently installed pipeline packages and spaCy version. Checks
if the installed packages are compatible and shows upgrade instructions if
available. Should be run after `pip install -U spacy`.
DOCS: https://spacy.io/api/cli#validate
"""
validate()
def validate() -> None:
model_pkgs, compat = get_model_pkgs()
spacy_version = get_minor_version(about.__version__)
current_compat = compat.get(spacy_version, {})
if not current_compat:
msg.warn(f"No compatible packages found for v{spacy_version} of spaCy")
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
na_models = [m for m in incompat_models if m not in current_compat]
update_models = [m for m in incompat_models if m in current_compat]
spacy_dir = Path(__file__).parent.parent
msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})")
msg.info(f"spaCy installation: {spacy_dir}")
if model_pkgs:
header = ("NAME", "SPACY", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
if data["compat"]:
comp = msg.text("", color="green", icon="good", no_print=True)
version = msg.text(data["version"], color="green", no_print=True)
else:
version = msg.text(data["version"], color="yellow", no_print=True)
comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}"
rows.append((data["name"], data["spacy"], version, comp))
msg.table(rows, header=header)
else:
msg.text("No pipeline packages found in your current environment.", exits=0)
if update_models:
msg.divider("Install updates")
msg.text("Use the following commands to update the packages:")
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
msg.info(
f"The following packages are custom spaCy pipelines or not "
f"available for spaCy v{about.__version__}:",
", ".join(na_models),
)
if incompat_models:
sys.exit(1)
def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
msg = Printer(no_print=silent, pretty=not silent)
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
msg.fail(
f"Server error ({r.status_code})",
"Couldn't fetch compatibility table.",
exits=1,
)
msg.good("Loaded compatibility table")
compat = r.json()["spacy"]
all_models = set()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W09[45]")
installed_models = get_installed_models()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
pkgs = {}
for pkg_name in installed_models:
package = pkg_name.replace("-", "_")
version = get_package_version(pkg_name)
if package in compat:
is_compat = version in compat[package]
spacy_version = about.__version__
else:
model_path = get_package_path(package)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="\\[W09[45]")
model_meta = get_model_meta(model_path)
spacy_version = model_meta.get("spacy_version", "n/a")
is_compat = is_compatible_version(about.__version__, spacy_version) # type: ignore[assignment]
pkgs[pkg_name] = {
"name": package,
"version": version,
"spacy": spacy_version,
"compat": is_compat,
}
return pkgs, compat
def reformat_version(version: str) -> str:
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith("-alpha"):
return version.replace("-alpha", "a0")
return version.replace("-alpha", "a")
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class AfrikaansDefaults(BaseDefaults):
stop_words = STOP_WORDS
class Afrikaans(Language):
lang = "af"
Defaults = AfrikaansDefaults
__all__ = ["Afrikaans"]
This diff is collapsed.
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
class AmharicDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "am"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "ltr", "has_case": False, "has_letters": True}
class Amharic(Language):
lang = "am"
Defaults = AmharicDefaults
__all__ = ["Amharic"]
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment