GITENV file updated 21

parent 69c5f5f6
This source diff could not be displayed because it is too large. You can view the blob instead.
from ...typedefs cimport class_t, hash_t
# These are passed as callbacks to thinc.search.Beam
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
cdef int check_final_state(void* _state, void* extra_args) except -1
# cython: infer_types=True
# cython: profile=True
cimport numpy as np
import numpy
from cpython.ref cimport PyObject, Py_XDECREF
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
from thinc.extra.search cimport MaxViolation
from ...typedefs cimport hash_t, class_t
from .transition_system cimport TransitionSystem, Transition
from ...errors import Errors
from .stateclass cimport StateC, StateClass
# These are passed as callbacks to thinc.search.Beam
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateC*>_dest
src = <StateC*>_src
moves = <const Transition*>_moves
dest.clone(src)
moves[clas].do(dest, moves[clas].label)
cdef int check_final_state(void* _state, void* extra_args) except -1:
state = <StateC*>_state
return state.is_final()
cdef class BeamBatch(object):
cdef public TransitionSystem moves
cdef public object states
cdef public object docs
cdef public object golds
cdef public object beams
def __init__(self, TransitionSystem moves, states, golds,
int width, float density=0.):
cdef StateClass state
self.moves = moves
self.states = states
self.docs = [state.doc for state in states]
self.golds = golds
self.beams = []
cdef Beam beam
cdef StateC* st
for state in states:
beam = Beam(self.moves.n_moves, width, min_density=density)
beam.initialize(self.moves.init_beam_state,
self.moves.del_beam_state, state.c.length,
<void*>state.c._sent)
for i in range(beam.width):
st = <StateC*>beam.at(i)
st.offset = state.c.offset
beam.check_done(check_final_state, NULL)
self.beams.append(beam)
@property
def is_done(self):
return all(b.is_done for b in self.beams)
def __getitem__(self, i):
return self.beams[i]
def __len__(self):
return len(self.beams)
def get_states(self):
cdef Beam beam
cdef StateC* state
cdef StateClass stcls
states = []
for beam, doc in zip(self, self.docs):
for i in range(beam.size):
state = <StateC*>beam.at(i)
stcls = StateClass.borrow(state, doc)
states.append(stcls)
return states
def get_unfinished_states(self):
return [st for st in self.get_states() if not st.is_final()]
def advance(self, float[:, ::1] scores, follow_gold=False):
cdef Beam beam
cdef int nr_class = scores.shape[1]
cdef const float* c_scores = &scores[0, 0]
docs = self.docs
for i, beam in enumerate(self):
if not beam.is_done:
nr_state = self._set_scores(beam, c_scores, nr_class)
assert nr_state
if self.golds is not None:
self._set_costs(
beam,
docs[i],
self.golds[i],
follow_gold=follow_gold
)
c_scores += nr_state * nr_class
beam.advance(transition_state, NULL, <void*>self.moves.c)
beam.check_done(check_final_state, NULL)
cdef int _set_scores(self, Beam beam, const float* scores, int nr_class) except -1:
cdef int nr_state = 0
for i in range(beam.size):
state = <StateC*>beam.at(i)
if not state.is_final():
for j in range(nr_class):
beam.scores[i][j] = scores[nr_state * nr_class + j]
self.moves.set_valid(beam.is_valid[i], state)
nr_state += 1
else:
for j in range(beam.nr_class):
beam.scores[i][j] = 0
beam.costs[i][j] = 0
return nr_state
def _set_costs(self, Beam beam, doc, gold, int follow_gold=False):
cdef const StateC* state
for i in range(beam.size):
state = <const StateC*>beam.at(i)
if state.is_final():
for j in range(beam.nr_class):
beam.is_valid[i][j] = 0
beam.costs[i][j] = 9000
else:
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
state, gold)
if follow_gold:
min_cost = 0
for j in range(beam.nr_class):
if beam.is_valid[i][j] and beam.costs[i][j] < min_cost:
min_cost = beam.costs[i][j]
for j in range(beam.nr_class):
if beam.costs[i][j] > min_cost:
beam.is_valid[i][j] = 0
def update_beam(TransitionSystem moves, states, golds, model, int width, beam_density=0.0):
cdef MaxViolation violn
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
cdef StateClass state
beam_maps = []
backprops = []
violns = [MaxViolation() for _ in range(len(states))]
dones = [False for _ in states]
while not pbeam.is_done or not gbeam.is_done:
# The beam maps let us find the right row in the flattened scores
# array for each state. States are identified by (example id,
# history). We keep a different beam map for each step (since we'll
# have a flat scores array for each step). The beam map will let us
# take the per-state losses, and compute the gradient for each (step,
# state, class).
# Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices
states, p_indices, g_indices, beam_map = get_unique_states(pbeam, gbeam)
beam_maps.append(beam_map)
if not states:
break
# Now that we have our flat list of states, feed them through the model
scores, bp_scores = model.begin_update(states)
assert scores.size != 0
# Store the callbacks for the backward pass
backprops.append(bp_scores)
# Unpack the scores for the two beams. The indices arrays
# tell us which example and state the scores-row refers to.
# Now advance the states in the beams. The gold beam is constrained to
# to follow only gold analyses.
if not pbeam.is_done:
pbeam.advance(model.ops.as_contig(scores[p_indices]))
if not gbeam.is_done:
gbeam.advance(model.ops.as_contig(scores[g_indices]), follow_gold=True)
# Track the "maximum violation", to use in the update.
for i, violn in enumerate(violns):
if not dones[i]:
violn.check_crf(pbeam[i], gbeam[i])
if pbeam[i].is_done and gbeam[i].is_done:
dones[i] = True
histories = []
grads = []
for violn in violns:
if violn.p_hist:
histories.append(violn.p_hist + violn.g_hist)
d_loss = [d_l * violn.cost for d_l in violn.p_probs + violn.g_probs]
grads.append(d_loss)
else:
histories.append([])
grads.append([])
loss = 0.0
states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, grads)
for i, (d_scores, bp_scores) in enumerate(zip(states_d_scores, backprops)):
loss += (d_scores**2).mean()
bp_scores(d_scores)
return loss
def collect_states(beams, docs):
cdef StateClass state
cdef Beam beam
states = []
for state_or_beam, doc in zip(beams, docs):
if isinstance(state_or_beam, StateClass):
states.append(state_or_beam)
else:
beam = state_or_beam
state = StateClass.borrow(<StateC*>beam.at(0), doc)
states.append(state)
return states
def get_unique_states(pbeams, gbeams):
seen = {}
states = []
p_indices = []
g_indices = []
beam_map = {}
docs = pbeams.docs
cdef Beam pbeam, gbeam
if len(pbeams) != len(gbeams):
raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams)))
for eg_id, (pbeam, gbeam, doc) in enumerate(zip(pbeams, gbeams, docs)):
if not pbeam.is_done:
for i in range(pbeam.size):
state = StateClass.borrow(<StateC*>pbeam.at(i), doc)
if not state.is_final():
key = tuple([eg_id] + pbeam.histories[i])
if key in seen:
raise ValueError(Errors.E080.format(key=key))
seen[key] = len(states)
p_indices.append(len(states))
states.append(state)
beam_map.update(seen)
if not gbeam.is_done:
for i in range(gbeam.size):
state = StateClass.borrow(<StateC*>gbeam.at(i), doc)
if not state.is_final():
key = tuple([eg_id] + gbeam.histories[i])
if key in seen:
g_indices.append(seen[key])
else:
g_indices.append(len(states))
beam_map[key] = len(states)
states.append(state)
p_indices = numpy.asarray(p_indices, dtype='i')
g_indices = numpy.asarray(g_indices, dtype='i')
return states, p_indices, g_indices, beam_map
def get_gradient(nr_class, beam_maps, histories, losses):
"""The global model assigns a loss to each parse. The beam scores
are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking
action i given history H."
Histories: Each history is a list of actions
Each candidate has a history
Each beam has multiple candidates
Each batch has multiple beams
So history is list of lists of lists of ints
"""
grads = []
nr_steps = []
for eg_id, hists in enumerate(histories):
nr_step = 0
for loss, hist in zip(losses[eg_id], hists):
assert not numpy.isnan(loss)
if loss != 0.0:
nr_step = max(nr_step, len(hist))
nr_steps.append(nr_step)
for i in range(max(nr_steps)):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
dtype='f'))
if len(histories) != len(losses):
raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses)))
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
assert not numpy.isnan(loss)
if loss == 0.0:
continue
key = tuple([eg_id])
# Adjust loss for length
# We need to do this because each state in a short path is scored
# multiple times, as we add in the average cost when we run out
# of actions.
avg_loss = loss / len(hist)
loss += avg_loss * (nr_steps[eg_id] - len(hist))
for step, clas in enumerate(hist):
i = beam_maps[step][key]
# In step j, at state i action clas
# resulted in loss
grads[step][i, clas] += loss
key = key + tuple([clas])
return grads
This source diff could not be displayed because it is too large. You can view the blob instead.
from libc.string cimport memcpy, memset
from libc.stdlib cimport calloc, free
from libc.stdint cimport uint32_t, uint64_t
cimport libcpp
from libcpp.vector cimport vector
from libcpp.set cimport set
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from murmurhash.mrmr cimport hash64
from ...vocab cimport EMPTY_LEXEME
from ...structs cimport TokenC, SpanC
from ...lexeme cimport Lexeme
from ...attrs cimport IS_SPACE
from ...typedefs cimport attr_t
cdef inline bint is_space_token(const TokenC* token) nogil:
return Lexeme.c_check_flag(token.lex, IS_SPACE)
cdef struct ArcC:
int head
int child
attr_t label
cdef cppclass StateC:
int* _heads
const TokenC* _sent
vector[int] _stack
vector[int] _rebuffer
vector[SpanC] _ents
vector[ArcC] _left_arcs
vector[ArcC] _right_arcs
vector[libcpp.bool] _unshiftable
set[int] _sent_starts
TokenC _empty_token
int length
int offset
int _b_i
__init__(const TokenC* sent, int length) nogil:
this._sent = sent
this._heads = <int*>calloc(length, sizeof(int))
if not (this._sent and this._heads):
with gil:
PyErr_SetFromErrno(MemoryError)
PyErr_CheckSignals()
this.offset = 0
this.length = length
this._b_i = 0
for i in range(length):
this._heads[i] = -1
this._unshiftable.push_back(0)
memset(&this._empty_token, 0, sizeof(TokenC))
this._empty_token.lex = &EMPTY_LEXEME
__dealloc__():
free(this._heads)
void set_context_tokens(int* ids, int n) nogil:
cdef int i, j
if n == 1:
if this.B(0) >= 0:
ids[0] = this.B(0)
else:
ids[0] = -1
elif n == 2:
ids[0] = this.B(0)
ids[1] = this.S(0)
elif n == 3:
if this.B(0) >= 0:
ids[0] = this.B(0)
else:
ids[0] = -1
# First word of entity, if any
if this.entity_is_open():
ids[1] = this.E(0)
else:
ids[1] = -1
# Last word of entity, if within entity
if ids[0] == -1 or ids[1] == -1:
ids[2] = -1
else:
ids[2] = ids[0] - 1
elif n == 8:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)
ids[3] = this.S(1)
ids[4] = this.S(2)
ids[5] = this.L(this.B(0), 1)
ids[6] = this.L(this.S(0), 1)
ids[7] = this.R(this.S(0), 1)
elif n == 13:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)
ids[3] = this.S(1)
ids[4] = this.S(2)
ids[5] = this.L(this.S(0), 1)
ids[6] = this.L(this.S(0), 2)
ids[6] = this.R(this.S(0), 1)
ids[7] = this.L(this.B(0), 1)
ids[8] = this.R(this.S(0), 2)
ids[9] = this.L(this.S(1), 1)
ids[10] = this.L(this.S(1), 2)
ids[11] = this.R(this.S(1), 1)
ids[12] = this.R(this.S(1), 2)
elif n == 6:
for i in range(6):
ids[i] = -1
if this.B(0) >= 0:
ids[0] = this.B(0)
if this.entity_is_open():
ent = this.get_ent()
j = 1
for i in range(ent.start, this.B(0)):
ids[j] = i
j += 1
if j >= 6:
break
else:
# TODO error =/
pass
for i in range(n):
if ids[i] >= 0:
ids[i] += this.offset
else:
ids[i] = -1
int S(int i) nogil const:
if i >= this._stack.size():
return -1
elif i < 0:
return -1
return this._stack.at(this._stack.size() - (i+1))
int B(int i) nogil const:
if i < 0:
return -1
elif i < this._rebuffer.size():
return this._rebuffer.at(this._rebuffer.size() - (i+1))
else:
b_i = this._b_i + (i - this._rebuffer.size())
if b_i >= this.length:
return -1
else:
return b_i
const TokenC* B_(int i) nogil const:
return this.safe_get(this.B(i))
const TokenC* E_(int i) nogil const:
return this.safe_get(this.E(i))
const TokenC* safe_get(int i) nogil const:
if i < 0 or i >= this.length:
return &this._empty_token
else:
return &this._sent[i]
void get_arcs(vector[ArcC]* arcs) nogil const:
for i in range(this._left_arcs.size()):
arc = this._left_arcs.at(i)
if arc.head != -1 and arc.child != -1:
arcs.push_back(arc)
for i in range(this._right_arcs.size()):
arc = this._right_arcs.at(i)
if arc.head != -1 and arc.child != -1:
arcs.push_back(arc)
int H(int child) nogil const:
if child >= this.length or child < 0:
return -1
else:
return this._heads[child]
int E(int i) nogil const:
if this._ents.size() == 0:
return -1
else:
return this._ents.back().start
int L(int head, int idx) nogil const:
if idx < 1 or this._left_arcs.size() == 0:
return -1
cdef vector[int] lefts
for i in range(this._left_arcs.size()):
arc = this._left_arcs.at(i)
if arc.head == head and arc.child != -1 and arc.child < head:
lefts.push_back(arc.child)
idx = (<int>lefts.size()) - idx
if idx < 0:
return -1
else:
return lefts.at(idx)
int R(int head, int idx) nogil const:
if idx < 1 or this._right_arcs.size() == 0:
return -1
cdef vector[int] rights
for i in range(this._right_arcs.size()):
arc = this._right_arcs.at(i)
if arc.head == head and arc.child != -1 and arc.child > head:
rights.push_back(arc.child)
idx = (<int>rights.size()) - idx
if idx < 0:
return -1
else:
return rights.at(idx)
bint empty() nogil const:
return this._stack.size() == 0
bint eol() nogil const:
return this.buffer_length() == 0
bint is_final() nogil const:
return this.stack_depth() <= 0 and this.eol()
int cannot_sent_start(int word) nogil const:
if word < 0 or word >= this.length:
return 0
elif this._sent[word].sent_start == -1:
return 1
else:
return 0
int is_sent_start(int word) nogil const:
if word < 0 or word >= this.length:
return 0
elif this._sent[word].sent_start == 1:
return 1
elif this._sent_starts.count(word) >= 1:
return 1
else:
return 0
void set_sent_start(int word, int value) nogil:
if value >= 1:
this._sent_starts.insert(word)
bint has_head(int child) nogil const:
return this._heads[child] >= 0
int l_edge(int word) nogil const:
return word
int r_edge(int word) nogil const:
return word
int n_L(int head) nogil const:
cdef int n = 0
for i in range(this._left_arcs.size()):
arc = this._left_arcs.at(i)
if arc.head == head and arc.child != -1 and arc.child < arc.head:
n += 1
return n
int n_R(int head) nogil const:
cdef int n = 0
for i in range(this._right_arcs.size()):
arc = this._right_arcs.at(i)
if arc.head == head and arc.child != -1 and arc.child > arc.head:
n += 1
return n
bint stack_is_connected() nogil const:
return False
bint entity_is_open() nogil const:
if this._ents.size() == 0:
return False
else:
return this._ents.back().end == -1
int stack_depth() nogil const:
return this._stack.size()
int buffer_length() nogil const:
return (this.length - this._b_i) + this._rebuffer.size()
void push() nogil:
b0 = this.B(0)
if this._rebuffer.size():
b0 = this._rebuffer.back()
this._rebuffer.pop_back()
else:
b0 = this._b_i
this._b_i += 1
this._stack.push_back(b0)
void pop() nogil:
this._stack.pop_back()
void force_final() nogil:
# This should only be used in desperate situations, as it may leave
# the analysis in an unexpected state.
this._stack.clear()
this._b_i = this.length
void unshift() nogil:
s0 = this._stack.back()
this._unshiftable[s0] = 1
this._rebuffer.push_back(s0)
this._stack.pop_back()
int is_unshiftable(int item) nogil const:
if item >= this._unshiftable.size():
return 0
else:
return this._unshiftable.at(item)
void set_reshiftable(int item) nogil:
if item < this._unshiftable.size():
this._unshiftable[item] = 0
void add_arc(int head, int child, attr_t label) nogil:
if this.has_head(child):
this.del_arc(this.H(child), child)
cdef ArcC arc
arc.head = head
arc.child = child
arc.label = label
if head > child:
this._left_arcs.push_back(arc)
else:
this._right_arcs.push_back(arc)
this._heads[child] = head
void del_arc(int h_i, int c_i) nogil:
cdef vector[ArcC]* arcs
if h_i > c_i:
arcs = &this._left_arcs
else:
arcs = &this._right_arcs
if arcs.size() == 0:
return
arc = arcs.back()
if arc.head == h_i and arc.child == c_i:
arcs.pop_back()
else:
for i in range(arcs.size()-1):
arc = arcs.at(i)
if arc.head == h_i and arc.child == c_i:
arc.head = -1
arc.child = -1
arc.label = 0
break
SpanC get_ent() nogil const:
cdef SpanC ent
if this._ents.size() == 0:
ent.start = 0
ent.end = 0
ent.label = 0
return ent
else:
return this._ents.back()
void open_ent(attr_t label) nogil:
cdef SpanC ent
ent.start = this.B(0)
ent.label = label
ent.end = -1
this._ents.push_back(ent)
void close_ent() nogil:
this._ents.back().end = this.B(0)+1
void clone(const StateC* src) nogil:
this.length = src.length
this._sent = src._sent
this._stack = src._stack
this._rebuffer = src._rebuffer
this._sent_starts = src._sent_starts
this._unshiftable = src._unshiftable
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
this._ents = src._ents
this._left_arcs = src._left_arcs
this._right_arcs = src._right_arcs
this._b_i = src._b_i
this.offset = src.offset
this._empty_token = src._empty_token
This source diff could not be displayed because it is too large. You can view the blob instead.
from ._state cimport StateC
from ...typedefs cimport weight_t, attr_t
from .transition_system cimport Transition, TransitionSystem
cdef class ArcEager(TransitionSystem):
cdef get_arcs(self, StateC* state)
# cython: profile=True, cdivision=True, infer_types=True
from cymem.cymem cimport Pool, Address
from libc.stdint cimport int32_t
from libcpp.vector cimport vector
from collections import defaultdict, Counter
from ...typedefs cimport hash_t, attr_t
from ...strings cimport hash_string
from ...structs cimport TokenC
from ...tokens.doc cimport Doc, set_children_from_heads
from ...tokens.token cimport MISSING_DEP
from ...training.example cimport Example
from .stateclass cimport StateClass
from ._state cimport StateC, ArcC
from ...errors import Errors
from thinc.extra.search cimport Beam
cdef weight_t MIN_SCORE = -90000
cdef attr_t SUBTOK_LABEL = hash_string('subtok')
DEF NON_MONOTONIC = True
cdef enum:
SHIFT
REDUCE
LEFT
RIGHT
BREAK
N_MOVES
MOVE_NAMES = [None] * N_MOVES
MOVE_NAMES[SHIFT] = 'S'
MOVE_NAMES[REDUCE] = 'D'
MOVE_NAMES[LEFT] = 'L'
MOVE_NAMES[RIGHT] = 'R'
MOVE_NAMES[BREAK] = 'B'
cdef enum:
HEAD_IN_STACK = 0
HEAD_IN_BUFFER
HEAD_UNKNOWN
IS_SENT_START
SENT_START_UNKNOWN
cdef struct GoldParseStateC:
char* state_bits
int32_t* n_kids_in_buffer
int32_t* n_kids_in_stack
int32_t* heads
attr_t* labels
int32_t** kids
int32_t* n_kids
int32_t length
int32_t stride
weight_t push_cost
weight_t pop_cost
cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
heads, labels, sent_starts) except *:
cdef GoldParseStateC gs
gs.length = len(heads)
gs.stride = 1
assert gs.length > 0
gs.labels = <attr_t*>mem.alloc(gs.length, sizeof(gs.labels[0]))
gs.heads = <int32_t*>mem.alloc(gs.length, sizeof(gs.heads[0]))
gs.n_kids = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids[0]))
gs.state_bits = <char*>mem.alloc(gs.length, sizeof(gs.state_bits[0]))
gs.n_kids_in_buffer = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_buffer[0]))
gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
for i, is_sent_start in enumerate(sent_starts):
if is_sent_start == True:
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
IS_SENT_START,
1
)
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
SENT_START_UNKNOWN,
0
)
elif is_sent_start is None:
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
SENT_START_UNKNOWN,
1
)
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
IS_SENT_START,
0
)
else:
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
SENT_START_UNKNOWN,
0
)
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
IS_SENT_START,
0
)
for i, (head, label) in enumerate(zip(heads, labels)):
if head is not None:
gs.heads[i] = head
gs.labels[i] = label
if i != head:
gs.n_kids[head] += 1
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
HEAD_UNKNOWN,
0
)
else:
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
HEAD_UNKNOWN,
1
)
# Make an array of pointers, pointing into the gs_kids_flat array.
assert gs.length > 0
gs.kids = <int32_t**>mem.alloc(gs.length, sizeof(int32_t*))
for i in range(gs.length):
if gs.n_kids[i] != 0:
gs.kids[i] = <int32_t*>mem.alloc(gs.n_kids[i], sizeof(int32_t))
# This is a temporary buffer
js_addr = Address(gs.length, sizeof(int32_t))
js = <int32_t*>js_addr.ptr
for i in range(gs.length):
if not is_head_unknown(&gs, i):
head = gs.heads[i]
if head != i:
gs.kids[head][js[head]] = i
js[head] += 1
gs.push_cost = push_cost(state, &gs)
gs.pop_cost = pop_cost(state, &gs)
return gs
cdef void update_gold_state(GoldParseStateC* gs, const StateC* s) nogil:
for i in range(gs.length):
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
HEAD_IN_BUFFER,
0
)
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
HEAD_IN_STACK,
0
)
gs.n_kids_in_stack[i] = 0
gs.n_kids_in_buffer[i] = 0
for i in range(s.stack_depth()):
s_i = s.S(i)
if not is_head_unknown(gs, s_i) and gs.heads[s_i] != s_i:
gs.n_kids_in_stack[gs.heads[s_i]] += 1
for kid in gs.kids[s_i][:gs.n_kids[s_i]]:
gs.state_bits[kid] = set_state_flag(
gs.state_bits[kid],
HEAD_IN_STACK,
1
)
for i in range(s.buffer_length()):
b_i = s.B(i)
if s.is_sent_start(b_i):
break
if not is_head_unknown(gs, b_i) and gs.heads[b_i] != b_i:
gs.n_kids_in_buffer[gs.heads[b_i]] += 1
for kid in gs.kids[b_i][:gs.n_kids[b_i]]:
gs.state_bits[kid] = set_state_flag(
gs.state_bits[kid],
HEAD_IN_BUFFER,
1
)
gs.push_cost = push_cost(s, gs)
gs.pop_cost = pop_cost(s, gs)
cdef class ArcEagerGold:
cdef GoldParseStateC c
cdef Pool mem
def __init__(self, ArcEager moves, StateClass stcls, Example example):
self.mem = Pool()
heads, labels = example.get_aligned_parse(projectivize=True)
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
sent_starts = _get_aligned_sent_starts(example)
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c)
def _get_aligned_sent_starts(example):
"""Get list of SENT_START attributes aligned to the predicted tokenization.
If the reference has not sentence starts, return a list of None values.
This function is slightly different from the one on Example, because we also
check whether the reference sentences align across multiple sentences,
and return missing values if they do. This prevents a problem where you have
the start of a sentence merged onto a token that belongs to two sentences.
"""
if example.y.has_annotation("SENT_START"):
align = example.alignment.y2x
sent_starts = [False] * len(example.x)
seen_words = set()
for y_sent in example.y.sents:
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
if any(x_idx in seen_words for x_idx in x_indices):
# If there are any tokens in X that align across two sentences,
# regard the sentence annotations as missing, as we can't
# reliably use them.
return [None] * len(example.x)
seen_words.update(x_indices)
sent_starts[x_indices[0]] = True
return sent_starts
else:
return [None] * len(example.x)
cdef int check_state_gold(char state_bits, char flag) nogil:
cdef char one = 1
return 1 if (state_bits & (one << flag)) else 0
cdef int set_state_flag(char state_bits, char flag, int value) nogil:
cdef char one = 1
if value:
return state_bits | (one << flag)
else:
return state_bits & ~(one << flag)
cdef int is_head_in_stack(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], HEAD_IN_STACK)
cdef int is_head_in_buffer(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], HEAD_IN_BUFFER)
cdef int is_head_unknown(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], HEAD_UNKNOWN)
cdef int is_sent_start(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], IS_SENT_START)
cdef int is_sent_start_unknown(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], SENT_START_UNKNOWN)
# Helper functions for the arc-eager oracle
cdef weight_t push_cost(const StateC* state, const GoldParseStateC* gold) nogil:
cdef weight_t cost = 0
b0 = state.B(0)
if b0 < 0:
return 9000
if is_head_in_stack(gold, b0):
cost += 1
cost += gold.n_kids_in_stack[b0]
if Break.is_valid(state, 0) and is_sent_start(gold, state.B(1)):
cost += 1
return cost
cdef weight_t pop_cost(const StateC* state, const GoldParseStateC* gold) nogil:
cdef weight_t cost = 0
s0 = state.S(0)
if s0 < 0:
return 9000
if is_head_in_buffer(gold, s0):
cost += 1
cost += gold.n_kids_in_buffer[s0]
return cost
cdef bint arc_is_gold(const GoldParseStateC* gold, int head, int child) nogil:
if is_head_unknown(gold, child):
return True
elif gold.heads[child] == head:
return True
else:
return False
cdef bint label_is_gold(const GoldParseStateC* gold, int child, attr_t label) nogil:
if is_head_unknown(gold, child):
return True
elif label == 0:
return True
elif gold.labels[child] == label:
return True
else:
return False
cdef bint _is_gold_root(const GoldParseStateC* gold, int word) nogil:
return gold.heads[word] == word or is_head_unknown(gold, word)
cdef class Shift:
"""Move the first word of the buffer onto the stack and mark it as "shifted"
Validity:
* If stack is empty
* At least two words in sentence
* Word has not been shifted before
Cost: push_cost
Action:
* Mark B[0] as 'shifted'
* Push stack
* Advance buffer
"""
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.stack_depth() == 0:
return 1
elif st.buffer_length() < 2:
return 0
elif st.is_sent_start(st.B(0)):
return 0
elif st.is_unshiftable(st.B(0)):
return 0
else:
return 1
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.push()
@staticmethod
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold
return gold.push_cost
cdef class Reduce:
"""
Pop from the stack. If it has no head and the stack isn't empty, place
it back on the buffer.
Validity:
* Stack not empty
* Buffer nt empty
* Stack depth 1 and cannot sent start l_edge(st.B(0))
Cost:
* If B[0] is the start of a sentence, cost is 0
* Arcs between stack and buffer
* If arc has no head, we're saving arcs between S[0] and S[1:], so decrement
cost by those arcs.
"""
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.stack_depth() == 0:
return False
elif st.buffer_length() == 0:
return True
elif st.stack_depth() == 1 and st.cannot_sent_start(st.l_edge(st.B(0))):
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
if st.has_head(st.S(0)) or st.stack_depth() == 1:
st.pop()
else:
st.unshift()
@staticmethod
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold
if state.is_sent_start(state.B(0)):
return 0
s0 = state.S(0)
cost = gold.pop_cost
if not state.has_head(s0):
# Decrement cost for the arcs we save, as we'll be putting this
# back to the buffer
if is_head_in_stack(gold, s0):
cost -= 1
cost -= gold.n_kids_in_stack[s0]
return cost
cdef class LeftArc:
"""Add an arc between B[0] and S[0], replacing the previous head of S[0] if
one is set. Pop S[0] from the stack.
Validity:
* len(S) >= 1
* len(B) >= 1
* not is_sent_start(B[0])
Cost:
pop_cost - Arc(B[0], S[0], label) + (Arc(S[1], S[0]) if H(S[0]) else Arcs(S, S[0]))
"""
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.stack_depth() == 0:
return 0
elif st.buffer_length() == 0:
return 0
elif st.is_sent_start(st.B(0)):
return 0
elif label == SUBTOK_LABEL and st.S(0) != (st.B(0)-1):
return 0
else:
return 1
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.B(0), st.S(0), label)
# If we change the stack, it's okay to remove the shifted mark, as
# we can't get in an infinite loop this way.
st.set_reshiftable(st.B(0))
st.pop()
@staticmethod
cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold
cdef weight_t cost = gold.pop_cost
s0 = state.S(0)
s1 = state.S(1)
b0 = state.B(0)
if state.has_head(s0):
# Increment cost if we're clobbering a correct arc
cost += gold.heads[s0] == s1
else:
# If there's no head, we're losing arcs between S0 and S[1:].
cost += is_head_in_stack(gold, s0)
cost += gold.n_kids_in_stack[s0]
if b0 != -1 and s0 != -1 and gold.heads[s0] == b0:
cost -= 1
cost += not label_is_gold(gold, s0, label)
return cost
cdef class RightArc:
"""
Add an arc from S[0] to B[0]. Push B[0].
Validity:
* len(S) >= 1
* len(B) >= 1
* not is_sent_start(B[0])
Cost:
push_cost + (not shifted[b0] and Arc(B[1:], B[0])) - Arc(S[0], B[0], label)
"""
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.stack_depth() == 0:
return 0
elif st.buffer_length() == 0:
return 0
elif st.is_sent_start(st.B(0)):
return 0
elif label == SUBTOK_LABEL and st.S(0) != (st.B(0)-1):
# If there's (perhaps partial) parse pre-set, don't allow cycle.
return 0
else:
return 1
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.S(0), st.B(0), label)
st.push()
@staticmethod
cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold
cost = gold.push_cost
s0 = state.S(0)
b0 = state.B(0)
if s0 != -1 and b0 != -1 and gold.heads[b0] == s0:
cost -= 1
cost += not label_is_gold(gold, b0, label)
elif is_head_in_buffer(gold, b0) and not state.is_unshiftable(b0):
cost += 1
return cost
cdef class Break:
"""Mark the second word of the buffer as the start of a
sentence.
Validity:
* len(buffer) >= 2
* B[1] == B[0] + 1
* not is_sent_start(B[1])
* not cannot_sent_start(B[1])
Action:
* mark_sent_start(B[1])
Cost:
* not is_sent_start(B[1])
* Arcs between B[0] and B[1:]
* Arcs between S and B[1]
"""
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int i
if st.buffer_length() < 2:
return False
elif st.B(1) != st.B(0) + 1:
return False
elif st.is_sent_start(st.B(1)):
return False
elif st.cannot_sent_start(st.B(1)):
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.set_sent_start(st.B(1), 1)
@staticmethod
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold
cdef int b0 = state.B(0)
cdef int cost = 0
cdef int si
for i in range(state.stack_depth()):
si = state.S(i)
if is_head_in_buffer(gold, si):
cost += 1
cost += gold.n_kids_in_buffer[si]
# We need to score into B[1:], so subtract deps that are at b0
if gold.heads[b0] == si:
cost -= 1
if gold.heads[si] == b0:
cost -= 1
if not is_sent_start(gold, state.B(1)) \
and not is_sent_start_unknown(gold, state.B(1)):
cost += 1
return cost
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
st = new StateC(<const TokenC*>tokens, length)
return <void*>st
cdef int _del_state(Pool mem, void* state, void* x) except -1:
cdef StateC* st = <StateC*>state
del st
cdef class ArcEager(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
self.init_beam_state = _init_state
self.del_beam_state = _del_state
@classmethod
def get_actions(cls, **kwargs):
min_freq = kwargs.get('min_freq', None)
actions = defaultdict(lambda: Counter())
actions[SHIFT][''] = 1
actions[REDUCE][''] = 1
for label in kwargs.get('left_labels', []):
actions[LEFT][label] = 1
actions[SHIFT][label] = 1
for label in kwargs.get('right_labels', []):
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
for example in kwargs.get('examples', []):
heads, labels = example.get_aligned_parse(projectivize=True)
for child, (head, label) in enumerate(zip(heads, labels)):
if head is None or label is None:
continue
if label.upper() == 'ROOT' :
label = 'ROOT'
if head == child:
actions[BREAK][label] += 1
if head < child:
actions[RIGHT][label] += 1
actions[REDUCE][''] += 1
elif head > child:
actions[LEFT][label] += 1
actions[SHIFT][''] += 1
if min_freq is not None:
for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()):
if freq < min_freq:
label_freqs.pop(label)
# Ensure these actions are present
actions[BREAK].setdefault('ROOT', 0)
if kwargs.get("learn_tokens") is True:
actions[RIGHT].setdefault('subtok', 0)
actions[LEFT].setdefault('subtok', 0)
# Used for backoff
actions[RIGHT].setdefault('dep', 0)
actions[LEFT].setdefault('dep', 0)
return actions
@property
def builtin_labels(self):
return ["ROOT", "dep"]
@property
def action_types(self):
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
def get_doc_labels(self, doc):
"""Get the labels required for a given Doc."""
labels = set(self.builtin_labels)
for token in doc:
if token.dep_:
labels.add(token.dep_)
return labels
def transition(self, StateClass state, action):
cdef Transition t = self.lookup_transition(action)
t.do(state.c, t.label)
return state
def is_gold_parse(self, StateClass state, ArcEagerGold gold):
for i in range(state.c.length):
token = state.c.safe_get(i)
if not arc_is_gold(&gold.c, i, i+token.head):
return False
elif not label_is_gold(&gold.c, i, token.dep):
return False
return True
def init_gold(self, StateClass state, Example example):
gold = ArcEagerGold(self, state, example)
self._replace_unseen_labels(gold)
return gold
def init_gold_batch(self, examples):
# TODO: Projectivity?
all_states = self.init_batch([eg.predicted for eg in examples])
golds = []
states = []
for state, eg in zip(all_states, examples):
if self.has_gold(eg) and not state.is_final():
golds.append(self.init_gold(state, eg))
states.append(state)
n_steps = sum([len(s.queue) for s in states])
return states, golds, n_steps
def _replace_unseen_labels(self, ArcEagerGold gold):
backoff_label = self.strings["dep"]
root_label = self.strings["ROOT"]
left_labels = self.labels[LEFT]
right_labels = self.labels[RIGHT]
break_labels = self.labels[BREAK]
for i in range(gold.c.length):
if not is_head_unknown(&gold.c, i):
head = gold.c.heads[i]
label = self.strings[gold.c.labels[i]]
if head > i and label not in left_labels:
gold.c.labels[i] = backoff_label
elif head < i and label not in right_labels:
gold.c.labels[i] = backoff_label
elif head == i and label not in break_labels:
gold.c.labels[i] = root_label
return gold
cdef Transition lookup_transition(self, object name_or_id) except *:
if isinstance(name_or_id, int):
return self.c[name_or_id]
name = name_or_id
if '-' in name:
move_str, label_str = name.split('-', 1)
label = self.strings[label_str]
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
raise KeyError(f"Unknown transition: {name}")
def move_name(self, int move, attr_t label):
label_str = self.strings[label]
if label_str:
return MOVE_NAMES[move] + '-' + label_str
else:
return MOVE_NAMES[move]
def class_name(self, int i):
return self.move_name(self.c[i].move, self.c[i].label)
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
t.score = 0
t.clas = clas
t.move = move
t.label = label
if move == SHIFT:
t.is_valid = Shift.is_valid
t.do = Shift.transition
t.get_cost = Shift.cost
elif move == REDUCE:
t.is_valid = Reduce.is_valid
t.do = Reduce.transition
t.get_cost = Reduce.cost
elif move == LEFT:
t.is_valid = LeftArc.is_valid
t.do = LeftArc.transition
t.get_cost = LeftArc.cost
elif move == RIGHT:
t.is_valid = RightArc.is_valid
t.do = RightArc.transition
t.get_cost = RightArc.cost
elif move == BREAK:
t.is_valid = Break.is_valid
t.do = Break.transition
t.get_cost = Break.cost
else:
raise ValueError(Errors.E019.format(action=move, src='arc_eager'))
return t
def set_annotations(self, StateClass state, Doc doc):
for arc in state.arcs:
doc.c[arc["child"]].head = arc["head"] - arc["child"]
doc.c[arc["child"]].dep = arc["label"]
for i in range(doc.length):
if doc.c[i].head == 0:
doc.c[i].dep = self.root_label
set_children_from_heads(doc.c, 0, doc.length)
def get_beam_parses(self, Beam beam):
parses = []
probs = beam.probs
for i in range(beam.size):
state = <StateC*>beam.at(i)
if state.is_final():
prob = probs[i]
parse = []
arcs = self.get_arcs(state)
if arcs:
for arc in arcs:
dep = arc["label"]
label = self.strings[dep]
parse.append((arc["head"], arc["child"], label))
parses.append((prob, parse))
return parses
cdef get_arcs(self, StateC* state):
cdef vector[ArcC] arcs
state.get_arcs(&arcs)
return list(arcs)
def has_gold(self, Example eg, start=0, end=None):
for word in eg.y[start:end]:
if word.dep != 0:
return True
else:
return False
cdef int set_valid(self, int* output, const StateC* st) nogil:
cdef int[N_MOVES] is_valid
is_valid[SHIFT] = Shift.is_valid(st, 0)
is_valid[REDUCE] = Reduce.is_valid(st, 0)
is_valid[LEFT] = LeftArc.is_valid(st, 0)
is_valid[RIGHT] = RightArc.is_valid(st, 0)
is_valid[BREAK] = Break.is_valid(st, 0)
cdef int i
for i in range(self.n_moves):
if self.c[i].label == SUBTOK_LABEL:
output[i] = self.c[i].is_valid(st, self.c[i].label)
else:
output[i] = is_valid[self.c[i].move]
def get_cost(self, StateClass stcls, gold, int i):
if not isinstance(gold, ArcEagerGold):
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
cdef ArcEagerGold gold_ = gold
gold_state = gold_.c
n_gold = 0
if self.c[i].is_valid(stcls.c, self.c[i].label):
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
else:
cost = 9000
return cost
cdef int set_costs(self, int* is_valid, weight_t* costs,
const StateC* state, gold) except -1:
if not isinstance(gold, ArcEagerGold):
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
cdef ArcEagerGold gold_ = gold
gold_state = gold_.c
update_gold_state(&gold_state, state)
self.set_valid(is_valid, state)
cdef int n_gold = 0
for i in range(self.n_moves):
if is_valid[i]:
costs[i] = self.c[i].get_cost(state, &gold_state, self.c[i].label)
if costs[i] <= 0:
n_gold += 1
else:
costs[i] = 9000
if n_gold < 1:
for i in range(self.n_moves):
print(self.get_class_name(i), is_valid[i], costs[i])
print("Gold sent starts?", is_sent_start(&gold_state, state.B(0)), is_sent_start(&gold_state, state.B(1)))
raise ValueError("Could not find gold transition - see logs above.")
def get_oracle_sequence_from_state(self, StateClass state, ArcEagerGold gold, _debug=None):
cdef int i
cdef Pool mem = Pool()
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.n_moves > 0
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
history = []
debug_log = []
failed = False
while not state.is_final():
try:
self.set_costs(is_valid, costs, state.c, gold)
except ValueError:
failed = True
break
min_cost = min(costs[i] for i in range(self.n_moves))
for i in range(self.n_moves):
if is_valid[i] and costs[i] <= min_cost:
action = self.c[i]
history.append(i)
s0 = state.S(0)
b0 = state.B(0)
if _debug:
example = _debug
debug_log.append(" ".join((
self.get_class_name(i),
state.print_state()
)))
action.do(state.c, action.label)
break
else:
failed = False
break
if failed and _debug not in (False, None):
example = _debug
print("Actions")
for i in range(self.n_moves):
print(self.get_class_name(i))
print("Gold")
for token in example.y:
print(token.i, token.text, token.dep_, token.head.text)
aligned_heads, aligned_labels = example.get_aligned_parse()
print("Aligned heads")
for i, head in enumerate(aligned_heads):
print(example.x[i], example.x[head] if head is not None else "__")
print("Aligned sent starts")
print(example.get_aligned_sent_starts())
print("Predicted tokens")
print([(w.i, w.text) for w in example.x])
s0 = state.S(0)
b0 = state.B(0)
debug_log.append(" ".join((
"?",
"S0=", (example.x[s0].text if s0 >= 0 else "-"),
"B0=", (example.x[b0].text if b0 >= 0 else "-"),
"S0 head?", str(state.has_head(state.S(0))),
)))
s0 = state.S(0)
b0 = state.B(0)
print("\n".join(debug_log))
print("Arc is gold B0, S0?", arc_is_gold(&gold.c, b0, s0))
print("Arc is gold S0, B0?", arc_is_gold(&gold.c, s0, b0))
print("is_head_unknown(s0)", is_head_unknown(&gold.c, s0))
print("is_head_unknown(b0)", is_head_unknown(&gold.c, b0))
print("b0", b0, "gold.heads[s0]", gold.c.heads[s0])
print("Stack", [example.x[i] for i in state.stack])
print("Buffer", [example.x[i] for i in state.queue])
raise ValueError(Errors.E024)
return history
This source diff could not be displayed because it is too large. You can view the blob instead.
from .transition_system cimport TransitionSystem
cdef class BiluoPushDown(TransitionSystem):
pass
import os
import random
from libc.stdint cimport int32_t
from cymem.cymem cimport Pool
from collections import Counter
from thinc.extra.search cimport Beam
from ...tokens.doc cimport Doc
from ...tokens.span import Span
from ...tokens.span cimport Span
from ...typedefs cimport weight_t, attr_t
from ...lexeme cimport Lexeme
from ...attrs cimport IS_SPACE
from ...structs cimport TokenC, SpanC
from ...training.example cimport Example
from .stateclass cimport StateClass
from ._state cimport StateC
from .transition_system cimport Transition, do_func_t
from ...errors import Errors
cdef enum:
MISSING
BEGIN
IN
LAST
UNIT
OUT
N_MOVES
MOVE_NAMES = [None] * N_MOVES
MOVE_NAMES[MISSING] = 'M'
MOVE_NAMES[BEGIN] = 'B'
MOVE_NAMES[IN] = 'I'
MOVE_NAMES[LAST] = 'L'
MOVE_NAMES[UNIT] = 'U'
MOVE_NAMES[OUT] = 'O'
cdef struct GoldNERStateC:
Transition* ner
SpanC* negs
int32_t length
int32_t nr_neg
cdef class BiluoGold:
cdef Pool mem
cdef GoldNERStateC c
def __init__(self, BiluoPushDown moves, StateClass stcls, Example example, neg_key):
self.mem = Pool()
self.c = create_gold_state(self.mem, moves, stcls.c, example, neg_key)
def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c)
cdef GoldNERStateC create_gold_state(
Pool mem,
BiluoPushDown moves,
const StateC* stcls,
Example example,
neg_key
) except *:
cdef GoldNERStateC gs
cdef Span neg
if neg_key is not None:
negs = example.get_aligned_spans_y2x(
example.y.spans.get(neg_key, []),
allow_overlap=True
)
else:
negs = []
assert example.x.length > 0
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
gs.nr_neg = len(negs)
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
for i, ner_tag in enumerate(ner_tags):
gs.ner[i] = moves.lookup_transition(ner_tag)
# Prevent conflicting spans in the data. For NER, spans are equal if they have the same offsets and label.
neg_span_triples = {(neg_ent.start_char, neg_ent.end_char, neg_ent.label) for neg_ent in negs}
for pos_span in ner_ents:
if (pos_span.start_char, pos_span.end_char, pos_span.label) in neg_span_triples:
raise ValueError(Errors.E868.format(span=(pos_span.start_char, pos_span.end_char, pos_span.label_)))
# In order to handle negative samples, we need to maintain the full
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
# thing, we'll get blocked if there's an incorrect prefix.
for i, neg in enumerate(negs):
gs.negs[i] = neg.c
return gs
cdef void update_gold_state(GoldNERStateC* gs, const StateC* state) except *:
# We don't need to update each time, unlike the parser.
pass
cdef do_func_t[N_MOVES] do_funcs
cdef bint _entity_is_sunk(const StateC* state, Transition* golds) nogil:
if not state.entity_is_open():
return False
cdef const Transition* gold = &golds[state.E(0)]
ent = state.get_ent()
if gold.move != BEGIN and gold.move != UNIT:
return True
elif gold.label != ent.label:
return True
else:
return False
cdef class BiluoPushDown(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
@classmethod
def get_actions(cls, **kwargs):
actions = {
MISSING: Counter(),
BEGIN: Counter(),
IN: Counter(),
LAST: Counter(),
UNIT: Counter(),
OUT: Counter()
}
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
for entity_type in kwargs.get('entity_types', []):
for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
for example in kwargs.get('examples', []):
for token in example.y:
ent_type = token.ent_type_
if ent_type:
for action in (BEGIN, IN, LAST, UNIT):
actions[action][ent_type] += 1
return actions
@property
def action_types(self):
return (BEGIN, IN, LAST, UNIT, OUT)
def get_doc_labels(self, doc):
labels = set()
for token in doc:
if token.ent_type:
labels.add(token.ent_type_)
return labels
def move_name(self, int move, attr_t label):
if move == OUT:
return 'O'
elif move == MISSING:
return 'M'
else:
return MOVE_NAMES[move] + '-' + self.strings[label]
def init_gold_batch(self, examples):
all_states = self.init_batch([eg.predicted for eg in examples])
golds = []
states = []
for state, eg in zip(all_states, examples):
if self.has_gold(eg) and not state.is_final():
golds.append(self.init_gold(state, eg))
states.append(state)
n_steps = sum([len(s.queue) for s in states])
return states, golds, n_steps
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == '' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0)
elif '-' in name:
move_str, label_str = name.split('-', 1)
# Deprecated, hacky way to denote 'not this entity'
if label_str.startswith('!'):
raise ValueError(Errors.E869.format(label=name))
label = self.strings.add(label_str)
else:
move_str = name
label = 0
move = MOVE_NAMES.index(move_str)
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
raise KeyError(Errors.E022.format(name=name))
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
t.score = 0
t.clas = clas
t.move = move
t.label = label
if move == MISSING:
t.is_valid = Missing.is_valid
t.do = Missing.transition
t.get_cost = Missing.cost
elif move == BEGIN:
t.is_valid = Begin.is_valid
t.do = Begin.transition
t.get_cost = Begin.cost
elif move == IN:
t.is_valid = In.is_valid
t.do = In.transition
t.get_cost = In.cost
elif move == LAST:
t.is_valid = Last.is_valid
t.do = Last.transition
t.get_cost = Last.cost
elif move == UNIT:
t.is_valid = Unit.is_valid
t.do = Unit.transition
t.get_cost = Unit.cost
elif move == OUT:
t.is_valid = Out.is_valid
t.do = Out.transition
t.get_cost = Out.cost
else:
raise ValueError(Errors.E019.format(action=move, src='ner'))
return t
def add_action(self, int action, label_name, freq=None):
cdef attr_t label_id
if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name)
else:
label_id = label_name
if action == OUT and label_id != 0:
return None
if action == MISSING:
return None
# Check we're not creating a move we already have, so that this is
# idempotent
for trans in self.c[:self.n_moves]:
if trans.move == action and trans.label == label_id:
return 0
if self.n_moves >= self._size:
self._size = self.n_moves
self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
self.n_moves += 1
if self.labels.get(action, []):
freq = min(0, min(self.labels[action].values()))
self.labels[action][label_name] = freq-1
else:
self.labels[action] = Counter()
self.labels[action][label_name] = -1
return 1
def set_annotations(self, StateClass state, Doc doc):
cdef int i
ents = []
for i in range(state.c._ents.size()):
ent = state.c._ents.at(i)
if ent.start != -1 and ent.end != -1:
ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id))
doc.set_ents(ents, default="unmodified")
# Set non-blocked tokens to O
for i in range(doc.length):
if doc.c[i].ent_iob == 0:
doc.c[i].ent_iob = 2
def get_beam_parses(self, Beam beam):
parses = []
probs = beam.probs
for i in range(beam.size):
state = <StateC*>beam.at(i)
if state.is_final():
prob = probs[i]
parse = []
for j in range(state._ents.size()):
ent = state._ents.at(j)
if ent.start != -1 and ent.end != -1:
parse.append((ent.start, ent.end, self.strings[ent.label]))
parses.append((prob, parse))
return parses
def init_gold(self, StateClass state, Example example):
return BiluoGold(self, state, example, self.neg_key)
def has_gold(self, Example eg, start=0, end=None):
# We get x and y referring to X, we want to check relative to Y,
# the reference
y_spans = eg.get_aligned_spans_x2y([eg.x[start:end]])
if not y_spans:
y_spans = [eg.y[:]]
y_span = y_spans[0]
start = y_span.start
end = y_span.end
neg_key = self.neg_key
if neg_key is not None:
# If we have any negative samples, count that as having annotation.
for span in eg.y.spans.get(neg_key, []):
if span.start >= start and span.end <= end:
return True
for word in eg.y[start:end]:
if word.ent_iob != 0:
return True
else:
return False
def get_cost(self, StateClass stcls, gold, int i):
if not isinstance(gold, BiluoGold):
raise TypeError(Errors.E909.format(name="BiluoGold"))
cdef BiluoGold gold_ = gold
gold_state = gold_.c
n_gold = 0
if self.c[i].is_valid(stcls.c, self.c[i].label):
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
else:
cost = 9000
return cost
cdef int set_costs(self, int* is_valid, weight_t* costs,
const StateC* state, gold) except -1:
if not isinstance(gold, BiluoGold):
raise TypeError(Errors.E909.format(name="BiluoGold"))
cdef BiluoGold gold_ = gold
gold_state = gold_.c
update_gold_state(&gold_state, state)
n_gold = 0
self.set_valid(is_valid, state)
for i in range(self.n_moves):
if is_valid[i]:
costs[i] = self.c[i].get_cost(state, &gold_state, self.c[i].label)
n_gold += costs[i] <= 0
else:
costs[i] = 9000
cdef class Missing:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False
@staticmethod
cdef int transition(StateC* s, attr_t label) nogil:
pass
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
return 9000
cdef class Begin:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if st.entity_is_open():
return False
if st.buffer_length() < 2:
# If we're the last token of the input, we can't B -- must U or O.
return False
elif label == 0:
return False
elif preset_ent_iob == 1:
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
return False
elif preset_ent_iob == 3:
# Okay, we're in a preset entity.
if label != preset_ent_label:
# If label isn't right, reject
return False
elif st.B_(1).ent_iob != 1:
# If next token isn't marked I, we need to make U, not B.
return False
else:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.B_(1).ent_iob == 3:
# If the next word is B, we can't B now
return False
elif st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries
return False
# Don't allow entities to start on whitespace
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
b0 = s.B(0)
cdef int cost = 0
cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label
if g_act == MISSING:
pass
elif g_act == BEGIN:
# B, Gold B --> Label match
cost += label != g_tag
else:
# B, Gold I --> False (P)
# B, Gold L --> False (P)
# B, Gold O --> False (P)
# B, Gold U --> False (P)
cost += 1
if s.buffer_length() < 3:
# Handle negatives. In general we can't really do much to block
# B, because we don't know whether the whole entity is going to
# be correct or not. However, we can at least tell whether we're
# going to be opening an entity where there's only one possible
# L.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0:
cost += 1
break
return cost
cdef class In:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if not st.entity_is_open():
return False
if st.buffer_length() < 2:
# If we're at the end, we can't I.
return False
ent = st.get_ent()
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
elif ent.label != label:
return False
elif preset_ent_iob == 3:
return False
elif st.B_(1).ent_iob == 3:
# If we know the next word is B, we can't be I (must be L)
return False
elif preset_ent_iob == 1:
if st.B_(1).ent_iob in (0, 2):
# if next preset is missing or O, this can't be I (must be L)
return False
elif label != preset_ent_label:
# If label isn't right, reject
return False
else:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
move = IN
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING:
return 0
elif g_act == BEGIN:
# I, Gold B --> True
# (P of bad open entity sunk, R of this entity sunk)
return 0
elif g_act == IN:
# I, Gold I --> True
# (label forced by prev, if mismatch, P and R both sunk)
return 0
elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O
return not (is_sunk and (next_act == OUT or next_act == MISSING))
elif g_act == OUT:
# I, Gold O --> True iff next tag == O
return not (next_act == OUT or next_act == MISSING)
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act != OUT
else:
return 1
cdef class Last:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
return False
elif not st.entity_is_open():
return False
elif preset_ent_iob == 1 and st.B_(1).ent_iob != 1:
# If a preset entity has I followed by not-I, is L
if label != preset_ent_label:
# If label isn't right, reject
return False
else:
# Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace.
return True
elif st.get_ent().label != label:
return False
elif st.B_(1).ent_iob == 1:
# If a preset entity has I next, we can't L here.
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent()
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
move = LAST
b0 = s.B(0)
ent_start = s.E(0)
cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label
cdef int cost = 0
if g_act == MISSING:
pass
elif g_act == BEGIN:
# L, Gold B --> True
pass
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
cost += not _entity_is_sunk(s, gold.ner)
elif g_act == LAST:
# L, Gold L --> True
pass
elif g_act == OUT:
# L, Gold O --> True
pass
elif g_act == UNIT:
# L, Gold U --> True
pass
else:
cost += 1
# If we have negative-example entities, integrate them into the objective,
# by marking actions that close an entity that we know is incorrect
# as costly.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
cost += 1
break
return cost
cdef class Unit:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0:
# this is only allowed if it's a preset blocked annotation
if preset_ent_label == 0 and preset_ent_iob == 3:
return True
else:
return False
elif st.entity_is_open():
return False
elif st.B_(1).ent_iob == 1:
# If next token is In, we can't be Unit -- must be Begin
return False
elif preset_ent_iob == 3:
# Okay, there's a preset entity here
if label != preset_ent_label:
# Require labels to match
return False
else:
# Otherwise return True, ignoring the whitespace constraint.
return True
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.close_ent()
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef int cost = 0
if g_act == MISSING:
pass
elif g_act == UNIT:
# U, Gold U --> True iff tag match
cost += label != g_tag
else:
# U, Gold B --> False
# U, Gold I --> False
# U, Gold L --> False
# U, Gold O --> False
cost += 1
# If we have negative-example entities, integrate them into the objective.
# This is fairly straight-forward for U- entities, as we have a single
# action
cdef int b0 = s.B(0)
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0 and span.end == (b0+1):
cost += 1
break
return cost
cdef class Out:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if st.entity_is_open():
return False
elif preset_ent_iob == 3:
return False
elif preset_ent_iob == 1:
return False
else:
return True
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
st.push()
st.pop()
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef weight_t cost = 0
if g_act == MISSING:
pass
elif g_act == BEGIN:
# O, Gold B --> False
cost += 1
elif g_act == IN:
# O, Gold I --> True
pass
elif g_act == LAST:
# O, Gold L --> True
pass
elif g_act == OUT:
# O, Gold O --> True
pass
elif g_act == UNIT:
# O, Gold U --> False
cost += 1
else:
cost += 1
return cost
This source diff could not be displayed because it is too large. You can view the blob instead.
# cython: profile=True, infer_types=True
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
from copy import copy
from ...tokens.doc cimport Doc, set_children_from_heads
from ...errors import Errors
DELIMITER = '||'
def ancestors(tokenid, heads):
# Returns all words going from the word up the path to the root. The path
# to root cannot be longer than the number of words in the sentence. This
# function ends after at most len(heads) steps, because it would otherwise
# loop indefinitely on cycles.
head = tokenid
cnt = 0
while heads[head] != head and cnt < len(heads):
head = heads[head]
cnt += 1
yield head
if head is None:
break
def contains_cycle(heads):
# in an acyclic tree, the path from each word following the head relation
# upwards always ends at the root node
for tokenid in range(len(heads)):
seen = set([tokenid])
for ancestor in ancestors(tokenid, heads):
if ancestor in seen:
return seen
seen.add(ancestor)
return None
def is_nonproj_arc(tokenid, heads):
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
# if there is a token k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d
head = heads[tokenid]
if head == tokenid: # root arcs cannot be non-projective
return False
elif head is None: # unattached tokens cannot be non-projective
return False
cdef int start, end
if head < tokenid:
start, end = (head+1, tokenid)
else:
start, end = (tokenid+1, head)
for k in range(start, end):
for ancestor in ancestors(k, heads):
if ancestor is None: # for unattached tokens/subtrees
break
elif ancestor == head: # normal case: k dominated by h
break
else: # head not in ancestors: d -> h is non-projective
return True
return False
def is_nonproj_tree(heads):
# a tree is non-projective if at least one arc is non-projective
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
def decompose(label):
return label.partition(DELIMITER)[::2]
def is_decorated(label):
return DELIMITER in label
def count_decorated_labels(gold_data):
freqs = {}
for example in gold_data:
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
example.get_aligned("DEP"))
# set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
return freqs
def projectivize(heads, labels):
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
# which encode a projective and decorated tree.
proj_heads = copy(heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc is None: # this sentence is already projective
return proj_heads, copy(labels)
while smallest_np_arc is not None:
_lift(smallest_np_arc, proj_heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
deco_labels = _decorate(heads, proj_heads, labels)
return proj_heads, deco_labels
cpdef deprojectivize(Doc doc):
# Reattach arcs with decorated labels (following HEAD scheme). For each
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
# hitting a Y then make this the new head.
for i in range(doc.length):
label = doc.vocab.strings[doc.c[i].dep]
if DELIMITER in label:
new_label, head_label = label.split(DELIMITER)
new_head = _find_new_head(doc[i], head_label)
doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label)
set_children_from_heads(doc.c, 0, doc.length)
return doc
def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
raise ValueError(Errors.E082.format(n_heads=len(heads),
n_proj_heads=len(proj_heads),
n_labels=len(labels)))
deco_labels = []
for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]:
deco_labels.append(f"{labels[tokenid]}{DELIMITER}{labels[head]}")
else:
deco_labels.append(labels[tokenid])
return deco_labels
def _get_smallest_nonproj_arc(heads):
# return the smallest non-proj arc or None
# where size is defined as the distance between dep and head
# and ties are broken left to right
smallest_size = float('inf')
smallest_np_arc = None
for tokenid, head in enumerate(heads):
size = abs(tokenid-head)
if size < smallest_size and is_nonproj_arc(tokenid, heads):
smallest_size = size
smallest_np_arc = tokenid
return smallest_np_arc
def _lift(tokenid, heads):
# reattaches a word to it's grandfather
head = heads[tokenid]
ghead = heads[head]
# attach to ghead if head isn't attached to root else attach to root
heads[tokenid] = ghead if head != ghead else tokenid
def _find_new_head(token, headlabel):
# search through the tree starting from the head of the given token
# returns the id of the first descendant with the given label
# if there is none, return the current head (no change)
queue = [token.head]
while queue:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space:
continue
if child == token:
continue
if child.dep_ == headlabel:
return child
next_queue.append(child)
queue = next_queue
return token.head
This source diff could not be displayed because it is too large. You can view the blob instead.
from cymem.cymem cimport Pool
from ...structs cimport TokenC, SpanC
from ...typedefs cimport attr_t
from ...tokens.doc cimport Doc
from ._state cimport StateC
cdef class StateClass:
cdef StateC* c
cdef readonly Doc doc
cdef int _borrowed
@staticmethod
cdef inline StateClass borrow(StateC* ptr, Doc doc):
cdef StateClass self = StateClass()
self.c = ptr
self._borrowed = 1
self.doc = doc
return self
@staticmethod
cdef inline StateClass init_offset(const TokenC* sent, int length, int
offset):
cdef StateClass self = StateClass()
self.c = new StateC(sent, length)
self.c.offset = offset
return self
# cython: infer_types=True
import numpy
from libcpp.vector cimport vector
from ._state cimport ArcC
from ...tokens.doc cimport Doc
cdef class StateClass:
def __init__(self, Doc doc=None, int offset=0):
self._borrowed = 0
if doc is not None:
self.c = new StateC(doc.c, doc.length)
self.c.offset = offset
self.doc = doc
else:
self.doc = None
def __dealloc__(self):
if self._borrowed != 1:
del self.c
@property
def stack(self):
return [self.S(i) for i in range(self.c.stack_depth())]
@property
def queue(self):
return [self.B(i) for i in range(self.c.buffer_length())]
@property
def token_vector_lenth(self):
return self.doc.tensor.shape[1]
@property
def arcs(self):
cdef vector[ArcC] arcs
self.c.get_arcs(&arcs)
return list(arcs)
#py_arcs = []
#for arc in arcs:
# if arc.head != -1 and arc.child != -1:
# py_arcs.append((arc.head, arc.child, arc.label))
#return arcs
def add_arc(self, int head, int child, int label):
self.c.add_arc(head, child, label)
def del_arc(self, int head, int child):
self.c.del_arc(head, child)
def H(self, int child):
return self.c.H(child)
def L(self, int head, int idx):
return self.c.L(head, idx)
def R(self, int head, int idx):
return self.c.R(head, idx)
@property
def _b_i(self):
return self.c._b_i
@property
def length(self):
return self.c.length
def is_final(self):
return self.c.is_final()
def copy(self):
cdef StateClass new_state = StateClass(doc=self.doc, offset=self.c.offset)
new_state.c.clone(self.c)
return new_state
def print_state(self):
words = [token.text for token in self.doc]
words = list(words) + ['_']
bools = ["F", "T"]
sent_starts = [bools[self.c.is_sent_start(i)] for i in range(len(self.doc))]
shifted = [1 if self.c.is_unshiftable(i) else 0 for i in range(self.c.length)]
shifted.append("")
sent_starts.append("")
top = f"{self.S(0)}{words[self.S(0)]}_{words[self.H(self.S(0))]}_{shifted[self.S(0)]}"
second = f"{self.S(1)}{words[self.S(1)]}_{words[self.H(self.S(1))]}_{shifted[self.S(1)]}"
third = f"{self.S(2)}{words[self.S(2)]}_{words[self.H(self.S(2))]}_{shifted[self.S(2)]}"
n0 = f"{self.B(0)}{words[self.B(0)]}_{sent_starts[self.B(0)]}_{shifted[self.B(0)]}"
n1 = f"{self.B(1)}{words[self.B(1)]}_{sent_starts[self.B(1)]}_{shifted[self.B(1)]}"
return ' '.join((str(self.stack_depth()), str(self.buffer_length()), third, second, top, '|', n0, n1))
def S(self, int i):
return self.c.S(i)
def B(self, int i):
return self.c.B(i)
def H(self, int i):
return self.c.H(i)
def E(self, int i):
return self.c.E(i)
def L(self, int i, int idx):
return self.c.L(i, idx)
def R(self, int i, int idx):
return self.c.R(i, idx)
def S_(self, int i):
return self.doc[self.c.S(i)]
def B_(self, int i):
return self.doc[self.c.B(i)]
def H_(self, int i):
return self.doc[self.c.H(i)]
def E_(self, int i):
return self.doc[self.c.E(i)]
def L_(self, int i, int idx):
return self.doc[self.c.L(i, idx)]
def R_(self, int i, int idx):
return self.doc[self.c.R(i, idx)]
def empty(self):
return self.c.empty()
def eol(self):
return self.c.eol()
def at_break(self):
return False
#return self.c.at_break()
def has_head(self, int i):
return self.c.has_head(i)
def n_L(self, int i):
return self.c.n_L(i)
def n_R(self, int i):
return self.c.n_R(i)
def entity_is_open(self):
return self.c.entity_is_open()
def stack_depth(self):
return self.c.stack_depth()
def buffer_length(self):
return self.c.buffer_length()
def push(self):
self.c.push()
def pop(self):
self.c.pop()
def unshift(self):
self.c.unshift()
def add_arc(self, int head, int child, attr_t label):
self.c.add_arc(head, child, label)
def del_arc(self, int head, int child):
self.c.del_arc(head, child)
def open_ent(self, attr_t label):
self.c.open_ent(label)
def close_ent(self):
self.c.close_ent()
def clone(self, StateClass src):
self.c.clone(src.c)
This source diff could not be displayed because it is too large. You can view the blob instead.
from cymem.cymem cimport Pool
from ...typedefs cimport attr_t, weight_t
from ...structs cimport TokenC
from ...strings cimport StringStore
from ...training.example cimport Example
from .stateclass cimport StateClass
from ._state cimport StateC
cdef struct Transition:
int clas
int move
attr_t label
weight_t score
bint (*is_valid)(const StateC* state, attr_t label) nogil
weight_t (*get_cost)(const StateC* state, const void* gold, attr_t label) nogil
int (*do)(StateC* state, attr_t label) nogil
ctypedef weight_t (*get_cost_func_t)(const StateC* state, const void* gold,
attr_tlabel) nogil
ctypedef weight_t (*move_cost_func_t)(const StateC* state, const void* gold) nogil
ctypedef weight_t (*label_cost_func_t)(const StateC* state, const void*
gold, attr_t label) nogil
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
ctypedef int (*del_state_t)(Pool mem, void* state, void* extra_args) except -1
cdef class TransitionSystem:
cdef Pool mem
cdef StringStore strings
cdef Transition* c
cdef readonly int n_moves
cdef int _size
cdef public attr_t root_label
cdef public freqs
cdef public object labels
cdef public object cfg
cdef init_state_t init_beam_state
cdef del_state_t del_beam_state
cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
cdef int set_valid(self, int* output, const StateC* st) nogil
cdef int set_costs(self, int* is_valid, weight_t* costs,
const StateC* state, gold) except -1
# cython: infer_types=True
from __future__ import print_function
from cymem.cymem cimport Pool
from collections import Counter
import srsly
from . cimport _beam_utils
from ...typedefs cimport weight_t, attr_t
from ...tokens.doc cimport Doc
from ...structs cimport TokenC
from .stateclass cimport StateClass
from ...errors import Errors
from ... import util
cdef weight_t MIN_SCORE = -90000
class OracleError(Exception):
pass
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef StateC* st = new StateC(<const TokenC*>tokens, length)
return <void*>st
cdef int _del_state(Pool mem, void* state, void* x) except -1:
cdef StateC* st = <StateC*>state
del st
cdef class TransitionSystem:
def __init__(
self,
StringStore string_table,
labels_by_action=None,
min_freq=None,
incorrect_spans_key=None
):
self.cfg = {"neg_key": incorrect_spans_key}
self.mem = Pool()
self.strings = string_table
self.n_moves = 0
self._size = 100
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
self.labels = {}
if labels_by_action:
self.initialize_actions(labels_by_action, min_freq=min_freq)
self.root_label = self.strings.add('ROOT')
self.init_beam_state = _init_state
self.del_beam_state = _del_state
def __reduce__(self):
# TODO: This loses the 'cfg'
return (self.__class__, (self.strings, self.labels), None, None)
@property
def neg_key(self):
return self.cfg.get("neg_key")
def init_batch(self, docs):
cdef StateClass state
states = []
offset = 0
for doc in docs:
state = StateClass(doc, offset=offset)
states.append(state)
offset += len(doc)
return states
def get_oracle_sequence(self, Example example, _debug=False):
states, golds, _ = self.init_gold_batch([example])
if not states:
return []
state = states[0]
gold = golds[0]
if _debug:
return self.get_oracle_sequence_from_state(state, gold, _debug=example)
else:
return self.get_oracle_sequence_from_state(state, gold)
def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
cdef Pool mem = Pool()
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.n_moves > 0
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
history = []
debug_log = []
while not state.is_final():
self.set_costs(is_valid, costs, state.c, gold)
for i in range(self.n_moves):
if is_valid[i] and costs[i] <= 0:
action = self.c[i]
history.append(i)
if _debug:
s0 = state.S(0)
b0 = state.B(0)
example = _debug
debug_log.append(" ".join((
self.get_class_name(i),
"S0=", (example.x[s0].text if s0 >= 0 else "__"),
"B0=", (example.x[b0].text if b0 >= 0 else "__"),
"S0 head?", str(state.has_head(state.S(0))),
)))
action.do(state.c, action.label)
break
else:
if _debug:
example = _debug
print("Actions")
for i in range(self.n_moves):
print(self.get_class_name(i))
print("Gold")
for token in example.y:
print(token.text, token.dep_, token.head.text)
s0 = state.S(0)
b0 = state.B(0)
debug_log.append(" ".join((
"?",
"S0=", (example.x[s0].text if s0 >= 0 else "-"),
"B0=", (example.x[b0].text if b0 >= 0 else "-"),
"S0 head?", str(state.has_head(state.S(0))),
)))
print("\n".join(debug_log))
raise ValueError(Errors.E024)
return history
def apply_transition(self, StateClass state, name):
if not self.is_valid(state, name):
raise ValueError(Errors.E170.format(name=name))
action = self.lookup_transition(name)
action.do(state.c, action.label)
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
raise NotImplementedError
def is_valid(self, StateClass stcls, move_name):
action = self.lookup_transition(move_name)
return action.is_valid(stcls.c, action.label)
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
cdef int i
for i in range(self.n_moves):
is_valid[i] = self.c[i].is_valid(st, self.c[i].label)
cdef int set_costs(self, int* is_valid, weight_t* costs,
const StateC* state, gold) except -1:
raise NotImplementedError
def get_class_name(self, int clas):
act = self.c[clas]
return self.move_name(act.move, act.label)
def initialize_actions(self, labels_by_action, min_freq=None):
self.labels = {}
self.n_moves = 0
added_labels = []
added_actions = {}
for action, label_freqs in sorted(labels_by_action.items()):
action = int(action)
# Make sure we take a copy here, and that we get a Counter
self.labels[action] = Counter()
# Have to be careful here: Sorting must be stable, or our model
# won't be read back in correctly.
sorted_labels = [(f, L) for L, f in label_freqs.items()]
sorted_labels.sort()
sorted_labels.reverse()
for freq, label_str in sorted_labels:
if freq < 0:
added_labels.append((freq, label_str))
added_actions.setdefault(label_str, []).append(action)
else:
self.add_action(int(action), label_str)
self.labels[action][label_str] = freq
added_labels.sort(reverse=True)
for freq, label_str in added_labels:
for action in added_actions[label_str]:
self.add_action(int(action), label_str)
self.labels[action][label_str] = freq
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int) and \
not isinstance(label_name, long):
label_id = self.strings.add(label_name)
else:
label_id = label_name
# Check we're not creating a move we already have, so that this is
# idempotent
for trans in self.c[:self.n_moves]:
if trans.move == action and trans.label == label_id:
return 0
if self.n_moves >= self._size:
self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
self.n_moves += 1
# Add the new (action, label) pair, making up a frequency for it if
# necessary. To preserve sort order, the frequency needs to be lower
# than previous frequencies.
if self.labels.get(action, []):
new_freq = min(self.labels[action].values())
else:
self.labels[action] = Counter()
new_freq = -1
if new_freq > 0:
new_freq = 0
self.labels[action][label_name] = new_freq-1
return 1
def to_disk(self, path, **kwargs):
with path.open('wb') as file_:
file_.write(self.to_bytes(**kwargs))
def from_disk(self, path, **kwargs):
with path.open('rb') as file_:
byte_data = file_.read()
self.from_bytes(byte_data, **kwargs)
return self
def to_bytes(self, exclude=tuple()):
transitions = []
serializers = {
'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes(),
'cfg': lambda: self.cfg
}
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple()):
# We're adding a new field, 'cfg', here and we don't want to break
# previous models that don't have it.
msg = srsly.msgpack_loads(bytes_data)
labels = {}
if 'moves' not in exclude:
labels.update(srsly.json_loads(msg['moves']))
if 'strings' not in exclude:
self.strings.from_bytes(msg['strings'])
if 'cfg' not in exclude and 'cfg' in msg:
self.cfg.update(msg['cfg'])
self.initialize_actions(labels)
return self
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment