From b078e276e61ad238e368af1607d213e45fc1d72f Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 6 Dec 2017 13:40:51 +0100 Subject: [PATCH] Document offsets_from_biluo_tags --- website/api/goldparse.jade | 38 +++++++++++++++++++++++++++++++ website/usage/_training/_ner.jade | 7 ++++++ 2 files changed, 45 insertions(+) diff --git a/website/api/goldparse.jade b/website/api/goldparse.jade index 9fb47ccc4..fbc9f0a37 100644 --- a/website/api/goldparse.jade +++ b/website/api/goldparse.jade @@ -163,3 +163,41 @@ p +cell | Unicode strings, describing the | #[+a("/api/annotation#biluo") BILUO] tags. + ++h(3, "offsets_from_biluo_tags") gold.offsets_from_biluo_tags + +p + | Encode per-token tags following the + | #[+a("/api/annotation#biluo") BILUO scheme] into entity offsets. + ++aside-code("Example"). + from spacy.gold import offsets_from_biluo_tags + + doc = nlp('I like London.') + tags = ['O', 'O', 'U-LOC', 'O'] + entities = offsets_from_biluo_tags(doc, tags) + assert entities == [(7, 13, 'LOC')] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The document that the BILUO tags refer to. + + +row + +cell #[code entities] + +cell iterable + +cell + | A sequence of #[+a("/api/annotation#biluo") BILUO] tags with + | each tag describing one token. Each tag string will be of the + | form of either #[code ""], #[code "O"] or + | #[code "{action}-{label}"], where action is one of #[code "B"], + | #[code "I"], #[code "L"], #[code "U"]. + + +row("foot") + +cell returns + +cell list + +cell + | A sequence of #[code (start, end, label)] triples. #[code start] + | and #[code end] will be character-offset integers denoting the + | slice into the original string. diff --git a/website/usage/_training/_ner.jade b/website/usage/_training/_ner.jade index 383db0f29..ec01b769d 100644 --- a/website/usage/_training/_ner.jade +++ b/website/usage/_training/_ner.jade @@ -21,6 +21,13 @@ p | #[strong experiment on your data] to find a solution that works best | for you. ++aside("Tip: Converting entity annotations", "💡") + | You can train the entity recognizer with entity offsets or + | annotations in the #[+a("/api/annotation#biluo") BILUO scheme]. The + | #[code spacy.gold] module also exposes + | #[+a("/api/goldparse#util") two helper functions] to convert offsets to + | BILUO tags, and BILUO tags to entity offsets. + +h(3, "example-train-ner") Updating the Named Entity Recognizer p