Document offsets_from_biluo_tags

2017-12-06 13:40:51 +01:00 · 2017-12-06 13:40:51 +01:00 · b078e276e6
parent fb663f9b7d
commit b078e276e6
2 changed files with 45 additions and 0 deletions
--- a/website/api/goldparse.jade
+++ b/website/api/goldparse.jade
@ -163,3 +163,41 @@ p
        +cell
            |  Unicode strings, describing the
            |  #[+a("/api/annotation#biluo") BILUO] tags.
+
+h(3, "offsets_from_biluo_tags") gold.offsets_from_biluo_tags
+
+p
+    |  Encode per-token tags following the
+    |  #[+a("/api/annotation#biluo") BILUO scheme] into entity offsets.
+
+aside-code("Example").
+    from spacy.gold import offsets_from_biluo_tags
+
+    doc = nlp('I like London.')
+    tags = ['O', 'O', 'U-LOC', 'O']
+    entities = offsets_from_biluo_tags(doc, tags)
+    assert entities == [(7, 13, 'LOC')]
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code doc]
+        +cell #[code Doc]
+        +cell The document that the BILUO tags refer to.
+
+    +row
+        +cell #[code entities]
+        +cell iterable
+        +cell
+            |  A sequence of #[+a("/api/annotation#biluo") BILUO] tags with
+            |  each tag describing one token. Each tag string will be of the
+            |  form of either #[code ""], #[code "O"] or
+            |  #[code "{action}-{label}"], where action is one of #[code "B"],
+            |  #[code "I"], #[code "L"], #[code "U"].
+
+    +row("foot")
+        +cell returns
+        +cell list
+        +cell
+            |  A sequence of #[code (start, end, label)] triples. #[code start]
+            |  and #[code end] will be character-offset integers denoting the
+            |  slice into the original string.
--- a/website/usage/_training/_ner.jade
+++ b/website/usage/_training/_ner.jade
@ -21,6 +21,13 @@ p
    |  #[strong experiment on your data] to find a solution that works best
    |  for you.

+aside("Tip: Converting entity annotations", "💡")
+    |  You can train the entity recognizer with entity offsets or
+    |  annotations in the #[+a("/api/annotation#biluo") BILUO scheme]. The
+    |  #[code spacy.gold] module also exposes
+    |  #[+a("/api/goldparse#util") two helper functions] to convert offsets to
+    |  BILUO tags, and BILUO tags to entity offsets.
+
 +h(3, "example-train-ner") Updating the Named Entity Recognizer

 p