mirror of https://github.com/explosion/spaCy.git
Merge pull request #1913 from ohenrik/nb_syntax_iterator
Norwegian Language (nb) - Added french syntax iterator with explanation
This commit is contained in:
commit
0954e15dda
|
@ -13,6 +13,12 @@ from ...language import Language
|
|||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
# Borrowing french syntax parser because both languages use
|
||||
# universal dependencies for tagging/parsing.
|
||||
# Read here for more:
|
||||
# https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
|
@ -22,6 +28,7 @@ class NorwegianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
lemma_lookup = LOOKUP
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
|
||||
|
||||
def noun_chunks(obj):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {
|
||||
'noun_chunks': noun_chunks
|
||||
}
|
Loading…
Reference in New Issue