spaCy/spacy/tests/lang/fr/test_noun_chunks.py

231 lines
8.2 KiB
Python
Raw Normal View History

from spacy.tokens import Doc
import pytest
# fmt: off
@pytest.mark.parametrize(
"words,heads,deps,pos,chunk_offsets",
[
# determiner + noun
# un nom -> un nom
(
["un", "nom"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + noun starting with vowel
# l'heure -> l'heure
(
["l'", "heure"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# determiner + plural noun
# les romans -> les romans
(
["les", "romans"],
[1, 1],
["det", "ROOT"],
["DET", "NOUN"],
[(0, 2)],
),
# det + adj + noun
# Le vieux Londres -> Le vieux Londres
(
['Les', 'vieux', 'Londres'],
[2, 2, 2],
["det", "amod", "ROOT"],
["DET", "ADJ", "NOUN"],
[(0,3)]
),
# det + noun + adj
# le nom propre -> le nom propre a proper noun
(
["le", "nom", "propre"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# det + noun + adj plural
# Les chiens bruns -> les chiens bruns
(
["Les", "chiens", "bruns"],
[1, 1, 1],
["det", "ROOT", "amod"],
["DET", "NOUN", "ADJ"],
[(0, 3)],
),
# multiple adjectives: one adj before the noun, one adj after the noun
# un nouveau film intéressant -> un nouveau film intéressant
(
["un", "nouveau", "film", "intéressant"],
[2, 2, 2, 2],
["det", "amod", "ROOT", "amod"],
["DET", "ADJ", "NOUN", "ADJ"],
[(0,4)]
),
# multiple adjectives, both adjs after the noun
# une personne intelligente et drôle -> une personne intelligente et drôle
(
["une", "personne", "intelligente", "et", "drôle"],
[1, 1, 1, 4, 2],
["det", "ROOT", "amod", "cc", "conj"],
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
[(0,5)]
),
# relative pronoun
# un bus qui va au ville -> un bus, qui, ville
(
['un', 'bus', 'qui', 'va', 'au', 'ville'],
[1, 1, 3, 1, 5, 3],
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
[(0,2), (2,3), (5,6)]
),
# relative subclause
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
(
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
[0, 2, 0, 5, 5, 2, 5],
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
[(1,3), (4,5)]
),
# Person name and title by flat
# Louis XIV -> Louis XIV
(
["Louis", "XIV"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Organization name by flat
# Nations Unies -> Nations Unies
(
["Nations", "Unies"],
[0, 0],
["ROOT", "flat:name"],
["PROPN", "PROPN"],
[(0,2)]
),
# Noun compound, person name created by two flats
# Louise de Bratagne -> Louise de Bratagne
(
["Louise", "de", "Bratagne"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# Noun compound, person name created by two flats
# Louis François Joseph -> Louis François Joseph
(
["Louis", "François", "Joseph"],
[0, 0, 0],
["ROOT", "flat:name", "flat:name"],
["PROPN", "PROPN", "PROPN"],
[(0,3)]
),
# one determiner + one noun + one adjective qualified by an adverb
# quelques agriculteurs très riches -> quelques agriculteurs très riches
(
["quelques", "agriculteurs", "très", "riches"],
[1, 1, 3, 1],
['det', 'ROOT', 'advmod', 'amod'],
['DET', 'NOUN', 'ADV', 'ADJ'],
[(0,4)]
),
# Two NPs conjuncted
# Il a un chien et un chat -> Il, un chien, un chat
(
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
[1, 1, 3, 1, 6, 6, 3],
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
[(0,1), (2,4), (5,7)]
),
# Two NPs together
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
(
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
[1, 1, 1, 1, 3],
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
[(0, 3), (3, 5)]
),
# nmod relation between NPs
# la destruction de la ville -> la destruction, la ville
(
['la', 'destruction', 'de', 'la', 'ville'],
[1, 1, 4, 4, 1],
['det', 'ROOT', 'case', 'det', 'nmod'],
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
[(0,2), (3,5)]
),
# nmod relation between NPs
# Archiduchesse dAutriche -> Archiduchesse, Autriche
(
['Archiduchesse', 'd', 'Autriche'],
[0, 2, 0],
['ROOT', 'case', 'nmod'],
['NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3)]
),
# Compounding by nmod, several NPs chained together
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
(
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
[2, 2, 2, 4, 2, 6, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
[(0, 3), (4, 5), (6, 7)]
),
# several NPs
# Traduction du rapport de Susana -> Traduction, rapport, Susana
(
['Traduction', 'du', 'raport', 'de', 'Susana'],
[0, 2, 0, 4, 2],
['ROOT', 'case', 'nmod', 'case', 'nmod'],
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
[(0,1), (2,3), (4,5)]
),
# Several NPs
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
(
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
[2, 2, 2, 4, 2, 7, 7, 2],
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
[(0,3), (4,5), (6,8)]
),
# Passive subject
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
(
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
[(0, 3), (6, 10), (11, 12)]
)
],
)
# fmt: on
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
2020-09-29 19:39:28 +00:00
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
doc = fr_tokenizer("Je suis allé à l'école")
with pytest.raises(ValueError):
list(doc.noun_chunks)