diff --git a/.gitignore b/.gitignore index b8a4a2fec..da7dde60c 100644 --- a/.gitignore +++ b/.gitignore @@ -93,6 +93,9 @@ coverage.xml # Mac OS X *.DS_Store +# Temporary files / Dropbox hack +*.~* + # Komodo project files *.komodoproject diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 23c9387e1..28fa4b2fe 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -14,9 +14,11 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Kendrick Tan, [@kendricktan](https://github.com/kendricktan) * Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson) * Liling Tan, [@alvations](https://github.com/alvations) +* Mark Amery, [@ExplodingCabbage](https://github.com/ExplodingCabbage) * Matthew Honnibal, [@honnibal](https://github.com/honnibal) * Maxim Samsonov, [@maxirmx](https://github.com/maxirmx) * Oleg Zd, [@olegzd](https://github.com/olegzd) +* Pokey Rule, [@pokey](https://github.com/pokey) * Sam Bozek, [@sambozek](https://github.com/sambozek) * Sasho Savkov [@savkov](https://github.com/savkov) * Tiago Rodrigues, [@TiagoMRodrigues](https://github.com/TiagoMRodrigues) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index aeb4e635c..ee5e818c1 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -1,13 +1,16 @@ from spacy.parts_of_speech cimport NOUN, PROPN, PRON -def english_noun_chunks(doc): +def english_noun_chunks(obj): + '''Detect base noun phrases from a dependency parse. + Works on both Doc and Span.''' labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'ROOT', 'root'] + doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings['conj'] np_label = doc.vocab.strings['NP'] - for i, word in enumerate(doc): + for i, word in enumerate(obj): if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: yield word.left_edge.i, word.i+1, np_label elif word.pos == NOUN and word.dep == conj: @@ -25,14 +28,15 @@ def english_noun_chunks(doc): # extended to the right of the NOUN # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not # just "eine Tasse", same for "das Thema Familie" -def german_noun_chunks(doc): +def german_noun_chunks(obj): labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] + doc = obj.doc # Ensure works on both Doc and Span. np_label = doc.vocab.strings['NP'] np_deps = set(doc.vocab.strings[label] for label in labels) close_app = doc.vocab.strings['nk'] rbracket = 0 - for i, word in enumerate(doc): + for i, word in enumerate(obj): if i < rbracket: continue if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3d09b7ad0..8ce2c7fe4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -223,6 +223,10 @@ cdef class Doc: def __repr__(self): return self.__str__() + @property + def doc(self): + return self + def similarity(self, other): '''Make a semantic similarity estimate. The default estimate is cosine similarity using an average of word vectors. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index e645c1a6f..a4f49555a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -190,6 +190,31 @@ cdef class Span: def __get__(self): return u''.join([t.text_with_ws for t in self]) + property noun_chunks: + ''' + Yields base noun-phrase #[code Span] objects, if the document + has been syntactically parsed. A base noun phrase, or + 'NP chunk', is a noun phrase that does not permit other NPs to + be nested within it – so no NP-level coordination, no prepositional + phrases, and no relative clauses. For example: + ''' + def __get__(self): + if not self.doc.is_parsed: + raise ValueError( + "noun_chunks requires the dependency parse, which " + "requires data to be installed. If you haven't done so, run: " + "\npython -m spacy.%s.download all\n" + "to install the data" % self.vocab.lang) + # Accumulate the result before beginning to iterate over it. This prevents + # the tokenisation from being changed out from under us during the iteration. + # The tricky thing here is that Span accepts its tokenisation changing, + # so it's okay once we have the Span objects. See Issue #375 + spans = [] + for start, end, label in self.doc.noun_chunks_iterator(self): + spans.append(Span(self, start, end, label=label)) + for span in spans: + yield span + property root: """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered. diff --git a/website/_harp.json b/website/_harp.json index c9cd5f02b..caa67a9f9 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -21,7 +21,8 @@ "SOCIAL": { "twitter": "spacy_io", "github": "explosion", - "reddit": "spacynlp" + "reddit": "spacynlp", + "codepen": "explosion" }, "NAVIGATION": { diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 030e9a776..4874783f0 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -90,6 +90,19 @@ mixin code(label, language) block +//- CodePen embed + slug - [string] ID of CodePen demo (taken from URL) + height - [integer] height of demo embed iframe + default_tab - [string] code tab(s) visible on load (default: "result") + +mixin codepen(slug, height, default_tab) + figure.o-block(style="min-height: #{height}px")&attributes(attributes) + .codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen) + +a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen + + script(async src="https://assets.codepen.io/assets/embed/ei.js") + + //- Images / figures url - [string] url or path to image width - [integer] image width in px, for better rendering (default: 500) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 200e22e9a..dce419d75 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -9,6 +9,7 @@ "Processing text": "processing-text", "spaCy's data model": "data-model", "Using the parse": "dependency-parse", + "Entity recognition": "entity-recognition", "Custom pipelines": "customizing-pipeline", "Rule-based matching": "rule-based-matching", "Word vectors": "word-vectors-similarities", @@ -51,7 +52,13 @@ }, "dependency-parse": { - "title": "Using the dependency parse" + "title": "Using the dependency parse", + "next": "entity-recognition" + }, + + "entity-recognition": { + "title": "Entity recognition", + "next": "rule-based-matching" }, "rule-based-matching": { @@ -232,6 +239,12 @@ }, "deep_dives": { + "Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": { + "url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb", + "author": "Patrick Harrison (S&P Global)", + "tags": [ "jupyter", "gensim" ] + }, + "Deep Learning with custom pipelines and Keras": { "url": "https://explosion.ai/blog/spacy-deep-learning-keras", "author": "Matthew Honnibal", diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade new file mode 100644 index 000000000..f8045778e --- /dev/null +++ b/website/docs/usage/entity-recognition.jade @@ -0,0 +1,290 @@ +//- 💫 DOCS > USAGE > NAMED ENTITY RECOGNITION + +include ../../_includes/_mixins + +p + | spaCy features an extremely fast statistical entity recognition system, + | that assigns labels to contiguous spans of tokens. The default model + | identifies a variety of named and numeric entities, including companies, + | locations, organizations and products. You can add arbitrary classes to + | the entity recognition system, and update the model with new examples. + ++aside-code("Example"). + import spacy + nlp = spacy.load('en') + doc = nlp(u'London is a big city in the United Kingdom.') + for ent in doc.ents: + print(ent.label_, ent.text) + # GPE London + # GPE United Kingdom + +p + | The standard way to access entity annotations is the + | #[+api("doc#ents") #[code doc.ents]] property, which produces a sequence + | of #[+api("span") #[code Span]] objects. The entity type is accessible + | either as an integer ID or as a string, using the attributes + | #[code ent.label] and #[code ent.label_]. The #[code Span] object acts + | as a sequence of tokens, so you can iterate over the entity or index into + | it. You can also get the text form of the whole entity, as though it were + | a single token. See the #[+api("span") API reference] for more details. + +p + | You can access token entity annotations using the #[code token.ent_iob] + | and #[code token.ent_type] attributes. The #[code token.ent_iob] + | attribute indicates whether an entity starts, continues or ends on the + | tag (In, Begin, Out). + ++code("Example"). + doc = nlp(u'London is a big city in the United Kingdom.') + print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_)) + # (u'London', 2, u'GPE') + print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_)) + (u'is', 3, u'')] + ++h(2, "setting") Setting entity annotations + +p + | To ensure that the sequence of token annotations remains consistent, you + | have to set entity annotations at the document level — you can't write + | directly to the #[code token.ent_iob] or #[code token.ent_type] + | attributes. The easiest way to set entities is to assign to the + | #[code doc.ents] attribute. + ++code("Example"). + doc = nlp(u'London is a big city in the United Kingdom.') + doc.ents = [] + assert doc[0].ent_type_ == '' + doc.ents = [Span(0, 1, label='GPE')] + assert doc[0].ent_type_ == 'GPE' + doc.ents = [] + doc.ents = [(u'LondonCity', 0, 1, u'GPE')] + +p + | The value you assign should be a sequence, the values of which + | can either be #[code Span] objects, or #[code (ent_id, ent_type, start, end)] + | tuples, where #[code start] and #[code end] are token offsets that + | describe the slice of the document that should be annotated. + +p + | You can also assign entity annotations using the #[code doc.from_array()] + | method. To do this, you should include both the #[code ENT_TYPE] and the + | #[code ENT_IOB] attributes in the array you're importing from. + ++code("Example"). + from spacy.attrs import ENT_IOB, ENT_TYPE + import numpy + + doc = nlp.make_doc(u'London is a big city in the United Kingdom.') + assert list(doc.ents) == [] + header = [ENT_IOB, ENT_TYPE] + attr_array = numpy.zeros((len(doc), len(header))) + attr_array[0, 0] = 2 # B + attr_array[0, 1] = doc.vocab.strings[u'GPE'] + doc.from_array(header, attr_array) + assert list(doc.ents)[0].text == u'London' + +p + | Finally, you can always write to the underlying struct, if you compile + | a Cython function. This is easy to do, and allows you to write efficient + | native code. + ++code("Example"). + # cython: infer_types=True + from spacy.tokens.doc cimport Doc + + cpdef set_entity(Doc doc, int start, int end, int ent_type): + for i in range(start, end): + doc.c[i].ent_type = ent_type + doc.c[start].ent_iob = 3 + for i in range(start+1, end): + doc.c[i].ent_iob = 2 + +p + | Obviously, if you write directly to the array of #[code TokenC*] structs, + | you'll have responsibility for ensuring that the data is left in a + | consistent state. + + ++h(2, "displacy") The displaCy #[sup ENT] visualizer + +p + | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] + | lets you explore an entity recognition model's behaviour interactively. + | If you're training a model, it's very useful to run the visualization + | server yourself. To help you do that, we've open-sourced both the + | #[+a(gh("spacy-services")) back-end service] and the + | #[+a(gh("displacy-ent")) front-end client]. + ++codepen("ALxpQO", 450) + ++h(2, "entity-types") Built-in entity types + ++h(3, "entity-types-named") Named types + ++table([ "Type", "Description" ]) + +row + +cell #[code PERSON] + +cell People, including fictional + + +row + +cell #[code NORP] + +cell Nationalities or religious or political groups + + +row + +cell #[code FACILITY] + +cell Buildings, airports, highways, bridges, etc. + + +row + +cell #[code ORG] + +cell Companies, agencies, institutions, etc. + + +row + +cell #[code GPE] + +cell Countries, cities, states + + +row + +cell #[code LOC] + +cell Non-GPE locations, mountain ranges, bodies of water + + +row + +cell #[code PRODUCT] + +cell Objects, vehicles, foods, etc. (not services) + + +row + +cell #[code EVENT] + +cell Named hurricanes, battles, wars, sports events, etc. + + +row + +cell #[code WORK_OF_ART] + +cell Titles of books, songs, etc. + + +row + +cell #[code LANGUAGE] + +cell Any named language + ++h(3, "entity-types-numeric") Numeric types + ++table([ "Type", "Description" ]) + +row + +cell #[code DATE] + +cell Absolute or relative dates or periods + + +row + +cell #[code TIME] + +cell Times smaller than a day + + +row + +cell #[code PERCENT] + +cell Percentage, including "%" + + +row + +cell #[code MONEY] + +cell Monetary values, including unit + + +row + +cell #[code QUANTITY] + +cell Measurements, as of weight or distance + + +row + +cell #[code ORDINAL] + +cell "first", "second", etc. + + +row + +cell #[code CARDINAL] + +cell Numerals that do not fall under another type + ++aside("Install") + | The #[+api("load") spacy.load()] function configures a pipeline that + | includes all of the available annotators for the given ID. In the example + | above, the #[code 'en'] ID tells spaCy to load the default English + | pipeline. If you have installed the data with + | #[code python -m spacy.en.download] this will include the entity + | recognition model. + ++h(2, "updating") Training and updating + +p + | To provide training examples to the entity recogniser, you'll first need + | to create an instance of the #[code GoldParse] class. You can specify + | your annotations in a stand-off format or as token tags. + ++code. + import spacy + from spacy.gold import GoldParse + + train_data = [ + ('Who is Chaka Khan?', [(7, 17, 'PERSON')]), + ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) + ] + + nlp = spacy.load(entity=False, parser=False) + ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC']) + + for itn in range(5): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + + nlp.tagger(doc) + ner.update(doc, gold) + ner.model.end_training() + +p + | If a character offset in your entity annotations don't fall on a token + | boundary, the #[code GoldParse] class will treat that annotation as a + | missing value. This allows for more realistic training, because the + | entity recogniser is allowed to learn from examples that may feature + | tokenizer errors. + ++aside-code("Example"). + doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets']) + gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O']) + ner = EntityRecognizer(nlp.vocab, entity_types=['ANIMAL']) + ner.update(doc, gold) + +p + | You can also provide token-level entity annotation, using the + | following tagging scheme to describe the entity boundaries: + ++table([ "Tag", "Description" ]) + +row + +cell #[code #[span.u-color-theme B] EGIN] + +cell The first token of a multi-token entity + + +row + +cell #[code #[span.u-color-theme I] N] + +cell An inner token of a multi-token entity + + +row + +cell #[code #[span.u-color-theme L] AST] + +cell The final token of a multi-token entity + + +row + +cell #[code #[span.u-color-theme U] NIT] + +cell A single-token entity + + +row + +cell #[code #[span.u-color-theme O] UT] + +cell A non-entity token. + ++aside("Why BILUO, not IOB?") + | There are several coding schemes for encoding entity annotations as + | token tags. These coding schemes are equally expressive, but not + | necessarily equally learnable. + | #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth] + | showed that the minimal #[strong Begin], #[strong In], #[strong Out] + | scheme was more difficult to learn than the #[strong BILUO] scheme that + | we use, which explicitly marks boundary tokens. + +p + | spaCy translates the character offsets into this scheme, in order to + | decide the cost of each action given the current state of the entity + | recogniser. The costs are then used to calculate the gradient of the + | loss, to train the model. The exact algorithm is a pastiche of + | well-known methods, and is not currently described in any single + | publication. The model is a greedy transition-based parser guided by a + | linear model whose weights are learned using the averaged perceptron + | loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle] + | imitation learning strategy. The transition system is equivalent to the + | BILOU tagging scheme.