//- 💫 DOCS > USAGE > SPACY 101 > LIGHTNING TOUR p | The following examples and code snippets give you an overview of spaCy's | functionality and its usage. +h(3, "lightning-tour-models") Install models and process text +code(false, "bash"). python -m spacy download en python -m spacy download de +code. import spacy nlp = spacy.load('en') doc = nlp(u'Hello, world. Here are two sentences.') nlp_de = spacy.load('de') doc_de = nlp_de(u'Ich bin ein Berliner.') +infobox | #[+label-inline API:] #[+api("spacy#load") #[code spacy.load()]] | #[+label-inline Usage:] #[+a("/usage/models") Models], | #[+a("/usage/spacy-101") spaCy 101] +h(3, "lightning-tour-tokens-sentences") Get tokens, noun chunks & sentences +tag-model("dependency parse") +code. doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " u"emoji. It's outranking eggplant 🍑 ") assert doc[0].text == u'Peach' assert doc[1].text == u'emoji' assert doc[-1].text == u'🍑' assert doc[17:19].text == u'outranking eggplant' assert list(doc.noun_chunks)[0].text == u'Peach emoji' sentences = list(doc.sents) assert len(sentences) == 3 assert sentences[1].text == u'Peach is the superior emoji.' +infobox | #[+label-inline API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]] | #[+label-inline Usage:] #[+a("/usage/spacy-101") spaCy 101] +h(3, "lightning-tour-pos-tags") Get part-of-speech tags and flags +tag-model("tagger") +code. doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') apple = doc[0] assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579] assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553] assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862] assert apple.is_alpha == True assert apple.is_punct == False billion = doc[10] assert billion.is_digit == False assert billion.like_num == True assert billion.like_email == False +infobox | #[+label-inline API:] #[+api("token") #[code Token]] | #[+label-inline Usage:] #[+a("/usage/linguistic-features#pos-tagging") Part-of-speech tagging] +h(3, "lightning-tour-hashes") Use hash values for any string +code. doc = nlp(u'I love coffee') coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401 coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' assert doc[2].orth == coffee_hash == 3197928453018144401 assert doc[2].text == coffee_text == u'coffee' beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079 beer_text = doc.vocab.strings[beer_hash] # 'beer' unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783 unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' +infobox | #[+label-inline API:] #[+api("stringstore") #[code StringStore]] | #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101] +h(3, "lightning-tour-entities") Recognise and update named entities +tag-model("NER") +code. doc = nlp(u'San Francisco considers banning sidewalk delivery robots') ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents] assert ents == [(u'San Francisco', 0, 13, u'GPE')] from spacy.tokens import Span doc = nlp(u'Netflix is hiring a new VP of global policy') doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])] ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents] assert ents == [(0, 7, u'ORG')] +infobox | #[+label-inline Usage:] #[+a("/usage/linguistic-features#named-entities") Named entity recognition] +h(3, "lightning-tour-training") Train and update neural network models +tag-model +code. import spacy import random nlp = spacy.load('en') train_data = [("Uber blew through $1 million", {'entities': [(0, 4, 'ORG')]})] with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']): optimizer = nlp.begin_training() for i in range(10): random.shuffle(train_data) for text, annotations in train_data: nlp.update([text], [annotations] sgd=optimizer) nlp.to_disk('/model') +infobox | #[+label-inline API:] #[+api("language#update") #[code Language.update]] | #[+label-inline Usage:] #[+a("/usage/training") Training spaCy's statistical models] +h(3, "lightning-tour-displacy") Visualize a dependency parse and named entities in your browser +tag-model("dependency parse", "NER") +tag-new(2) +aside .u-text-center(style="overflow: auto"). This DT is VBZ a DT sentence. NN nsubj det attr +code. from spacy import displacy doc_dep = nlp(u'This is a sentence.') displacy.serve(doc_dep, style='dep') doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google ' u'in 2007, few people outside of the company took him seriously.') displacy.serve(doc_ent, style='ent') +infobox | #[+label-inline API:] #[+api("top-level#displacy") #[code displacy]] | #[+label-inline Usage:] #[+a("/usage/visualizers") Visualizers] +h(3, "lightning-tour-word-vectors") Get word vectors and similarity +tag-model("word vectors") +code. doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") apple = doc[0] banana = doc[2] pasta = doc[6] hippo = doc[8] assert apple.similarity(banana) > pasta.similarity(hippo) assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector p | For the best results, you should run this example using the | #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model. +infobox | #[+label-inline Usage:] #[+a("/usage/vectors-similarity") Word vectors and similarity] +h(3, "lightning-tour-serialization") Simple and efficient serialization +code. import spacy from spacy.tokens import Doc from spacy.vocab import Vocab nlp = spacy.load('en') customer_feedback = open('customer_feedback_627.txt').read() doc = nlp(customer_feedback) doc.to_disk('/tmp/customer_feedback_627.bin') new_doc = Doc(Vocab()).from_disk('/tmp/customer_feedback_627.bin') +infobox | #[+label-inline API:] #[+api("language") #[code Language]], | #[+api("doc") #[code Doc]] | #[+label-inline Usage:] #[+a("/usage/models#saving-loading") Saving and loading models] +h(3, "lightning-tour-rule-matcher") Match text with token rules +code. import spacy from spacy.matcher import Matcher nlp = spacy.load('en') matcher = Matcher(nlp.vocab) def set_sentiment(matcher, doc, i, matches): doc.sentiment += 0.1 pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']] matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o" matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji text = open('customer_feedback_627.txt').read() matches = nlp(text) +infobox | #[+label-inline API:] #[+api("matcher") #[code Matcher]] | #[+label-inline Usage:] #[+a("/usage/linguistic-features#rule-based-matching") Rule-based matching] +h(3, "lightning-tour-multi-threaded") Multi-threaded generator +code. texts = [u'One document.', u'...', u'Lots of documents'] # .pipe streams input, and produces streaming output iter_texts = (texts[i % 3] for i in xrange(100000000)) for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)): assert doc.is_parsed if i == 100: break +infobox | #[+label-inline API:] #[+api("doc") #[code Doc]] | #[+label-inline Usage:] #[+a("/usage/processing-pipelines#multithreading") Processing pipelines] +h(3, "lightning-tour-dependencies") Get syntactic dependencies +tag-model("dependency parse") +code. def dependency_labels_to_root(token): """Walk up the syntactic tree, collecting the arc labels.""" dep_labels = [] while token.head is not token: dep_labels.append(token.dep) token = token.head return dep_labels +infobox | #[+label-inline API:] #[+api("token") #[code Token]] | #[+label-inline Usage:] #[+a("/usage/linguistic-features#dependency-parse") Using the dependency parse] +h(3, "lightning-tour-numpy-arrays") Export to numpy arrays +code. from spacy.attrs import ORTH, LIKE_URL, IS_OOV attr_ids = [ORTH, LIKE_URL, IS_OOV] doc_array = doc.to_array(attr_ids) assert doc_array.shape == (len(doc), len(attr_ids)) assert doc[0].orth == doc_array[0, 0] assert doc[1].orth == doc_array[1, 0] assert doc[0].like_url == doc_array[0, 1] assert list(doc_array[:, 1]) == [t.like_url for t in doc] +h(3, "lightning-tour-inline") Calculate inline markup on original string +code. def put_spans_around_tokens(doc, get_classes): """Given some function to compute class names, put each token in a span element, with the appropriate classes computed. All whitespace is preserved, outside of the spans. (Of course, HTML won't display more than one whitespace character it – but the point is, no information is lost and you can calculate what you need, e.g. <br />, <p> etc.) """ output = [] html = '<span class="{classes}">{word}</span>{space}' for token in doc: if token.is_space: output.append(token.text) else: classes = ' '.join(get_classes(token)) output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) string = ''.join(output) string = string.replace('\n', '') string = string.replace('\t', ' ') return string