2017-10-03 12:26:20 +00:00
|
|
|
|
//- 💫 DOCS > USAGE > SPACY 101 > LIGHTNING TOUR
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
|
|
p
|
2016-12-25 14:23:30 +00:00
|
|
|
|
| The following examples and code snippets give you an overview of spaCy's
|
2017-11-01 18:49:36 +00:00
|
|
|
|
| functionality and its usage.
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-models") Install models and process text
|
2017-03-17 12:11:00 +00:00
|
|
|
|
|
|
|
|
|
+code(false, "bash").
|
2018-04-29 00:06:46 +00:00
|
|
|
|
python -m spacy download en_core_web_sm
|
|
|
|
|
python -m spacy download de_core_news_sm
|
2017-03-17 12:11:00 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
2017-03-17 12:11:00 +00:00
|
|
|
|
import spacy
|
2018-04-29 00:06:46 +00:00
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc = nlp(u'Hello, world. Here are two sentences.')
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print([t.text for t in doc])
|
2017-03-17 12:11:00 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
nlp_de = spacy.load('de_core_news_sm')
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc_de = nlp_de(u'Ich bin ein Berliner.')
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print([t.text for t in doc_de])
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("spacy#load") #[code spacy.load()]]
|
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/models") Models],
|
2017-10-03 12:26:20 +00:00
|
|
|
|
| #[+a("/usage/spacy-101") spaCy 101]
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-tokens-sentences") Get tokens, noun chunks & sentences
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+tag-model("dependency parse")
|
2017-05-23 21:15:39 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
|
|
|
|
import spacy
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
|
|
|
|
|
u"emoji. It's outranking eggplant 🍑 ")
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print(doc[0].text) # Peach
|
|
|
|
|
print(doc[1].text) # emoji
|
|
|
|
|
print(doc[-1].text) # 🍑
|
|
|
|
|
print(doc[17:19].text) # outranking eggplant
|
2017-05-23 21:15:39 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
noun_chunks = list(doc.noun_chunks)
|
|
|
|
|
print(noun_chunks[0].text) # Peach emoji
|
2017-05-23 21:15:39 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
sentences = list(doc.sents)
|
|
|
|
|
assert len(sentences) == 3
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print(sentences[1].text) # 'Peach is the superior emoji.'
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
|
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/spacy-101") spaCy 101]
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-pos-tags") Get part-of-speech tags and flags
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+tag-model("tagger")
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
|
|
|
|
import spacy
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
|
|
|
|
|
apple = doc[0]
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print('Fine-grained POS tag', apple.pos_, apple.pos)
|
|
|
|
|
print('Coarse-grained POS tag', apple.tag_, apple.tag)
|
|
|
|
|
print('Word shape', apple.shape_, apple.shape)
|
|
|
|
|
print('Alphanumeric characters?', apple.is_alpha)
|
|
|
|
|
print('Punctuation mark?', apple.is_punct)
|
2017-05-24 23:58:33 +00:00
|
|
|
|
|
|
|
|
|
billion = doc[10]
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print('Digit?', billion.is_digit)
|
|
|
|
|
print('Like a number?', billion.like_num)
|
|
|
|
|
print('Like an email address?', billion.like_email)
|
2017-05-24 23:58:33 +00:00
|
|
|
|
|
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("token") #[code Token]]
|
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/linguistic-features#pos-tagging") Part-of-speech tagging]
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-hashes") Use hash values for any string
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
|
|
|
|
import spacy
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
2017-05-28 16:19:11 +00:00
|
|
|
|
doc = nlp(u'I love coffee')
|
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401
|
|
|
|
|
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
|
|
|
|
|
print(coffee_hash, coffee_text)
|
|
|
|
|
print(doc[2].orth, coffee_hash) # 3197928453018144401
|
|
|
|
|
print(doc[2].text, coffee_text) # 'coffee'
|
2017-05-28 16:19:11 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079
|
|
|
|
|
beer_text = doc.vocab.strings[beer_hash] # 'beer'
|
|
|
|
|
print(beer_hash, beer_text)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783
|
|
|
|
|
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '
|
|
|
|
|
print(unicorn_hash, unicorn_text)
|
2017-05-28 17:42:44 +00:00
|
|
|
|
|
|
|
|
|
+infobox
|
2017-11-26 17:03:44 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("stringstore") #[code StringStore]]
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
|
2017-05-28 17:42:44 +00:00
|
|
|
|
|
2017-11-09 12:55:13 +00:00
|
|
|
|
+h(3, "lightning-tour-entities") Recognise and update named entities
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+tag-model("NER")
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
|
|
|
|
import spacy
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
|
2018-04-29 00:06:46 +00:00
|
|
|
|
for ent in doc.ents:
|
|
|
|
|
print(ent.text, ent.start_char, ent.end_char, ent.label_)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
from spacy.tokens import Span
|
2018-04-29 00:06:46 +00:00
|
|
|
|
doc = nlp(u'FB is hiring a new VP of global policy')
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
|
2018-04-29 00:06:46 +00:00
|
|
|
|
for ent in doc.ents:
|
|
|
|
|
print(ent.text, ent.start_char, ent.end_char, ent.label_)
|
2017-05-24 23:58:33 +00:00
|
|
|
|
|
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/linguistic-features#named-entities") Named entity recognition]
|
2017-05-24 23:58:33 +00:00
|
|
|
|
|
2017-11-26 17:04:18 +00:00
|
|
|
|
+h(3, "lightning-tour-training") Train and update neural network models
|
|
|
|
|
+tag-model
|
|
|
|
|
|
|
|
|
|
+code.
|
|
|
|
|
import spacy
|
|
|
|
|
import random
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en')
|
|
|
|
|
train_data = [("Uber blew through $1 million", {'entities': [(0, 4, 'ORG')]})]
|
|
|
|
|
|
2017-12-06 14:26:43 +00:00
|
|
|
|
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']):
|
2017-11-26 17:04:18 +00:00
|
|
|
|
optimizer = nlp.begin_training()
|
|
|
|
|
for i in range(10):
|
|
|
|
|
random.shuffle(train_data)
|
|
|
|
|
for text, annotations in train_data:
|
2018-04-28 12:56:00 +00:00
|
|
|
|
nlp.update([text], [annotations], sgd=optimizer)
|
2017-11-26 17:04:18 +00:00
|
|
|
|
nlp.to_disk('/model')
|
|
|
|
|
|
|
|
|
|
+infobox
|
|
|
|
|
| #[+label-inline API:] #[+api("language#update") #[code Language.update]]
|
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/training") Training spaCy's statistical models]
|
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-displacy") Visualize a dependency parse and named entities in your browser
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+tag-model("dependency parse", "NER")
|
2017-10-28 23:18:09 +00:00
|
|
|
|
+tag-new(2)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-28 14:41:01 +00:00
|
|
|
|
+aside
|
|
|
|
|
.u-text-center(style="overflow: auto").
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" class="o-svg" viewBox="270 35 125 240" width="400" height="150" style="max-width: none; color: #fff; background: #1a1e23; font-family: inherit; font-size: 2rem">
|
|
|
|
|
<text fill="currentColor" text-anchor="middle" y="222.0">
|
|
|
|
|
<tspan style="font-weight: bold" fill="currentColor" x="50">This</tspan>
|
|
|
|
|
<tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="50">DT</tspan>
|
|
|
|
|
</text>
|
|
|
|
|
<text fill="currentColor" text-anchor="middle" y="222.0">
|
|
|
|
|
<tspan style="font-weight: bold" fill="currentColor" x="225">is</tspan>
|
|
|
|
|
<tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="225">VBZ</tspan>
|
|
|
|
|
</text>
|
|
|
|
|
<text fill="currentColor" text-anchor="middle" y="222.0">
|
|
|
|
|
<tspan style="font-weight: bold" fill="currentColor" x="400">a</tspan>
|
|
|
|
|
<tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="400">DT</tspan>
|
|
|
|
|
</text>
|
|
|
|
|
<text fill="currentColor" text-anchor="middle" y="222.0">
|
|
|
|
|
<tspan style="font-weight: bold" fill="currentColor" x="575">sentence.</tspan>
|
|
|
|
|
<tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="575">NN</tspan>
|
|
|
|
|
</text>
|
|
|
|
|
<path id="arrow-0-0" stroke-width="2px" d="M70,177.0 C70,89.5 220.0,89.5 220.0,177.0" fill="none" stroke="currentColor"/>
|
|
|
|
|
<text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
|
|
|
|
|
<textPath xlink:href="#arrow-0-0" startOffset="50%" fill="currentColor" text-anchor="middle">nsubj</textPath>
|
|
|
|
|
</text>
|
|
|
|
|
<path d="M70,179.0 L62,167.0 78,167.0" fill="currentColor"/>
|
|
|
|
|
<path id="arrow-0-1" stroke-width="2px" d="M420,177.0 C420,89.5 570.0,89.5 570.0,177.0" fill="none" stroke="currentColor"/>
|
|
|
|
|
<text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
|
|
|
|
|
<textPath xlink:href="#arrow-0-1" startOffset="50%" fill="currentColor" text-anchor="middle">det</textPath>
|
|
|
|
|
</text>
|
|
|
|
|
<path d="M420,179.0 L412,167.0 428,167.0" fill="currentColor"/>
|
|
|
|
|
<path id="arrow-0-2" stroke-width="2px" d="M245,177.0 C245,2.0 575.0,2.0 575.0,177.0" fill="none" stroke="currentColor"/>
|
|
|
|
|
<text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
|
|
|
|
|
<textPath xlink:href="#arrow-0-2" startOffset="50%" fill="currentColor" text-anchor="middle">attr</textPath>
|
|
|
|
|
</text>
|
|
|
|
|
<path d="M575.0,179.0 L583.0,167.0 567.0,167.0" fill="currentColor"/>
|
|
|
|
|
</svg>
|
|
|
|
|
|
2016-10-31 18:04:15 +00:00
|
|
|
|
+code.
|
2017-05-24 23:58:33 +00:00
|
|
|
|
from spacy import displacy
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc_dep = nlp(u'This is a sentence.')
|
|
|
|
|
displacy.serve(doc_dep, style='dep')
|
|
|
|
|
|
2017-05-25 09:15:56 +00:00
|
|
|
|
doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
|
|
|
|
|
u'in 2007, few people outside of the company took him seriously.')
|
2017-05-24 23:58:33 +00:00
|
|
|
|
displacy.serve(doc_ent, style='ent')
|
|
|
|
|
|
|
|
|
|
+infobox
|
2017-11-01 20:11:10 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("top-level#displacy") #[code displacy]]
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/visualizers") Visualizers]
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-word-vectors") Get word vectors and similarity
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+tag-model("word vectors")
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
|
|
|
|
import spacy
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_md')
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
|
2018-04-29 00:06:46 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
apple = doc[0]
|
|
|
|
|
banana = doc[2]
|
|
|
|
|
pasta = doc[6]
|
|
|
|
|
hippo = doc[8]
|
2018-04-29 00:06:46 +00:00
|
|
|
|
|
|
|
|
|
print('apple <-> banana', apple.similarity(banana))
|
|
|
|
|
print('pasta <-> hippo', pasta.similarity(hippo))
|
|
|
|
|
print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-16 18:36:41 +00:00
|
|
|
|
p
|
|
|
|
|
| For the best results, you should run this example using the
|
2018-04-29 00:06:46 +00:00
|
|
|
|
| #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model
|
|
|
|
|
| (currently not available in the live demo).
|
2017-10-16 18:36:41 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/vectors-similarity") Word vectors and similarity]
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-serialization") Simple and efficient serialization
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
|
|
+code.
|
2017-05-24 23:58:33 +00:00
|
|
|
|
import spacy
|
2017-10-11 00:30:40 +00:00
|
|
|
|
from spacy.tokens import Doc
|
2017-05-27 15:58:06 +00:00
|
|
|
|
from spacy.vocab import Vocab
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
nlp = spacy.load('en')
|
2017-11-26 17:04:04 +00:00
|
|
|
|
customer_feedback = open('customer_feedback_627.txt').read()
|
|
|
|
|
doc = nlp(customer_feedback)
|
|
|
|
|
doc.to_disk('/tmp/customer_feedback_627.bin')
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-11-26 17:04:04 +00:00
|
|
|
|
new_doc = Doc(Vocab()).from_disk('/tmp/customer_feedback_627.bin')
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("language") #[code Language]],
|
2017-05-27 22:03:16 +00:00
|
|
|
|
| #[+api("doc") #[code Doc]]
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/models#saving-loading") Saving and loading models]
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-rule-matcher") Match text with token rules
|
2017-05-25 09:15:56 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
2017-05-25 09:15:56 +00:00
|
|
|
|
import spacy
|
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
2017-05-25 09:15:56 +00:00
|
|
|
|
matcher = Matcher(nlp.vocab)
|
2017-05-27 18:02:20 +00:00
|
|
|
|
|
|
|
|
|
def set_sentiment(matcher, doc, i, matches):
|
|
|
|
|
doc.sentiment += 0.1
|
|
|
|
|
|
2018-02-08 10:29:27 +00:00
|
|
|
|
pattern1 = [{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}]
|
2017-05-27 18:02:20 +00:00
|
|
|
|
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
|
|
|
|
|
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
|
2017-05-28 14:41:01 +00:00
|
|
|
|
matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji
|
2018-04-29 00:06:46 +00:00
|
|
|
|
|
|
|
|
|
doc = nlp(u"A text about Google I/O 😀😀")
|
|
|
|
|
matches = matcher(doc)
|
|
|
|
|
|
|
|
|
|
for match_id, start, end in matches:
|
|
|
|
|
string_id = nlp.vocab.strings[match_id]
|
|
|
|
|
span = doc[start:end]
|
|
|
|
|
print(string_id, span.text)
|
|
|
|
|
print('Sentiment', doc.sentiment)
|
2017-05-25 09:15:56 +00:00
|
|
|
|
|
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("matcher") #[code Matcher]]
|
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/linguistic-features#rule-based-matching") Rule-based matching]
|
2017-05-25 09:15:56 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-multi-threaded") Multi-threaded generator
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+code.
|
|
|
|
|
texts = [u'One document.', u'...', u'Lots of documents']
|
|
|
|
|
# .pipe streams input, and produces streaming output
|
|
|
|
|
iter_texts = (texts[i % 3] for i in xrange(100000000))
|
|
|
|
|
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
|
|
|
|
|
assert doc.is_parsed
|
|
|
|
|
if i == 100:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("doc") #[code Doc]]
|
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/processing-pipelines#multithreading") Processing pipelines]
|
2017-05-24 23:58:33 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-dependencies") Get syntactic dependencies
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+tag-model("dependency parse")
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
|
|
|
|
import spacy
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
|
|
|
|
doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google "
|
|
|
|
|
u"in 2007, few people outside of the company took him seriously.")
|
|
|
|
|
|
|
|
|
|
dep_labels = []
|
|
|
|
|
for token in doc:
|
2017-12-06 19:08:42 +00:00
|
|
|
|
while token.head != token:
|
2018-04-29 00:06:46 +00:00
|
|
|
|
dep_labels.append(token.dep_)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
token = token.head
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print(dep_labels)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
+infobox
|
2017-10-16 18:36:41 +00:00
|
|
|
|
| #[+label-inline API:] #[+api("token") #[code Token]]
|
|
|
|
|
| #[+label-inline Usage:] #[+a("/usage/linguistic-features#dependency-parse") Using the dependency parse]
|
2017-05-24 23:58:33 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-numpy-arrays") Export to numpy arrays
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
|
|
|
|
import spacy
|
|
|
|
|
from spacy.attrs import ORTH, LIKE_URL
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
|
|
|
|
doc = nlp(u"Check out https://spacy.io")
|
|
|
|
|
for token in doc:
|
|
|
|
|
print(token.text, token.orth, token.like_url)
|
2017-05-24 23:58:33 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
attr_ids = [ORTH, LIKE_URL]
|
2017-05-24 23:58:33 +00:00
|
|
|
|
doc_array = doc.to_array(attr_ids)
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print(doc_array.shape)
|
|
|
|
|
print(len(doc), len(attr_ids))
|
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
assert doc[0].orth == doc_array[0, 0]
|
|
|
|
|
assert doc[1].orth == doc_array[1, 0]
|
|
|
|
|
assert doc[0].like_url == doc_array[0, 1]
|
2018-04-29 00:06:46 +00:00
|
|
|
|
|
2017-05-24 23:58:33 +00:00
|
|
|
|
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
|
2018-04-29 00:06:46 +00:00
|
|
|
|
print(list(doc_array[:, 1]))
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2017-10-03 12:26:20 +00:00
|
|
|
|
+h(3, "lightning-tour-inline") Calculate inline markup on original string
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
2018-04-29 00:06:46 +00:00
|
|
|
|
+code-exec.
|
|
|
|
|
import spacy
|
|
|
|
|
|
|
|
|
|
def put_spans_around_tokens(doc):
|
|
|
|
|
"""Here, we're building a custom "syntax highlighter" for
|
|
|
|
|
part-of-speech tags and dependencies. We put each token in a
|
2017-05-25 09:15:56 +00:00
|
|
|
|
span element, with the appropriate classes computed. All whitespace is
|
2018-04-29 00:06:46 +00:00
|
|
|
|
preserved, outside of the spans. (Of course, HTML will only display
|
|
|
|
|
multiple whitespace if enabled – but the point is, no information is lost
|
2017-05-25 09:15:56 +00:00
|
|
|
|
and you can calculate what you need, e.g. <br />, <p> etc.)
|
|
|
|
|
"""
|
2016-10-31 18:04:15 +00:00
|
|
|
|
output = []
|
2017-05-25 09:15:56 +00:00
|
|
|
|
html = '<span class="{classes}">{word}</span>{space}'
|
2016-10-31 18:04:15 +00:00
|
|
|
|
for token in doc:
|
|
|
|
|
if token.is_space:
|
2017-05-25 09:15:56 +00:00
|
|
|
|
output.append(token.text)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
else:
|
2018-04-29 00:06:46 +00:00
|
|
|
|
classes = 'pos-{} dep-{}'.format(token.pos_, token.dep_)
|
2017-05-25 09:15:56 +00:00
|
|
|
|
output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
|
2016-10-31 18:04:15 +00:00
|
|
|
|
string = ''.join(output)
|
|
|
|
|
string = string.replace('\n', '')
|
|
|
|
|
string = string.replace('\t', ' ')
|
2018-06-11 15:47:24 +00:00
|
|
|
|
return '<pre>{}</pre>'.format(string)
|
2018-04-29 00:06:46 +00:00
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en_core_web_sm')
|
|
|
|
|
doc = nlp(u"This is a test.\n\nHello world.")
|
|
|
|
|
html = put_spans_around_tokens(doc)
|
|
|
|
|
print(html)
|