mirror of https://github.com/explosion/spaCy.git
1372 lines
72 KiB
JSON
1372 lines
72 KiB
JSON
{
|
||
"resources": [
|
||
{
|
||
"id": "spacymoji",
|
||
"slogan": "Emoji handling and meta data as a spaCy pipeline component",
|
||
"github": "ines/spacymoji",
|
||
"description": "spaCy v2.0 extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.",
|
||
"pip": "spacymoji",
|
||
"category": ["pipeline"],
|
||
"tags": ["emoji", "unicode"],
|
||
"thumb": "https://i.imgur.com/XOTYIgn.jpg",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacymoji import Emoji",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"emoji = Emoji(nlp)",
|
||
"nlp.add_pipe(emoji, first=True)",
|
||
"",
|
||
"doc = nlp(u'This is a test 😻 👍🏿')",
|
||
"assert doc._.has_emoji == True",
|
||
"assert doc[2:5]._.has_emoji == True",
|
||
"assert doc[0]._.is_emoji == False",
|
||
"assert doc[4]._.is_emoji == True",
|
||
"assert doc[5]._.emoji_desc == u'thumbs up dark skin tone'",
|
||
"assert len(doc._.emoji) == 2",
|
||
"assert doc._.emoji[1] == (u'👍🏿', 5, u'thumbs up dark skin tone')"
|
||
],
|
||
"author": "Ines Montani",
|
||
"author_links": {
|
||
"twitter": "_inesmontani",
|
||
"github": "ines",
|
||
"website": "https://ines.io"
|
||
}
|
||
},
|
||
{
|
||
"id": "spacy_hunspell",
|
||
"slogan": "Add spellchecking and spelling suggestions to your spaCy pipeline using Hunspell",
|
||
"description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [Hunspell](http://hunspell.github.io) support for spellchecking.",
|
||
"github": "tokestermw/spacy_hunspell",
|
||
"pip": "spacy_hunspell",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_hunspell import spaCyHunSpell",
|
||
"",
|
||
"nlp = spacy.load('en_core_web_sm')",
|
||
"hunspell = spaCyHunSpell(nlp, 'mac')",
|
||
"nlp.add_pipe(hunspell)",
|
||
"doc = nlp('I can haz cheezeburger.')",
|
||
"haz = doc[2]",
|
||
"haz._.hunspell_spell # False",
|
||
"haz._.hunspell_suggest # ['ha', 'haze', 'hazy', 'has', 'hat', 'had', 'hag', 'ham', 'hap', 'hay', 'haw', 'ha z']"
|
||
],
|
||
"author": "Motoki Wu",
|
||
"author_links": {
|
||
"github": "tokestermw",
|
||
"twitter": "plusepsilon"
|
||
},
|
||
"category": ["pipeline"],
|
||
"tags": ["spellcheck"]
|
||
},
|
||
{
|
||
"id": "spacy_grammar",
|
||
"slogan": "Language Tool style grammar handling with spaCy",
|
||
"description": "This packages leverages the [Matcher API](https://spacy.io/docs/usage/rule-based-matching) in spaCy to quickly match on spaCy tokens not dissimilar to regex. It reads a `grammar.yml` file to load up custom patterns and returns the results inside `Doc`, `Span`, and `Token`. It is extensible through adding rules to `grammar.yml` (though currently only the simple string matching is implemented).",
|
||
"github": "tokestermw/spacy_grammar",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_grammar.grammar import Grammar",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"grammar = Grammar(nlp)",
|
||
"nlp.add_pipe(grammar)",
|
||
"doc = nlp('I can haz cheeseburger.')",
|
||
"doc._.has_grammar_error # True"
|
||
],
|
||
"author": "Motoki Wu",
|
||
"author_links": {
|
||
"github": "tokestermw",
|
||
"twitter": "plusepsilon"
|
||
},
|
||
"category": ["pipeline"]
|
||
},
|
||
{
|
||
"id": "spacy_kenlm",
|
||
"slogan": "KenLM extension for spaCy 2.0",
|
||
"github": "tokestermw/spacy_kenlm",
|
||
"pip": "spacy_kenlm",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_kenlm import spaCyKenLM",
|
||
"",
|
||
"nlp = spacy.load('en_core_web_sm')",
|
||
"spacy_kenlm = spaCyKenLM() # default model from test.arpa",
|
||
"nlp.add_pipe(spacy_kenlm)",
|
||
"doc = nlp('How are you?')",
|
||
"doc._.kenlm_score # doc score",
|
||
"doc[:2]._.kenlm_score # span score",
|
||
"doc[2]._.kenlm_score # token score"
|
||
],
|
||
"author": "Motoki Wu",
|
||
"author_links": {
|
||
"github": "tokestermw",
|
||
"twitter": "plusepsilon"
|
||
},
|
||
"category": ["pipeline"]
|
||
},
|
||
{
|
||
"id": "spacy_readability",
|
||
"slogan": "Add text readability meta data to Doc objects",
|
||
"description": "spaCy v2.0 pipeline component for calculating readability scores of of text. Provides scores for Flesh-Kincaid grade level, Flesh-Kincaid reading ease, and Dale-Chall.",
|
||
"github": "mholtzscher/spacy_readability",
|
||
"pip": "spacy-readability",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_readability import Readability",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"read = Readability(nlp)",
|
||
"nlp.add_pipe(read, last=True)",
|
||
"doc = nlp(\"I am some really difficult text to read because I use obnoxiously large words.\")",
|
||
"doc._.flesch_kincaid_grade_level",
|
||
"doc._.flesch_kincaid_reading_ease",
|
||
"doc._.dale_chall"
|
||
],
|
||
"author": "Michael Holtzscher",
|
||
"author_links": {
|
||
"github": "mholtzscher"
|
||
},
|
||
"category": ["pipeline"]
|
||
},
|
||
{
|
||
"id": "spacy-sentence-segmenter",
|
||
"title": "Sentence Segmenter",
|
||
"slogan": "Custom sentence segmentation for spaCy",
|
||
"code_example": [
|
||
"from seg.newline.segmenter import NewLineSegmenter",
|
||
"import spacy",
|
||
"",
|
||
"nlseg = NewLineSegmenter()",
|
||
"nlp = spacy.load('en')",
|
||
"nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
|
||
"doc = nlp(my_doc_text)"
|
||
],
|
||
"author": "tc64",
|
||
"author_link": {
|
||
"github": "tc64"
|
||
},
|
||
"category": ["pipeline"]
|
||
},
|
||
{
|
||
"id": "spacy_cld",
|
||
"title": "spaCy-CLD",
|
||
"slogan": "Add language detection to your spaCy pipeline using CLD2",
|
||
"description": "spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1).\n\nspacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language.",
|
||
"github": "nickdavidhaynes/spacy-cld",
|
||
"pip": "spacy_cld",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_cld import LanguageDetector",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"language_detector = LanguageDetector()",
|
||
"nlp.add_pipe(language_detector)",
|
||
"doc = nlp('This is some English text.')",
|
||
"",
|
||
"doc._.languages # ['en']",
|
||
"doc._.language_scores['en'] # 0.96"
|
||
],
|
||
"author": "Nicholas D Haynes",
|
||
"author_links": {
|
||
"github": "nickdavidhaynes"
|
||
},
|
||
"category": ["pipeline"]
|
||
},
|
||
{
|
||
"id": "spacy-lookup",
|
||
"slogan": "A powerful entity matcher for very large dictionaries, using the FlashText module",
|
||
"description": "spaCy v2.0 extension and pipeline component for adding Named Entities metadata to `Doc` objects. Detects Named Entities using dictionaries. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_entity`, `._.entity_type`, `._.has_entities` and `._.entities`. Named Entities are matched using the python module `flashtext`, and looked up in the data provided by different dictionaries.",
|
||
"github": "mpuig/spacy-lookup",
|
||
"pip": "spacy-lookup",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_lookup import Entity",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"entity = Entity(keywords_list=['python', 'java platform'])",
|
||
"nlp.add_pipe(entity, last=True)",
|
||
"",
|
||
"doc = nlp(u\"I am a product manager for a java and python.\")",
|
||
"assert doc._.has_entities == True",
|
||
"assert doc[2:5]._.has_entities == True",
|
||
"assert doc[0]._.is_entity == False",
|
||
"assert doc[3]._.is_entity == True",
|
||
"print(doc._.entities)"
|
||
],
|
||
"author": "Marc Puig",
|
||
"author_links": {
|
||
"github": "mpuig"
|
||
},
|
||
"category": ["pipeline"]
|
||
},
|
||
{
|
||
"id": "spacy-iwnlp",
|
||
"slogan": "German lemmatization with IWNLP",
|
||
"description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [IWNLP-py](https://github.com/Liebeck/iwnlp-py) as German lemmatizer directly into your spaCy pipeline.",
|
||
"github": "Liebeck/spacy-iwnlp",
|
||
"pip": "spacy-iwnlp",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_iwnlp import spaCyIWNLP",
|
||
"",
|
||
"nlp = spacy.load('de')",
|
||
"iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')",
|
||
"nlp.add_pipe(iwnlp)",
|
||
"doc = nlp('Wir mögen Fußballspiele mit ausgedehnten Verlängerungen.')",
|
||
"for token in doc:",
|
||
" print('POS: {}\tIWNLP:{}'.format(token.pos_, token._.iwnlp_lemmas))"
|
||
],
|
||
"author": "Matthias Liebeck",
|
||
"author_links": {
|
||
"github": "Liebeck"
|
||
},
|
||
"category": ["pipeline"],
|
||
"tags": ["lemmatizer", "german"]
|
||
},
|
||
{
|
||
"id": "spacy-sentiws",
|
||
"slogan": "German sentiment scores with SentiWS",
|
||
"description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [SentiWS](http://wortschatz.uni-leipzig.de/en/download) as German sentiment score directly into your spaCy pipeline.",
|
||
"github": "Liebeck/spacy-sentiws",
|
||
"pip": "spacy-sentiws",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_sentiws import spaCySentiWS",
|
||
"",
|
||
"nlp = spacy.load('de')",
|
||
"sentiws = spaCySentiWS(sentiws_path='data/sentiws/')",
|
||
"nlp.add_pipe(sentiws)",
|
||
"doc = nlp('Die Dummheit der Unterwerfung blüht in hübschen Farben.')",
|
||
"",
|
||
"for token in doc:",
|
||
" print('{}, {}, {}'.format(token.text, token._.sentiws, token.pos_))"
|
||
],
|
||
"author": "Matthias Liebeck",
|
||
"author_links": {
|
||
"github": "Liebeck"
|
||
},
|
||
"category": ["pipeline"],
|
||
"tags": ["sentiment", "german"]
|
||
},
|
||
{
|
||
"id": "spacy-lefff",
|
||
"slogan": "POS and French lemmatization with Lefff",
|
||
"description": "spacy v2.0 extension and pipeline component for adding a French POS and lemmatizer based on [Lefff](https://hal.inria.fr/inria-00521242/).",
|
||
"github": "sammous/spacy-lefff",
|
||
"pip": "spacy-lefff",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_lefff import LefffLemmatizer, POSTagger",
|
||
"",
|
||
"nlp = spacy.load('fr')",
|
||
"pos = POSTagger()",
|
||
"french_lemmatizer = LefffLemmatizer(after_melt=True)",
|
||
"nlp.add_pipe(pos, name='pos', after='parser')",
|
||
"nlp.add_pipe(french_lemmatizer, name='lefff', after='pos')",
|
||
"doc = nlp(u\"Paris est une ville très chère.\")",
|
||
"for d in doc:",
|
||
" print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)"
|
||
],
|
||
"author": "Sami Moustachir",
|
||
"author_links": {
|
||
"github": "sammous"
|
||
},
|
||
"category": ["pipeline"],
|
||
"tags": ["pos", "lemmatizer", "french"]
|
||
},
|
||
{
|
||
"id": "lemmy",
|
||
"title": "Lemmy",
|
||
"slogan": "A Danish lemmatizer",
|
||
"description": "Lemmy is a lemmatizer for Danish 🇩🇰 . It comes already trained on Dansk Sprognævns (DSN) word list (‘fuldformliste’) and the Danish Universal Dependencies and is ready for use. Lemmy also supports training on your own dataset. The model currently included in Lemmy was evaluated on the Danish Universal Dependencies dev dataset and scored an accruacy > 99%.\n\nYou can use Lemmy as a spaCy extension, more specifcally a spaCy pipeline component. This is highly recommended and makes the lemmas easily accessible from the spaCy tokens. Lemmy makes use of POS tags to predict the lemmas. When wired up to the spaCy pipeline, Lemmy has the benefit of using spaCy’s builtin POS tagger.",
|
||
"github": "sorenlind/lemmy",
|
||
"pip": "lemmy",
|
||
"code_example": [
|
||
"import da_custom_model as da # name of your spaCy model",
|
||
"import lemmy.pipe",
|
||
"nlp = da.load()",
|
||
"",
|
||
"# create an instance of Lemmy's pipeline component for spaCy",
|
||
"pipe = lemmy.pipe.load()",
|
||
"",
|
||
"# add the comonent to the spaCy pipeline.",
|
||
"nlp.add_pipe(pipe, after='tagger')",
|
||
"",
|
||
"# lemmas can now be accessed using the `._.lemma` attribute on the tokens",
|
||
"nlp(\"akvariernes\")[0]._.lemma"
|
||
],
|
||
"thumb": "https://i.imgur.com/RJVFRWm.jpg",
|
||
"author": "Søren Lind Kristiansen",
|
||
"author_links": {
|
||
"github": "sorenlind"
|
||
},
|
||
"category": ["pipeline"],
|
||
"tags": ["lemmatizer", "danish"]
|
||
},
|
||
{
|
||
"id": "wmd-relax",
|
||
"slogan": "Calculates word mover's distance insanely fast",
|
||
"description": "Calculates Word Mover's Distance as described in [From Word Embeddings To Document Distances](http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf) by Matt Kusner, Yu Sun, Nicholas Kolkin and Kilian Weinberger.\n\n⚠️ **This package is currently only compatible with spaCy v.1x.**",
|
||
"github": "src-d/wmd-relax",
|
||
"thumb": "https://i.imgur.com/f91C3Lf.jpg",
|
||
"code_example": [
|
||
"import spacy",
|
||
"import wmd",
|
||
"",
|
||
"nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline)",
|
||
"doc1 = nlp(\"Politician speaks to the media in Illinois.\")",
|
||
"doc2 = nlp(\"The president greets the press in Chicago.\")",
|
||
"print(doc1.similarity(doc2))"
|
||
],
|
||
"author": "source{d}",
|
||
"author_links": {
|
||
"github": "src-d",
|
||
"twitter": "sourcedtech",
|
||
"website": "https://sourced.tech"
|
||
},
|
||
"category": ["pipeline"]
|
||
},
|
||
{
|
||
"id": "neuralcoref",
|
||
"slogan": "State-of-the-art coreference resolution based on neural nets and spaCy",
|
||
"description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. With ✨Neuralcoref v2.0, you should now be able to train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**.",
|
||
"github": "huggingface/neuralcoref",
|
||
"thumb": "https://i.imgur.com/j6FO9O6.jpg",
|
||
"code_example": [
|
||
"from neuralcoref import Coref",
|
||
"",
|
||
"coref = Coref()",
|
||
"clusters = coref.one_shot_coref(utterances=u\"She loves him.\", context=u\"My sister has a dog.\")",
|
||
"mentions = coref.get_mentions()",
|
||
"utterances = coref.get_utterances()",
|
||
"resolved_utterance_text = coref.get_resolved_utterances()"
|
||
],
|
||
"author": "Hugging Face",
|
||
"author_links": {
|
||
"github": "huggingface"
|
||
},
|
||
"category": ["standalone", "conversational"],
|
||
"tags": ["coref"]
|
||
},
|
||
{
|
||
"id": "neuralcoref-vizualizer",
|
||
"title": "Neuralcoref Visualizer",
|
||
"slogan": "State-of-the-art coreference resolution based on neural nets and spaCy",
|
||
"description": "In short, coreference is the fact that two or more expressions in a text – like pronouns or nouns – link to the same person or thing. It is a classical Natural language processing task, that has seen a revival of interest in the past two years as several research groups applied cutting-edge deep-learning and reinforcement-learning techniques to it. It is also one of the key building blocks to building conversational Artificial intelligences.",
|
||
"url": "https://huggingface.co/coref/",
|
||
"image": "https://i.imgur.com/3yy4Qyf.png",
|
||
"thumb": "https://i.imgur.com/j6FO9O6.jpg",
|
||
"github": "huggingface/neuralcoref",
|
||
"category": ["visualizers", "conversational"],
|
||
"tags": ["coref", "chatbots"],
|
||
"author": "Hugging Face",
|
||
"author_links": {
|
||
"github": "huggingface"
|
||
}
|
||
},
|
||
{
|
||
"id": "spacy-vis",
|
||
"slogan": "A visualisation tool for spaCy using Hierplane",
|
||
"description": "A visualiser for spaCy annotations. This visualisation uses the [Hierplane](https://allenai.github.io/hierplane/) Library to render the dependency parse from spaCy's models. It also includes visualisation of entities and POS tags within nodes.",
|
||
"github": "DeNeutoy/spacy-vis",
|
||
"url": "http://spacyvis.allennlp.org/spacy-parser",
|
||
"thumb": "https://i.imgur.com/DAG9QFd.jpg",
|
||
"image": "https://raw.githubusercontent.com/DeNeutoy/spacy-vis/master/img/example.gif",
|
||
"author": "Mark Neumann",
|
||
"author_links": {
|
||
"twitter": "MarkNeumannnn",
|
||
"github": "DeNeutoy"
|
||
},
|
||
"category": ["visualizers"]
|
||
},
|
||
{
|
||
"id": "matcher-explorer",
|
||
"title": "Rule-based Matcher Explorer",
|
||
"slogan": "Test spaCy's rule-based Matcher by creating token patterns interactively",
|
||
"description": "Test spaCy's rule-based `Matcher` by creating token patterns interactively and running them over your text. Each token can set multiple attributes like text value, part-of-speech tag or boolean flags. The token-based view lets you explore how spaCy processes your text – and why your pattern matches, or why it doesn't. For more details on rule-based matching, see the [documentation](https://spacy.io/usage/rule-based-matching).",
|
||
"image": "https://explosion.ai/assets/img/demos/matcher.png",
|
||
"thumb": "https://i.imgur.com/rPK4AGt.jpg",
|
||
"url": "https://explosion.ai/demos/matcher",
|
||
"author": "Ines Montani",
|
||
"author_links": {
|
||
"twitter": "_inesmontani",
|
||
"github": "ines",
|
||
"website": "https://ines.io"
|
||
},
|
||
"category": ["visualizers"]
|
||
},
|
||
{
|
||
"id": "displacy",
|
||
"title": "displaCy",
|
||
"slogan": "A modern syntactic dependency visualizer",
|
||
"description": "Visualize spaCy's guess at the syntactic structure of a sentence. Arrows point from children to heads, and are labelled by their relation type.",
|
||
"url": "https://explosion.ai/demos/displacy",
|
||
"thumb": "https://i.imgur.com/nxDcHaL.jpg",
|
||
"image": "https://explosion.ai/assets/img/demos/displacy.png",
|
||
"author": "Ines Montani",
|
||
"author_links": {
|
||
"twitter": "_inesmontani",
|
||
"github": "ines",
|
||
"website": "https://ines.io"
|
||
},
|
||
"category": ["visualizers"]
|
||
},
|
||
{
|
||
"id": "displacy-ent",
|
||
"title": "displaCy ENT",
|
||
"slogan": "A modern named entity visualizer",
|
||
"description": "Visualize spaCy's guess at the named entities in the document. You can filter the displayed types, to only show the annotations you're interested in.",
|
||
"url": "https://explosion.ai/demos/displacy-ent",
|
||
"thumb": "https://i.imgur.com/A77Ecbs.jpg",
|
||
"image": "https://explosion.ai/assets/img/demos/displacy-ent.png",
|
||
"author": "Ines Montani",
|
||
"author_links": {
|
||
"twitter": "_inesmontani",
|
||
"github": "ines",
|
||
"website": "https://ines.io"
|
||
},
|
||
"category": ["visualizers"]
|
||
},
|
||
{
|
||
"id": "explacy",
|
||
"slogan": "A small tool that explains spaCy parse results",
|
||
"github": "tylerneylon/explacy",
|
||
"thumb": "https://i.imgur.com/V1hCWmn.jpg",
|
||
"image": "https://raw.githubusercontent.com/tylerneylon/explacy/master/img/screenshot.png",
|
||
"code_example": [
|
||
"import spacy",
|
||
"import explacy",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')"
|
||
],
|
||
"author": "Tyler Neylon",
|
||
"author_links": {
|
||
"github": "tylerneylon"
|
||
},
|
||
"category": ["visualizers"]
|
||
},
|
||
{
|
||
"id": "scattertext",
|
||
"slogan": "Beautiful visualizations of how language differs among document types",
|
||
"description": "A tool for finding distinguishing terms in small-to-medium-sized corpora, and presenting them in a sexy, interactive scatter plot with non-overlapping term labels. Exploratory data analysis just got more fun.",
|
||
"github": "JasonKessler/scattertext",
|
||
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
|
||
"code_example": [
|
||
"import spacy",
|
||
"import scattertext as st",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"corpus = st.CorpusFromPandas(convention_df,",
|
||
" category_col='party',",
|
||
" text_col='text',",
|
||
" nlp=nlp).build()"
|
||
],
|
||
"author": "Jason Kessler",
|
||
"author_links": {
|
||
"github": "JasonKessler",
|
||
"twitter": "jasonkessler"
|
||
},
|
||
"category": ["visualizers"]
|
||
},
|
||
{
|
||
"id": "rasa",
|
||
"title": "Rasa NLU",
|
||
"slogan": "Turn natural language into structured data",
|
||
"description": "Rasa NLU (Natural Language Understanding) is a tool for understanding what is being said in short pieces of text. Rasa NLU is primarily used to build chatbots and voice apps, where this is called intent classification and entity extraction. To use Rasa, *you have to provide some training data*.",
|
||
"github": "RasaHQ/rasa_nlu",
|
||
"pip": "rasa_nlu",
|
||
"thumb": "https://i.imgur.com/ndCfKNq.png",
|
||
"url": "https://nlu.rasa.com/",
|
||
"author": "Rasa",
|
||
"author_links": {
|
||
"github": "RasaHQ"
|
||
},
|
||
"category": ["conversational"],
|
||
"tags": ["chatbots"]
|
||
},
|
||
{
|
||
"id": "tochtext",
|
||
"title": "torchtext",
|
||
"slogan": "Data loaders and abstractions for text and NLP",
|
||
"github": "pytorch/text",
|
||
"pip": "torchtext",
|
||
"thumb": "https://i.imgur.com/WFkxuPo.png",
|
||
"code_example": [
|
||
">>> pos = data.TabularDataset(",
|
||
"... path='data/pos/pos_wsj_train.tsv', format='tsv',",
|
||
"... fields=[('text', data.Field()),",
|
||
"... ('labels', data.Field())])",
|
||
"...",
|
||
">>> sentiment = data.TabularDataset(",
|
||
"... path='data/sentiment/train.json', format='json',",
|
||
"... fields={'sentence_tokenized': ('text', data.Field(sequential=True)),",
|
||
"... 'sentiment_gold': ('labels', data.Field(sequential=False))})"
|
||
],
|
||
"category": ["standalone", "research"],
|
||
"tags": ["pytorch"]
|
||
},
|
||
{
|
||
"id": "allennlp",
|
||
"title": "AllenNLP",
|
||
"slogan": "An open-source NLP research library, built on PyTorch and spaCy",
|
||
"description": "AllenNLP is a new library designed to accelerate NLP research, by providing a framework that supports modern deep learning workflows for cutting-edge language understanding problems. AllenNLP uses spaCy as a preprocessing component. You can also use Allen NLP to develop spaCy pipeline components, to add annotations to the `Doc` object.",
|
||
"github": "allenai/allennlp",
|
||
"pip": "allennlp",
|
||
"thumb": "https://i.imgur.com/U8opuDN.jpg",
|
||
"url": "http://allennlp.org",
|
||
"author": " Allen Institute for Artificial Intelligence",
|
||
"author_links": {
|
||
"github": "allenai",
|
||
"twitter": "allenai_org",
|
||
"website": "http://allenai.org"
|
||
},
|
||
"category": ["standalone", "research"]
|
||
},
|
||
{
|
||
"id": "scispacy",
|
||
"title": "scispaCy",
|
||
"slogan": "A full spaCy pipeline and models for scientific/biomedical documents",
|
||
"github": "allenai/scispacy",
|
||
"pip": "scispacy",
|
||
"thumb": "https://i.imgur.com/dJQSclW.png",
|
||
"url": "https://allenai.github.io/scispacy/",
|
||
"author": " Allen Institute for Artificial Intelligence",
|
||
"author_links": {
|
||
"github": "allenai",
|
||
"twitter": "allenai_org",
|
||
"website": "http://allenai.org"
|
||
},
|
||
"category": ["models", "research"]
|
||
},
|
||
{
|
||
"id": "textacy",
|
||
"slogan": "NLP, before and after spaCy",
|
||
"description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.",
|
||
"github": "chartbeat-labs/textacy",
|
||
"pip": "textacy",
|
||
"url": "https://chartbeat-labs.github.io/textacy/",
|
||
"author": "Burton DeWilde",
|
||
"author_links": {
|
||
"github": "bdewilde",
|
||
"twitter": "bjdewilde"
|
||
},
|
||
"category": ["standalone"]
|
||
},
|
||
{
|
||
"id": "textpipe",
|
||
"slogan": "clean and extract metadata from text",
|
||
"description": "`textpipe` is a Python package for converting raw text in to clean, readable text and extracting metadata from that text. Its functionalities include transforming raw text into readable text by removing HTML tags and extracting metadata such as the number of words and named entities from the text.",
|
||
"github": "textpipe/textpipe",
|
||
"pip": "textpipe",
|
||
"author": "Textpipe Contributors",
|
||
"author_links": {
|
||
"github": "textpipe",
|
||
"website": "https://github.com/textpipe/textpipe/blob/master/CONTRIBUTORS.md"
|
||
},
|
||
"category": ["standalone"],
|
||
"tags": ["text-processing", "named-entity-recognition"],
|
||
"thumb": "https://avatars0.githubusercontent.com/u/40492530",
|
||
"code_example": [
|
||
"from textpipe import doc, pipeline",
|
||
"sample_text = 'Sample text! <!DOCTYPE>'",
|
||
"document = doc.Doc(sample_text)",
|
||
"print(document.clean)",
|
||
"'Sample text!'",
|
||
"print(document.language)",
|
||
"# 'en'",
|
||
"print(document.nwords)",
|
||
"# 2",
|
||
"",
|
||
"pipe = pipeline.Pipeline(['CleanText', 'NWords'])",
|
||
"print(pipe(sample_text))",
|
||
"# {'CleanText': 'Sample text!', 'NWords': 2}"
|
||
]
|
||
},
|
||
{
|
||
"id": "mordecai",
|
||
"slogan": "Full text geoparsing using spaCy, Geonames and Keras",
|
||
"description": "Extract the place names from a piece of text, resolve them to the correct place, and return their coordinates and structured geographic information.",
|
||
"github": "openeventdata/mordecai",
|
||
"pip": "mordecai",
|
||
"thumb": "https://i.imgur.com/gPJ9upa.jpg",
|
||
"code_example": [
|
||
"from mordecai import Geoparser",
|
||
"geo = Geoparser()",
|
||
"geo.geoparse(\"I traveled from Oxford to Ottawa.\")"
|
||
],
|
||
"author": "Andy Halterman",
|
||
"author_links": {
|
||
"github": "ahalterman",
|
||
"twitter": "ahalterman"
|
||
},
|
||
"category": ["standalone"]
|
||
},
|
||
{
|
||
"id": "kindred",
|
||
"title": "Kindred",
|
||
"slogan": "Biomedical relation extraction using spaCy",
|
||
"description": "Kindred is a package for relation extraction in biomedical texts. Given some training data, it can build a model to identify relations between entities (e.g. drugs, genes, etc) in a sentence.",
|
||
"github": "jakelever/kindred",
|
||
"pip": "kindred",
|
||
"code_example": [
|
||
"import kindred",
|
||
"",
|
||
"trainCorpus = kindred.bionlpst.load('2016-BB3-event-train')",
|
||
"devCorpus = kindred.bionlpst.load('2016-BB3-event-dev')",
|
||
"predictionCorpus = devCorpus.clone()",
|
||
"predictionCorpus.removeRelations()",
|
||
"classifier = kindred.RelationClassifier()",
|
||
"classifier.train(trainCorpus)",
|
||
"classifier.predict(predictionCorpus)",
|
||
"f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')"
|
||
],
|
||
"author": "Jake Lever",
|
||
"author_links": {
|
||
"github": "jakelever"
|
||
},
|
||
"category": ["standalone"]
|
||
},
|
||
{
|
||
"id": "sense2vec",
|
||
"slogan": "Use NLP to go beyond vanilla word2vec",
|
||
"description": "sense2vec ([Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn more interesting, detailed and context-sensitive word vectors. For an interactive example of the technology, see our [sense2vec demo](https://explosion.ai/demos/sense2vec) that lets you explore semantic similarities across all Reddit comments of 2015.",
|
||
"github": "explosion/sense2vec",
|
||
"pip": "sense2vec==1.0.0a0",
|
||
"thumb": "https://i.imgur.com/awfdhX6.jpg",
|
||
"image": "https://explosion.ai/assets/img/demos/sense2vec.png",
|
||
"url": "https://explosion.ai/demos/sense2vec",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from sense2vec import Sense2VecComponent",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')",
|
||
"nlp.add_pipe(s2v)",
|
||
"",
|
||
"doc = nlp(u\"A sentence about natural language processing.\")",
|
||
"assert doc[3].text == u'natural language processing'",
|
||
"freq = doc[3]._.s2v_freq",
|
||
"vector = doc[3]._.s2v_vec",
|
||
"most_similar = doc[3]._.s2v_most_similar(3)",
|
||
"# [(('natural language processing', 'NOUN'), 1.0),",
|
||
"# (('machine learning', 'NOUN'), 0.8986966609954834),",
|
||
"# (('computer vision', 'NOUN'), 0.8636297583580017)]"
|
||
],
|
||
"category": ["pipeline", "standalone", "visualizers"],
|
||
"tags": ["vectors"],
|
||
"author": "Explosion AI",
|
||
"author_links": {
|
||
"twitter": "explosion_ai",
|
||
"github": "explosion",
|
||
"website": "https://explosion.ai"
|
||
}
|
||
},
|
||
{
|
||
"id": "spacyr",
|
||
"slogan": "An R wrapper for spaCy",
|
||
"github": "quanteda/spacyr",
|
||
"cran": "spacyr",
|
||
"code_example": [
|
||
"library(\"spacyr\")",
|
||
"spacy_initialize()",
|
||
"",
|
||
"txt <- c(d1 = \"spaCy excels at large-scale information extraction tasks.\",",
|
||
" d2 = \"Mr. Smith goes to North Carolina.\")",
|
||
"",
|
||
"# process documents and obtain a data.table",
|
||
"parsedtxt <- spacy_parse(txt)"
|
||
],
|
||
"code_language": "r",
|
||
"author": "Kenneth Benoit & Aki Matsuo",
|
||
"category": ["nonpython"]
|
||
},
|
||
{
|
||
"id": "cleannlp",
|
||
"title": "CleanNLP",
|
||
"slogan": "A tidy data model for NLP in R",
|
||
"description": "The cleanNLP package is designed to make it as painless as possible to turn raw text into feature-rich data frames. the package offers four backends that can be used for parsing text: `tokenizers`, `udpipe`, `spacy` and `corenlp`.",
|
||
"github": "statsmaths/cleanNLP",
|
||
"cran": "cleanNLP",
|
||
"author": "Taylor B. Arnold",
|
||
"author_links": {
|
||
"github": "statsmaths"
|
||
},
|
||
"category": ["nonpython"]
|
||
},
|
||
{
|
||
"id": "spacy-cpp",
|
||
"slogan": "C++ wrapper library for spaCy",
|
||
"description": "The goal of spacy-cpp is to expose the functionality of spaCy to C++ applications, and to provide an API that is similar to that of spaCy, enabling rapid development in Python and simple porting to C++.",
|
||
"github": "d99kris/spacy-cpp",
|
||
"code_example": [
|
||
"Spacy::Spacy spacy;",
|
||
"auto nlp = spacy.load(\"en_core_web_sm\");",
|
||
"auto doc = nlp.parse(\"This is a sentence.\");",
|
||
"for (auto& token : doc.tokens())",
|
||
" std::cout << token.text() << \" [\" << token.pos_() << \"]\\n\";"
|
||
],
|
||
"code_language": "cpp",
|
||
"author": "Kristofer Berggren",
|
||
"author_links": {
|
||
"github": "d99kris"
|
||
},
|
||
"category": ["nonpython"]
|
||
},
|
||
{
|
||
"id": "spaCy.jl",
|
||
"slogan": "Julia interface for spaCy (work in progress)",
|
||
"github": "jekbradbury/SpaCy.jl",
|
||
"author": "James Bradbury",
|
||
"author_links": {
|
||
"github": "jekbradbury",
|
||
"twitter": "jekbradbury"
|
||
},
|
||
"category": ["nonpython"]
|
||
},
|
||
{
|
||
"id": "spacy_api",
|
||
"slogan": "Server/client to load models in a separate, dedicated process",
|
||
"github": "kootenpv/spacy_api",
|
||
"pip": "spacy_api",
|
||
"code_example": [
|
||
"from spacy_api import Client",
|
||
"",
|
||
"spacy_client = Client() # default args host/port",
|
||
"doc = spacy_client.single(\"How are you\")"
|
||
],
|
||
"author": "Pascal van Kooten",
|
||
"author_links": {
|
||
"github": "kootenpv"
|
||
},
|
||
"category": ["apis"]
|
||
},
|
||
{
|
||
"id": "spacy-api-docker",
|
||
"slogan": "spaCy REST API, wrapped in a Docker container",
|
||
"github": "jgontrum/spacy-api-docker",
|
||
"url": "https://hub.docker.com/r/jgontrum/spacyapi/",
|
||
"thumb": "https://i.imgur.com/NRnDKyj.jpg",
|
||
"code_example": [
|
||
"version: '2'",
|
||
"",
|
||
"services:",
|
||
" spacyapi:",
|
||
" image: jgontrum/spacyapi:en_v2",
|
||
" ports:",
|
||
" - \"127.0.0.1:8080:80\"",
|
||
" restart: always"
|
||
],
|
||
"code_language": "docker",
|
||
"author": "Johannes Gontrum",
|
||
"author_links": {
|
||
"github": "jgontrum"
|
||
},
|
||
"category": ["apis"]
|
||
},
|
||
{
|
||
"id": "languagecrunch",
|
||
"slogan": "NLP server for spaCy, WordNet and NeuralCoref as a Docker image",
|
||
"github": "artpar/languagecrunch",
|
||
"code_example": [
|
||
"docker run -it -p 8080:8080 artpar/languagecrunch",
|
||
"curl http://localhost:8080/nlp/parse?`echo -n \"The new twitter is so weird. Seriously. Why is there a new twitter? What was wrong with the old one? Fix it now.\" | python -c \"import urllib, sys; print(urllib.urlencode({'sentence': sys.stdin.read()}))\"`"
|
||
],
|
||
"code_language": "bash",
|
||
"author": "Parth Mudgal",
|
||
"author_links": {
|
||
"github": "artpar"
|
||
},
|
||
"category": ["apis"]
|
||
},
|
||
{
|
||
"id": "spacy-nlp",
|
||
"slogan": " Expose spaCy NLP text parsing to Node.js (and other languages) via Socket.IO",
|
||
"github": "kengz/spacy-nlp",
|
||
"thumb": "https://i.imgur.com/w41VSr7.jpg",
|
||
"code_example": [
|
||
"const spacyNLP = require(\"spacy-nlp\")",
|
||
"// default port 6466",
|
||
"// start the server with the python client that exposes spacyIO (or use an existing socketIO server at IOPORT)",
|
||
"var serverPromise = spacyNLP.server({ port: process.env.IOPORT });",
|
||
"// Loading spacy may take up to 15s"
|
||
],
|
||
"code_language": "javascript",
|
||
"author": "Wah Loon Keng",
|
||
"author_links": {
|
||
"github": "kengz"
|
||
},
|
||
"category": ["apis", "nonpython"]
|
||
},
|
||
{
|
||
"id": "prodigy",
|
||
"title": "Prodigy",
|
||
"slogan": "Radically efficient machine teaching, powered by active learning",
|
||
"description": "Prodigy is an annotation tool so efficient that data scientists can do the annotation themselves, enabling a new level of rapid iteration. Whether you're working on entity recognition, intent detection or image classification, Prodigy can help you train and evaluate your models faster. Stream in your own examples or real-world data from live APIs, update your model in real-time and chain models together to build more complex systems.",
|
||
"thumb": "https://i.imgur.com/UVRtP6g.jpg",
|
||
"image": "https://i.imgur.com/Dt5vrY6.png",
|
||
"url": "https://prodi.gy",
|
||
"code_example": [
|
||
"prodigy dataset ner_product \"Improve PRODUCT on Reddit data\"",
|
||
"✨ Created dataset 'ner_product'.",
|
||
"",
|
||
"prodigy ner.teach ner_product en_core_web_sm ~/data.jsonl --label PRODUCT",
|
||
"✨ Starting the web server on port 8080..."
|
||
],
|
||
"code_language": "bash",
|
||
"category": ["standalone", "training"],
|
||
"author": "Explosion AI",
|
||
"author_links": {
|
||
"twitter": "explosion_ai",
|
||
"github": "explosion",
|
||
"website": "https://explosion.ai"
|
||
}
|
||
},
|
||
{
|
||
"id": "dragonfire",
|
||
"title": "Dragonfire",
|
||
"slogan": "An open-source virtual assistant for Ubuntu based Linux distributions",
|
||
"github": "DragonComputer/Dragonfire",
|
||
"thumb": "https://i.imgur.com/5fqguKS.jpg",
|
||
"image": "https://raw.githubusercontent.com/DragonComputer/Dragonfire/master/docs/img/demo.gif",
|
||
"author": "Dragon Computer",
|
||
"author_links": {
|
||
"github": "DragonComputer",
|
||
"website": "http://dragon.computer"
|
||
},
|
||
"category": ["standalone"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "oreilly-python-ds",
|
||
"title": "Introduction to Machine Learning with Python: A Guide for Data Scientists",
|
||
"slogan": "O'Reilly, 2016",
|
||
"description": "Machine learning has become an integral part of many commercial applications and research projects, but this field is not exclusive to large companies with extensive research teams. If you use Python, even as a beginner, this book will teach you practical ways to build your own machine learning solutions. With all the data available today, machine learning applications are limited only by your imagination.",
|
||
"cover": "https://covers.oreillystatic.com/images/0636920030515/lrg.jpg",
|
||
"url": "http://shop.oreilly.com/product/0636920030515.do",
|
||
"author": "Andreas Müller, Sarah Guido",
|
||
"category": ["books"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "text-analytics-python",
|
||
"title": "Text Analytics with Python",
|
||
"slogan": "Apress / Springer, 2016",
|
||
"description": "*Text Analytics with Python* teaches you the techniques related to natural language processing and text analytics, and you will gain the skills to know which technique is best suited to solve a particular problem. You will look at each technique and algorithm with both a bird's eye view to understand how it can be used as well as with a microscopic view to understand the mathematical concepts and to implement them to solve your own problems.",
|
||
"github": "dipanjanS/text-analytics-with-python",
|
||
"cover": "https://i.imgur.com/AOmzZu8.png",
|
||
"url": "https://www.amazon.com/Text-Analytics-Python-Real-World-Actionable/dp/148422387X",
|
||
"author": "Dipanjan Sarkar",
|
||
"category": ["books"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "practical-ml-python",
|
||
"title": "Practical Machine Learning with Python",
|
||
"slogan": "Apress, 2017",
|
||
"description": "Master the essential skills needed to recognize and solve complex problems with machine learning and deep learning. Using real-world examples that leverage the popular Python machine learning ecosystem, this book is your perfect companion for learning the art and science of machine learning to become a successful practitioner. The concepts, techniques, tools, frameworks, and methodologies used in this book will teach you how to think, design, build, and execute machine learning systems and projects successfully.",
|
||
"github": "dipanjanS/practical-machine-learning-with-python",
|
||
"cover": "https://i.imgur.com/5F4mkt7.jpg",
|
||
"url": "https://www.amazon.com/Practical-Machine-Learning-Python-Problem-Solvers/dp/1484232062",
|
||
"author": "Dipanjan Sarkar, Raghav Bali, Tushar Sharma",
|
||
"category": ["books"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "packt-nlp-computational-linguistics",
|
||
"title": "Natural Language Processing and Computational Linguistics",
|
||
"slogan": "Packt, 2018",
|
||
"description": "This book shows you how to use natural language processing, and computational linguistics algorithms, to make inferences and gain insights about data you have. These algorithms are based on statistical machine learning and artificial intelligence techniques. The tools to work with these algorithms are available to you right now - with Python, and tools like Gensim and spaCy.",
|
||
"cover": "https://i.imgur.com/aleMf1Y.jpg",
|
||
"url": "https://www.amazon.com/Natural-Language-Processing-Computational-Linguistics-ebook/dp/B07BWH779J",
|
||
"author": "Bhargav Srinivasa-Desikan",
|
||
"category": ["books"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "datacamp-nlp-fundamentals",
|
||
"title": "Natural Language Processing Fundamentals in Python",
|
||
"slogan": "Datacamp, 2017",
|
||
"description": "In this course, you'll learn Natural Language Processing (NLP) basics, such as how to identify and separate words, how to extract topics in a text, and how to build your own fake news classifier. You'll also learn how to use basic libraries such as NLTK, alongside libraries which utilize deep learning to solve common NLP problems. This course will give you the foundation to process and parse text as you move forward in your Python learning.",
|
||
"url": "https://www.datacamp.com/courses/natural-language-processing-fundamentals-in-python",
|
||
"thumb": "https://i.imgur.com/0Zks7c0.jpg",
|
||
"author": "Katharine Jarmul",
|
||
"author_links": {
|
||
"twitter": "kjam"
|
||
},
|
||
"category": ["courses"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "datacamp-advanced-nlp",
|
||
"title": "Advanced Natural Language Processing with spaCy",
|
||
"slogan": "Datacamp, 2019",
|
||
"description": "If you're working with a lot of text, you'll eventually want to know more about it. For example, what's it about? What do the words mean in context? Who is doing what to whom? What companies and products are mentioned? Which texts are similar to each other? In this course, you'll learn how to use spaCy, a fast-growing industry standard library for NLP in Python, to build advanced natural language understanding systems, using both rule-based and machine learning approaches.",
|
||
"url": "https://www.datacamp.com/courses/advanced-nlp-with-spacy",
|
||
"thumb": "https://i.imgur.com/0Zks7c0.jpg",
|
||
"author": "Ines Montani",
|
||
"author_links": {
|
||
"twitter": "_inesmontani",
|
||
"github": "ines",
|
||
"website": "https://ines.io"
|
||
},
|
||
"category": ["courses"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "learning-path-spacy",
|
||
"title": "Learning Path: Mastering spaCy for Natural Language Processing",
|
||
"slogan": "O'Reilly, 2017",
|
||
"description": "spaCy, a fast, user-friendly library for teaching computers to understand text, simplifies NLP techniques, such as speech tagging and syntactic dependencies, so you can easily extract information, attributes, and objects from massive amounts of text to then document, measure, and analyze. This Learning Path is a hands-on introduction to using spaCy to discover insights through natural language processing. While end-to-end natural language processing solutions can be complex, you’ll learn the linguistics, algorithms, and machine learning skills to get the job done.",
|
||
"url": "https://www.safaribooksonline.com/library/view/learning-path-mastering/9781491986653/",
|
||
"thumb": "https://i.imgur.com/9MIgMAc.jpg",
|
||
"author": "Aaron Kramer",
|
||
"category": ["courses"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "video-spacys-ner-model",
|
||
"title": "spaCy's NER model",
|
||
"slogan": "Incremental parsing with bloom embeddings and residual CNNs",
|
||
"description": "spaCy v2.0's Named Entity Recognition system features a sophisticated word embedding strategy using subword features and \"Bloom\" embeddings, a deep convolutional neural network with residual connections, and a novel transition-based approach to named entity parsing. The system is designed to give a good balance of efficiency, accuracy and adaptability. In this talk, I sketch out the components of the system, explaining the intuition behind the various choices. I also give a brief introduction to the named entity recognition problem, with an overview of what else Explosion AI is working on, and why.",
|
||
"youtube": "sqDHBH9IjRU",
|
||
"author": "Matthew Honnibal",
|
||
"author_links": {
|
||
"twitter": "honnibal",
|
||
"github": "honnibal",
|
||
"website": "https://explosion.ai"
|
||
},
|
||
"category": ["videos"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "video-new-nlp-solutions",
|
||
"title": "Building new NLP solutions with spaCy and Prodigy",
|
||
"slogan": "PyData Berlin 2018",
|
||
"description": "In this talk, I will discuss how to address some of the most likely causes of failure for new Natural Language Processing (NLP) projects. My main recommendation is to take an iterative approach: don't assume you know what your pipeline should look like, let alone your annotation schemes or model architectures.",
|
||
"author": "Matthew Honnibal",
|
||
"author_links": {
|
||
"twitter": "honnibal",
|
||
"github": "honnibal",
|
||
"website": "https://explosion.ai"
|
||
},
|
||
"youtube": "jpWqz85F_4Y",
|
||
"category": ["videos"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "video-modern-nlp-in-python",
|
||
"title": "Modern NLP in Python",
|
||
"slogan": "PyData DC 2016",
|
||
"description": "Academic and industry research in Natural Language Processing (NLP) has progressed at an accelerating pace over the last several years. Members of the Python community have been hard at work moving cutting-edge research out of papers and into open source, \"batteries included\" software libraries that can be applied to practical problems. We'll explore some of these tools for modern NLP in Python.",
|
||
"author": "Patrick Harrison",
|
||
"youtube": "6zm9NC9uRkk",
|
||
"category": ["videos"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "podcast-nlp-highlights",
|
||
"title": "NLP Highlights 78: Where do corpora come from?",
|
||
"slogan": "January 2019",
|
||
"description": "Most NLP projects rely crucially on the quality of annotations used for training and evaluating models. In this episode, Matt and Ines of Explosion AI tell us how Prodigy can improve data annotation and model development workflows. Prodigy is an annotation tool implemented as a python library, and it comes with a web application and a command line interface. A developer can define input data streams and design simple annotation interfaces. Prodigy can help break down complex annotation decisions into a series of binary decisions, and it provides easy integration with spaCy models. Developers can specify how models should be modified as new annotations come in in an active learning framework.",
|
||
"soundcloud": "559200912",
|
||
"thumb": "https://i.imgur.com/hOBQEzc.jpg",
|
||
"url": "https://soundcloud.com/nlp-highlights/78-where-do-corpora-come-from-with-matt-honnibal-and-ines-montani",
|
||
"author": "Matt Gardner, Waleed Ammar (Allen AI)",
|
||
"author_links": {
|
||
"website": "https://soundcloud.com/nlp-highlights"
|
||
},
|
||
"category": ["podcasts"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "podcast-init",
|
||
"title": "Podcast.__init__ 87: spaCy with Matthew Honnibal",
|
||
"slogan": "December 2017",
|
||
"description": "As the amount of text available on the internet and in businesses continues to increase, the need for fast and accurate language analysis becomes more prominent. This week Matthew Honnibal, the creator of SpaCy, talks about his experiences researching natural language processing and creating a library to make his findings accessible to industry.",
|
||
"iframe": "https://www.pythonpodcast.com/wp-content/plugins/podlove-podcasting-plugin-for-wordpress/lib/modules/podlove_web_player/player_v4/dist/share.html?episode=https://www.pythonpodcast.com/?podlove_player4=176",
|
||
"iframe_height": 200,
|
||
"thumb": "https://i.imgur.com/rpo6BuY.png",
|
||
"url": "https://www.podcastinit.com/episode-87-spacy-with-matthew-honnibal/",
|
||
"author": "Tobias Macey",
|
||
"author_links": {
|
||
"website": "https://www.podcastinit.com"
|
||
},
|
||
"category": ["podcasts"]
|
||
},
|
||
{
|
||
"type": "education",
|
||
"id": "talk-python-podcast",
|
||
"title": "Talk Python 202: Building a software business",
|
||
"slogan": "March 2019",
|
||
"description": "One core question around open source is how do you fund it? Well, there is always that PayPal donate button. But that's been a tremendous failure for many projects. Often the go-to answer is consulting. But what if you don't want to trade time for money? You could take things up a notch and change the equation, exchanging value for money. That's what Ines Montani and her co-founder did when they started Explosion AI with spaCy as the foundation.",
|
||
"thumb": "https://i.imgur.com/q1twuK8.png",
|
||
"url": "https://talkpython.fm/episodes/show/202/building-a-software-business",
|
||
"soundcloud": "588364857",
|
||
"author": "Michael Kennedy",
|
||
"author_links": {
|
||
"website": "https://talkpython.fm/"
|
||
},
|
||
"category": ["podcasts"]
|
||
},
|
||
{
|
||
"id": "adam_qas",
|
||
"title": "ADAM: Question Answering System",
|
||
"slogan": "A question answering system that extracts answers from Wikipedia to questions posed in natural language.",
|
||
"github": "5hirish/adam_qas",
|
||
"pip": "qas",
|
||
"code_example": [
|
||
"git clone https://github.com/5hirish/adam_qas.git",
|
||
"cd adam_qas",
|
||
"pip install -r requirements.txt",
|
||
"python -m qas.adam 'When was linux kernel version 4.0 released ?'"
|
||
],
|
||
"code_language": "bash",
|
||
"thumb": "https://shirishkadam.files.wordpress.com/2018/04/mini_alleviate.png",
|
||
"author": "Shirish Kadam",
|
||
"author_links": {
|
||
"twitter": "5hirish",
|
||
"github": "5hirish",
|
||
"website": "https://shirishkadam.com/"
|
||
},
|
||
"category": ["standalone"],
|
||
"tags": ["question-answering", "elasticsearch"]
|
||
},
|
||
{
|
||
"id": "epitator",
|
||
"title": "EpiTator",
|
||
"thumb": "https://i.imgur.com/NYFY1Km.jpg",
|
||
"slogan": "Extracts case counts, resolved location/species/disease names, date ranges and more",
|
||
"description": "EcoHealth Alliance uses EpiTator to catalog the what, where and when of infectious disease case counts reported in online news. Each of these aspects is extracted using independent annotators than can be applied to other domains. EpiTator organizes annotations by creating \"AnnoTiers\" for each type. AnnoTiers have methods for manipulating, combining and searching annotations. For instance, the `with_following_spans_from()` method can be used to create a new tier that combines a tier of one type (such as numbers), with another (say, kitchenware). The resulting tier will contain all the phrases in the document that match that pattern, like \"5 plates\" or \"2 cups.\"\n\nAnother commonly used method is `group_spans_by_containing_span()` which can be used to do things like find all the spaCy tokens in all the GeoNames a document mentions. spaCy tokens, named entities, sentences and noun chunks are exposed through the spaCy annotator which will create a AnnoTier for each. These are basis of many of the other annotators. EpiTator also includes an annotator for extracting tables embedded in free text articles. Another neat feature is that the lexicons used for entity resolution are all stored in an embedded sqlite database so there is no need to run any external services in order to use EpiTator.",
|
||
"url": "https://github.com/ecohealthalliance/EpiTator",
|
||
"github": "ecohealthalliance/EpiTator",
|
||
"pip": "EpiTator",
|
||
"code_example": [
|
||
"from epitator.annotator import AnnoDoc",
|
||
"from epitator.geoname_annotator import GeonameAnnotator",
|
||
"",
|
||
"doc = AnnoDoc('Where is Chiang Mai?')",
|
||
"geoname_annotier = doc.require_tiers('geonames', via=GeonameAnnotator)",
|
||
"geoname = geoname_annotier.spans[0].metadata['geoname']",
|
||
"geoname['name']",
|
||
"# = 'Chiang Mai'",
|
||
"geoname['geonameid']",
|
||
"# = '1153671'",
|
||
"geoname['latitude']",
|
||
"# = 18.79038",
|
||
"geoname['longitude']",
|
||
"# = 98.98468",
|
||
"",
|
||
"from epitator.spacy_annotator import SpacyAnnotator",
|
||
"spacy_token_tier = doc.require_tiers('spacy.tokens', via=SpacyAnnotator)",
|
||
"list(geoname_annotier.group_spans_by_containing_span(spacy_token_tier))",
|
||
"# = [(AnnoSpan(9-19, Chiang Mai), [AnnoSpan(9-15, Chiang), AnnoSpan(16-19, Mai)])]"
|
||
],
|
||
"author": "EcoHealth Alliance",
|
||
"author_links": {
|
||
"github": "ecohealthalliance",
|
||
"website": " https://ecohealthalliance.org/"
|
||
},
|
||
"category": ["research", "standalone"]
|
||
},
|
||
{
|
||
"id": "self-attentive-parser",
|
||
"title": "Berkeley Neural Parser",
|
||
"slogan": "Constituency Parsing with a Self-Attentive Encoder (ACL 2018)",
|
||
"description": "A Python implementation of the parsers described in *\"Constituency Parsing with a Self-Attentive Encoder\"* from ACL 2018.",
|
||
"url": "https://arxiv.org/abs/1805.01052",
|
||
"github": "nikitakit/self-attentive-parser",
|
||
"pip": "benepar",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from benepar.spacy_plugin import BeneparComponent",
|
||
"",
|
||
"nlp = spacy.load('en')",
|
||
"nlp.add_pipe(BeneparComponent('benepar_en'))",
|
||
"doc = nlp(u'The time for action is now. It's never too late to do something.')",
|
||
"sent = list(doc.sents)[0]",
|
||
"print(sent._.parse_string)",
|
||
"# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))",
|
||
"print(sent._.labels)",
|
||
"# ('S',)",
|
||
"print(list(sent._.children)[0])",
|
||
"# The time for action"
|
||
],
|
||
"author": "Nikita Kitaev",
|
||
"author_links": {
|
||
"github": "nikitakit",
|
||
"website": " http://kitaev.io"
|
||
},
|
||
"category": ["research", "pipeline"]
|
||
},
|
||
{
|
||
"id": "excelcy",
|
||
"title": "ExcelCy",
|
||
"slogan": "Excel Integration with spaCy. Training NER using XLSX from PDF, DOCX, PPT, PNG or JPG.",
|
||
"description": "ExcelCy is a toolkit to integrate Excel to spaCy NLP training experiences. Training NER using XLSX from PDF, DOCX, PPT, PNG or JPG. ExcelCy has pipeline to match Entity with PhraseMatcher or Matcher in regular expression.",
|
||
"url": "https://github.com/kororo/excelcy",
|
||
"github": "kororo/excelcy",
|
||
"pip": "excelcy",
|
||
"code_example": [
|
||
"from excelcy import ExcelCy",
|
||
"# collect sentences, annotate Entities and train NER using spaCy",
|
||
"excelcy = ExcelCy.execute(file_path='https://github.com/kororo/excelcy/raw/master/tests/data/test_data_01.xlsx')",
|
||
"# use the nlp object as per spaCy API",
|
||
"doc = excelcy.nlp('Google rebrands its business apps')",
|
||
"# or save it for faster bootstrap for application",
|
||
"excelcy.nlp.to_disk('/model')"
|
||
],
|
||
"author": "Robertus Johansyah",
|
||
"author_links": {
|
||
"github": "kororo"
|
||
},
|
||
"category": ["training"],
|
||
"tags": ["excel"]
|
||
},
|
||
{
|
||
"id": "spacy-graphql",
|
||
"title": "spacy-graphql",
|
||
"slogan": "Query spaCy's linguistic annotations using GraphQL",
|
||
"github": "ines/spacy-graphql",
|
||
"description": "A very simple and experimental app that lets you query spaCy's linguistic annotations using [GraphQL](https://graphql.org/). The API currently supports most token attributes, named entities, sentences and text categories (if available as `doc.cats`, i.e. if you added a text classifier to a model). The `meta` field will return the model meta data. Models are only loaded once and kept in memory.",
|
||
"url": "https://explosion.ai/demos/spacy-graphql",
|
||
"category": ["apis"],
|
||
"tags": ["graphql"],
|
||
"thumb": "https://i.imgur.com/xC7zpTO.png",
|
||
"code_example": [
|
||
"{",
|
||
" nlp(text: \"Zuckerberg is the CEO of Facebook.\", model: \"en_core_web_sm\") {",
|
||
" meta {",
|
||
" lang",
|
||
" description",
|
||
" }",
|
||
" doc {",
|
||
" text",
|
||
" tokens {",
|
||
" text",
|
||
" pos_",
|
||
" }",
|
||
" ents {",
|
||
" text",
|
||
" label_",
|
||
" }",
|
||
" }",
|
||
" }",
|
||
"}"
|
||
],
|
||
"code_language": "json",
|
||
"author": "Ines Montani",
|
||
"author_links": {
|
||
"twitter": "_inesmontani",
|
||
"github": "ines",
|
||
"website": "https://ines.io"
|
||
}
|
||
},
|
||
{
|
||
"id": "spacy-js",
|
||
"title": "spacy-js",
|
||
"slogan": "JavaScript API for spaCy with Python REST API",
|
||
"github": "ines/spacy-js",
|
||
"description": "JavaScript interface for accessing linguistic annotations provided by spaCy. This project is mostly experimental and was developed for fun to play around with different ways of mimicking spaCy's Python API.\n\nThe results will still be computed in Python and made available via a REST API. The JavaScript API resembles spaCy's Python API as closely as possible (with a few exceptions, as the values are all pre-computed and it's tricky to express complex recursive relationships).",
|
||
"code_language": "javascript",
|
||
"code_example": [
|
||
"const spacy = require('spacy');",
|
||
"",
|
||
"(async function() {",
|
||
" const nlp = spacy.load('en_core_web_sm');",
|
||
" const doc = await nlp('This is a text about Facebook.');",
|
||
" for (let ent of doc.ents) {",
|
||
" console.log(ent.text, ent.label);",
|
||
" }",
|
||
" for (let token of doc) {",
|
||
" console.log(token.text, token.pos, token.head.text);",
|
||
" }",
|
||
"})();"
|
||
],
|
||
"author": "Ines Montani",
|
||
"author_links": {
|
||
"twitter": "_inesmontani",
|
||
"github": "ines",
|
||
"website": "https://ines.io"
|
||
},
|
||
"category": ["nonpython"],
|
||
"tags": ["javascript"]
|
||
},
|
||
{
|
||
"id": "spacy-raspberry",
|
||
"title": "spacy-raspberry",
|
||
"slogan": "64bit Raspberry Pi image for spaCy and neuralcoref",
|
||
"github": "boehm-e/spacy-raspberry",
|
||
"thumb": "https://i.imgur.com/VCJMrE6.png",
|
||
"image": "https://raw.githubusercontent.com/boehm-e/spacy-raspberry/master/imgs/preview.png",
|
||
"author": "Erwan Boehm",
|
||
"author_links": {
|
||
"github": "boehm-e"
|
||
},
|
||
"category": ["apis"],
|
||
"tags": ["raspberrypi"]
|
||
},
|
||
{
|
||
"id": "spacy-wordnet",
|
||
"title": "spacy-wordnet",
|
||
"slogan": "WordNet meets spaCy",
|
||
"description": "`spacy-wordnet` creates annotations that easily allow the use of WordNet and [WordNet Domains](http://wndomains.fbk.eu/) by using the [NLTK WordNet interface](http://www.nltk.org/howto/wordnet.html)",
|
||
"github": "recognai/spacy-wordnet",
|
||
"tags": ["wordnet", "synsets"],
|
||
"thumb": "https://i.imgur.com/3y2uPUv.jpg",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_wordnet.wornet_annotator import WordnetAnnotator ",
|
||
"",
|
||
"# Load an spacy model (supported models are \"es\" and \"en\") ",
|
||
"nlp = spacy.load('en')",
|
||
"nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
|
||
"token = nlp('prices')[0]",
|
||
"",
|
||
"# wordnet object link spacy token with nltk wordnet interface by giving acces to",
|
||
"# synsets and lemmas ",
|
||
"token._.wordnet.synsets()",
|
||
"token._.wordnet.lemmas()",
|
||
"",
|
||
"# And automatically tags with wordnet domains",
|
||
"token._.wordnet.wordnet_domains()"
|
||
],
|
||
"author": "recognai",
|
||
"author_links": {
|
||
"github": "recognai",
|
||
"twitter": "recogn_ai",
|
||
"website": "https://recogn.ai"
|
||
},
|
||
"category": ["pipeline"]
|
||
},
|
||
{
|
||
"id": "spacy-conll",
|
||
"title": "spacy_conll",
|
||
"slogan": "Parse text with spaCy and print the output in CoNLL-U format",
|
||
"description": "This module allows you to parse a text to CoNLL-U format. You can use it as a command line tool, or embed it in your own scripts.",
|
||
"code_example": [
|
||
"from spacy_conll import Spacy2ConllParser",
|
||
"spacyconll = Spacy2ConllParser()",
|
||
"",
|
||
"# `parse` returns a generator of the parsed sentences",
|
||
"for parsed_sent in spacyconll.parse(input_str='I like cookies.\nWhat about you?\nI don't like 'em!'):",
|
||
" do_something_(parsed_sent)",
|
||
"",
|
||
"# `parseprint` prints output to stdout (default) or a file (use `output_file` parameter)",
|
||
"# This method is called when using the command line",
|
||
"spacyconll.parseprint(input_str='I like cookies.')"
|
||
],
|
||
"code_language": "python",
|
||
"author": "Bram Vanroy",
|
||
"author_links": {
|
||
"github": "BramVanroy",
|
||
"website": "https://bramvanroy.be"
|
||
},
|
||
"github": "BramVanroy/spacy_conll",
|
||
"category": ["standalone"]
|
||
},
|
||
{
|
||
"id": "spacy-langdetect",
|
||
"title": "spacy-langdetect",
|
||
"slogan": "A fully customizable language detection pipeline for spaCy",
|
||
"description": "This module allows you to add language detection capabilites to your spaCy pipeline. Also supports custom language detectors!",
|
||
"pip": "spacy-langdetect",
|
||
"code_example": [
|
||
"import spacy",
|
||
"from spacy_langdetect import LanguageDetector",
|
||
"nlp = spacy.load('en')",
|
||
"nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)",
|
||
"text = 'This is an english text.'",
|
||
"doc = nlp(text)",
|
||
"# document level language detection. Think of it like average language of the document!",
|
||
"print(doc._.language)",
|
||
"# sentence level language detection",
|
||
"for sent in doc.sents:",
|
||
" print(sent, sent._.language)"
|
||
],
|
||
"code_language": "python",
|
||
"author": "Abhijit Balaji",
|
||
"author_links": {
|
||
"github": "Abhijit-2592",
|
||
"website": "https://abhijit-2592.github.io/"
|
||
},
|
||
"github": "Abhijit-2592/spacy-langdetect",
|
||
"category": ["pipeline"],
|
||
"tags": ["language-detection"]
|
||
}
|
||
],
|
||
"categories": [
|
||
{
|
||
"label": "Projects",
|
||
"items": [
|
||
{
|
||
"id": "pipeline",
|
||
"title": "Pipeline",
|
||
"description": "Custom pipeline components and extensions"
|
||
},
|
||
{
|
||
"id": "training",
|
||
"title": "Training",
|
||
"description": "Helpers and toolkits for training spaCy models"
|
||
},
|
||
{
|
||
"id": "conversational",
|
||
"title": "Conversational",
|
||
"description": "Frameworks and utilities for working with conversational text, e.g. for chat bots"
|
||
},
|
||
{
|
||
"id": "research",
|
||
"title": "Research",
|
||
"description": "Frameworks and utilities for developing better NLP models, especially using neural networks"
|
||
},
|
||
{
|
||
"id": "visualizers",
|
||
"title": "Visualizers",
|
||
"description": "Demos and tools to visualize NLP annotations or systems"
|
||
},
|
||
{
|
||
"id": "apis",
|
||
"title": "Containers & APIs",
|
||
"description": "Infrastructure tools for managing or deploying spaCy"
|
||
},
|
||
{
|
||
"id": "nonpython",
|
||
"title": "Non-Python",
|
||
"description": "Wrappers, bindings and implementations in other programming languages"
|
||
},
|
||
{
|
||
"id": "standalone",
|
||
"title": "Standalone",
|
||
"description": "Self-contained libraries or tools that use spaCy under the hood"
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"label": "Education",
|
||
"items": [
|
||
{
|
||
"id": "books",
|
||
"title": "Books",
|
||
"description": "Books about or featuring spaCy"
|
||
},
|
||
{
|
||
"id": "courses",
|
||
"title": "Courses",
|
||
"description": "Online courses and interactive tutorials"
|
||
},
|
||
{
|
||
"id": "videos",
|
||
"title": "Videos",
|
||
"description": "Talks and tutorials in video format"
|
||
},
|
||
{
|
||
"id": "podcasts",
|
||
"title": "Podcasts",
|
||
"description": "Episodes about spaCy or interviews with the spaCy team"
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}
|