From 3e30b5bef68240a3def1aef3914a9b8523258c00 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2024 10:43:40 +0100 Subject: [PATCH] Add spacy-layout [ci skip] --- website/meta/universe.json | 42 ++++++++++++++++++++++++++++++++++ website/src/templates/index.js | 4 ++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index b35423790..b65702885 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1394,6 +1394,48 @@ "website": "https://ines.io" } }, + { + "id": "spacy-layout", + "slogan": "Process PDFs, Word documents and more with spaCy", + "github": "explosion/spacy-layout", + "description": "This plugin integrates with [Docling](https://ds4sd.github.io/docling/) to bring structured processing of PDFs, Word documents and other input formats to your spaCy pipeline. It outputs clean, structured data in a text-based format and outputs spaCy's familiar `Doc` objects that let you access labelled text spans like sections, headings, or footnotes.\n\nThis workflow makes it easy to apply powerful NLP techniques to your documents, including linguistic analysis, named entity recognition, text classification and more. It's also great for implementing chunking for RAG pipelines.", + "pip": "spacy-layout", + "category": [ + "pipeline" + ], + "code_example": [ + "import spacy", + "from spacy_layout import spaCyLayout", + "", + "nlp = spacy.blank(\"en\")", + "layout = spaCyLayout(nlp)", + "", + "# Process a document and create a spaCy Doc object", + "doc = layout(\"./starcraft.pdf\")", + "", + "# The text-based contents of the document", + "print(doc.text)", + "# Document layout including pages and page sizes", + "print(doc._.layout)", + "", + "# Layout spans for different sections", + "for span in doc.spans[\"layout\"]:", + " # Document section and token and character offsets into the text", + " print(span.text, span.start, span.end, span.start_char, span.end_char)", + " # Section type, e.g. \"text\", \"title\", \"section_header\" etc.", + " print(span.label_)", + " # Layout features of the section, including bounding box", + " print(span._.layout)", + " # Closest heading to the span (accuracy depends on document structure)", + " print(span._.heading)" + ], + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + } + }, { "id": "spacyopentapioca", "title": "spaCyOpenTapioca", diff --git a/website/src/templates/index.js b/website/src/templates/index.js index 754cf47bf..b4c6f8e00 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => { } const navAlert = ( - - 💥 New: Case study with S&P Global + + 💥 New: spaCy for PDFs and Word docs )