Rename "English" section to "Language"

2016-10-21 00:58:24 +02:00 · 2016-10-21 00:58:24 +02:00 · f8322a69e7
parent e16e78a737
commit f8322a69e7
3 changed files with 101 additions and 117 deletions
--- a/website/docs/_api-language.jade
+++ b/website/docs/_api-language.jade
@ -1,150 +1,134 @@
 //- ----------------------------------
-//- 💫 DOCS > API > ENGLISH
+//- 💫 DOCS > API > LANGUAGE
 //- ----------------------------------
-+section("english")
+section("language")
-    +h(2, "english", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
+    +h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
-        | #[+tag class] English(Language)
+        | #[+tag class] Language
    p.
-        The English analysis pipeline. Usually you"ll load this once per process,
+        A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.
        and pass the instance around your program.
    +code("python", "Overview").
        class Language:
-            lang = None
+            Defaults = BaseDefaults
            def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
                return self
-            def __call__(self, text, tag=True, parse=True, entity=True):
+            def __init__(self, path=True, **overrides):
-                return Doc()
+                self.vocab = Vocab()
                self.tokenizer = Tokenizer()
                self.tagger = Tagger()
                self.parser = DependencyParser()
                self.entity = EntityRecognizer()
                self.make_doc = lambda text: Doc()
                self.pipeline = [self.tagger, self.parser, self.entity]
-            def pipe(self, texts_iterator, batch_size=1000, n_threads=2):
+            def __call__(self, text, **toggle):
-                yield Doc()
+                doc = self.make_doc(text)
                for proc in self.pipeline:
                    if toggle.get(process.name, True):
                        process(doc)
                return doc
-            def end_training(self, data_dir=None):
+            def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
                docs = (self.make_doc(text) for text in texts_iterator)
                for process in self.pipeline:
                    if toggle.get(process.name, True):
                        docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
                for doc in self.docs:
                    yield doc
            def end_training(self, path=None):
                return None
-        class English(Language):
+            class English(Language):
-            lang = "en"
+                class Defaults(BaseDefaults):
                    pass
-        class German(Language):
+            class German(Language):
-            lang = "de"
+                class Defaults(BaseDefaults):
                    pass
    +section("english-init")
        +h(3, "english-init")
-            | #[+tag method] English.__init__
+            | #[+tag method] Language.__init__
        p
-            | Load the pipeline.  Each component can be passed
+            | Load the pipeline.  You can disable components by passing None as a value,
-            | as an argument, or left as #[code None], in which case it will be loaded
+            | e.g. pass parser=None, vectors=None to save memory if you're not using
-            | from a classmethod, named e.g. #[code default_vocab()].
+            | those components. You can also pass an object as the value.
            | Pass a function create_pipeline to use a custom pipeline --- see
            | the custom pipeline tutorial.
            +aside("Efficiency").
                Loading takes 10-20 seconds, and the instance consumes 2 to 3
                gigabytes of memory.  Intended use is for one instance to be
                created for each language per process, but you can create more
-                if you"re doing something unusual. You may wish to make the
+                if you're doing something unusual. You may wish to make the
                instance a global variable or "singleton".
        +table(["Example", "Description"])
            +row
-                +cell #[code.lang-python nlp = English()]
+                +cell #[code nlp = English()]
-                +cell Load everything, from default package
+                +cell Load everything, from default path.
            +row
-                +cell #[code.lang-python nlp = English(data_dir='my_data')]
+                +cell #[code nlp = English(path='my_data')]
-                +cell Load everything, from specified dir
+                +cell Load everything, from specified path
            +row
-                +cell #[code.lang-python nlp = English(parser=False)]
+                +cell #[code nlp = English(path=path_obj)]
-                +cell Load everything except the parser.
+                +cell Load everything, from an object that follows the #[code pathlib.Path] protocol.
            +row
-                +cell #[code.lang-python nlp = English(parser=False, tagger=False)]
+                +cell #[code nlp = English(parser=False, vectors=False)]
-                +cell Load everything except the parser and tagger.
+                +cell Load everything except the parser and the word vectors.
            +row
-                +cell #[code.lang-python nlp = English(parser=MyParser())]
+                +cell #[code nlp = English(parser=my_parser)]
-                +cell Supply your own parser
+                +cell Load everything, and use a custom parser.
            +row
                +cell #[code nlp = English(create_pipeline=my_pipeline)]
                +cell Load everything, and use a custom pipeline.
        +code("python", "Definition").
-            def __init__(self, data_dir=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None):
+            def __init__(self, path=True, **overrides):
-                return self
+                D = self.Defaults
                self.vocab     = Vocab(path=path, parent=self, **D.vocab) \
                                 if 'vocab' not in overrides \
                                 else overrides['vocab']
                self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
                                 if 'tokenizer' not in overrides \
                                 else overrides['tokenizer']
                self.tagger    = Tagger(self.vocab, path=path, **D.tagger) \
                                 if 'tagger' not in overrides \
                                 else overrides['tagger']
                self.parser    = DependencyParser(self.vocab, path=path, **D.parser) \
                                 if 'parser' not in overrides \
                                 else overrides['parser']
                self.entity    = EntityRecognizer(self.vocab, path=path, **D.entity) \
                                 if 'entity' not in overrides \
                                 else overrides['entity']
                self.matcher   = Matcher(self.vocab, path=path, **D.matcher) \
                                 if 'matcher' not in overrides \
                                 else overrides['matcher']
-        +table(["Arg", "Type", "Description"])
+                if 'make_doc' in overrides:
-            +row
+                    self.make_doc = overrides['make_doc']
-                +cell data_dir
+                elif 'create_make_doc' in overrides:
-                +cell str
+                    self.make_doc = overrides['create_make_doc'](self)
-                +cell.
+                else:
-                    The data directory. If None, value is obtained via the
+                    self.make_doc = lambda text: self.tokenizer(text)
-                    #[code default_data_dir()] method.
+                if 'pipeline' in overrides:
                    self.pipeline = overrides['pipeline']
                elif 'create_pipeline' in overrides:
                    self.pipeline = overrides['create_pipeline'](self)
                else:
                    self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
-            +row
+        +section("language-call")
-                +cell vocab
+            +h(3, "language-call")
-                +cell #[code Vocab]
+                | #[+tag method] Language.__call__
                +cell.
                    The vocab object, which should be an instance of class
                    #[code spacy.vocab.Vocab]. If #[code None], the object is
                    obtained from the #[code default_vocab()] class method. The
                    vocab object manages all of the language specific rules and
                    definitions, maintains the cache of lexical types, and manages
                    the word vectors. Because the vocab owns this important data,
                    most objects hold a reference to the vocab.
            +row
                +cell tokenizer
                +cell #[code Tokenizer]
                +cell.
                    The tokenizer, which should be a callable that accepts a
                    unicode string, and returns a #[code Doc] object. If set to
                    #[code None], the default tokenizer is constructed from the
                    #[code default_tokenizer()] method.
            +row
                +cell tagger
                +cell #[code Tagger]
                +cell.
                    The part-of-speech tagger, which should be a callable that
                    accepts a #[code Doc] object, and sets the part-of-speech
                    tags in-place. If set to None, the default tagger is constructed
                    from the #[code default_tagger()] method.
            +row
                +cell parser
                +cell #[code Parser]
                +cell.
                    The dependency parser, which should be a callable that accepts
                    a #[code Doc] object, and sets the sentence boundaries,
                    syntactic heads and dependency labels in-place.
                    If set to #[code None], the default parser is
                    constructed from the #[code default_parser()] method. To disable
                    the parser and prevent it from being loaded, pass #[code parser=False].
            +row
                +cell entity
                +cell #[code Parser]
                +cell.
                    The named entity recognizer, which should be a callable that
                    accepts a #[code Doc] object, and sets the named entity annotations
                    in-place. If set to None, the default entity recognizer is
                    constructed from the #[code default_entity()] method. To disable
                    the entity recognizer and prevent it from being loaded, pass
                    #[code entity=False].
            +row
                +cell matcher
                +cell #[code Matcher]
                +cell.
                    The pattern matcher, which should be a callable that accepts
                    a #[code Doc] object, and sets named entity annotations in-place
                    using token-based rules. If set
                    to None, the default matcher is constructed from the
                    #[code default_matcher()] method.
        +section("english-call")
            +h(3, "english-call")
                | #[+tag method] English.__call__
            p
                | The main entry point to spaCy. Takes raw unicode text, and returns
@ -152,30 +136,30 @@
                | and #[code Span] objects.
                +aside("Efficiency").
-                    spaCy"s algorithms are all linear-time, so you can supply
+                    spaCy's algorithms are all linear-time, so you can supply
                    documents of arbitrary length, e.g. whole novels.
            +table(["Example", "Description"], "code")
                +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.')]
+                    +cell #[ doc = nlp(u'Some text.')]
                    +cell Apply the full pipeline.
                +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', parse=False)]
+                    +cell #[ doc = nlp(u'Some text.', parse=False)]
                    +cell Applies tagger and entity, not parser
                +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', entity=False)]
+                    +cell #[ doc = nlp(u'Some text.', entity=False)]
                    +cell Applies tagger and parser, not entity.
                +row
-                    +cell #[code.lang-python doc = nlp(u'Some text.', tag=False)]
+                    +cell #[ doc = nlp(u'Some text.', tag=False)]
                    +cell Does not apply tagger, entity or parser
                +row
-                    +cell #[code.lang-python doc = nlp(u'')]
+                    +cell #[ doc = nlp(u'')]
                    +cell Zero-length tokens, not an error
                +row
-                    +cell #[code.lang-python doc = nlp(b'Some text')]
+                    +cell #[ doc = nlp(b'Some text')]
                    +cell Error: need unicode
                +row
-                    +cell #[code.lang-python doc = nlp(b'Some text'.decode('utf8'))]
+                    +cell #[ doc = nlp(b'Some text'.decode('utf8'))]
                    +cell Decode bytes into unicode first.
            +code("python", "Definition").
--- a/website/docs/_data.json
+++ b/website/docs/_data.json
@ -8,7 +8,7 @@
                ["Usage Examples", "#examples", "examples"]
            ],
            "API": [
-                ["English", "#english", "english"],
+                ["Language", "#language", "language"],
                ["Doc", "#doc", "doc"],
                ["Token", "#token", "token"],
                ["Span", "#span", "span"],
--- a/website/docs/index.jade
+++ b/website/docs/index.jade
@ -13,7 +13,7 @@ include _quickstart-examples
 +h(2, "api") API
-include _api-english
+include _api-language
 include _api-doc
 include _api-token
 include _api-span