From 94d8b711a3fe4f48b82f01ee4ac17b6b15e52f8c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 12 Nov 2017 12:06:59 +0100 Subject: [PATCH 1/9] Update CONTRIBUTING.md --- CONTRIBUTING.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 010508b0a..0ec363f3a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -88,8 +88,10 @@ requests: | [`models`](https://github.com/explosion/spaCy/labels/models), `language / [name]` | Issues related to the specific [models](https://github.com/explosion/spacy-models), languages and data | | [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems | | [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers | -| [`wip`](https://github.com/explosion/spaCy/labels/wip) | Work in progress, mostly used for pull requests. | +| [`wip`](https://github.com/explosion/spaCy/labels/wip) | Work in progress, mostly used for pull requests | +| [`v1`](https://github.com/explosion/spaCy/labels/v1) | Reports related to spaCy v1.x | | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before | +| [`third-party`](https://github.com/explosion/spaCy/labels/third-party) | Issues related to third-party packages and services | | [`meta`](https://github.com/explosion/spaCy/labels/meta) | Meta topics, e.g. repo organisation and issue management | | [`help wanted`](https://github.com/explosion/spaCy/labels/help%20wanted), [`help wanted (easy)`](https://github.com/explosion/spaCy/labels/help%20wanted%20%28easy%29) | Requests for contributions | From f0e28e8ae5859966532e2eac4ddc0c8532a8edb9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 12 Nov 2017 12:07:13 +0100 Subject: [PATCH 2/9] Make fasttext reader accommodate whitespace --- examples/vectors_fast_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py index b6f303015..1544e1d5e 100644 --- a/examples/vectors_fast_text.py +++ b/examples/vectors_fast_text.py @@ -30,7 +30,7 @@ def main(vectors_loc, lang=None): nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.decode('utf8') - pieces = line.split() + pieces = line.rsplit(' ', nr_dim) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab From f2b6b98b75a5fcdeacd72f3305cb5ff1d17e3145 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 13 Nov 2017 08:29:16 +0100 Subject: [PATCH 3/9] Fix typo in code example (resolves #1556) --- website/usage/_processing-pipelines/_custom-components.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/usage/_processing-pipelines/_custom-components.jade b/website/usage/_processing-pipelines/_custom-components.jade index d2100b5cd..71e88ca3d 100644 --- a/website/usage/_processing-pipelines/_custom-components.jade +++ b/website/usage/_processing-pipelines/_custom-components.jade @@ -39,7 +39,7 @@ p return doc nlp = spacy.load('en') - nlp.pipeline.add_pipe(my_component, name='print_info', first=True) + nlp.add_pipe(my_component, name='print_info', first=True) print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner'] doc = nlp(u"This is a sentence.") From b3e502a076524b0ea50fb37bdbfdac95996050d8 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 13 Nov 2017 08:29:57 +0100 Subject: [PATCH 4/9] Add videos section to resources --- website/_harp.json | 2 +- website/_includes/_mixins.jade | 8 ++++++++ website/assets/css/_base/_objects.sass | 16 ++++++++++++++++ website/usage/_data.json | 1 + website/usage/resources.jade | 5 +++++ 5 files changed, 31 insertions(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index 7edad1b0c..7da11afa4 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -82,7 +82,7 @@ } ], - "V_CSS": "2.0.0", + "V_CSS": "2.0.1", "V_JS": "2.0.1", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index c6f605cbc..158668de5 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -312,6 +312,14 @@ mixin github(repo, file, height, alt_file, language) +button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub +//- Youtube video embed + id - [string] ID of YouTube video. + ratio - [string] Video ratio, "16x9" or "4x3". + +mixin youtube(id, ratio) + figure.o-video.o-block(class="o-video--" + (ratio || "16x9")) + iframe.o-video__iframe(src="https://www.youtube.com/embed/#{id}" frameborder="0" height="500" allowfullscreen) + //- Images / figures url - [string] url or path to image diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass index de5b4a322..b8a20f5dd 100644 --- a/website/assets/css/_base/_objects.sass +++ b/website/assets/css/_base/_objects.sass @@ -177,6 +177,22 @@ border-radius: $border-radius +//- Responsive Video embeds + +.o-video + position: relative + height: 0 + + @each $ratio1, $ratio2 in (16, 9), (4, 3) + &.o-video--#{$ratio1}x#{$ratio2} + padding-bottom: (100% * $ratio2 / $ratio1) + +.o-video__iframe + @include position(absolute, top, left, 0, 0) + @include size(100%) + border-radius: var(--border-radius) + + //- Form fields .o-field diff --git a/website/usage/_data.json b/website/usage/_data.json index a736af02f..6dd2a66a8 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -166,6 +166,7 @@ "Demos & Visualizations": "demos", "Books & Courses": "books", "Jupyter Notebooks": "notebooks", + "Videos": "videos", "Research": "research" } }, diff --git a/website/usage/resources.jade b/website/usage/resources.jade index 0e53c900a..d6afcd82f 100644 --- a/website/usage/resources.jade +++ b/website/usage/resources.jade @@ -114,6 +114,11 @@ include ../_includes/_mixins .u-text-right +button(gh("spacy-notebooks"), false, "primary", "small") See more notebooks on GitHub ++section("videos") + +h(2, "videos") Videos + + +youtube("sqDHBH9IjRU") + +section("research") +h(2, "research") Research systems From 7a7b01feb1435094ee63655b19b342f11a8d4af2 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 13 Nov 2017 08:30:06 +0100 Subject: [PATCH 5/9] Update links --- website/usage/_install/_instructions.jade | 3 ++- website/usage/_spacy-101/_tokenization.jade | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/website/usage/_install/_instructions.jade b/website/usage/_install/_instructions.jade index 770aba35c..21b0533eb 100644 --- a/website/usage/_install/_instructions.jade +++ b/website/usage/_install/_instructions.jade @@ -184,7 +184,8 @@ p +h(4, "source-windows") Windows p - | Install a version of + | Install a version of the + | #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Bulild Tools] or | #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express] | that matches the version that was used to compile your Python | interpreter. For official distributions these are: diff --git a/website/usage/_spacy-101/_tokenization.jade b/website/usage/_spacy-101/_tokenization.jade index 602209ec8..c2a02a7a7 100644 --- a/website/usage/_spacy-101/_tokenization.jade +++ b/website/usage/_spacy-101/_tokenization.jade @@ -55,6 +55,6 @@ p p | While punctuation rules are usually pretty general, tokenizer exceptions | strongly depend on the specifics of the individual language. This is - | why each #[+a("/models/#languages") available language] has its + | why each #[+a("/usage/models#languages") available language] has its | own subclass like #[code English] or #[code German], that loads in lists | of hard-coded data and exception rules. From 59f5740edef9ac629b2caf6babf1e06d14df92e7 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Mon, 13 Nov 2017 17:13:49 +0530 Subject: [PATCH 6/9] improved upon the list of included stop_words --- spacy/lang/hi/stop_words.py | 62 ++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py index 2ff27c015..370060c51 100644 --- a/spacy/lang/hi/stop_words.py +++ b/spacy/lang/hi/stop_words.py @@ -5,14 +5,23 @@ from __future__ import unicode_literals # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt STOP_WORDS = set(""" +अंदर अत +अदि +अप अपना +अपनि अपनी अपने +अभि अभी अंदर आदि आप +इंहिं +इंहें +इंहों +इतयादि इत्यादि इन इनका @@ -21,13 +30,19 @@ STOP_WORDS = set(""" इन्हों इस इसका +इसकि इसकी इसके इसमें +इसि इसी इसे +उंहिं +उंहें +उंहों उन उनका +उनकि उनकी उनके उनको @@ -36,13 +51,17 @@ STOP_WORDS = set(""" उन्हों उस उसके +उसि उसी उसे एक एवं एस +एसे ऐसे +ओर और +कइ कई कर करता @@ -53,14 +72,18 @@ STOP_WORDS = set(""" कहते कहा का +काफि काफ़ी कि +किंहें +किंहों कितना किन्हें किन्हों किया किर किस +किसि किसी किसे की @@ -68,27 +91,38 @@ STOP_WORDS = set(""" कुल के को +कोइ कोई +कोन +कोनसा कौन कौनसा गया घर जब जहाँ +जहां जा +जिंहें +जिंहों जितना +जिधर जिन जिन्हें जिन्हों जिस जिसे जीधर +जेसा +जेसे जैसा जैसे जो तक तब तरह +तिंहें +तिंहों तिन तिन्हें तिन्हों @@ -96,32 +130,41 @@ STOP_WORDS = set(""" तिसे तो था +थि थी थे दबारा +दवारा दिया दुसरा +दुसरे दूसरे दो द्वारा न -नके +नहिं नहीं ना +निचे निहायत नीचे ने पर पहले +पुरा पूरा पे फिर +बनि बनी +बहि बही बहुत बाद बाला बिलकुल +भि +भितर भी भीतर मगर @@ -131,11 +174,14 @@ STOP_WORDS = set(""" यदि यह यहाँ +यहां +यहि यही या यिह ये रखें +रवासा रहा रहे ऱ्वासा @@ -143,17 +189,24 @@ STOP_WORDS = set(""" लिये लेकिन व +वगेरह वग़ैरह +वरग वर्ग वह वहाँ +वहां +वहिं वहीं वाले वुह वे +वग़ैरह +संग सकता सकते सबसे +सभि सभी साथ साबुत @@ -162,16 +215,23 @@ STOP_WORDS = set(""" से सो संग +हि ही +हुअ हुआ +हुइ हुई हुए +हे +हें है हैं हो होता +होति होती होते होना होने + """.split()) From 4dd34058a2deb46386bf6d9627863b6d960db317 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Mon, 13 Nov 2017 17:23:05 +0530 Subject: [PATCH 7/9] Create abhi18av.md --- .github/contributors/abhi18av.md | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/abhi18av.md diff --git a/.github/contributors/abhi18av.md b/.github/contributors/abhi18av.md new file mode 100644 index 000000000..71a6671e9 --- /dev/null +++ b/.github/contributors/abhi18av.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Abhinav Sharma | +| Company name (if applicable) | Fourtek I.T. Solutions Pvt. Ltd. | +| Title or role (if applicable) | Machine Learning Engineer | +| Date | 3 November 2017 | +| GitHub username | abhi18av | +| Website (optional) | https://abhi18av.github.io/ | From c263c3acce76e08deb2540e9c7db0b007219ac18 Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Mon, 13 Nov 2017 15:45:13 +0100 Subject: [PATCH 8/9] added contributor agreement for DuyguA --- .github/contributors/DuyguA.md | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/DuyguA.md diff --git a/.github/contributors/DuyguA.md b/.github/contributors/DuyguA.md new file mode 100644 index 000000000..817d25ae1 --- /dev/null +++ b/.github/contributors/DuyguA.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Duygu Altinok | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 13 November 2017 | +| GitHub username | DuyguA | +| Website (optional) | | From bc792747067cc791ffe2261939f0bffb1b43d635 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 13 Nov 2017 17:00:03 +0100 Subject: [PATCH 9/9] Fix typo --- website/usage/_adding-languages/_language-data.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/usage/_adding-languages/_language-data.jade b/website/usage/_adding-languages/_language-data.jade index 15e47cc8f..1b0ed241a 100644 --- a/website/usage/_adding-languages/_language-data.jade +++ b/website/usage/_adding-languages/_language-data.jade @@ -376,7 +376,7 @@ p p | Here's an example from the English - | #[+src(gh("spaCy", "spacy/en/lang/lex_attrs.py")) #[code lex_attrs.py]]: + | #[+src(gh("spaCy", "spacy/lang/en/lex_attrs.py")) #[code lex_attrs.py]]: +code("lex_attrs.py"). _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',