diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d60b7d4bd..09e400e1f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,7 +41,7 @@ To distinguish issues that are opened by us, the maintainers, we usually add a | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before | | [`meta`](https://github.com/explosion/spaCy/labels/meta) | Meta topics, e.g. repo organisation and issue management | | [`help wanted`](https://github.com/explosion/spaCy/labels/help%20wanted) | Requests for contributions | -| [`help wanted (easy)`](https://github.com/explosion/spaCy/labels/help%20wanted%20%28easy%29) | Requests for contributions suitable for begginners | +| [`help wanted (easy)`](https://github.com/explosion/spaCy/labels/help%20wanted%20%28easy%29) | Requests for contributions suitable for beginners | ## Contributing to the code base diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 274cbdd6d..fae00b5e6 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -9,6 +9,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Chris DuBois, [@chrisdubois](https://github.com/chrisdubois) * Christoph Schwienheer, [@chssch](https://github.com/chssch) * Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk) +* Daniel Rapp, [@rappdw](https://github.com/rappdw) * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi) * György Orosz, [@oroszgy](https://github.com/oroszgy) * Henning Peters, [@henningpeters](https://github.com/henningpeters) diff --git a/spacy/bn/tokenizer_exceptions.py b/spacy/bn/tokenizer_exceptions.py index 7722c9dcc..a47b89280 100644 --- a/spacy/bn/tokenizer_exceptions.py +++ b/spacy/bn/tokenizer_exceptions.py @@ -27,10 +27,21 @@ ABBREVIATIONS = { "সে.": [ {ORTH: "সে.", LEMMA: "সেলসিয়াস"}, ], - "কি.মি": [ - {ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, + "কি.মি.": [ {ORTH: "কি.মি.", LEMMA: "কিলোমিটার"}, ], + "কি.মি": [ + {ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, + ], + "সে.মি.": [ + {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, + ], + "সে.মি": [ + {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, + ], + "মি.লি.": [ + {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}, + ] } TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index 7d623cbb3..f01c2fdf5 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -45,6 +45,6 @@ _URL_PATTERN = ( r"$" ).strip() -TOKEN_MATCH = re.compile(_URL_PATTERN).match +TOKEN_MATCH = re.compile(_URL_PATTERN, re.UNICODE).match __all__ = ['TOKEN_MATCH'] diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index f4f9ef29e..316b25f12 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -21,11 +21,8 @@ URLS_FULL = URLS_BASIC + [ URLS_SHOULD_MATCH = [ "http://foo.com/blah_blah", "http://foo.com/blah_blah/", -# "http://foo.com/blah_blah_(wikipedia)", -# "http://foo.com/blah_blah_(wikipedia)_(again)", "http://www.example.com/wpstyle/?p=364", "https://www.example.com/foo/?bar=baz&inga=42&quux", - "http://✪df.ws/123", "http://userid:password@example.com:8080", "http://userid:password@example.com:8080/", "http://userid@example.com", @@ -36,7 +33,6 @@ URLS_SHOULD_MATCH = [ "http://userid:password@example.com/", "http://142.42.1.1/", "http://142.42.1.1:8080/", - "http://➡.ws/䨹", "http://⌘.ws", "http://⌘.ws/", "http://foo.com/blah_(wikipedia)#cite-1", @@ -48,13 +44,19 @@ URLS_SHOULD_MATCH = [ "http://j.mp", "ftp://foo.bar/baz", "http://foo.bar/?q=Test%20URL-encoded%20stuff", - "http://مثال.إختبار", - "http://例子.测试", -# "http://उदाहरण.परीक्षा", "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com", "http://1337.net", "http://a.b-c.de", "http://223.255.255.254", + "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 + "http://✪df.ws/123", + "http://➡.ws/䨹", + "http://مثال.إختبار", + "http://例子.测试", + + pytest.mark.xfail("http://उदाहरण.परीक्षा"), + pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"), + pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"), ] URLS_SHOULD_NOT_MATCH = [ @@ -74,7 +76,6 @@ URLS_SHOULD_NOT_MATCH = [ "///a", "///", "http:///a", -# "foo.com", "rdar://1234", "h://test", "http:// shouldfail.com", @@ -82,21 +83,22 @@ URLS_SHOULD_NOT_MATCH = [ "http://foo.bar/foo(bar)baz quux", "ftps://foo.bar/", "http://-error-.invalid/", -# "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 "http://-a.b.co", "http://a.b-.co", "http://0.0.0.0", "http://10.1.1.0", "http://10.1.1.255", "http://224.1.1.1", -# "http://1.1.1.1.1", "http://123.123.123", "http://3628126748", "http://.www.foo.bar/", -# "http://www.foo.bar./", "http://.www.foo.bar./", "http://10.1.1.1", - "NASDAQ:GOOG" + "NASDAQ:GOOG", + + pytest.mark.xfail("foo.com"), + pytest.mark.xfail("http://1.1.1.1.1"), + pytest.mark.xfail("http://www.foo.bar./"), ]