mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
ea2592879f
|
@ -41,7 +41,7 @@ To distinguish issues that are opened by us, the maintainers, we usually add a
|
|||
| [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before |
|
||||
| [`meta`](https://github.com/explosion/spaCy/labels/meta) | Meta topics, e.g. repo organisation and issue management |
|
||||
| [`help wanted`](https://github.com/explosion/spaCy/labels/help%20wanted) | Requests for contributions |
|
||||
| [`help wanted (easy)`](https://github.com/explosion/spaCy/labels/help%20wanted%20%28easy%29) | Requests for contributions suitable for begginners |
|
||||
| [`help wanted (easy)`](https://github.com/explosion/spaCy/labels/help%20wanted%20%28easy%29) | Requests for contributions suitable for beginners |
|
||||
|
||||
## Contributing to the code base
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
|
|||
* Chris DuBois, [@chrisdubois](https://github.com/chrisdubois)
|
||||
* Christoph Schwienheer, [@chssch](https://github.com/chssch)
|
||||
* Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk)
|
||||
* Daniel Rapp, [@rappdw](https://github.com/rappdw)
|
||||
* Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
|
||||
* György Orosz, [@oroszgy](https://github.com/oroszgy)
|
||||
* Henning Peters, [@henningpeters](https://github.com/henningpeters)
|
||||
|
|
|
@ -27,10 +27,21 @@ ABBREVIATIONS = {
|
|||
"সে.": [
|
||||
{ORTH: "সে.", LEMMA: "সেলসিয়াস"},
|
||||
],
|
||||
"কি.মি": [
|
||||
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
||||
"কি.মি.": [
|
||||
{ORTH: "কি.মি.", LEMMA: "কিলোমিটার"},
|
||||
],
|
||||
"কি.মি": [
|
||||
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
|
||||
],
|
||||
"সে.মি.": [
|
||||
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
|
||||
],
|
||||
"সে.মি": [
|
||||
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
|
||||
],
|
||||
"মি.লি.": [
|
||||
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"},
|
||||
]
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
|
||||
|
|
|
@ -45,6 +45,6 @@ _URL_PATTERN = (
|
|||
r"$"
|
||||
).strip()
|
||||
|
||||
TOKEN_MATCH = re.compile(_URL_PATTERN).match
|
||||
TOKEN_MATCH = re.compile(_URL_PATTERN, re.UNICODE).match
|
||||
|
||||
__all__ = ['TOKEN_MATCH']
|
||||
|
|
|
@ -21,11 +21,8 @@ URLS_FULL = URLS_BASIC + [
|
|||
URLS_SHOULD_MATCH = [
|
||||
"http://foo.com/blah_blah",
|
||||
"http://foo.com/blah_blah/",
|
||||
# "http://foo.com/blah_blah_(wikipedia)",
|
||||
# "http://foo.com/blah_blah_(wikipedia)_(again)",
|
||||
"http://www.example.com/wpstyle/?p=364",
|
||||
"https://www.example.com/foo/?bar=baz&inga=42&quux",
|
||||
"http://✪df.ws/123",
|
||||
"http://userid:password@example.com:8080",
|
||||
"http://userid:password@example.com:8080/",
|
||||
"http://userid@example.com",
|
||||
|
@ -36,7 +33,6 @@ URLS_SHOULD_MATCH = [
|
|||
"http://userid:password@example.com/",
|
||||
"http://142.42.1.1/",
|
||||
"http://142.42.1.1:8080/",
|
||||
"http://➡.ws/䨹",
|
||||
"http://⌘.ws",
|
||||
"http://⌘.ws/",
|
||||
"http://foo.com/blah_(wikipedia)#cite-1",
|
||||
|
@ -48,13 +44,19 @@ URLS_SHOULD_MATCH = [
|
|||
"http://j.mp",
|
||||
"ftp://foo.bar/baz",
|
||||
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
|
||||
"http://مثال.إختبار",
|
||||
"http://例子.测试",
|
||||
# "http://उदाहरण.परीक्षा",
|
||||
"http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
|
||||
"http://1337.net",
|
||||
"http://a.b-c.de",
|
||||
"http://223.255.255.254",
|
||||
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
||||
"http://✪df.ws/123",
|
||||
"http://➡.ws/䨹",
|
||||
"http://مثال.إختبار",
|
||||
"http://例子.测试",
|
||||
|
||||
pytest.mark.xfail("http://उदाहरण.परीक्षा"),
|
||||
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"),
|
||||
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"),
|
||||
]
|
||||
|
||||
URLS_SHOULD_NOT_MATCH = [
|
||||
|
@ -74,7 +76,6 @@ URLS_SHOULD_NOT_MATCH = [
|
|||
"///a",
|
||||
"///",
|
||||
"http:///a",
|
||||
# "foo.com",
|
||||
"rdar://1234",
|
||||
"h://test",
|
||||
"http:// shouldfail.com",
|
||||
|
@ -82,21 +83,22 @@ URLS_SHOULD_NOT_MATCH = [
|
|||
"http://foo.bar/foo(bar)baz quux",
|
||||
"ftps://foo.bar/",
|
||||
"http://-error-.invalid/",
|
||||
# "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
||||
"http://-a.b.co",
|
||||
"http://a.b-.co",
|
||||
"http://0.0.0.0",
|
||||
"http://10.1.1.0",
|
||||
"http://10.1.1.255",
|
||||
"http://224.1.1.1",
|
||||
# "http://1.1.1.1.1",
|
||||
"http://123.123.123",
|
||||
"http://3628126748",
|
||||
"http://.www.foo.bar/",
|
||||
# "http://www.foo.bar./",
|
||||
"http://.www.foo.bar./",
|
||||
"http://10.1.1.1",
|
||||
"NASDAQ:GOOG"
|
||||
"NASDAQ:GOOG",
|
||||
|
||||
pytest.mark.xfail("foo.com"),
|
||||
pytest.mark.xfail("http://1.1.1.1.1"),
|
||||
pytest.mark.xfail("http://www.foo.bar./"),
|
||||
]
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue