2017-08-21 21:22:49 +00:00
|
|
|
import plac
|
2019-12-16 12:12:19 +00:00
|
|
|
import tqdm
|
2017-08-21 21:22:49 +00:00
|
|
|
from pathlib import Path
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 00:28:22 +00:00
|
|
|
import srsly
|
2017-08-21 21:22:49 +00:00
|
|
|
import cProfile
|
|
|
|
import pstats
|
|
|
|
import sys
|
2018-12-06 15:04:12 +00:00
|
|
|
import itertools
|
2017-11-15 12:51:25 +00:00
|
|
|
import thinc.extra.datasets
|
2019-11-04 01:38:45 +00:00
|
|
|
from wasabi import msg
|
2017-08-21 21:22:49 +00:00
|
|
|
|
2018-11-30 19:16:14 +00:00
|
|
|
from ..util import load_model
|
2017-08-21 21:22:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
@plac.annotations(
|
2018-11-30 19:16:14 +00:00
|
|
|
model=("Model to load", "positional", None, str),
|
|
|
|
inputs=("Location of input file. '-' for stdin.", "positional", None, str),
|
|
|
|
n_texts=("Maximum number of texts to use if available", "option", "n", int),
|
|
|
|
)
|
|
|
|
def profile(model, inputs=None, n_texts=10000):
|
2017-08-21 21:22:49 +00:00
|
|
|
"""
|
|
|
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
2018-11-30 19:16:14 +00:00
|
|
|
Input should be formatted as one JSON object per line with a key "text".
|
|
|
|
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
|
|
|
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
2017-08-21 21:22:49 +00:00
|
|
|
"""
|
2018-11-30 19:16:14 +00:00
|
|
|
if inputs is not None:
|
|
|
|
inputs = _read_inputs(inputs, msg)
|
2017-11-15 12:51:25 +00:00
|
|
|
if inputs is None:
|
2018-11-30 19:16:14 +00:00
|
|
|
n_inputs = 25000
|
|
|
|
with msg.loading("Loading IMDB dataset via Thinc..."):
|
|
|
|
imdb_train, _ = thinc.extra.datasets.imdb()
|
|
|
|
inputs, _ = zip(*imdb_train)
|
2019-12-22 00:53:56 +00:00
|
|
|
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
2018-11-30 19:16:14 +00:00
|
|
|
inputs = inputs[:n_inputs]
|
2019-12-22 00:53:56 +00:00
|
|
|
with msg.loading(f"Loading model '{model}'..."):
|
2018-11-30 19:16:14 +00:00
|
|
|
nlp = load_model(model)
|
2019-12-22 00:53:56 +00:00
|
|
|
msg.good(f"Loaded model '{model}'")
|
2018-12-06 15:04:12 +00:00
|
|
|
texts = list(itertools.islice(inputs, n_texts))
|
2018-11-30 19:16:14 +00:00
|
|
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
2017-08-21 21:22:49 +00:00
|
|
|
s = pstats.Stats("Profile.prof")
|
2018-11-30 19:16:14 +00:00
|
|
|
msg.divider("Profile stats")
|
2017-11-17 18:13:00 +00:00
|
|
|
s.strip_dirs().sort_stats("time").print_stats()
|
2017-08-21 21:22:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
def parse_texts(nlp, texts):
|
2017-11-15 12:51:25 +00:00
|
|
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
2017-08-21 21:22:49 +00:00
|
|
|
pass
|
2018-11-30 19:16:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _read_inputs(loc, msg):
|
|
|
|
if loc == "-":
|
|
|
|
msg.info("Reading input from sys.stdin")
|
|
|
|
file_ = sys.stdin
|
|
|
|
file_ = (line.encode("utf8") for line in file_)
|
|
|
|
else:
|
|
|
|
input_path = Path(loc)
|
|
|
|
if not input_path.exists() or not input_path.is_file():
|
|
|
|
msg.fail("Not a valid input data file", loc, exits=1)
|
2019-12-22 00:53:56 +00:00
|
|
|
msg.info(f"Using data from {input_path.parts[-1]}")
|
2018-11-30 19:16:14 +00:00
|
|
|
file_ = input_path.open()
|
|
|
|
for line in file_:
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 00:28:22 +00:00
|
|
|
data = srsly.json_loads(line)
|
2018-11-30 19:16:14 +00:00
|
|
|
text = data["text"]
|
|
|
|
yield text
|