mirror of https://github.com/explosion/spaCy.git
54 lines
1.5 KiB
Python
54 lines
1.5 KiB
Python
|
from pathlib import Path
|
||
|
import plac
|
||
|
import spacy
|
||
|
from spacy.gold import docs_to_json
|
||
|
import srsly
|
||
|
import sys
|
||
|
|
||
|
@plac.annotations(
|
||
|
model=("Model name. Defaults to 'en'.", "option", "m", str),
|
||
|
input_file=("Input file (jsonl)", "positional", None, Path),
|
||
|
output_dir=("Output directory", "positional", None, Path),
|
||
|
n_texts=("Number of texts to convert", "option", "t", int),
|
||
|
)
|
||
|
def convert(model='en', input_file=None, output_dir=None, n_texts=0):
|
||
|
# Load model with tokenizer + sentencizer only
|
||
|
nlp = spacy.load(model)
|
||
|
nlp.disable_pipes(*nlp.pipe_names)
|
||
|
sentencizer = nlp.create_pipe("sentencizer")
|
||
|
nlp.add_pipe(sentencizer, first=True)
|
||
|
|
||
|
texts = []
|
||
|
cats = []
|
||
|
count = 0
|
||
|
|
||
|
if not input_file.exists():
|
||
|
print("Input file not found:", input_file)
|
||
|
sys.exit(1)
|
||
|
else:
|
||
|
with open(input_file) as fileh:
|
||
|
for line in fileh:
|
||
|
data = srsly.json_loads(line)
|
||
|
texts.append(data["text"])
|
||
|
cats.append(data["cats"])
|
||
|
|
||
|
if output_dir is not None:
|
||
|
output_dir = Path(output_dir)
|
||
|
if not output_dir.exists():
|
||
|
output_dir.mkdir()
|
||
|
else:
|
||
|
output_dir = Path(".")
|
||
|
|
||
|
docs = []
|
||
|
for i, doc in enumerate(nlp.pipe(texts)):
|
||
|
doc.cats = cats[i]
|
||
|
docs.append(doc)
|
||
|
if n_texts > 0 and count == n_texts:
|
||
|
break
|
||
|
count += 1
|
||
|
|
||
|
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
plac.call(convert)
|