mirror of https://github.com/explosion/spaCy.git
parent
925a852bb6
commit
8718ca8b1f
|
@ -24,6 +24,7 @@ except ImportError:
|
||||||
ftfy = None
|
ftfy = None
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_OOV_PROB = -20
|
||||||
msg = Printer()
|
msg = Printer()
|
||||||
|
|
||||||
|
|
||||||
|
@ -108,23 +109,30 @@ def open_file(loc):
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
with msg.loading("Counting frequencies..."):
|
if freqs_loc is not None:
|
||||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
with msg.loading("Counting frequencies..."):
|
||||||
msg.good("Counted frequencies")
|
probs, _ = read_freqs(freqs_loc)
|
||||||
with msg.loading("Reading clusters..."):
|
msg.good("Counted frequencies")
|
||||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
else:
|
||||||
msg.good("Read clusters")
|
probs, _ = ({}, DEFAULT_OOV_PROB)
|
||||||
|
if clusters_loc:
|
||||||
|
with msg.loading("Reading clusters..."):
|
||||||
|
clusters = read_clusters(clusters_loc)
|
||||||
|
msg.good("Read clusters")
|
||||||
|
else:
|
||||||
|
clusters = {}
|
||||||
lex_attrs = []
|
lex_attrs = []
|
||||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
||||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
if len(sorted_probs):
|
||||||
attrs = {"orth": word, "id": i, "prob": prob}
|
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
attrs = {"orth": word, "id": i, "prob": prob}
|
||||||
# the first 4 bits. See _parse_features.pyx
|
# Decode as a little-endian string, so that we can do & 15 to get
|
||||||
if word in clusters:
|
# the first 4 bits. See _parse_features.pyx
|
||||||
attrs["cluster"] = int(clusters[word][::-1], 2)
|
if word in clusters:
|
||||||
else:
|
attrs["cluster"] = int(clusters[word][::-1], 2)
|
||||||
attrs["cluster"] = 0
|
else:
|
||||||
lex_attrs.append(attrs)
|
attrs["cluster"] = 0
|
||||||
|
lex_attrs.append(attrs)
|
||||||
return lex_attrs
|
return lex_attrs
|
||||||
|
|
||||||
|
|
||||||
|
@ -142,8 +150,11 @@ def create_model(lang, lex_attrs):
|
||||||
lexeme.is_oov = False
|
lexeme.is_oov = False
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab)
|
if len(nlp.vocab):
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
|
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
||||||
|
else:
|
||||||
|
oov_prob = DEFAULT_OOV_PROB
|
||||||
|
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue