From bea762ec04b8ca4b18b5c08e010b211d3efebb52 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 4 Nov 2014 01:06:00 +1100 Subject: [PATCH] * Update tokenization rules --- data/en/infix | 1 - data/en/tokenization | 96 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/data/en/infix b/data/en/infix index b88818006..28169ecbb 100644 --- a/data/en/infix +++ b/data/en/infix @@ -1,2 +1 @@ -(?<=[^-])-(?=\w) (?<=[a-z])\.(?=[A-Z]) diff --git a/data/en/tokenization b/data/en/tokenization index 46365515a..4b50cfe3b 100644 --- a/data/en/tokenization +++ b/data/en/tokenization @@ -9,6 +9,7 @@ ain't are not aren't are not can't can not +cannot can not could've could have couldn't could not couldn't've could not have @@ -94,13 +95,108 @@ you'd've you would have you'll you will you're you are you've you have +'em them +'ol old 10km 10 km U.S. U.S. +non-U.S. non-U.S. U.N. U.N. Co. Co. +Corp. Corp. +Inc. Inc. +Rep. Rep. Ms. Ms. Mr. Mr. +a.m. a.m. +p.m. p.m. +Nos. Nos. +a.k.a. a.k.a. +A. A. +B. B. +C. C. +D. D. +E. E. +F. F. +G. G. +H. H. +J. J. +K. K. +L. L. +M. M. +N. N. +O. O. P. P. +Q. Q. +R. R. +S. S. +T. T. +U. U. +V. V. +W. W. +X. X. +Y. Y. +Z. Z. +Jan. Jan. +Feb. Feb. +Mar. Mar. +Apr. Apr. +May. May. +Jun. Jun. +Jul. Jul. +Aug. Aug. +Sep. Sep. +Sept. Sept. +Oct. Oct. +Nov. Nov. +Dec. Dec. +N.V. N.V. +Ala. Ala. +Ariz. Ariz. +Ark. Ark. +Calif. Calif. +Colo. Colo. +Conn. Conn. +Del. Del. +D.C. D.C. +Fla. Fla. +Ga. Ga. +Ill. Ill. +Ind. Ind. +Kans. Kans. +Kan. Kan. +Ky. Ky. +La. La. +Md. Md. +Mass. Mass. +Mich. Mich. +Minn. Minn. +Miss. Miss. +Mo. Mo. +Mont. Mont. +Nebr. Nebr. +Nev. Nev. +N.H. N.H. +N.J. N.J. +N.M. N.M. +N.Y. N.Y. +N.C. N.C. +N.D. N.D. +Okla. Okla. +Ore. Ore. +Pa. Pa. +P.R. P.R. +R.I. R.I. +S.C. S.C. +S.D. S.D. +Tenn. Tenn. +Tex. Tex. +Vt. Vt. +Va. Va. +V.I. V.I. +Wash. Wash. +W.Va. W.Va. +Wis. Wis. +Wyo. Wyo. '' '' :) :) <3 <3