From a85620a731ae54114b5129b50414cb153d61e783 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 2 Jul 2018 11:35:31 +0200
Subject: [PATCH] Note CoreNLP tokenizer correction on website

---
 website/usage/_facts-figures/_benchmarks.jade | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/website/usage/_facts-figures/_benchmarks.jade b/website/usage/_facts-figures/_benchmarks.jade
index dabf58795..d81dd4a98 100644
--- a/website/usage/_facts-figures/_benchmarks.jade
+++ b/website/usage/_facts-figures/_benchmarks.jade
@@ -157,7 +157,13 @@ p
 
 +infobox("Important note", "⚠️")
     |  This evaluation was conducted in 2015. We're working on benchmarks on
-    |  current CPU and GPU hardware.
+    |  current CPU and GPU hardware. In the meantime, we're grateful to the
+    |  Stanford folks for drawing our attention to what seems
+    |  to be #[+a("https://nlp.stanford.edu/software/tokenizer.html#Speed") a long-standing error] 
+    |  in our CoreNLP benchmarks, especially for their 
+    |  tokenizer. Until we run corrected experiments, we have updated the table
+    |  using their figures.
+
 
 +aside("Methodology")
     |  #[strong Set up:] 100,000 plain-text documents were streamed from an
@@ -183,14 +189,14 @@ p
     +row
         +cell #[strong spaCy]
         each data in [ "0.2ms", "1ms", "19ms"]
-            +cell("num") #[strong=data]
+            +cell("num")=data
 
         each data in ["1x", "1x", "1x"]
             +cell("num")=data
 
     +row
         +cell CoreNLP
-        each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
+        each data in ["0.18ms", "10ms", "49ms", "0.9x", "10x", "2.6x"]
             +cell("num")=data
     +row
         +cell ZPar