[Java][FlexBuffers] Optimize Map access (#5735)

The original implementation of map access is very naive: - Encode String to UTF8 byte[] - Creates a new KeyVector - Performs a binary search to find the key - return value So every access to the Map there was useless allocations of Keys and KeyVector and complete encoding of the search key, which for most comparisons would be wasteful. This changes completely removes the use of KeyVector and compute the key positions on the spot. Besides that, it compares keys codepoint-by-codepoint, avoiding unnecessary allocations and reducing encoding for most cases. Some benchmarks result in a 2.75x speedup.
2020-03-30 22:46:42 +02:00 · 2020-03-30 22:46:42 +02:00 · 925fab6b15
parent d9fecc3327
commit 925fab6b15
3 changed files with 210 additions and 9 deletions
--- a/java/com/google/flatbuffers/FlexBuffers.java
+++ b/java/com/google/flatbuffers/FlexBuffers.java
@ -818,6 +818,9 @@ public class FlexBuffers {
     */
    public static class Map extends Vector {
        private static final Map EMPTY_MAP = new Map(EMPTY_BB, 1, 1);
+        // cache for converting UTF-8 codepoints into
+        // Java chars. Used to speed up String comparison
+        private final byte[] comparisonBuffer = new byte[4];

        Map(ReadBuf bb, int end, int byteWidth) {
            super(bb, end, byteWidth);
@ -836,7 +839,11 @@ public class FlexBuffers {
         * @return reference to value in map
         */
        public Reference get(String key) {
-            return get(key.getBytes(StandardCharsets.UTF_8));
+            int index = binarySearch(key);
+            if (index >= 0 && index < size) {
+                return get(index);
+            }
+            return Reference.NULL_REFERENCE;
        }

        /**
@ -844,9 +851,7 @@ public class FlexBuffers {
         * @return reference to value in map
         */
        public Reference get(byte[] key) {
-            KeyVector keys = keys();
-            int size = keys.size();
-            int index = binarySearch(keys, key);
+            int index = binarySearch(key);
            if (index >= 0 && index < size) {
                return get(index);
            }
@ -898,14 +903,17 @@ public class FlexBuffers {
        }

        // Performs a binary search on a key vector and return index of the key in key vector
-        private int binarySearch(KeyVector keys, byte[] searchedKey) {
+        private int binarySearch(CharSequence searchedKey) {
            int low = 0;
-            int high = keys.size() - 1;
-
+            int high = size - 1;
+            final int num_prefixed_fields = 3;
+            int keysOffset = end - (byteWidth * num_prefixed_fields);
+            int keysStart = indirect(bb, keysOffset, byteWidth);
+            int keyByteWidth = readInt(bb, keysOffset + byteWidth, byteWidth);
            while (low <= high) {
                int mid = (low + high) >>> 1;
-                Key k = keys.get(mid);
-                int cmp = k.compareTo(searchedKey);
+                int keyPos = indirect(bb, keysStart + mid * keyByteWidth, keyByteWidth);
+                int cmp = compareCharSequence(keyPos, searchedKey);
                if (cmp < 0)
                    low = mid + 1;
                else if (cmp > 0)
@ -915,6 +923,107 @@ public class FlexBuffers {
            }
            return -(low + 1);  // key not found
        }
+
+        private int binarySearch(byte[] searchedKey) {
+            int low = 0;
+            int high = size - 1;
+            final int num_prefixed_fields = 3;
+            int keysOffset = end - (byteWidth * num_prefixed_fields);
+            int keysStart = indirect(bb, keysOffset, byteWidth);
+            int keyByteWidth = readInt(bb, keysOffset + byteWidth, byteWidth);
+
+            while (low <= high) {
+                int mid = (low + high) >>> 1;
+                int keyPos = indirect(bb, keysStart + mid * keyByteWidth, keyByteWidth);
+                int cmp = compareBytes(bb, keyPos, searchedKey);
+                if (cmp < 0)
+                    low = mid + 1;
+                else if (cmp > 0)
+                    high = mid - 1;
+                else
+                    return mid; // key found
+            }
+            return -(low + 1);  // key not found
+        }
+
+        // compares a byte[] against a FBT_KEY
+        private int compareBytes(ReadBuf bb, int start, byte[] other) {
+            int l1 = start;
+            int l2 = 0;
+            byte c1, c2;
+            do {
+                c1 = bb.get(l1);
+                c2 = other[l2];
+                if (c1 == '\0')
+                    return c1 - c2;
+                l1++;
+                l2++;
+                if (l2 == other.length) {
+                    // in our buffer we have an additional \0 byte
+                    // but this does not exist in regular Java strings, so we return now
+                    return c1 - c2;
+                }
+            }
+            while (c1 == c2);
+            return c1 - c2;
+        }
+
+        // compares a CharSequence against a FBT_KEY
+        private int compareCharSequence(int start, CharSequence other) {
+            int bufferPos = start;
+            int otherPos = 0;
+            int limit = bb.limit();
+            int otherLimit = other.length();
+
+            // special loop for ASCII characters. Most of keys should be ASCII only, so this
+            // loop should be optimized for that.
+            // breaks if a multi-byte character is found
+            while (otherPos < otherLimit) {
+                char c2 = other.charAt(otherPos);
+
+                if (c2 >= 0x80) {
+                    // not a single byte codepoint
+                    break;
+                }
+
+                byte b = bb.get(bufferPos);
+
+                if (b == 0) {
+                    return -c2;
+                } else if (b < 0) {
+                    break;
+                } else if ((char) b != c2) {
+                    return b - c2;
+                }
+                ++bufferPos;
+                ++otherPos;
+            }
+
+            while (bufferPos < limit) {
+
+                int sizeInBuff = Utf8.encodeUtf8CodePoint(other, otherPos, comparisonBuffer);
+
+                if (sizeInBuff == 0) {
+                    // That means we finish with other and there are not more chars to
+                    // compare. String in the buffer is bigger.
+                    return bb.get(bufferPos);
+                }
+
+                for (int i = 0; i < sizeInBuff; i++) {
+                    byte bufferByte = bb.get(bufferPos++);
+                    byte otherByte = comparisonBuffer[i];
+                    if (bufferByte == 0) {
+                        // Our key is finished, so other is bigger
+                        return -otherByte;
+                    } else if (bufferByte != otherByte) {
+                        return bufferByte - otherByte;
+                    }
+                }
+
+                otherPos += sizeInBuff == 4 ? 2 : 1;
+            }
+            return 0;
+        }
    }

    /**
--- a/java/com/google/flatbuffers/Utf8.java
+++ b/java/com/google/flatbuffers/Utf8.java
@ -18,9 +18,13 @@ package com.google.flatbuffers;

 import java.nio.ByteBuffer;

+import static java.lang.Character.MAX_SURROGATE;
+import static java.lang.Character.MIN_SURROGATE;
 import static java.lang.Character.MIN_HIGH_SURROGATE;
 import static java.lang.Character.MIN_LOW_SURROGATE;
 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
+import static java.lang.Character.isSurrogatePair;
+import static java.lang.Character.toCodePoint;

 public abstract class Utf8 {

@ -73,6 +77,56 @@ public abstract class Utf8 {
    DEFAULT = instance;
  }

+  /**
+   * Encode a Java's CharSequence UTF8 codepoint into a byte array.
+   * @param in CharSequence to be encoded
+   * @param start start position of the first char in the codepoint
+   * @param out byte array of 4 bytes to be filled
+   * @return return the amount of bytes occupied by the codepoint
+   */
+  public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
+    // utf8 codepoint needs at least 4 bytes
+    assert out.length >= 4;
+
+    final int inLength = in.length();
+    if (start >= inLength) {
+      return 0;
+    }
+
+    char c = in.charAt(start);
+     if (c < 0x80) {
+       // One byte (0xxx xxxx)
+       out[0] = (byte) c;
+       return 1;
+     } else if (c < 0x800) {
+      // Two bytes (110x xxxx 10xx xxxx)
+      out[0] = (byte) (0xC0 | (c >>> 6));
+      out[1] = (byte) (0x80 | (0x3F & c));
+      return 2;
+    } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
+      // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
+      // Maximum single-char code point is 0xFFFF, 16 bits.
+      out[0] = (byte) (0xE0 | (c >>> 12));
+      out[1] =(byte) (0x80 | (0x3F & (c >>> 6)));
+      out[2] = (byte) (0x80 | (0x3F & c));
+      return 3;
+    } else {
+      // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
+      // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
+      // bytes
+      final char low;
+      if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) {
+        throw new UnpairedSurrogateException(start, inLength);
+      }
+      int codePoint = toCodePoint(c, low);
+      out[0] = (byte) ((0xF << 4) | (codePoint >>> 18));
+      out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
+      out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
+      out[3] = (byte) (0x80 | (0x3F & codePoint));
+      return 4;
+    }
+  }
+
  /**
   * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
   * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
--- a/tests/JavaTest.java
+++ b/tests/JavaTest.java
@ -12,10 +12,12 @@ import com.google.flatbuffers.FlexBuffers;
 import com.google.flatbuffers.FlexBuffersBuilder;
 import com.google.flatbuffers.StringVector;
 import com.google.flatbuffers.UnionVector;
+
 import com.google.flatbuffers.FlexBuffers.FlexBufferException;
 import com.google.flatbuffers.FlexBuffers.Reference;
 import com.google.flatbuffers.FlexBuffers.Vector;
 import com.google.flatbuffers.ArrayReadWriteBuf;
+import com.google.flatbuffers.FlexBuffers.KeyVector;

 import java.io.*;
 import java.math.BigInteger;
@ -1025,6 +1027,41 @@ class JavaTest {
        }
    }
    
+    public static void testFlexBuffersUtf8Map() {
+        FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(512),
+                FlexBuffersBuilder.BUILDER_FLAG_SHARE_KEYS_AND_STRINGS);
+
+        String key0 = "😨 face1";
+        String key1 = "😩 face2";
+        String key2 = "😨 face3";
+        String key3 = "trademark ®";
+        String key4 = "€ euro";
+        String utf8keys[] = { "😨 face1", "😩 face2", "😨 face3", "trademark ®", "€ euro"};
+
+        int map = builder.startMap();
+
+        for (int i=0; i< utf8keys.length; i++) {
+            builder.putString(utf8keys[i], utf8keys[i]);  // Testing key and string reuse.
+        }
+        builder.endMap(null, map);
+        builder.finish();
+
+        FlexBuffers.Map m = FlexBuffers.getRoot(builder.getBuffer()).asMap();
+
+        TestEq(m.size(), 5);
+
+        KeyVector kv = m.keys();
+        for (int i=0; i< utf8keys.length; i++) {
+            TestEq(kv.get(i).toString(), m.get(i).asString());
+        }
+
+        TestEq(m.get(key0).asString(), utf8keys[0]);
+        TestEq(m.get(key1).asString(), utf8keys[1]);
+        TestEq(m.get(key2).asString(), utf8keys[2]);
+        TestEq(m.get(key3).asString(), utf8keys[3]);
+        TestEq(m.get(key4).asString(), utf8keys[4]);
+    }
+
    public static void TestFlexBuffers() {
        testSingleElementByte();
        testSingleElementShort();
@ -1047,6 +1084,7 @@ class JavaTest {
        testFlexBufferVectorStrings();
        testDeprecatedTypedVectorString();
        testBuilderGrowth();
+        testFlexBuffersUtf8Map();
    }

    static void TestVectorOfBytes() {