diff --git a/java/com/google/flatbuffers/FlexBuffers.java b/java/com/google/flatbuffers/FlexBuffers.java index d7df08c1e..8263f9a0c 100644 --- a/java/com/google/flatbuffers/FlexBuffers.java +++ b/java/com/google/flatbuffers/FlexBuffers.java @@ -818,6 +818,9 @@ public class FlexBuffers { */ public static class Map extends Vector { private static final Map EMPTY_MAP = new Map(EMPTY_BB, 1, 1); + // cache for converting UTF-8 codepoints into + // Java chars. Used to speed up String comparison + private final byte[] comparisonBuffer = new byte[4]; Map(ReadBuf bb, int end, int byteWidth) { super(bb, end, byteWidth); @@ -836,7 +839,11 @@ public class FlexBuffers { * @return reference to value in map */ public Reference get(String key) { - return get(key.getBytes(StandardCharsets.UTF_8)); + int index = binarySearch(key); + if (index >= 0 && index < size) { + return get(index); + } + return Reference.NULL_REFERENCE; } /** @@ -844,9 +851,7 @@ public class FlexBuffers { * @return reference to value in map */ public Reference get(byte[] key) { - KeyVector keys = keys(); - int size = keys.size(); - int index = binarySearch(keys, key); + int index = binarySearch(key); if (index >= 0 && index < size) { return get(index); } @@ -898,14 +903,17 @@ public class FlexBuffers { } // Performs a binary search on a key vector and return index of the key in key vector - private int binarySearch(KeyVector keys, byte[] searchedKey) { + private int binarySearch(CharSequence searchedKey) { int low = 0; - int high = keys.size() - 1; - + int high = size - 1; + final int num_prefixed_fields = 3; + int keysOffset = end - (byteWidth * num_prefixed_fields); + int keysStart = indirect(bb, keysOffset, byteWidth); + int keyByteWidth = readInt(bb, keysOffset + byteWidth, byteWidth); while (low <= high) { int mid = (low + high) >>> 1; - Key k = keys.get(mid); - int cmp = k.compareTo(searchedKey); + int keyPos = indirect(bb, keysStart + mid * keyByteWidth, keyByteWidth); + int cmp = compareCharSequence(keyPos, searchedKey); if (cmp < 0) low = mid + 1; else if (cmp > 0) @@ -915,6 +923,107 @@ public class FlexBuffers { } return -(low + 1); // key not found } + + private int binarySearch(byte[] searchedKey) { + int low = 0; + int high = size - 1; + final int num_prefixed_fields = 3; + int keysOffset = end - (byteWidth * num_prefixed_fields); + int keysStart = indirect(bb, keysOffset, byteWidth); + int keyByteWidth = readInt(bb, keysOffset + byteWidth, byteWidth); + + while (low <= high) { + int mid = (low + high) >>> 1; + int keyPos = indirect(bb, keysStart + mid * keyByteWidth, keyByteWidth); + int cmp = compareBytes(bb, keyPos, searchedKey); + if (cmp < 0) + low = mid + 1; + else if (cmp > 0) + high = mid - 1; + else + return mid; // key found + } + return -(low + 1); // key not found + } + + // compares a byte[] against a FBT_KEY + private int compareBytes(ReadBuf bb, int start, byte[] other) { + int l1 = start; + int l2 = 0; + byte c1, c2; + do { + c1 = bb.get(l1); + c2 = other[l2]; + if (c1 == '\0') + return c1 - c2; + l1++; + l2++; + if (l2 == other.length) { + // in our buffer we have an additional \0 byte + // but this does not exist in regular Java strings, so we return now + return c1 - c2; + } + } + while (c1 == c2); + return c1 - c2; + } + + // compares a CharSequence against a FBT_KEY + private int compareCharSequence(int start, CharSequence other) { + int bufferPos = start; + int otherPos = 0; + int limit = bb.limit(); + int otherLimit = other.length(); + + // special loop for ASCII characters. Most of keys should be ASCII only, so this + // loop should be optimized for that. + // breaks if a multi-byte character is found + while (otherPos < otherLimit) { + char c2 = other.charAt(otherPos); + + if (c2 >= 0x80) { + // not a single byte codepoint + break; + } + + byte b = bb.get(bufferPos); + + if (b == 0) { + return -c2; + } else if (b < 0) { + break; + } else if ((char) b != c2) { + return b - c2; + } + ++bufferPos; + ++otherPos; + } + + while (bufferPos < limit) { + + int sizeInBuff = Utf8.encodeUtf8CodePoint(other, otherPos, comparisonBuffer); + + if (sizeInBuff == 0) { + // That means we finish with other and there are not more chars to + // compare. String in the buffer is bigger. + return bb.get(bufferPos); + } + + for (int i = 0; i < sizeInBuff; i++) { + byte bufferByte = bb.get(bufferPos++); + byte otherByte = comparisonBuffer[i]; + if (bufferByte == 0) { + // Our key is finished, so other is bigger + return -otherByte; + } else if (bufferByte != otherByte) { + return bufferByte - otherByte; + } + } + + otherPos += sizeInBuff == 4 ? 2 : 1; + } + return 0; + } } /** diff --git a/java/com/google/flatbuffers/Utf8.java b/java/com/google/flatbuffers/Utf8.java index efb6811f8..e8af8ad6c 100644 --- a/java/com/google/flatbuffers/Utf8.java +++ b/java/com/google/flatbuffers/Utf8.java @@ -18,9 +18,13 @@ package com.google.flatbuffers; import java.nio.ByteBuffer; +import static java.lang.Character.MAX_SURROGATE; +import static java.lang.Character.MIN_SURROGATE; import static java.lang.Character.MIN_HIGH_SURROGATE; import static java.lang.Character.MIN_LOW_SURROGATE; import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; +import static java.lang.Character.isSurrogatePair; +import static java.lang.Character.toCodePoint; public abstract class Utf8 { @@ -73,6 +77,56 @@ public abstract class Utf8 { DEFAULT = instance; } + /** + * Encode a Java's CharSequence UTF8 codepoint into a byte array. + * @param in CharSequence to be encoded + * @param start start position of the first char in the codepoint + * @param out byte array of 4 bytes to be filled + * @return return the amount of bytes occupied by the codepoint + */ + public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) { + // utf8 codepoint needs at least 4 bytes + assert out.length >= 4; + + final int inLength = in.length(); + if (start >= inLength) { + return 0; + } + + char c = in.charAt(start); + if (c < 0x80) { + // One byte (0xxx xxxx) + out[0] = (byte) c; + return 1; + } else if (c < 0x800) { + // Two bytes (110x xxxx 10xx xxxx) + out[0] = (byte) (0xC0 | (c >>> 6)); + out[1] = (byte) (0x80 | (0x3F & c)); + return 2; + } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { + // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) + // Maximum single-char code point is 0xFFFF, 16 bits. + out[0] = (byte) (0xE0 | (c >>> 12)); + out[1] =(byte) (0x80 | (0x3F & (c >>> 6))); + out[2] = (byte) (0x80 | (0x3F & c)); + return 3; + } else { + // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) + // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 + // bytes + final char low; + if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) { + throw new UnpairedSurrogateException(start, inLength); + } + int codePoint = toCodePoint(c, low); + out[0] = (byte) ((0xF << 4) | (codePoint >>> 18)); + out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12))); + out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6))); + out[3] = (byte) (0x80 | (0x3F & codePoint)); + return 4; + } + } + /** * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity diff --git a/tests/JavaTest.java b/tests/JavaTest.java index 5ea9baad8..afa1970af 100644 --- a/tests/JavaTest.java +++ b/tests/JavaTest.java @@ -12,10 +12,12 @@ import com.google.flatbuffers.FlexBuffers; import com.google.flatbuffers.FlexBuffersBuilder; import com.google.flatbuffers.StringVector; import com.google.flatbuffers.UnionVector; + import com.google.flatbuffers.FlexBuffers.FlexBufferException; import com.google.flatbuffers.FlexBuffers.Reference; import com.google.flatbuffers.FlexBuffers.Vector; import com.google.flatbuffers.ArrayReadWriteBuf; +import com.google.flatbuffers.FlexBuffers.KeyVector; import java.io.*; import java.math.BigInteger; @@ -1024,6 +1026,41 @@ class JavaTest { // It should throw exception } } + + public static void testFlexBuffersUtf8Map() { + FlexBuffersBuilder builder = new FlexBuffersBuilder(ByteBuffer.allocate(512), + FlexBuffersBuilder.BUILDER_FLAG_SHARE_KEYS_AND_STRINGS); + + String key0 = "😨 face1"; + String key1 = "😩 face2"; + String key2 = "😨 face3"; + String key3 = "trademark ®"; + String key4 = "€ euro"; + String utf8keys[] = { "😨 face1", "😩 face2", "😨 face3", "trademark ®", "€ euro"}; + + int map = builder.startMap(); + + for (int i=0; i< utf8keys.length; i++) { + builder.putString(utf8keys[i], utf8keys[i]); // Testing key and string reuse. + } + builder.endMap(null, map); + builder.finish(); + + FlexBuffers.Map m = FlexBuffers.getRoot(builder.getBuffer()).asMap(); + + TestEq(m.size(), 5); + + KeyVector kv = m.keys(); + for (int i=0; i< utf8keys.length; i++) { + TestEq(kv.get(i).toString(), m.get(i).asString()); + } + + TestEq(m.get(key0).asString(), utf8keys[0]); + TestEq(m.get(key1).asString(), utf8keys[1]); + TestEq(m.get(key2).asString(), utf8keys[2]); + TestEq(m.get(key3).asString(), utf8keys[3]); + TestEq(m.get(key4).asString(), utf8keys[4]); + } public static void TestFlexBuffers() { testSingleElementByte(); @@ -1047,6 +1084,7 @@ class JavaTest { testFlexBufferVectorStrings(); testDeprecatedTypedVectorString(); testBuilderGrowth(); + testFlexBuffersUtf8Map(); } static void TestVectorOfBytes() {