From 3e8f15df908c31b4c52308b621fed490cbac2ccc Mon Sep 17 00:00:00 2001 From: Wouter van Oortmerssen Date: Mon, 23 Dec 2019 11:12:41 -0800 Subject: [PATCH] Fix for FlexBuffers FBT_VECTOR_STRING size bit-width. For details, test.cpp/FlexBuffersDeprecatedTest(), and also https://github.com/google/flatbuffers/issues/5627 Change-Id: I6e86e1138a5777e31055cfa2f79276d44732efbc --- include/flatbuffers/flexbuffers.h | 58 +++++++++++++++++++++------- tests/test.cpp | 63 +++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 13 deletions(-) diff --git a/include/flatbuffers/flexbuffers.h b/include/flatbuffers/flexbuffers.h index 870bc83a2..1bc3e0f3b 100644 --- a/include/flatbuffers/flexbuffers.h +++ b/include/flatbuffers/flexbuffers.h @@ -65,7 +65,9 @@ enum Type { FBT_VECTOR_UINT = 12, FBT_VECTOR_FLOAT = 13, FBT_VECTOR_KEY = 14, - FBT_VECTOR_STRING = 15, + // DEPRECATED, use FBT_VECTOR or FBT_VECTOR_KEY instead. + // Read test.cpp/FlexBuffersDeprecatedTest() for details on why. + FBT_VECTOR_STRING_DEPRECATED = 15, FBT_VECTOR_INT2 = 16, // Typed tuple (no type table, no size field). FBT_VECTOR_UINT2 = 17, FBT_VECTOR_FLOAT2 = 18, @@ -88,7 +90,7 @@ inline bool IsTypedVectorElementType(Type t) { } inline bool IsTypedVector(Type t) { - return (t >= FBT_VECTOR_INT && t <= FBT_VECTOR_STRING) || + return (t >= FBT_VECTOR_INT && t <= FBT_VECTOR_STRING_DEPRECATED) || t == FBT_VECTOR_BOOL; } @@ -212,26 +214,40 @@ class Object { uint8_t byte_width_; }; -// Stores size in `byte_width_` bytes before data_ pointer. +// Object that has a size, obtained either from size prefix, or elsewhere. class Sized : public Object { public: - Sized(const uint8_t *data, uint8_t byte_width) : Object(data, byte_width) {} - size_t size() const { + // Size prefix. + Sized(const uint8_t *data, uint8_t byte_width) : + Object(data, byte_width), size_(read_size()) {} + // Manual size. + Sized(const uint8_t *data, uint8_t byte_width, size_t sz) : + Object(data, byte_width), size_(sz) {} + size_t size() const { return size_; } + // Access size stored in `byte_width_` bytes before data_ pointer. + size_t read_size() const { return static_cast(ReadUInt64(data_ - byte_width_, byte_width_)); } + protected: + size_t size_; }; class String : public Sized { public: - String(const uint8_t *data, uint8_t byte_width) : Sized(data, byte_width) {} + // Size prefix. + String(const uint8_t *data, uint8_t byte_width) + : Sized(data, byte_width) {} + // Manual size. + String(const uint8_t *data, uint8_t byte_width, size_t sz) + : Sized(data, byte_width, sz) {} size_t length() const { return size(); } const char *c_str() const { return reinterpret_cast(data_); } - std::string str() const { return std::string(c_str(), length()); } + std::string str() const { return std::string(c_str(), size()); } static String EmptyString() { - static const uint8_t empty_string[] = { 0 /*len*/, 0 /*terminator*/ }; - return String(empty_string + 1, 1); + static const char *empty_string = ""; + return String(reinterpret_cast(empty_string), 1, 0); } bool IsTheEmptyString() const { return data_ == EmptyString().data_; } }; @@ -279,6 +295,7 @@ class TypedVector : public Sized { Type ElementType() { return type_; } + friend Reference; private: Type type_; @@ -487,17 +504,22 @@ class Reference { float AsFloat() const { return static_cast(AsDouble()); } const char *AsKey() const { - if (type_ == FBT_KEY) { + if (type_ == FBT_KEY || type_ == FBT_STRING) { return reinterpret_cast(Indirect()); } else { return ""; } } - // This function returns the empty string if you try to read a not-string. + // This function returns the empty string if you try to read something that + // is not a string or key. String AsString() const { if (type_ == FBT_STRING) { return String(Indirect(), byte_width_); + } else if (type_ == FBT_KEY) { + auto key = Indirect(); + return String(key, byte_width_, + strlen(reinterpret_cast(key))); } else { return String::EmptyString(); } @@ -588,8 +610,18 @@ class Reference { TypedVector AsTypedVector() const { if (IsTypedVector()) { - return TypedVector(Indirect(), byte_width_, - ToTypedVectorElementType(type_)); + auto tv = TypedVector(Indirect(), byte_width_, + ToTypedVectorElementType(type_)); + if (tv.type_ == FBT_STRING) { + // These can't be accessed as strings, since we don't know the bit-width + // of the size field, see the declaration of + // FBT_VECTOR_STRING_DEPRECATED above for details. + // We change the type here to be keys, which are a subtype of strings, + // and will ignore the size field. This will truncate strings with + // embedded nulls. + tv.type_ = FBT_KEY; + } + return tv; } else { return TypedVector::EmptyTypedVector(); } diff --git a/tests/test.cpp b/tests/test.cpp index 80c18768e..cc6086a9d 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -2813,6 +2813,68 @@ void FlexBuffersTest() { TEST_EQ_STR(jsontest, jsonback.c_str()); } +void FlexBuffersDeprecatedTest() { + // FlexBuffers as originally designed had a flaw involving the + // FBT_VECTOR_STRING datatype, and this test documents/tests the fix for it. + // Discussion: https://github.com/google/flatbuffers/issues/5627 + flexbuffers::Builder slb; + // FBT_VECTOR_* are "typed vectors" where all elements are of the same type. + // Problem is, when storing FBT_STRING elements, it relies on that type to + // get the bit-width for the size field of the string, which in this case + // isn't present, and instead defaults to 8-bit. This means that any strings + // stored inside such a vector, when accessed thru the old API that returns + // a String reference, will appear to be truncated if the string stored is + // actually >=256 bytes. + std::string test_data(300, 'A'); + auto start = slb.StartVector(); + // This one will have a 16-bit size field. + slb.String(test_data); + // This one will have an 8-bit size field. + slb.String("hello"); + // We're asking this to be serialized as a typed vector (true), but not + // fixed size (false). The type will be FBT_VECTOR_STRING with a bit-width + // of whatever the offsets in the vector need, the bit-widths of the strings + // are not stored(!) <- the actual design flaw. + // Note that even in the fixed code, we continue to serialize the elements of + // FBT_VECTOR_STRING as FBT_STRING, since there may be old code out there + // reading new data that we want to continue to function. + // Thus, FBT_VECTOR_STRING, while deprecated, will always be represented the + // same way, the fix lies on the reading side. + slb.EndVector(start, true, false); + slb.Finish(); + // So now lets read this data back. + // For existing data, since we have no way of knowing what the actual + // bit-width of the size field of the string is, we are going to ignore this + // field, and instead treat these strings as FBT_KEY (null-terminated), so we + // can deal with strings of arbitrary length. This of course truncates strings + // with embedded nulls, but we think that that is preferrable over truncating + // strings >= 256 bytes. + auto vec = flexbuffers::GetRoot(slb.GetBuffer()).AsTypedVector(); + // Even though this was serialized as FBT_VECTOR_STRING, it is read as + // FBT_VECTOR_KEY: + TEST_EQ(vec.ElementType(), flexbuffers::FBT_KEY); + // Access the long string. Previously, this would return a string of size 1, + // since it would read the high-byte of the 16-bit length. + // This should now correctly test the full 300 bytes, using AsKey(): + TEST_EQ_STR(vec[0].AsKey(), test_data.c_str()); + // Old code that called AsString will continue to work, as the String + // accessor objects now use a cached size that can come from a key as well. + TEST_EQ_STR(vec[0].AsString().c_str(), test_data.c_str()); + // Short strings work as before: + TEST_EQ_STR(vec[1].AsKey(), "hello"); + TEST_EQ_STR(vec[1].AsString().c_str(), "hello"); + // So, while existing code and data mostly "just work" with the fixes applied + // to AsTypedVector and AsString, what do you do going forward? + // Code accessing existing data doesn't necessarily need to change, though + // you could consider using AsKey instead of AsString for a) documenting + // that you are accessing keys, or b) a speedup if you don't actually use + // the string size. + // For new data, or data that doesn't need to be backwards compatible, + // instead serialize as FBT_VECTOR (call EndVector with typed = false, then + // read elements with AsString), or, for maximum compactness, use + // FBT_VECTOR_KEY (call slb.Key above instead, read with AsKey or AsString). +} + void TypeAliasesTest() { flatbuffers::FlatBufferBuilder builder; @@ -3237,6 +3299,7 @@ int FlatBufferTests() { JsonDefaultTest(); JsonEnumsTest(); FlexBuffersTest(); + FlexBuffersDeprecatedTest(); UninitializedVectorTest(); EqualOperatorTest(); NumericUtilsTest();