Fix for FlexBuffers FBT_VECTOR_STRING size bit-width.

For details, test.cpp/FlexBuffersDeprecatedTest(), and also
https://github.com/google/flatbuffers/issues/5627

Change-Id: I6e86e1138a5777e31055cfa2f79276d44732efbc
This commit is contained in:
Wouter van Oortmerssen 2019-12-23 11:12:41 -08:00
parent 602721a735
commit 3e8f15df90
2 changed files with 108 additions and 13 deletions

View File

@ -65,7 +65,9 @@ enum Type {
FBT_VECTOR_UINT = 12,
FBT_VECTOR_FLOAT = 13,
FBT_VECTOR_KEY = 14,
FBT_VECTOR_STRING = 15,
// DEPRECATED, use FBT_VECTOR or FBT_VECTOR_KEY instead.
// Read test.cpp/FlexBuffersDeprecatedTest() for details on why.
FBT_VECTOR_STRING_DEPRECATED = 15,
FBT_VECTOR_INT2 = 16, // Typed tuple (no type table, no size field).
FBT_VECTOR_UINT2 = 17,
FBT_VECTOR_FLOAT2 = 18,
@ -88,7 +90,7 @@ inline bool IsTypedVectorElementType(Type t) {
}
inline bool IsTypedVector(Type t) {
return (t >= FBT_VECTOR_INT && t <= FBT_VECTOR_STRING) ||
return (t >= FBT_VECTOR_INT && t <= FBT_VECTOR_STRING_DEPRECATED) ||
t == FBT_VECTOR_BOOL;
}
@ -212,26 +214,40 @@ class Object {
uint8_t byte_width_;
};
// Stores size in `byte_width_` bytes before data_ pointer.
// Object that has a size, obtained either from size prefix, or elsewhere.
class Sized : public Object {
public:
Sized(const uint8_t *data, uint8_t byte_width) : Object(data, byte_width) {}
size_t size() const {
// Size prefix.
Sized(const uint8_t *data, uint8_t byte_width) :
Object(data, byte_width), size_(read_size()) {}
// Manual size.
Sized(const uint8_t *data, uint8_t byte_width, size_t sz) :
Object(data, byte_width), size_(sz) {}
size_t size() const { return size_; }
// Access size stored in `byte_width_` bytes before data_ pointer.
size_t read_size() const {
return static_cast<size_t>(ReadUInt64(data_ - byte_width_, byte_width_));
}
protected:
size_t size_;
};
class String : public Sized {
public:
String(const uint8_t *data, uint8_t byte_width) : Sized(data, byte_width) {}
// Size prefix.
String(const uint8_t *data, uint8_t byte_width)
: Sized(data, byte_width) {}
// Manual size.
String(const uint8_t *data, uint8_t byte_width, size_t sz)
: Sized(data, byte_width, sz) {}
size_t length() const { return size(); }
const char *c_str() const { return reinterpret_cast<const char *>(data_); }
std::string str() const { return std::string(c_str(), length()); }
std::string str() const { return std::string(c_str(), size()); }
static String EmptyString() {
static const uint8_t empty_string[] = { 0 /*len*/, 0 /*terminator*/ };
return String(empty_string + 1, 1);
static const char *empty_string = "";
return String(reinterpret_cast<const uint8_t *>(empty_string), 1, 0);
}
bool IsTheEmptyString() const { return data_ == EmptyString().data_; }
};
@ -279,6 +295,7 @@ class TypedVector : public Sized {
Type ElementType() { return type_; }
friend Reference;
private:
Type type_;
@ -487,17 +504,22 @@ class Reference {
float AsFloat() const { return static_cast<float>(AsDouble()); }
const char *AsKey() const {
if (type_ == FBT_KEY) {
if (type_ == FBT_KEY || type_ == FBT_STRING) {
return reinterpret_cast<const char *>(Indirect());
} else {
return "";
}
}
// This function returns the empty string if you try to read a not-string.
// This function returns the empty string if you try to read something that
// is not a string or key.
String AsString() const {
if (type_ == FBT_STRING) {
return String(Indirect(), byte_width_);
} else if (type_ == FBT_KEY) {
auto key = Indirect();
return String(key, byte_width_,
strlen(reinterpret_cast<const char *>(key)));
} else {
return String::EmptyString();
}
@ -588,8 +610,18 @@ class Reference {
TypedVector AsTypedVector() const {
if (IsTypedVector()) {
return TypedVector(Indirect(), byte_width_,
ToTypedVectorElementType(type_));
auto tv = TypedVector(Indirect(), byte_width_,
ToTypedVectorElementType(type_));
if (tv.type_ == FBT_STRING) {
// These can't be accessed as strings, since we don't know the bit-width
// of the size field, see the declaration of
// FBT_VECTOR_STRING_DEPRECATED above for details.
// We change the type here to be keys, which are a subtype of strings,
// and will ignore the size field. This will truncate strings with
// embedded nulls.
tv.type_ = FBT_KEY;
}
return tv;
} else {
return TypedVector::EmptyTypedVector();
}

View File

@ -2813,6 +2813,68 @@ void FlexBuffersTest() {
TEST_EQ_STR(jsontest, jsonback.c_str());
}
void FlexBuffersDeprecatedTest() {
// FlexBuffers as originally designed had a flaw involving the
// FBT_VECTOR_STRING datatype, and this test documents/tests the fix for it.
// Discussion: https://github.com/google/flatbuffers/issues/5627
flexbuffers::Builder slb;
// FBT_VECTOR_* are "typed vectors" where all elements are of the same type.
// Problem is, when storing FBT_STRING elements, it relies on that type to
// get the bit-width for the size field of the string, which in this case
// isn't present, and instead defaults to 8-bit. This means that any strings
// stored inside such a vector, when accessed thru the old API that returns
// a String reference, will appear to be truncated if the string stored is
// actually >=256 bytes.
std::string test_data(300, 'A');
auto start = slb.StartVector();
// This one will have a 16-bit size field.
slb.String(test_data);
// This one will have an 8-bit size field.
slb.String("hello");
// We're asking this to be serialized as a typed vector (true), but not
// fixed size (false). The type will be FBT_VECTOR_STRING with a bit-width
// of whatever the offsets in the vector need, the bit-widths of the strings
// are not stored(!) <- the actual design flaw.
// Note that even in the fixed code, we continue to serialize the elements of
// FBT_VECTOR_STRING as FBT_STRING, since there may be old code out there
// reading new data that we want to continue to function.
// Thus, FBT_VECTOR_STRING, while deprecated, will always be represented the
// same way, the fix lies on the reading side.
slb.EndVector(start, true, false);
slb.Finish();
// So now lets read this data back.
// For existing data, since we have no way of knowing what the actual
// bit-width of the size field of the string is, we are going to ignore this
// field, and instead treat these strings as FBT_KEY (null-terminated), so we
// can deal with strings of arbitrary length. This of course truncates strings
// with embedded nulls, but we think that that is preferrable over truncating
// strings >= 256 bytes.
auto vec = flexbuffers::GetRoot(slb.GetBuffer()).AsTypedVector();
// Even though this was serialized as FBT_VECTOR_STRING, it is read as
// FBT_VECTOR_KEY:
TEST_EQ(vec.ElementType(), flexbuffers::FBT_KEY);
// Access the long string. Previously, this would return a string of size 1,
// since it would read the high-byte of the 16-bit length.
// This should now correctly test the full 300 bytes, using AsKey():
TEST_EQ_STR(vec[0].AsKey(), test_data.c_str());
// Old code that called AsString will continue to work, as the String
// accessor objects now use a cached size that can come from a key as well.
TEST_EQ_STR(vec[0].AsString().c_str(), test_data.c_str());
// Short strings work as before:
TEST_EQ_STR(vec[1].AsKey(), "hello");
TEST_EQ_STR(vec[1].AsString().c_str(), "hello");
// So, while existing code and data mostly "just work" with the fixes applied
// to AsTypedVector and AsString, what do you do going forward?
// Code accessing existing data doesn't necessarily need to change, though
// you could consider using AsKey instead of AsString for a) documenting
// that you are accessing keys, or b) a speedup if you don't actually use
// the string size.
// For new data, or data that doesn't need to be backwards compatible,
// instead serialize as FBT_VECTOR (call EndVector with typed = false, then
// read elements with AsString), or, for maximum compactness, use
// FBT_VECTOR_KEY (call slb.Key above instead, read with AsKey or AsString).
}
void TypeAliasesTest() {
flatbuffers::FlatBufferBuilder builder;
@ -3237,6 +3299,7 @@ int FlatBufferTests() {
JsonDefaultTest();
JsonEnumsTest();
FlexBuffersTest();
FlexBuffersDeprecatedTest();
UninitializedVectorTest();
EqualOperatorTest();
NumericUtilsTest();