From 958fc6ec49199ba47dd2492bef51845a53ec27e2 Mon Sep 17 00:00:00 2001 From: Wouter van Oortmerssen Date: Mon, 29 Feb 2016 15:47:46 -0800 Subject: [PATCH] Added support for easy string pooling. Change-Id: I790cf681c1bffff800d77afb0e2f908d1c827679 Tested: on Linux. Bug: 26186542 --- include/flatbuffers/flatbuffers.h | 80 +++++++++++++++++++++++++++++-- include/flatbuffers/reflection.h | 6 ++- src/reflection.cpp | 11 +++-- tests/test.cpp | 20 +++++--- 4 files changed, 102 insertions(+), 15 deletions(-) diff --git a/include/flatbuffers/flatbuffers.h b/include/flatbuffers/flatbuffers.h index 14a055e8b..ff9f7269d 100644 --- a/include/flatbuffers/flatbuffers.h +++ b/include/flatbuffers/flatbuffers.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -499,7 +500,7 @@ class vector_downward { return cur_; } - uint8_t *data_at(size_t offset) { return buf_ + reserved_ - offset; } + uint8_t *data_at(size_t offset) const { return buf_ + reserved_ - offset; } // push() & fill() are most frequently called with small byte counts (<= 4), // which is why we're using loops rather than calling memcpy/memset. @@ -565,12 +566,17 @@ FLATBUFFERS_FINAL_CLASS explicit FlatBufferBuilder(uoffset_t initial_size = 1024, const simple_allocator *allocator = nullptr) : buf_(initial_size, allocator ? *allocator : default_allocator), - nested(false), finished(false), minalign_(1), force_defaults_(false) { + nested(false), finished(false), minalign_(1), force_defaults_(false), + string_pool(nullptr) { offsetbuf_.reserve(16); // Avoid first few reallocs. vtables_.reserve(16); EndianCheck(); } + ~FlatBufferBuilder() { + if (string_pool) delete string_pool; + } + /// @brief Reset all the state in this FlatBufferBuilder so it can be reused /// to construct another buffer. void Clear() { @@ -580,6 +586,7 @@ FLATBUFFERS_FINAL_CLASS finished = false; vtables_.clear(); minalign_ = 1; + if (string_pool) string_pool->clear(); } /// @brief The current size of the serialized buffer, counting from the end. @@ -829,7 +836,7 @@ FLATBUFFERS_FINAL_CLASS return Offset(GetSize()); } - /// @brief Store a string in the buffer, which can contain any binary data. + /// @brief Store a string in the buffer, which is null-terminated. /// @param[in] str A const char pointer to a C-string to add to the buffer. /// @return Returns the offset in the buffer where the string starts. Offset CreateString(const char *str) { @@ -850,6 +857,58 @@ FLATBUFFERS_FINAL_CLASS return CreateString(str->c_str(), str->Length()); } + /// @brief Store a string in the buffer, which can contain any binary data. + /// If a string with this exact contents has already been serialized before, + /// instead simply returns the offset of the existing string. + /// @param[in] str A const char pointer to the data to be stored as a string. + /// @param[in] len The number of bytes that should be stored from `str`. + /// @return Returns the offset in the buffer where the string starts. + Offset CreateSharedString(const char *str, size_t len) { + if (!string_pool) + string_pool = new StringOffsetMap(StringOffsetCompare(buf_)); + auto size_before_string = buf_.size(); + // Must first serialize the string, since the set is all offsets into + // buffer. + auto off = CreateString(str, len); + auto it = string_pool->find(off); + // If it exists we reuse existing serialized data! + if (it != string_pool->end()) { + // We can remove the string we serialized. + buf_.pop(buf_.size() - size_before_string); + return *it; + } + // Record this string for future use. + string_pool->insert(off); + return off; + } + + /// @brief Store a string in the buffer, which null-terminated. + /// If a string with this exact contents has already been serialized before, + /// instead simply returns the offset of the existing string. + /// @param[in] str A const char pointer to a C-string to add to the buffer. + /// @return Returns the offset in the buffer where the string starts. + Offset CreateSharedString(const char *str) { + return CreateSharedString(str, strlen(str)); + } + + /// @brief Store a string in the buffer, which can contain any binary data. + /// If a string with this exact contents has already been serialized before, + /// instead simply returns the offset of the existing string. + /// @param[in] str A const reference to a std::string to store in the buffer. + /// @return Returns the offset in the buffer where the string starts. + Offset CreateSharedString(const std::string &str) { + return CreateSharedString(str.c_str(), str.length()); + } + + /// @brief Store a string in the buffer, which can contain any binary data. + /// If a string with this exact contents has already been serialized before, + /// instead simply returns the offset of the existing string. + /// @param[in] str A const pointer to a `String` struct to add to the buffer. + /// @return Returns the offset in the buffer where the string starts + Offset CreateSharedString(const String *str) { + return CreateSharedString(str->c_str(), str->Length()); + } + /// @cond FLATBUFFERS_INTERNAL uoffset_t EndVector(size_t len) { assert(nested); // Hit if no corresponding StartVector. @@ -1048,6 +1107,21 @@ FLATBUFFERS_FINAL_CLASS size_t minalign_; bool force_defaults_; // Serialize values equal to their defaults anyway. + + struct StringOffsetCompare { + StringOffsetCompare(const vector_downward &buf) : buf_(buf) {} + bool operator() (const Offset &a, const Offset &b) const { + auto stra = reinterpret_cast(buf_.data_at(a.o)); + auto strb = reinterpret_cast(buf_.data_at(b.o)); + return strncmp(stra->c_str(), strb->c_str(), + std::min(stra->size(), strb->size()) + 1) < 0; + } + const vector_downward &buf_; + }; + + // For use with CreateSharedString. Instantiated on first use only. + typedef std::set, StringOffsetCompare> StringOffsetMap; + StringOffsetMap *string_pool; }; /// @} diff --git a/include/flatbuffers/reflection.h b/include/flatbuffers/reflection.h index cb18b1437..ababe6ad4 100644 --- a/include/flatbuffers/reflection.h +++ b/include/flatbuffers/reflection.h @@ -415,12 +415,14 @@ inline bool SetFieldT(Table *table, const reflection::Field &field, // above resizing functionality has introduced garbage in a buffer you want // to remove. // Note: this does not deal with DAGs correctly. If the table passed forms a -// DAG, the copy will be a tree instead (with duplicates). +// DAG, the copy will be a tree instead (with duplicates). Strings can be +// shared however, by passing true for use_string_pooling. Offset CopyTable(FlatBufferBuilder &fbb, const reflection::Schema &schema, const reflection::Object &objectdef, - const Table &table); + const Table &table, + bool use_string_pooling = false); } // namespace flatbuffers diff --git a/src/reflection.cpp b/src/reflection.cpp index d82c046cc..ff499ae99 100644 --- a/src/reflection.cpp +++ b/src/reflection.cpp @@ -354,7 +354,8 @@ void CopyInline(FlatBufferBuilder &fbb, const reflection::Field &fielddef, Offset CopyTable(FlatBufferBuilder &fbb, const reflection::Schema &schema, const reflection::Object &objectdef, - const Table &table) { + const Table &table, + bool use_string_pooling) { // Before we can construct the table, we have to first generate any // subobjects, and collect their offsets. std::vector offsets; @@ -366,7 +367,9 @@ Offset CopyTable(FlatBufferBuilder &fbb, uoffset_t offset = 0; switch (fielddef.type()->base_type()) { case reflection::String: { - offset = fbb.CreateString(GetFieldS(table, fielddef)).o; + offset = use_string_pooling + ? fbb.CreateSharedString(GetFieldS(table, fielddef)).o + : fbb.CreateString(GetFieldS(table, fielddef)).o; break; } case reflection::Obj: { @@ -395,7 +398,9 @@ Offset CopyTable(FlatBufferBuilder &fbb, std::vector> elements(vec->size()); auto vec_s = reinterpret_cast> *>(vec); for (uoffset_t i = 0; i < vec_s->size(); i++) { - elements[i] = fbb.CreateString(vec_s->Get(i)).o; + elements[i] = use_string_pooling + ? fbb.CreateSharedString(vec_s->Get(i)).o + : fbb.CreateString(vec_s->Get(i)).o; } offset = fbb.CreateVector(elements).o; break; diff --git a/tests/test.cpp b/tests/test.cpp index fce249e99..52da4518f 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -113,11 +113,13 @@ flatbuffers::unique_ptr_t CreateFlatBufferTest(std::string &buffer) { mb3.add_name(wilma); mlocs[2] = mb3.Finish(); - // Create an array of strings: - flatbuffers::Offset strings[2]; - strings[0] = builder.CreateString("bob"); - strings[1] = builder.CreateString("fred"); - auto vecofstrings = builder.CreateVector(strings, 2); + // Create an array of strings. Also test string pooling. + flatbuffers::Offset strings[4]; + strings[0] = builder.CreateSharedString("bob"); + strings[1] = builder.CreateSharedString("fred"); + strings[2] = builder.CreateSharedString("bob"); + strings[3] = builder.CreateSharedString("fred"); + auto vecofstrings = builder.CreateVector(strings, 4); // Create an array of sorted tables, can be used with binary search when read: auto vecoftables = builder.CreateVectorOfSortedTables(mlocs, 3); @@ -188,9 +190,12 @@ void AccessFlatBufferTest(const uint8_t *flatbuf, size_t length) { // Example of accessing a vector of strings: auto vecofstrings = monster->testarrayofstring(); - TEST_EQ(vecofstrings->Length(), 2U); + TEST_EQ(vecofstrings->Length(), 4U); TEST_EQ_STR(vecofstrings->Get(0)->c_str(), "bob"); TEST_EQ_STR(vecofstrings->Get(1)->c_str(), "fred"); + // These should have pointer equality because of string pooling. + TEST_EQ(vecofstrings->Get(0)->c_str(), vecofstrings->Get(2)->c_str()); + TEST_EQ(vecofstrings->Get(1)->c_str(), vecofstrings->Get(3)->c_str()); // Example of accessing a vector of tables: auto vecoftables = monster->testarrayoftables(); @@ -420,7 +425,8 @@ void ReflectionTest(uint8_t *flatbuf, size_t length) { // either part or whole. flatbuffers::FlatBufferBuilder fbb; auto root_offset = flatbuffers::CopyTable(fbb, schema, *root_table, - *flatbuffers::GetAnyRoot(flatbuf)); + *flatbuffers::GetAnyRoot(flatbuf), + true); fbb.Finish(root_offset, MonsterIdentifier()); // Test that it was copied correctly: AccessFlatBufferTest(fbb.GetBufferPointer(), fbb.GetSize());