From 06224212901a9d9b63fe6cab11e9d6d7de4c8153 Mon Sep 17 00:00:00 2001
From: Stefan Bucur <281483+stefanbucur@users.noreply.github.com>
Date: Wed, 21 Nov 2018 15:21:56 -0500
Subject: [PATCH] [libxml2] Add a libXML fuzzer that exercises its file-based
 parsing interface. (#1967)

---
 projects/libxml2/Dockerfile                   |   4 +-
 projects/libxml2/build.sh                     |   2 +-
 projects/libxml2/byte_stream.h                | 128 ++++++++++++++++++
 projects/libxml2/fuzzer_temp_file.h           |  81 +++++++++++
 .../libxml2_xml_reader_for_file_fuzzer.cc     |  50 +++++++
 5 files changed, 261 insertions(+), 4 deletions(-)
 create mode 100644 projects/libxml2/byte_stream.h
 create mode 100644 projects/libxml2/fuzzer_temp_file.h
 create mode 100644 projects/libxml2/libxml2_xml_reader_for_file_fuzzer.cc
diff --git a/projects/libxml2/Dockerfile b/projects/libxml2/Dockerfile
index 3f144d2ab..fc821c806 100644
--- a/projects/libxml2/Dockerfile
+++ b/projects/libxml2/Dockerfile
@@ -22,6 +22,4 @@ RUN git clone --depth 1 https://gitlab.gnome.org/GNOME/libxml2.git
 WORKDIR libxml2
 
 COPY build.sh $SRC/
-COPY libxml2_xml_read_memory_fuzzer.* \
-     libxml2_xml_regexp_compile_fuzzer.* \
-     xml.dict $SRC/
+COPY *.cc *.h *.options *.dict $SRC/
diff --git a/projects/libxml2/build.sh b/projects/libxml2/build.sh
index bbfcf4181..da97cad6c 100755
--- a/projects/libxml2/build.sh
+++ b/projects/libxml2/build.sh
@@ -21,7 +21,7 @@
 make -j$(nproc) clean
 make -j$(nproc) all
 
-for fuzzer in libxml2_xml_read_memory_fuzzer libxml2_xml_regexp_compile_fuzzer; do
+for fuzzer in libxml2_xml_read_memory_fuzzer libxml2_xml_reader_for_file_fuzzer libxml2_xml_regexp_compile_fuzzer; do
   $CXX $CXXFLAGS -std=c++11 -Iinclude/ \
       $SRC/$fuzzer.cc -o $OUT/$fuzzer \
       -lFuzzingEngine .libs/libxml2.a
diff --git a/projects/libxml2/byte_stream.h b/projects/libxml2/byte_stream.h
new file mode 100644
index 000000000..6a4257891
--- /dev/null
+++ b/projects/libxml2/byte_stream.h
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BYTE_STREAM_H_
+#define BYTE_STREAM_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <string>
+
+// Wrapper for fuzzer input strings that helps consume and interpret the data
+// as a sequence of values, such as strings and PODs.
+class ByteStream {
+ public:
+  // Does not take ownership of data.
+  ByteStream(const uint8_t* data, size_t size)
+      : data_(data), size_(size), position_(0) {}
+
+  ByteStream(const ByteStream&) = delete;
+  ByteStream& operator=(const ByteStream&) = delete;
+
+  // Returns a string. Strings are obtained from the byte stream by reading a
+  // size_t N followed by N char elements. If there are fewer than N bytes left
+  // in the stream, this returns as many bytes as are available.
+  std::string GetNextString();
+
+  // The following GetNext{integer type} functions all return the next
+  // sizeof(integer type) bytes in the stream or 0 if there is insufficient
+  // capacity.
+  size_t GetNextSizeT() { return ConsumeCopyOrDefault<size_t>(0); }
+  int GetNextInt() { return ConsumeCopyOrDefault<int>(0); }
+  uint8_t GetNextUint8() { return ConsumeCopyOrDefault<uint8_t>(0); }
+  int64_t GetNextInt64() { return ConsumeCopyOrDefault<int64_t>(0); }
+
+  // Returns an integer in the range [0,n) for n > 0 and consumes up to
+  // sizeof(int) bytes. For n<=0, returns 0 and consumes 0 bytes.
+  int GetNextInt(int n);
+
+ private:
+  // The remaining capacity of the ByteStream.
+  size_t capacity() const { return size_ - position_; }
+
+  // Returns data_ + position_ and then advances position_ by requested bytes.
+  //
+  // This is the canonical way for the class to request regions of memory
+  // or to advance the position by requested bytes. This operation is unchecked
+  // for maintaining that position_ <= size_. Requesting 0 bytes always
+  // succeeds.
+  const uint8_t* UncheckedConsume(size_t requested) {
+    const uint8_t* region = data_ + position_;
+    position_ += requested;
+    return region;
+  }
+
+  // Directly initialize T by copying sizeof(T) bytes into results if there is
+  // sufficient capacity in the stream. If there is not sufficient capacity
+  // result is unmodified.
+  template <class T>
+  void ConsumeBytesByCopy(T* result) {
+    constexpr size_t type_size = sizeof(T);
+    if (type_size <= capacity()) {
+      const uint8_t* region = UncheckedConsume(type_size);
+      memcpy(static_cast<void*>(result), region, type_size);
+    } else {
+      // Consume the remainder of data_.
+      UncheckedConsume(capacity());
+    }
+  }
+
+  // A helper function for using ConsumeBytesByCopy and returning a default
+  // value `t` if there is insufficient capacity to read a full `T`. T should
+  // probably be a primitive type.
+  template <class T>
+  T ConsumeCopyOrDefault(T t) {
+    ConsumeBytesByCopy(&t);
+    return t;
+  }
+
+  const uint8_t* data_;
+  const size_t size_;
+  size_t position_;
+};
+
+inline std::string ByteStream::GetNextString() {
+  const size_t requested_size = GetNextSizeT();
+  const size_t consumed_size = std::min(requested_size, capacity());
+  const uint8_t* selection = UncheckedConsume(consumed_size);
+  return std::string(reinterpret_cast<const char*>(selection), consumed_size);
+}
+
+inline int ByteStream::GetNextInt(int n) {
+  if (n <= 0) {
+    return 0;
+  }
+  // We grab as few bytes as possible as n will often be fixed.
+  int selection = 0;
+  if (n <= std::numeric_limits<uint8_t>::max()) {
+    selection = static_cast<int>(GetNextUint8());
+  } else if (n <= std::numeric_limits<uint16_t>::max()) {
+    selection = ConsumeCopyOrDefault<uint16_t>(0);
+  } else {
+    selection = GetNextInt();
+  }
+
+  // Take the absolute value of selection w/o undefined behavior.
+  // If selection is INT_MIN, return 0.
+  if (selection == std::numeric_limits<int>::min()) {
+    selection = 0;
+  } else if (selection < 0) {
+    selection = -selection;
+  }
+  return selection % n;
+}
+
+#endif  // BYTE_STREAM_H_
diff --git a/projects/libxml2/fuzzer_temp_file.h b/projects/libxml2/fuzzer_temp_file.h
new file mode 100644
index 000000000..fe25cabae
--- /dev/null
+++ b/projects/libxml2/fuzzer_temp_file.h
@@ -0,0 +1,81 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Adapter utility from fuzzer input to a temporary file, for fuzzing APIs that
+// require a file instead of an input buffer.
+
+#ifndef FUZZER_TEMP_FILE_H_
+#define FUZZER_TEMP_FILE_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+// Pure-C interface for creating and cleaning up temporary files.
+
+static char* fuzzer_get_tmpfile(const uint8_t* data, size_t size) {
+  char* filename_buffer = strdup("/tmp/generate_temporary_file.XXXXXX");
+  if (!filename_buffer) {
+    perror("Failed to allocate file name buffer.");
+    abort();
+  }
+  const int file_descriptor = mkstemp(filename_buffer);
+  if (file_descriptor < 0) {
+    perror("Failed to make temporary file.");
+    abort();
+  }
+  FILE* file = fdopen(file_descriptor, "wb");
+  if (!file) {
+    perror("Failed to open file descriptor.");
+    close(file_descriptor);
+    abort();
+  }
+  const size_t bytes_written = fwrite(data, sizeof(uint8_t), size, file);
+  if (bytes_written < size) {
+    close(file_descriptor);
+    fprintf(stderr, "Failed to write all bytes to file (%zu out of %zu)",
+            bytes_written, size);
+    abort();
+  }
+  fclose(file);
+  return filename_buffer;
+}
+
+static void fuzzer_release_tmpfile(char* filename) {
+  if (unlink(filename) != 0) {
+    perror("WARNING: Failed to delete temporary file.");
+  }
+  free(filename);
+}
+
+// C++ RAII object for creating temporary files.
+
+#ifdef __cplusplus
+class FuzzerTemporaryFile {
+ public:
+  FuzzerTemporaryFile(const uint8_t* data, size_t size)
+      : filename_(fuzzer_get_tmpfile(data, size)) {}
+
+  ~FuzzerTemporaryFile() { fuzzer_release_tmpfile(filename_); }
+
+  const char* filename() const { return filename_; }
+
+ private:
+  char* filename_;
+};
+#endif
+
+#endif  // FUZZER_TEMP_FILE_H_
diff --git a/projects/libxml2/libxml2_xml_reader_for_file_fuzzer.cc b/projects/libxml2/libxml2_xml_reader_for_file_fuzzer.cc
new file mode 100644
index 000000000..4f4cf6c35
--- /dev/null
+++ b/projects/libxml2/libxml2_xml_reader_for_file_fuzzer.cc
@@ -0,0 +1,50 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "byte_stream.h"
+#include "fuzzer_temp_file.h"
+
+#include "libxml/xmlreader.h"
+
+void ignore (void* ctx, const char* msg, ...) {
+  // Error handler to avoid spam of error messages from libxml parser.
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  xmlSetGenericErrorFunc(NULL, &ignore);
+
+  ByteStream stream(data, size);
+  const int options = stream.GetNextInt();
+  const std::string encoding = stream.GetNextString();
+  const std::string file_contents = stream.GetNextString();
+  FuzzerTemporaryFile file(
+      reinterpret_cast<const uint8_t*>(file_contents.c_str()),
+      file_contents.size());
+
+  xmlTextReaderPtr xmlReader =
+      xmlReaderForFile(file.filename(), encoding.c_str(), options);
+
+  constexpr int kReadSuccessful = 1;
+  while (xmlTextReaderRead(xmlReader) == kReadSuccessful) {
+    xmlTextReaderNodeType(xmlReader);
+    xmlTextReaderConstValue(xmlReader);
+  }
+
+  xmlFreeTextReader(xmlReader);
+  return EXIT_SUCCESS;
+}