oss-fuzz/infra/base-images/base-runner/dataflow_tracer.py

#!/usr/bin/env python3
# Copyright 2020 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
"""Script for collecting dataflow traces using DFSan compiled binary. The script
imitates `CollectDataFlow` function from libFuzzer but provides some flexibility
for skipping long and/or slow corpus elements.

Follow https://github.com/google/oss-fuzz/issues/1632 for more details."""
import hashlib
import os
import subprocess
import sys

# These can be controlled by the runner in order to change the values without
# rebuilding OSS-Fuzz base images.
FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024))
MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0))
TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0))

DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0'


def _error(msg):
  sys.stderr.write(msg + '\n')


def _list_dir(dirpath):
  for root, _, files in os.walk(dirpath):
    for f in files:
      yield os.path.join(root, f)


def _sha1(filepath):
  h = hashlib.sha1()
  with open(filepath, 'rb') as f:
    h.update(f.read())
  return h.hexdigest()


def _run(cmd, timeout=None):
  result = None
  try:
    result = subprocess.run(cmd,
                            timeout=timeout,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    if result.returncode:
      _error('{command} finished with non-zero code: {code}'.format(
          command=str(cmd), code=result.returncode))

  except subprocess.TimeoutExpired:
    raise
  except Exception as e:
    _error('Exception: ' + str(e))

  return result


def _timeout(size):
  # Dynamic timeout value (proportional to file size) to discard slow units.
  timeout = MIN_TIMEOUT
  timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT
  return timeout


def collect_traces(binary, corpus_dir, dft_dir):
  stats = {
      'total': 0,
      'traced': 0,
      'long': 0,
      'slow': 0,
      'failed': 0,
  }

  files_and_sizes = {}
  for f in _list_dir(corpus_dir):
    stats['total'] += 1
    size = os.path.getsize(f)
    if size > FILE_SIZE_LIMIT:
      stats['long'] += 1
      print('Skipping large file ({size}b): {path}'.format(size=size, path=f))
      continue
    files_and_sizes[f] = size

  for f in sorted(files_and_sizes, key=files_and_sizes.get):
    output_path = os.path.join(dft_dir, _sha1(f))
    try:
      result = _run([binary, f, output_path], timeout=_timeout(size))
      if result.returncode:
        stats['failed'] += 1
      else:
        stats['traced'] += 1

    except subprocess.TimeoutExpired as e:
      _error('Slow input: ' + str(e))
      stats['slow'] += 1

  return stats


def dump_functions(binary, dft_dir):
  result = _run([binary])
  if not result or result.returncode:
    return False

  with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f:
    f.write(result.stdout)

  return True


def main():
  if len(sys.argv) < 4:
    _error('Usage: {0} <binary> <corpus_dir> <dft_dir>'.format(sys.argv[0]))
    sys.exit(1)

  binary = sys.argv[1]
  corpus_dir = sys.argv[2]
  dft_dir = sys.argv[3]

  os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS

  if not dump_functions(binary, dft_dir):
    _error('Failed to dump functions. Something is wrong.')
    sys.exit(1)

  stats = collect_traces(binary, corpus_dir, dft_dir)
  for k, v in stats.items():
    print('{0}: {1}'.format(k, v))

  # Checksum that we didn't lose track of any of the inputs.
  assert stats['total'] * 2 == sum(v for v in stats.values())
  sys.exit(0)


if __name__ == "__main__":
  main()
[infra] Collect dataflow traces on the builder (#1632). (#3238) * [infra] Skeleton of the changes needed for collecting DFT on the builder (#1632). * move ENGINE_INFO to the helper as well * make collect_dft +x * syntax fixes * add actual dataflow tracer script * format * more refactoring and cleanup * format * address Oliver's feedback * format * more fixes * format * do not redirect stderr to stdout * add exit at the end of main * address feedback from Oliver 2020-01-17 14:24:15 +00:00			`#!/usr/bin/env python3`
			`# Copyright 2020 Google Inc.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
			`################################################################################`
			`"""Script for collecting dataflow traces using DFSan compiled binary. The script`
			imitates `CollectDataFlow` function from libFuzzer but provides some flexibility
			`for skipping long and/or slow corpus elements.`

			`Follow https://github.com/google/oss-fuzz/issues/1632 for more details."""`
			`import hashlib`
			`import os`
			`import subprocess`
			`import sys`

			`# These can be controlled by the runner in order to change the values without`
			`# rebuilding OSS-Fuzz base images.`
			`FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024))`
			`MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0))`
			`TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0))`

			`DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0'`


			`def _error(msg):`
			`sys.stderr.write(msg + '\n')`


			`def _list_dir(dirpath):`
			`for root, _, files in os.walk(dirpath):`
			`for f in files:`
			`yield os.path.join(root, f)`


			`def _sha1(filepath):`
			`h = hashlib.sha1()`
			`with open(filepath, 'rb') as f:`
			`h.update(f.read())`
			`return h.hexdigest()`


			`def _run(cmd, timeout=None):`
			`result = None`
			`try:`
			`result = subprocess.run(cmd,`
			`timeout=timeout,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`
			`if result.returncode:`
			`_error('{command} finished with non-zero code: {code}'.format(`
			`command=str(cmd), code=result.returncode))`

			`except subprocess.TimeoutExpired:`
			`raise`
			`except Exception as e:`
			`_error('Exception: ' + str(e))`

			`return result`


			`def _timeout(size):`
			`# Dynamic timeout value (proportional to file size) to discard slow units.`
			`timeout = MIN_TIMEOUT`
			`timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT`
			`return timeout`


			`def collect_traces(binary, corpus_dir, dft_dir):`
			`stats = {`
			`'total': 0,`
			`'traced': 0,`
			`'long': 0,`
			`'slow': 0,`
			`'failed': 0,`
			`}`

[infra] Prioritize short files when collecting dataflow traces (#1632). (#3254) * [infra] Prioritize short files when collecting dataflow traces (#1632). * remove debug print * rename files and sizes dict 2020-01-21 21:09:07 +00:00			`files_and_sizes = {}`
[infra] Collect dataflow traces on the builder (#1632). (#3238) * [infra] Skeleton of the changes needed for collecting DFT on the builder (#1632). * move ENGINE_INFO to the helper as well * make collect_dft +x * syntax fixes * add actual dataflow tracer script * format * more refactoring and cleanup * format * address Oliver's feedback * format * more fixes * format * do not redirect stderr to stdout * add exit at the end of main * address feedback from Oliver 2020-01-17 14:24:15 +00:00			`for f in _list_dir(corpus_dir):`
			`stats['total'] += 1`
			`size = os.path.getsize(f)`
			`if size > FILE_SIZE_LIMIT:`
			`stats['long'] += 1`
			`print('Skipping large file ({size}b): {path}'.format(size=size, path=f))`
			`continue`
[infra] Prioritize short files when collecting dataflow traces (#1632). (#3254) * [infra] Prioritize short files when collecting dataflow traces (#1632). * remove debug print * rename files and sizes dict 2020-01-21 21:09:07 +00:00			`files_and_sizes[f] = size`
[infra] Collect dataflow traces on the builder (#1632). (#3238) * [infra] Skeleton of the changes needed for collecting DFT on the builder (#1632). * move ENGINE_INFO to the helper as well * make collect_dft +x * syntax fixes * add actual dataflow tracer script * format * more refactoring and cleanup * format * address Oliver's feedback * format * more fixes * format * do not redirect stderr to stdout * add exit at the end of main * address feedback from Oliver 2020-01-17 14:24:15 +00:00
[infra] Prioritize short files when collecting dataflow traces (#1632). (#3254) * [infra] Prioritize short files when collecting dataflow traces (#1632). * remove debug print * rename files and sizes dict 2020-01-21 21:09:07 +00:00			`for f in sorted(files_and_sizes, key=files_and_sizes.get):`
[infra] Collect dataflow traces on the builder (#1632). (#3238) * [infra] Skeleton of the changes needed for collecting DFT on the builder (#1632). * move ENGINE_INFO to the helper as well * make collect_dft +x * syntax fixes * add actual dataflow tracer script * format * more refactoring and cleanup * format * address Oliver's feedback * format * more fixes * format * do not redirect stderr to stdout * add exit at the end of main * address feedback from Oliver 2020-01-17 14:24:15 +00:00			`output_path = os.path.join(dft_dir, _sha1(f))`
			`try:`
			`result = _run([binary, f, output_path], timeout=_timeout(size))`
			`if result.returncode:`
			`stats['failed'] += 1`
			`else:`
			`stats['traced'] += 1`

			`except subprocess.TimeoutExpired as e:`
			`_error('Slow input: ' + str(e))`
			`stats['slow'] += 1`

			`return stats`


			`def dump_functions(binary, dft_dir):`
			`result = _run([binary])`
			`if not result or result.returncode:`
			`return False`

			`with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f:`
			`f.write(result.stdout)`

			`return True`


			`def main():`
			`if len(sys.argv) < 4:`
			`_error('Usage: {0} <binary> <corpus_dir> <dft_dir>'.format(sys.argv[0]))`
			`sys.exit(1)`

			`binary = sys.argv[1]`
			`corpus_dir = sys.argv[2]`
			`dft_dir = sys.argv[3]`

			`os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS`

			`if not dump_functions(binary, dft_dir):`
			`_error('Failed to dump functions. Something is wrong.')`
			`sys.exit(1)`

			`stats = collect_traces(binary, corpus_dir, dft_dir)`
			`for k, v in stats.items():`
			`print('{0}: {1}'.format(k, v))`

			`# Checksum that we didn't lose track of any of the inputs.`
			`assert stats['total'] * 2 == sum(v for v in stats.values())`
			`sys.exit(0)`


			`if __name__ == "__main__":`
			`main()`