diff --git a/infra/base-images/base-runner/Dockerfile b/infra/base-images/base-runner/Dockerfile index ae1c0e8c1..ea29e1f6e 100644 --- a/infra/base-images/base-runner/Dockerfile +++ b/infra/base-images/base-runner/Dockerfile @@ -40,8 +40,10 @@ RUN git clone https://chromium.googlesource.com/chromium/src/tools/code_coverage RUN pip3 install -r /opt/code_coverage/requirements.txt COPY bad_build_check \ + collect_dft \ coverage \ coverage_helper \ + dataflow_tracer.py \ download_corpus \ minijail0 \ reproduce \ diff --git a/infra/base-images/base-runner/collect_dft b/infra/base-images/base-runner/collect_dft new file mode 100755 index 000000000..e316c0dbf --- /dev/null +++ b/infra/base-images/base-runner/collect_dft @@ -0,0 +1,65 @@ +#!/bin/bash -u +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ +cd $OUT + +if (( $# > 0 )); then + FUZZ_TARGETS="$@" +else + FUZZ_TARGETS="$(find . -maxdepth 1 -type f -executable -printf '%P\n')" +fi + +# Timeout for running a single fuzz target. +TIMEOUT=1h + +# Number of CPUs available, this is needed for running targets in parallel. +NPROC=$(nproc) + +function run_one_target { + local target=$1 + local corpus="/corpus/${target}" + local traces="$OUT/${target}_dft" + + # Put the logs in $OUT as well for debugging purposes. + local log="$OUT/${target}_dft.log" + + rm -rf $traces && mkdir -p $traces + + timeout $TIMEOUT dataflow_tracer.py $OUT/$target $corpus $traces &> $log + if (( $? != 0 )); then + echo "Error occured while collecting data flow traces for $target:" + cat $log + fi +} + +# Run each fuzz target, write data flow traces into corresponding dir in $OUT. +for fuzz_target in $FUZZ_TARGETS; do + # Skip binaries that do not seem to be fuzz targets. + grep "LLVMFuzzerTestOneInput" $fuzz_target > /dev/null 2>&1 || continue + + echo "Running $fuzz_target" + run_one_target $fuzz_target & + + # Do not spawn more processes than the number of CPUs available. + n_child_proc=$(jobs -rp | wc -l) + while [ "$n_child_proc" -eq "$NPROC" ]; do + sleep 4 + n_child_proc=$(jobs -rp | wc -l) + done +done + +# Wait for background processes to finish. +wait diff --git a/infra/base-images/base-runner/dataflow_tracer.py b/infra/base-images/base-runner/dataflow_tracer.py new file mode 100755 index 000000000..b157d66c8 --- /dev/null +++ b/infra/base-images/base-runner/dataflow_tracer.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ +"""Script for collecting dataflow traces using DFSan compiled binary. The script +imitates `CollectDataFlow` function from libFuzzer but provides some flexibility +for skipping long and/or slow corpus elements. + +Follow https://github.com/google/oss-fuzz/issues/1632 for more details.""" +import hashlib +import os +import subprocess +import sys + +# These can be controlled by the runner in order to change the values without +# rebuilding OSS-Fuzz base images. +FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024)) +MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0)) +TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0)) + +DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0' + + +def _error(msg): + sys.stderr.write(msg + '\n') + + +def _list_dir(dirpath): + for root, _, files in os.walk(dirpath): + for f in files: + yield os.path.join(root, f) + + +def _sha1(filepath): + h = hashlib.sha1() + with open(filepath, 'rb') as f: + h.update(f.read()) + return h.hexdigest() + + +def _run(cmd, timeout=None): + result = None + try: + result = subprocess.run(cmd, + timeout=timeout, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if result.returncode: + _error('{command} finished with non-zero code: {code}'.format( + command=str(cmd), code=result.returncode)) + + except subprocess.TimeoutExpired: + raise + except Exception as e: + _error('Exception: ' + str(e)) + + return result + + +def _timeout(size): + # Dynamic timeout value (proportional to file size) to discard slow units. + timeout = MIN_TIMEOUT + timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT + return timeout + + +def collect_traces(binary, corpus_dir, dft_dir): + stats = { + 'total': 0, + 'traced': 0, + 'long': 0, + 'slow': 0, + 'failed': 0, + } + + for f in _list_dir(corpus_dir): + stats['total'] += 1 + size = os.path.getsize(f) + if size > FILE_SIZE_LIMIT: + stats['long'] += 1 + print('Skipping large file ({size}b): {path}'.format(size=size, path=f)) + continue + + output_path = os.path.join(dft_dir, _sha1(f)) + try: + result = _run([binary, f, output_path], timeout=_timeout(size)) + if result.returncode: + stats['failed'] += 1 + else: + stats['traced'] += 1 + + except subprocess.TimeoutExpired as e: + _error('Slow input: ' + str(e)) + stats['slow'] += 1 + + return stats + + +def dump_functions(binary, dft_dir): + result = _run([binary]) + if not result or result.returncode: + return False + + with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f: + f.write(result.stdout) + + return True + + +def main(): + if len(sys.argv) < 4: + _error('Usage: {0} '.format(sys.argv[0])) + sys.exit(1) + + binary = sys.argv[1] + corpus_dir = sys.argv[2] + dft_dir = sys.argv[3] + + os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS + + if not dump_functions(binary, dft_dir): + _error('Failed to dump functions. Something is wrong.') + sys.exit(1) + + stats = collect_traces(binary, corpus_dir, dft_dir) + for k, v in stats.items(): + print('{0}: {1}'.format(k, v)) + + # Checksum that we didn't lose track of any of the inputs. + assert stats['total'] * 2 == sum(v for v in stats.values()) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/infra/gcb/build_and_run_coverage.py b/infra/gcb/build_and_run_coverage.py index f6b086660..d7599ba7c 100644 --- a/infra/gcb/build_and_run_coverage.py +++ b/infra/gcb/build_and_run_coverage.py @@ -11,29 +11,20 @@ import requests import sys import urlparse +import build_lib import build_project SANITIZER = 'coverage' CONFIGURATION = ['FUZZING_ENGINE=libfuzzer', 'SANITIZER=%s' % SANITIZER] PLATFORM = 'linux' -# Where corpus backups can be downloaded from. -CORPUS_BACKUP_URL = ('/{project}-backup.clusterfuzz-external.appspot.com/' - 'corpus/libFuzzer/{fuzzer}/latest.zip') - -# Cloud Builder has a limit of 100 build steps and 100 arguments for each step. -CORPUS_DOWNLOAD_BATCH_SIZE = 100 - COVERAGE_BUILD_TAG = 'coverage' -# Needed for reading public target.list.* files. -GCS_URL_BASENAME = 'https://storage.googleapis.com/' - # Where code coverage reports need to be uploaded to. COVERAGE_BUCKET_NAME = 'oss-fuzz-coverage' # Link to the code coverage report in HTML format. -HTML_REPORT_URL_FORMAT = (GCS_URL_BASENAME + COVERAGE_BUCKET_NAME + +HTML_REPORT_URL_FORMAT = (build_lib.GCS_URL_BASENAME + COVERAGE_BUCKET_NAME + '/{project}/reports/{date}/{platform}/index.html') # This is needed for ClusterFuzz to pick up the most recent reports data. @@ -74,10 +65,6 @@ def get_build_steps(project_dir): skip_build('Project "%s" uses go-fuzz, coverage is not supported yet.' % project_name) - fuzz_targets = get_targets_list(project_name) - if not fuzz_targets: - skip_build('No fuzz targets found for project "%s".' % project_name) - dockerfile_path = os.path.join(project_dir, 'Dockerfile') name = project_yaml['name'] image = project_yaml['image'] @@ -143,32 +130,11 @@ def get_build_steps(project_dir): ], }) - # Split fuzz targets into batches of CORPUS_DOWNLOAD_BATCH_SIZE. - for i in xrange(0, len(fuzz_targets), CORPUS_DOWNLOAD_BATCH_SIZE): - download_corpus_args = [] - for binary_name in fuzz_targets[i:i + CORPUS_DOWNLOAD_BATCH_SIZE]: - qualified_name = binary_name - qualified_name_prefix = '%s_' % project_name - if not binary_name.startswith(qualified_name_prefix): - qualified_name = qualified_name_prefix + binary_name + download_corpora_step = build_lib.download_corpora_step(project_name) + if not download_corpora_step: + skip_build("Skipping code coverage build for %s.\n" % project_name) - url = build_project.get_signed_url(CORPUS_BACKUP_URL.format( - project=project_name, fuzzer=qualified_name), - method='GET') - - corpus_archive_path = os.path.join('/corpus', binary_name + '.zip') - download_corpus_args.append('%s %s' % (corpus_archive_path, url)) - - # Download corpus. - build_steps.append({ - 'name': 'gcr.io/oss-fuzz-base/base-runner', - 'entrypoint': 'download_corpus', - 'args': download_corpus_args, - 'volumes': [{ - 'name': 'corpus', - 'path': '/corpus' - }], - }) + build_steps.append(download_corpora_step) failure_msg = ('*' * 80 + '\nCode coverage report generation failed.\n' 'To reproduce, run:\n' @@ -267,7 +233,7 @@ def get_build_steps(project_dir): }) # Update the latest report information file for ClusterFuzz. - latest_report_info_url = build_project.get_signed_url( + latest_report_info_url = build_lib.get_signed_url( LATEST_REPORT_INFO_URL.format(project=project_name), method='PUT', content_type='application/json') @@ -300,23 +266,6 @@ def get_build_steps(project_dir): return build_steps -def get_targets_list(project_name): - # libFuzzer ASan is the default configuration, get list of targets from it. - url = build_project.get_targets_list_url( - build_project.ENGINE_INFO['libfuzzer'].upload_bucket, project_name, - 'address') - - url = urlparse.urljoin(GCS_URL_BASENAME, url) - r = requests.get(url) - if not r.status_code == 200: - sys.stderr.write('Failed to get list of targets from "%s".\n' % url) - sys.stderr.write('Status code: %d \t\tText:\n%s\n' % - (r.status_code, r.text)) - return None - - return r.text.split() - - def main(): if len(sys.argv) != 2: usage() diff --git a/infra/gcb/build_lib.py b/infra/gcb/build_lib.py new file mode 100644 index 000000000..d3508730c --- /dev/null +++ b/infra/gcb/build_lib.py @@ -0,0 +1,134 @@ +"""Utility module for Google Cloud Build scripts.""" +import base64 +import collections +import os +import requests +import sys +import time +import urllib +import urlparse + +from oauth2client.service_account import ServiceAccountCredentials + +BUILD_TIMEOUT = 12 * 60 * 60 + +# Needed for reading public target.list.* files. +GCS_URL_BASENAME = 'https://storage.googleapis.com/' + +GCS_UPLOAD_URL_FORMAT = '/{0}/{1}/{2}' + +# Where corpus backups can be downloaded from. +CORPUS_BACKUP_URL = ('/{project}-backup.clusterfuzz-external.appspot.com/' + 'corpus/libFuzzer/{fuzzer}/latest.zip') + +# Cloud Builder has a limit of 100 build steps and 100 arguments for each step. +CORPUS_DOWNLOAD_BATCH_SIZE = 100 + +TARGETS_LIST_BASENAME = 'targets.list' + +EngineInfo = collections.namedtuple( + 'EngineInfo', + ['upload_bucket', 'supported_sanitizers', 'supported_architectures']) + +ENGINE_INFO = { + 'libfuzzer': + EngineInfo(upload_bucket='clusterfuzz-builds', + supported_sanitizers=['address', 'memory', 'undefined'], + supported_architectures=['x86_64', 'i386']), + 'afl': + EngineInfo(upload_bucket='clusterfuzz-builds-afl', + supported_sanitizers=['address'], + supported_architectures=['x86_64']), + 'honggfuzz': + EngineInfo(upload_bucket='clusterfuzz-builds-honggfuzz', + supported_sanitizers=['address', 'memory', 'undefined'], + supported_architectures=['x86_64']), + 'dataflow': + EngineInfo(upload_bucket='clusterfuzz-builds-dataflow', + supported_sanitizers=['dataflow'], + supported_architectures=['x86_64']), + 'none': + EngineInfo(upload_bucket='clusterfuzz-builds-no-engine', + supported_sanitizers=['address'], + supported_architectures=['x86_64']), +} + + +def get_targets_list_filename(sanitizer): + return TARGETS_LIST_BASENAME + '.' + sanitizer + + +def get_targets_list_url(bucket, project, sanitizer): + filename = get_targets_list_filename(sanitizer) + url = GCS_UPLOAD_URL_FORMAT.format(bucket, project, filename) + return url + + +def _get_targets_list(project_name): + # libFuzzer ASan is the default configuration, get list of targets from it. + url = get_targets_list_url(ENGINE_INFO['libfuzzer'].upload_bucket, + project_name, 'address') + + url = urlparse.urljoin(GCS_URL_BASENAME, url) + response = requests.get(url) + if not response.status_code == 200: + sys.stderr.write('Failed to get list of targets from "%s".\n' % url) + sys.stderr.write('Status code: %d \t\tText:\n%s\n' % + (response.status_code, response.text)) + return None + + return response.text.split() + + +def get_signed_url(path, method='PUT', content_type=''): + timestamp = int(time.time() + BUILD_TIMEOUT) + blob = '{0}\n\n{1}\n{2}\n{3}'.format(method, content_type, timestamp, path) + + creds = ServiceAccountCredentials.from_json_keyfile_name( + os.environ['GOOGLE_APPLICATION_CREDENTIALS']) + client_id = creds.service_account_email + signature = base64.b64encode(creds.sign_blob(blob)[1]) + values = { + 'GoogleAccessId': client_id, + 'Expires': timestamp, + 'Signature': signature, + } + + return ('https://storage.googleapis.com{0}?'.format(path) + + urllib.urlencode(values)) + + +def download_corpora_step(project_name): + """Returns a GCB step for downloading corpora backups for the given project. + """ + fuzz_targets = _get_targets_list(project_name) + if not fuzz_targets: + sys.stderr.write('No fuzz targets found for project "%s".\n' % project_name) + return None + + # Split fuzz targets into batches of CORPUS_DOWNLOAD_BATCH_SIZE. + for i in range(0, len(fuzz_targets), CORPUS_DOWNLOAD_BATCH_SIZE): + download_corpus_args = [] + for binary_name in fuzz_targets[i:i + CORPUS_DOWNLOAD_BATCH_SIZE]: + qualified_name = binary_name + qualified_name_prefix = '%s_' % project_name + if not binary_name.startswith(qualified_name_prefix): + qualified_name = qualified_name_prefix + binary_name + + url = get_signed_url(CORPUS_BACKUP_URL.format(project=project_name, + fuzzer=qualified_name), + method='GET') + + corpus_archive_path = os.path.join('/corpus', binary_name + '.zip') + download_corpus_args.append('%s %s' % (corpus_archive_path, url)) + + step = { + 'name': 'gcr.io/oss-fuzz-base/base-runner', + 'entrypoint': 'download_corpus', + 'args': download_corpus_args, + 'volumes': [{ + 'name': 'corpus', + 'path': '/corpus' + }], + } + return step diff --git a/infra/gcb/build_project.py b/infra/gcb/build_project.py index 0b6405424..642fce21e 100644 --- a/infra/gcb/build_project.py +++ b/infra/gcb/build_project.py @@ -6,22 +6,17 @@ Usage: build_project.py from __future__ import print_function -import base64 -import collections import datetime import json import os import re import sys -import time -import urllib import yaml from oauth2client.client import GoogleCredentials -from oauth2client.service_account import ServiceAccountCredentials from googleapiclient.discovery import build -BUILD_TIMEOUT = 12 * 60 * 60 +import build_lib FUZZING_BUILD_TAG = 'fuzzing' @@ -39,41 +34,10 @@ CONFIGURATIONS = { 'engine-none': ['FUZZING_ENGINE=none'], } -EngineInfo = collections.namedtuple( - 'EngineInfo', - ['upload_bucket', 'supported_sanitizers', 'supported_architectures']) - -ENGINE_INFO = { - 'libfuzzer': - EngineInfo(upload_bucket='clusterfuzz-builds', - supported_sanitizers=['address', 'memory', 'undefined'], - supported_architectures=['x86_64', 'i386']), - 'afl': - EngineInfo(upload_bucket='clusterfuzz-builds-afl', - supported_sanitizers=['address'], - supported_architectures=['x86_64']), - 'honggfuzz': - EngineInfo(upload_bucket='clusterfuzz-builds-honggfuzz', - supported_sanitizers=['address', 'memory', 'undefined'], - supported_architectures=['x86_64']), - 'dataflow': - EngineInfo(upload_bucket='clusterfuzz-builds-dataflow', - supported_sanitizers=['dataflow'], - supported_architectures=['x86_64']), - 'none': - EngineInfo(upload_bucket='clusterfuzz-builds-no-engine', - supported_sanitizers=['address'], - supported_architectures=['x86_64']), -} - DEFAULT_ARCHITECTURES = ['x86_64'] DEFAULT_ENGINES = ['libfuzzer', 'afl', 'honggfuzz'] DEFAULT_SANITIZERS = ['address', 'undefined'] -TARGETS_LIST_BASENAME = 'targets.list' - -UPLOAD_URL_FORMAT = '/{0}/{1}/{2}' - def usage(): sys.stderr.write('Usage: ' + sys.argv[0] + ' \n') @@ -97,26 +61,8 @@ def load_project_yaml(project_dir): return project_yaml -def get_signed_url(path, method='PUT', content_type=''): - timestamp = int(time.time() + BUILD_TIMEOUT) - blob = '{0}\n\n{1}\n{2}\n{3}'.format(method, content_type, timestamp, path) - - creds = ServiceAccountCredentials.from_json_keyfile_name( - os.environ['GOOGLE_APPLICATION_CREDENTIALS']) - client_id = creds.service_account_email - signature = base64.b64encode(creds.sign_blob(blob)[1]) - values = { - 'GoogleAccessId': client_id, - 'Expires': timestamp, - 'Signature': signature, - } - - return ('https://storage.googleapis.com{0}?'.format(path) + - urllib.urlencode(values)) - - def is_supported_configuration(fuzzing_engine, sanitizer, architecture): - fuzzing_engine_info = ENGINE_INFO[fuzzing_engine] + fuzzing_engine_info = build_lib.ENGINE_INFO[fuzzing_engine] if architecture == 'i386' and sanitizer != 'address': return False return (sanitizer in fuzzing_engine_info.supported_sanitizers and @@ -213,17 +159,18 @@ def get_build_steps(project_dir): stamped_name = '-'.join([name, sanitizer, ts]) zip_file = stamped_name + '.zip' stamped_srcmap_file = stamped_name + '.srcmap.json' - bucket = ENGINE_INFO[fuzzing_engine].upload_bucket + bucket = build_lib.ENGINE_INFO[fuzzing_engine].upload_bucket if architecture != 'x86_64': bucket += '-' + architecture - upload_url = get_signed_url( - UPLOAD_URL_FORMAT.format(bucket, name, zip_file)) - srcmap_url = get_signed_url( - UPLOAD_URL_FORMAT.format(bucket, name, stamped_srcmap_file)) + upload_url = build_lib.get_signed_url( + build_lib.GCS_UPLOAD_URL_FORMAT.format(bucket, name, zip_file)) + srcmap_url = build_lib.get_signed_url( + build_lib.GCS_UPLOAD_URL_FORMAT.format(bucket, name, + stamped_srcmap_file)) - targets_list_filename = get_targets_list_filename(sanitizer) - targets_list_url = get_signed_url( - get_targets_list_url(bucket, name, sanitizer)) + targets_list_filename = build_lib.get_targets_list_filename(sanitizer) + targets_list_url = build_lib.get_signed_url( + build_lib.get_targets_list_url(bucket, name, sanitizer)) env.append('OUT=' + out) env.append('MSAN_LIBS_PATH=/workspace/msan') @@ -320,6 +267,13 @@ def get_build_steps(project_dir): ], }) + if sanitizer == 'dataflow' and fuzzing_engine == 'dataflow': + dataflow_steps = dataflow_post_build_steps(name) + if dataflow_steps: + build_steps.extend(dataflow_steps) + else: + sys.stderr.write('Skipping dataflow post build steps.\n') + build_steps.extend([ # generate targets list { @@ -383,22 +337,34 @@ def get_build_steps(project_dir): return build_steps +def dataflow_post_build_steps(project_name): + steps = [] + download_corpora_step = build_lib.download_corpora_step(project_name) + if not download_corpora_step: + return None + + steps = [download_corpora_step] + steps.append({ + 'name': 'gcr.io/oss-fuzz-base/base-runner', + 'args': [ + 'bash', '-c', + ('for f in /corpus/*.zip; do unzip -q $f -d ${f%%.*}; done && ' + 'collect_dft || (echo "DFT collection failed." && false)') + ], + 'volumes': [{ + 'name': 'corpus', + 'path': '/corpus' + }], + }) + return steps + + def get_logs_url(build_id): URL_FORMAT = ('https://console.developers.google.com/logs/viewer?' 'resource=build%2Fbuild_id%2F{0}&project=oss-fuzz') return URL_FORMAT.format(build_id) -def get_targets_list_filename(sanitizer): - return TARGETS_LIST_BASENAME + '.' + sanitizer - - -def get_targets_list_url(bucket, project, sanitizer): - filename = get_targets_list_filename(sanitizer) - url = UPLOAD_URL_FORMAT.format(bucket, project, filename) - return url - - def run_build(build_steps, project_name, tag): options = {} if 'GCB_OPTIONS' in os.environ: @@ -406,7 +372,7 @@ def run_build(build_steps, project_name, tag): build_body = { 'steps': build_steps, - 'timeout': str(BUILD_TIMEOUT) + 's', + 'timeout': str(build_lib.BUILD_TIMEOUT) + 's', 'options': options, 'logsBucket': GCB_LOGS_BUCKET, 'tags': [project_name + '-' + tag,], diff --git a/infra/gcb/cancel.py b/infra/gcb/cancel.py index 331244fed..8393a5144 100755 --- a/infra/gcb/cancel.py +++ b/infra/gcb/cancel.py @@ -15,7 +15,6 @@ import urllib import yaml from oauth2client.client import GoogleCredentials -from oauth2client.service_account import ServiceAccountCredentials from googleapiclient.discovery import build @@ -32,8 +31,9 @@ def main(): credentials = GoogleCredentials.get_application_default() cloudbuild = build('cloudbuild', 'v1', credentials=credentials) - print cloudbuild.projects().builds().cancel( - projectId='oss-fuzz', id=build_id, body={}).execute() + print cloudbuild.projects().builds().cancel(projectId='oss-fuzz', + id=build_id, + body={}).execute() if __name__ == '__main__':