[infra] Collect dataflow traces on the builder (#1632). (#3238)

* [infra] Skeleton of the changes needed for collecting DFT on the builder (#1632).

* move ENGINE_INFO to the helper as well

* make collect_dft +x

* syntax fixes

* add actual dataflow tracer script

* format

* more refactoring and cleanup

* format

* address Oliver's feedback

* format

* more fixes

* format

* do not redirect stderr to stdout

* add exit at the end of main

* address feedback from Oliver
This commit is contained in:
Max Moroz 2020-01-17 06:24:15 -08:00 committed by GitHub
parent 3a9400d3a2
commit f6002f6139
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 399 additions and 136 deletions

View File

@ -40,8 +40,10 @@ RUN git clone https://chromium.googlesource.com/chromium/src/tools/code_coverage
RUN pip3 install -r /opt/code_coverage/requirements.txt
COPY bad_build_check \
collect_dft \
coverage \
coverage_helper \
dataflow_tracer.py \
download_corpus \
minijail0 \
reproduce \

View File

@ -0,0 +1,65 @@
#!/bin/bash -u
# Copyright 2020 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
cd $OUT
if (( $# > 0 )); then
FUZZ_TARGETS="$@"
else
FUZZ_TARGETS="$(find . -maxdepth 1 -type f -executable -printf '%P\n')"
fi
# Timeout for running a single fuzz target.
TIMEOUT=1h
# Number of CPUs available, this is needed for running targets in parallel.
NPROC=$(nproc)
function run_one_target {
local target=$1
local corpus="/corpus/${target}"
local traces="$OUT/${target}_dft"
# Put the logs in $OUT as well for debugging purposes.
local log="$OUT/${target}_dft.log"
rm -rf $traces && mkdir -p $traces
timeout $TIMEOUT dataflow_tracer.py $OUT/$target $corpus $traces &> $log
if (( $? != 0 )); then
echo "Error occured while collecting data flow traces for $target:"
cat $log
fi
}
# Run each fuzz target, write data flow traces into corresponding dir in $OUT.
for fuzz_target in $FUZZ_TARGETS; do
# Skip binaries that do not seem to be fuzz targets.
grep "LLVMFuzzerTestOneInput" $fuzz_target > /dev/null 2>&1 || continue
echo "Running $fuzz_target"
run_one_target $fuzz_target &
# Do not spawn more processes than the number of CPUs available.
n_child_proc=$(jobs -rp | wc -l)
while [ "$n_child_proc" -eq "$NPROC" ]; do
sleep 4
n_child_proc=$(jobs -rp | wc -l)
done
done
# Wait for background processes to finish.
wait

View File

@ -0,0 +1,147 @@
#!/usr/bin/env python3
# Copyright 2020 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
"""Script for collecting dataflow traces using DFSan compiled binary. The script
imitates `CollectDataFlow` function from libFuzzer but provides some flexibility
for skipping long and/or slow corpus elements.
Follow https://github.com/google/oss-fuzz/issues/1632 for more details."""
import hashlib
import os
import subprocess
import sys
# These can be controlled by the runner in order to change the values without
# rebuilding OSS-Fuzz base images.
FILE_SIZE_LIMIT = int(os.getenv('DFT_FILE_SIZE_LIMIT', 32 * 1024))
MIN_TIMEOUT = float(os.getenv('DFT_MIN_TIMEOUT', 1.0))
TIMEOUT_RANGE = float(os.getenv('DFT_TIMEOUT_RANGE', 3.0))
DFSAN_OPTIONS = 'fast16labels=1:warn_unimplemented=0'
def _error(msg):
sys.stderr.write(msg + '\n')
def _list_dir(dirpath):
for root, _, files in os.walk(dirpath):
for f in files:
yield os.path.join(root, f)
def _sha1(filepath):
h = hashlib.sha1()
with open(filepath, 'rb') as f:
h.update(f.read())
return h.hexdigest()
def _run(cmd, timeout=None):
result = None
try:
result = subprocess.run(cmd,
timeout=timeout,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
if result.returncode:
_error('{command} finished with non-zero code: {code}'.format(
command=str(cmd), code=result.returncode))
except subprocess.TimeoutExpired:
raise
except Exception as e:
_error('Exception: ' + str(e))
return result
def _timeout(size):
# Dynamic timeout value (proportional to file size) to discard slow units.
timeout = MIN_TIMEOUT
timeout += size * TIMEOUT_RANGE / FILE_SIZE_LIMIT
return timeout
def collect_traces(binary, corpus_dir, dft_dir):
stats = {
'total': 0,
'traced': 0,
'long': 0,
'slow': 0,
'failed': 0,
}
for f in _list_dir(corpus_dir):
stats['total'] += 1
size = os.path.getsize(f)
if size > FILE_SIZE_LIMIT:
stats['long'] += 1
print('Skipping large file ({size}b): {path}'.format(size=size, path=f))
continue
output_path = os.path.join(dft_dir, _sha1(f))
try:
result = _run([binary, f, output_path], timeout=_timeout(size))
if result.returncode:
stats['failed'] += 1
else:
stats['traced'] += 1
except subprocess.TimeoutExpired as e:
_error('Slow input: ' + str(e))
stats['slow'] += 1
return stats
def dump_functions(binary, dft_dir):
result = _run([binary])
if not result or result.returncode:
return False
with open(os.path.join(dft_dir, 'functions.txt'), 'wb') as f:
f.write(result.stdout)
return True
def main():
if len(sys.argv) < 4:
_error('Usage: {0} <binary> <corpus_dir> <dft_dir>'.format(sys.argv[0]))
sys.exit(1)
binary = sys.argv[1]
corpus_dir = sys.argv[2]
dft_dir = sys.argv[3]
os.environ['DFSAN_OPTIONS'] = DFSAN_OPTIONS
if not dump_functions(binary, dft_dir):
_error('Failed to dump functions. Something is wrong.')
sys.exit(1)
stats = collect_traces(binary, corpus_dir, dft_dir)
for k, v in stats.items():
print('{0}: {1}'.format(k, v))
# Checksum that we didn't lose track of any of the inputs.
assert stats['total'] * 2 == sum(v for v in stats.values())
sys.exit(0)
if __name__ == "__main__":
main()

View File

@ -11,29 +11,20 @@ import requests
import sys
import urlparse
import build_lib
import build_project
SANITIZER = 'coverage'
CONFIGURATION = ['FUZZING_ENGINE=libfuzzer', 'SANITIZER=%s' % SANITIZER]
PLATFORM = 'linux'
# Where corpus backups can be downloaded from.
CORPUS_BACKUP_URL = ('/{project}-backup.clusterfuzz-external.appspot.com/'
'corpus/libFuzzer/{fuzzer}/latest.zip')
# Cloud Builder has a limit of 100 build steps and 100 arguments for each step.
CORPUS_DOWNLOAD_BATCH_SIZE = 100
COVERAGE_BUILD_TAG = 'coverage'
# Needed for reading public target.list.* files.
GCS_URL_BASENAME = 'https://storage.googleapis.com/'
# Where code coverage reports need to be uploaded to.
COVERAGE_BUCKET_NAME = 'oss-fuzz-coverage'
# Link to the code coverage report in HTML format.
HTML_REPORT_URL_FORMAT = (GCS_URL_BASENAME + COVERAGE_BUCKET_NAME +
HTML_REPORT_URL_FORMAT = (build_lib.GCS_URL_BASENAME + COVERAGE_BUCKET_NAME +
'/{project}/reports/{date}/{platform}/index.html')
# This is needed for ClusterFuzz to pick up the most recent reports data.
@ -74,10 +65,6 @@ def get_build_steps(project_dir):
skip_build('Project "%s" uses go-fuzz, coverage is not supported yet.' %
project_name)
fuzz_targets = get_targets_list(project_name)
if not fuzz_targets:
skip_build('No fuzz targets found for project "%s".' % project_name)
dockerfile_path = os.path.join(project_dir, 'Dockerfile')
name = project_yaml['name']
image = project_yaml['image']
@ -143,32 +130,11 @@ def get_build_steps(project_dir):
],
})
# Split fuzz targets into batches of CORPUS_DOWNLOAD_BATCH_SIZE.
for i in xrange(0, len(fuzz_targets), CORPUS_DOWNLOAD_BATCH_SIZE):
download_corpus_args = []
for binary_name in fuzz_targets[i:i + CORPUS_DOWNLOAD_BATCH_SIZE]:
qualified_name = binary_name
qualified_name_prefix = '%s_' % project_name
if not binary_name.startswith(qualified_name_prefix):
qualified_name = qualified_name_prefix + binary_name
download_corpora_step = build_lib.download_corpora_step(project_name)
if not download_corpora_step:
skip_build("Skipping code coverage build for %s.\n" % project_name)
url = build_project.get_signed_url(CORPUS_BACKUP_URL.format(
project=project_name, fuzzer=qualified_name),
method='GET')
corpus_archive_path = os.path.join('/corpus', binary_name + '.zip')
download_corpus_args.append('%s %s' % (corpus_archive_path, url))
# Download corpus.
build_steps.append({
'name': 'gcr.io/oss-fuzz-base/base-runner',
'entrypoint': 'download_corpus',
'args': download_corpus_args,
'volumes': [{
'name': 'corpus',
'path': '/corpus'
}],
})
build_steps.append(download_corpora_step)
failure_msg = ('*' * 80 + '\nCode coverage report generation failed.\n'
'To reproduce, run:\n'
@ -267,7 +233,7 @@ def get_build_steps(project_dir):
})
# Update the latest report information file for ClusterFuzz.
latest_report_info_url = build_project.get_signed_url(
latest_report_info_url = build_lib.get_signed_url(
LATEST_REPORT_INFO_URL.format(project=project_name),
method='PUT',
content_type='application/json')
@ -300,23 +266,6 @@ def get_build_steps(project_dir):
return build_steps
def get_targets_list(project_name):
# libFuzzer ASan is the default configuration, get list of targets from it.
url = build_project.get_targets_list_url(
build_project.ENGINE_INFO['libfuzzer'].upload_bucket, project_name,
'address')
url = urlparse.urljoin(GCS_URL_BASENAME, url)
r = requests.get(url)
if not r.status_code == 200:
sys.stderr.write('Failed to get list of targets from "%s".\n' % url)
sys.stderr.write('Status code: %d \t\tText:\n%s\n' %
(r.status_code, r.text))
return None
return r.text.split()
def main():
if len(sys.argv) != 2:
usage()

134
infra/gcb/build_lib.py Normal file
View File

@ -0,0 +1,134 @@
"""Utility module for Google Cloud Build scripts."""
import base64
import collections
import os
import requests
import sys
import time
import urllib
import urlparse
from oauth2client.service_account import ServiceAccountCredentials
BUILD_TIMEOUT = 12 * 60 * 60
# Needed for reading public target.list.* files.
GCS_URL_BASENAME = 'https://storage.googleapis.com/'
GCS_UPLOAD_URL_FORMAT = '/{0}/{1}/{2}'
# Where corpus backups can be downloaded from.
CORPUS_BACKUP_URL = ('/{project}-backup.clusterfuzz-external.appspot.com/'
'corpus/libFuzzer/{fuzzer}/latest.zip')
# Cloud Builder has a limit of 100 build steps and 100 arguments for each step.
CORPUS_DOWNLOAD_BATCH_SIZE = 100
TARGETS_LIST_BASENAME = 'targets.list'
EngineInfo = collections.namedtuple(
'EngineInfo',
['upload_bucket', 'supported_sanitizers', 'supported_architectures'])
ENGINE_INFO = {
'libfuzzer':
EngineInfo(upload_bucket='clusterfuzz-builds',
supported_sanitizers=['address', 'memory', 'undefined'],
supported_architectures=['x86_64', 'i386']),
'afl':
EngineInfo(upload_bucket='clusterfuzz-builds-afl',
supported_sanitizers=['address'],
supported_architectures=['x86_64']),
'honggfuzz':
EngineInfo(upload_bucket='clusterfuzz-builds-honggfuzz',
supported_sanitizers=['address', 'memory', 'undefined'],
supported_architectures=['x86_64']),
'dataflow':
EngineInfo(upload_bucket='clusterfuzz-builds-dataflow',
supported_sanitizers=['dataflow'],
supported_architectures=['x86_64']),
'none':
EngineInfo(upload_bucket='clusterfuzz-builds-no-engine',
supported_sanitizers=['address'],
supported_architectures=['x86_64']),
}
def get_targets_list_filename(sanitizer):
return TARGETS_LIST_BASENAME + '.' + sanitizer
def get_targets_list_url(bucket, project, sanitizer):
filename = get_targets_list_filename(sanitizer)
url = GCS_UPLOAD_URL_FORMAT.format(bucket, project, filename)
return url
def _get_targets_list(project_name):
# libFuzzer ASan is the default configuration, get list of targets from it.
url = get_targets_list_url(ENGINE_INFO['libfuzzer'].upload_bucket,
project_name, 'address')
url = urlparse.urljoin(GCS_URL_BASENAME, url)
response = requests.get(url)
if not response.status_code == 200:
sys.stderr.write('Failed to get list of targets from "%s".\n' % url)
sys.stderr.write('Status code: %d \t\tText:\n%s\n' %
(response.status_code, response.text))
return None
return response.text.split()
def get_signed_url(path, method='PUT', content_type=''):
timestamp = int(time.time() + BUILD_TIMEOUT)
blob = '{0}\n\n{1}\n{2}\n{3}'.format(method, content_type, timestamp, path)
creds = ServiceAccountCredentials.from_json_keyfile_name(
os.environ['GOOGLE_APPLICATION_CREDENTIALS'])
client_id = creds.service_account_email
signature = base64.b64encode(creds.sign_blob(blob)[1])
values = {
'GoogleAccessId': client_id,
'Expires': timestamp,
'Signature': signature,
}
return ('https://storage.googleapis.com{0}?'.format(path) +
urllib.urlencode(values))
def download_corpora_step(project_name):
"""Returns a GCB step for downloading corpora backups for the given project.
"""
fuzz_targets = _get_targets_list(project_name)
if not fuzz_targets:
sys.stderr.write('No fuzz targets found for project "%s".\n' % project_name)
return None
# Split fuzz targets into batches of CORPUS_DOWNLOAD_BATCH_SIZE.
for i in range(0, len(fuzz_targets), CORPUS_DOWNLOAD_BATCH_SIZE):
download_corpus_args = []
for binary_name in fuzz_targets[i:i + CORPUS_DOWNLOAD_BATCH_SIZE]:
qualified_name = binary_name
qualified_name_prefix = '%s_' % project_name
if not binary_name.startswith(qualified_name_prefix):
qualified_name = qualified_name_prefix + binary_name
url = get_signed_url(CORPUS_BACKUP_URL.format(project=project_name,
fuzzer=qualified_name),
method='GET')
corpus_archive_path = os.path.join('/corpus', binary_name + '.zip')
download_corpus_args.append('%s %s' % (corpus_archive_path, url))
step = {
'name': 'gcr.io/oss-fuzz-base/base-runner',
'entrypoint': 'download_corpus',
'args': download_corpus_args,
'volumes': [{
'name': 'corpus',
'path': '/corpus'
}],
}
return step

View File

@ -6,22 +6,17 @@ Usage: build_project.py <project_dir>
from __future__ import print_function
import base64
import collections
import datetime
import json
import os
import re
import sys
import time
import urllib
import yaml
from oauth2client.client import GoogleCredentials
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build
BUILD_TIMEOUT = 12 * 60 * 60
import build_lib
FUZZING_BUILD_TAG = 'fuzzing'
@ -39,41 +34,10 @@ CONFIGURATIONS = {
'engine-none': ['FUZZING_ENGINE=none'],
}
EngineInfo = collections.namedtuple(
'EngineInfo',
['upload_bucket', 'supported_sanitizers', 'supported_architectures'])
ENGINE_INFO = {
'libfuzzer':
EngineInfo(upload_bucket='clusterfuzz-builds',
supported_sanitizers=['address', 'memory', 'undefined'],
supported_architectures=['x86_64', 'i386']),
'afl':
EngineInfo(upload_bucket='clusterfuzz-builds-afl',
supported_sanitizers=['address'],
supported_architectures=['x86_64']),
'honggfuzz':
EngineInfo(upload_bucket='clusterfuzz-builds-honggfuzz',
supported_sanitizers=['address', 'memory', 'undefined'],
supported_architectures=['x86_64']),
'dataflow':
EngineInfo(upload_bucket='clusterfuzz-builds-dataflow',
supported_sanitizers=['dataflow'],
supported_architectures=['x86_64']),
'none':
EngineInfo(upload_bucket='clusterfuzz-builds-no-engine',
supported_sanitizers=['address'],
supported_architectures=['x86_64']),
}
DEFAULT_ARCHITECTURES = ['x86_64']
DEFAULT_ENGINES = ['libfuzzer', 'afl', 'honggfuzz']
DEFAULT_SANITIZERS = ['address', 'undefined']
TARGETS_LIST_BASENAME = 'targets.list'
UPLOAD_URL_FORMAT = '/{0}/{1}/{2}'
def usage():
sys.stderr.write('Usage: ' + sys.argv[0] + ' <project_dir>\n')
@ -97,26 +61,8 @@ def load_project_yaml(project_dir):
return project_yaml
def get_signed_url(path, method='PUT', content_type=''):
timestamp = int(time.time() + BUILD_TIMEOUT)
blob = '{0}\n\n{1}\n{2}\n{3}'.format(method, content_type, timestamp, path)
creds = ServiceAccountCredentials.from_json_keyfile_name(
os.environ['GOOGLE_APPLICATION_CREDENTIALS'])
client_id = creds.service_account_email
signature = base64.b64encode(creds.sign_blob(blob)[1])
values = {
'GoogleAccessId': client_id,
'Expires': timestamp,
'Signature': signature,
}
return ('https://storage.googleapis.com{0}?'.format(path) +
urllib.urlencode(values))
def is_supported_configuration(fuzzing_engine, sanitizer, architecture):
fuzzing_engine_info = ENGINE_INFO[fuzzing_engine]
fuzzing_engine_info = build_lib.ENGINE_INFO[fuzzing_engine]
if architecture == 'i386' and sanitizer != 'address':
return False
return (sanitizer in fuzzing_engine_info.supported_sanitizers and
@ -213,17 +159,18 @@ def get_build_steps(project_dir):
stamped_name = '-'.join([name, sanitizer, ts])
zip_file = stamped_name + '.zip'
stamped_srcmap_file = stamped_name + '.srcmap.json'
bucket = ENGINE_INFO[fuzzing_engine].upload_bucket
bucket = build_lib.ENGINE_INFO[fuzzing_engine].upload_bucket
if architecture != 'x86_64':
bucket += '-' + architecture
upload_url = get_signed_url(
UPLOAD_URL_FORMAT.format(bucket, name, zip_file))
srcmap_url = get_signed_url(
UPLOAD_URL_FORMAT.format(bucket, name, stamped_srcmap_file))
upload_url = build_lib.get_signed_url(
build_lib.GCS_UPLOAD_URL_FORMAT.format(bucket, name, zip_file))
srcmap_url = build_lib.get_signed_url(
build_lib.GCS_UPLOAD_URL_FORMAT.format(bucket, name,
stamped_srcmap_file))
targets_list_filename = get_targets_list_filename(sanitizer)
targets_list_url = get_signed_url(
get_targets_list_url(bucket, name, sanitizer))
targets_list_filename = build_lib.get_targets_list_filename(sanitizer)
targets_list_url = build_lib.get_signed_url(
build_lib.get_targets_list_url(bucket, name, sanitizer))
env.append('OUT=' + out)
env.append('MSAN_LIBS_PATH=/workspace/msan')
@ -320,6 +267,13 @@ def get_build_steps(project_dir):
],
})
if sanitizer == 'dataflow' and fuzzing_engine == 'dataflow':
dataflow_steps = dataflow_post_build_steps(name)
if dataflow_steps:
build_steps.extend(dataflow_steps)
else:
sys.stderr.write('Skipping dataflow post build steps.\n')
build_steps.extend([
# generate targets list
{
@ -383,22 +337,34 @@ def get_build_steps(project_dir):
return build_steps
def dataflow_post_build_steps(project_name):
steps = []
download_corpora_step = build_lib.download_corpora_step(project_name)
if not download_corpora_step:
return None
steps = [download_corpora_step]
steps.append({
'name': 'gcr.io/oss-fuzz-base/base-runner',
'args': [
'bash', '-c',
('for f in /corpus/*.zip; do unzip -q $f -d ${f%%.*}; done && '
'collect_dft || (echo "DFT collection failed." && false)')
],
'volumes': [{
'name': 'corpus',
'path': '/corpus'
}],
})
return steps
def get_logs_url(build_id):
URL_FORMAT = ('https://console.developers.google.com/logs/viewer?'
'resource=build%2Fbuild_id%2F{0}&project=oss-fuzz')
return URL_FORMAT.format(build_id)
def get_targets_list_filename(sanitizer):
return TARGETS_LIST_BASENAME + '.' + sanitizer
def get_targets_list_url(bucket, project, sanitizer):
filename = get_targets_list_filename(sanitizer)
url = UPLOAD_URL_FORMAT.format(bucket, project, filename)
return url
def run_build(build_steps, project_name, tag):
options = {}
if 'GCB_OPTIONS' in os.environ:
@ -406,7 +372,7 @@ def run_build(build_steps, project_name, tag):
build_body = {
'steps': build_steps,
'timeout': str(BUILD_TIMEOUT) + 's',
'timeout': str(build_lib.BUILD_TIMEOUT) + 's',
'options': options,
'logsBucket': GCB_LOGS_BUCKET,
'tags': [project_name + '-' + tag,],

View File

@ -15,7 +15,6 @@ import urllib
import yaml
from oauth2client.client import GoogleCredentials
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build
@ -32,8 +31,9 @@ def main():
credentials = GoogleCredentials.get_application_default()
cloudbuild = build('cloudbuild', 'v1', credentials=credentials)
print cloudbuild.projects().builds().cancel(
projectId='oss-fuzz', id=build_id, body={}).execute()
print cloudbuild.projects().builds().cancel(projectId='oss-fuzz',
id=build_id,
body={}).execute()
if __name__ == '__main__':