oss-fuzz/infra/base-images/base-builder/bash_parser.py

230 lines
7.2 KiB
Python

#!/usr/bin/python3
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
from glob import glob
import bashlex
def find_all_bash_scripts_in_src():
"""Finds all bash scripts that exist in SRC/. This is used to idenfiy scripts
that may be needed for reading during the AST parsing. This is the case
when a given build script calls another build script, then we need to
read those."""
all_local_scripts = [
y for x in os.walk('/src/') for y in glob(os.path.join(x[0], '*.sh'))
]
scripts_we_care_about = []
to_ignore = {'aflplusplus', 'honggfuzz', '/fuzztest', '/centipede'}
for s in all_local_scripts:
if any([x for x in to_ignore if x in s]):
continue
scripts_we_care_about.append(s)
print(scripts_we_care_about)
return scripts_we_care_about
def should_discard_command(ast_tree) -> bool:
"""Returns True if the command shuold be avoided, otherwise False"""
try:
first_word = ast_tree.parts[0].word
except: # pylint: disable=bare-except
return False
cmds_to_avoid_replaying = {
'configure', 'autoheader', 'autoconf', 'autoreconf', 'cmake', 'autogen.sh'
}
if any([cmd for cmd in cmds_to_avoid_replaying if cmd in first_word]):
return True
# Avoid all "make clean" calls. We dont want to erase previously build
# files.
try:
second_word = ast_tree.parts[1].word
except: # pylint: disable=bare-except
return False
if 'make' in first_word and 'clean' in second_word:
return True
# No match was found to commands we dont want to build. There is no
# indication we shuold avoid.
return False
def is_local_redirection(ast_node, all_local_scripts):
"""Return the list of scripts corresponding to the command, in case
the command is an execution of a local script."""
# print("Checking")
# Capture local script called with ./random/path/build.sh
if len(ast_node.parts) >= 2:
try:
ast_node.parts[0].word
except:
return []
if ast_node.parts[0].word == '.':
suffixes_matching = []
#print(ast_node.parts[1].word)
for bash_script in all_local_scripts:
#print("- %s"%(bash_script))
cmd_to_exec = ast_node.parts[1].word.replace('$SRC', 'src')
if bash_script.endswith(cmd_to_exec):
suffixes_matching.append(bash_script)
#print(suffixes_matching)
return suffixes_matching
# Capture a local script called with $SRC/random/path/build.sh
if len(ast_node.parts) >= 1:
if '$SRC' in ast_node.parts[0].word:
suffixes_matching = []
print(ast_node.parts[0].word)
for bash_script in all_local_scripts:
print("- %s" % (bash_script))
cmd_to_exec = ast_node.parts[0].word.replace('$SRC', 'src')
if bash_script.endswith(cmd_to_exec):
suffixes_matching.append(bash_script)
print(suffixes_matching)
return suffixes_matching
return []
def handle_ast_command(ast_node, all_scripts_in_fs, raw_script):
"""Generate bash script string for command node"""
new_script = ''
if should_discard_command(ast_node):
return ''
matches = is_local_redirection(ast_node, all_scripts_in_fs)
if len(matches) == 1:
new_script += parse_script(matches[0], all_scripts_in_fs) + '\n'
return ''
# Extract the command from the script string
idx_start = ast_node.pos[0]
idx_end = ast_node.pos[1]
new_script += raw_script[idx_start:idx_end]
#new_script += '\n'
# If mkdir is used, then ensure that '-p' is provided, as
# otherwise we will run into failures. We don't have to worry
# about multiple uses of -p as `mkdir -p -p -p`` is valid.
new_script = new_script.replace('mkdir', 'mkdir -p')
return new_script
def handle_ast_list(ast_node, all_scripts_in_fs, raw_script):
"""Handles bashlex AST list."""
new_script = ''
try_hard = 1
if not try_hard:
list_start = ast_node.pos[0]
list_end = ast_node.pos[1]
new_script += raw_script[list_start:list_end] # + '\n'
else:
# This is more refined logic. Ideally, this should work, but it's a bit
# more intricate to get right due to e.g. white-space between positions
# and more extensive parsing needed. We don't neccesarily need this
# level of success rate for what we're trying to achieve, so am disabling
# this for now.
for part in ast_node.parts:
if part.kind == 'list':
new_script += handle_ast_list(part, all_scripts_in_fs, raw_script)
elif part.kind == 'command':
new_script += handle_ast_command(part, all_scripts_in_fs, raw_script)
else:
idx_start = part.pos[0]
idx_end = part.pos[1]
new_script += raw_script[idx_start:idx_end]
new_script += ' '
# Make sure what was created is valid syntax, and otherwise return empty
try:
bashlex.parse(new_script)
except: # pylint: disable=bare-except
# Maybe return the original here instead of skipping?
return ''
return new_script
def handle_ast_compound(ast_node, all_scripts_in_fs, raw_script):
"""Handles bashlex compound AST node."""
new_script = ''
list_start = ast_node.pos[0]
list_end = ast_node.pos[1]
new_script += raw_script[list_start:list_end] + '\n'
return new_script
def handle_node(ast_node, all_scripts_in_fs, build_script):
"""Generates a bash script string for a given node"""
if ast_node.kind == 'command':
return handle_ast_command(ast_node, all_scripts_in_fs, build_script)
elif ast_node.kind == 'list':
return handle_ast_list(ast_node, all_scripts_in_fs, build_script)
elif ast_node.kind == 'compound':
print('todo: handle compound')
return handle_ast_compound(ast_node, all_scripts_in_fs, build_script)
elif ast_node.kind == 'pipeline':
# Not supported
return ''
else:
raise Exception(f'Missing node handling: {ast_node.kind}')
def parse_script(bash_script, all_scripts) -> str:
"""Top-level bash script parser"""
new_script = ''
with open(bash_script, 'r', encoding='utf-8') as f:
build_script = f.read()
try:
parts = bashlex.parse(build_script)
except bashlex.error.ParsingError:
return ''
for part in parts:
new_script += handle_node(part, all_scripts, build_script)
new_script += '\n'
print("-" * 45)
print(part.kind)
print(part.dump())
return new_script
def main():
"""Main function"""
all_scripts = find_all_bash_scripts_in_src()
replay_bash_script = parse_script(sys.argv[1], all_scripts)
print("REPLAYABLE BASH SCRIPT")
print("#" * 60)
print(replay_bash_script)
print("#" * 60)
with open('/out/replay-build-script.sh', 'w', encoding='utf-8') as f:
f.write(replay_bash_script)
src_dir = os.getenv('SRC', '/src')
with open(f'{src_dir}/replay_build.sh', 'w', encoding='utf-8') as f:
f.write(replay_bash_script)
if __name__ == "__main__":
main()