From bf5caa86c3a01c7efbb17a880074220f29dcefe8 Mon Sep 17 00:00:00 2001 From: Guido Vranken Date: Thu, 7 Mar 2019 15:57:21 +0100 Subject: [PATCH] [tesseract-ocr] Add Tesseract (#2210) * Add Tesseract * Use -lz instead of static library path * Disable Tesseract shared build * Minimal repository cloning (--depth 1) * Improve tessdata directory resolution syntax * Don't hardcode TESSDATA_PREFIX into binary * Don't move, but copy $SRC/tessdata to $OUT Move sometimes results in "inter-device move failed" --- projects/tesseract-ocr/Dockerfile | 24 ++++++++++++++ projects/tesseract-ocr/build.sh | 49 +++++++++++++++++++++++++++++ projects/tesseract-ocr/project.yaml | 2 ++ 3 files changed, 75 insertions(+) create mode 100644 projects/tesseract-ocr/Dockerfile create mode 100755 projects/tesseract-ocr/build.sh create mode 100644 projects/tesseract-ocr/project.yaml diff --git a/projects/tesseract-ocr/Dockerfile b/projects/tesseract-ocr/Dockerfile new file mode 100644 index 000000000..573afbc60 --- /dev/null +++ b/projects/tesseract-ocr/Dockerfile @@ -0,0 +1,24 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +FROM gcr.io/oss-fuzz-base/base-builder +MAINTAINER guidovranken@gmail.com +RUN apt-get update && apt-get install -y autoconf automake libtool pkg-config libpng-dev libjpeg8-dev libtiff5-dev zlib1g-dev wget +RUN wget https://github.com/DanBloomberg/leptonica/releases/download/1.77.0/leptonica-1.77.0.tar.gz +RUN git clone --depth 1 https://github.com/tesseract-ocr/tesseract +RUN git clone --depth 1 https://github.com/tesseract-ocr/tessdata +RUN git clone https://github.com/guidovranken/tesseract-ocr-fuzzers +COPY build.sh $SRC/ diff --git a/projects/tesseract-ocr/build.sh b/projects/tesseract-ocr/build.sh new file mode 100755 index 000000000..ca6657270 --- /dev/null +++ b/projects/tesseract-ocr/build.sh @@ -0,0 +1,49 @@ +#!/bin/bash -eu +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +cd $SRC +tar zxf leptonica-1.77.0.tar.gz +cd leptonica-1.77.0 +./configure +make -j$(nproc) +make install +ldconfig + +cd $SRC/tesseract +./autogen.sh +CXXFLAGS="$CXXFLAGS -D_GLIBCXX_DEBUG" ./configure --disable-graphics --disable-shared +make -j$(nproc) + +cd $SRC/tesseract-ocr-fuzzers + +cp -R $SRC/tessdata $OUT + +$CXX $CXXFLAGS \ + -I $SRC/tesseract/src/api \ + -I $SRC/tesseract/src/ccstruct \ + -I $SRC/tesseract/src/ccmain \ + -I $SRC/tesseract/src/ccutil \ + $SRC/tesseract-ocr-fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api \ + $SRC/tesseract/src/api/.libs/libtesseract.a \ + /usr/local/lib/liblept.a \ + /usr/lib/x86_64-linux-gnu/libtiff.a \ + /usr/lib/x86_64-linux-gnu/libpng.a \ + /usr/lib/x86_64-linux-gnu/libjpeg.a \ + /usr/lib/x86_64-linux-gnu/libjbig.a \ + /usr/lib/x86_64-linux-gnu/liblzma.a \ + -lz \ + -lFuzzingEngine diff --git a/projects/tesseract-ocr/project.yaml b/projects/tesseract-ocr/project.yaml new file mode 100644 index 000000000..70b8e3a51 --- /dev/null +++ b/projects/tesseract-ocr/project.yaml @@ -0,0 +1,2 @@ +homepage: "https://github.com/tesseract-ocr/tesseract" +primary_contact: "stjoweil@googlemail.com"