oss-fuzz/projects/lucene/CustomAnalyzerFuzzer.java

215 lines
6.7 KiB
Java
Executable File

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in co mpliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//////////////////////////////////////////////////////////////////////////////////
import com.code_intelligence.jazzer.api.FuzzedDataProvider;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.lang.IllegalArgumentException;
import java.lang.IllegalStateException;
import java.lang.IllegalArgumentException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
public class CustomAnalyzerFuzzer {
static String [] tokenizerArray = {
"standard",
"simplePattern",
"classic",
"whitespace",
"uax29UrlEmail",
"pathHierarchy",
"wikipedia",
"nGram",
"edgeNGram",
"thai",
"pattern",
"simplePatternSplit",
"letter",
"keyword"
};
static String [] charFilterArray = {
"htmlStrip",
"cjkWidth",
"mapping",
"patternReplace",
"persian"
};
static String [] tokenFilterArray = {
"apostrophe",
"arabicNormalization",
"arabicStem",
"asciiFolding",
"bengaliNormalization",
"bengaliStem",
"brazilianStem",
"bulgarianStem",
"capitalization",
"cjkBigram",
"cjkWidth",
"classic",
"codepointCount",
"commonGrams",
"commonGramsQuery",
"concatenateGraph",
"czechStem",
"dateRecognizer",
"decimalDigit",
"delimitedPayload",
"delimitedTermFrequency",
"dictionaryCompoundWord",
"edgeNGram",
"elision",
"englishMinimalStem",
"englishPossessive",
"fingerprint",
"finnishLightStem",
"fixBrokenOffsets",
"fixedShingle",
"flattenGraph",
"frenchLightStem",
"frenchMinimalStem",
"galicianMinimalStem",
"galicianStem",
"germanLightStem",
"germanMinimalStem",
"germanNormalization",
"germanStem",
"greekLowercase",
"greekStem",
"hindiNormalization",
"hindiStem",
"hungarianLightStem",
"hunspellStem",
"hyphenatedWords",
"hyphenationCompoundWord",
"indicNormalization",
"indonesianStem",
"irishLowercase",
"italianLightStem",
"kStem",
"keepWord",
"keywordMarker",
"keywordRepeat",
"latvianStem",
"length",
"limitTokenCount",
"limitTokenOffset",
"limitTokenPosition",
"lowercase",
"minHash",
"nGram",
"norwegianLightStem",
"norwegianMinimalStem",
"numericPayload",
"patternCaptureGroup",
"patternReplace",
"persianNormalization",
"porterStem",
"portugueseLightStem",
"portugueseMinimalStem",
"portugueseStem",
"protectedTerm",
"removeDuplicates",
"reverseString",
"russianLightStem",
"scandinavianFolding",
"scandinavianNormalization",
"serbianNormalization",
"shingle",
"snowballPorter",
"soraniNormalization",
"soraniStem",
"spanishLightStem",
"spanishMinimalStem",
"stemmerOverride",
"stop",
"swedishLightStem",
"synonym",
"synonymGraph",
"tokenOffsetPayload",
"trim",
"truncate",
"turkishLowercase",
"type",
"typeAsPayload",
"typeAsSynonym",
"uppercase",
"wordDelimiter",
"wordDelimiterGraph"
};
public static void fuzzerTestOneInput(FuzzedDataProvider data) {
List<String> selectedTokenizers = data.pickValues(tokenizerArray, data.consumeInt(0, tokenizerArray.length));
List<String> selectedCharFilters = data.pickValues(charFilterArray, data.consumeInt(0, charFilterArray.length));
List<String> selectedTokenFilters = data.pickValues(tokenFilterArray, data.consumeInt(0, tokenFilterArray.length));
String str0 = data.consumeString(100);
String str1 = data.consumeRemainingAsString();
CustomAnalyzer.Builder cb = CustomAnalyzer.builder();
try {
cb.withTokenizer(data.pickValue(tokenizerArray));
for (String cf : selectedCharFilters) {
cb.addCharFilter(cf);
}
for (String tf : selectedTokenFilters) {
cb.addTokenFilter(tf);
}
Analyzer analyzer = cb.build();
List<String> result = analyze(str0, str1, analyzer);
} catch (IOException e) {
// IOException must be caught or declared to be thrown according to docs.
} catch (IllegalStateException e) {
// IllegalStateException is caught because with specific crashing input it will trigger line 185 in GraphTokenFilter.java
// https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java#L185
} catch (IllegalArgumentException e) {
// IllegalArgumentException is not documented but will be thrown with specific crashing input.
}
}
public static List<String> analyze(String field, String text, Analyzer analyzer) throws IOException {
List<String> result = new ArrayList<String>();
TokenStream tokenStream = analyzer.tokenStream(field, text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while(tokenStream.incrementToken()) {
result.add(attr.toString());
}
return result;
}
}