mirror of https://github.com/google/oss-fuzz.git
215 lines
6.7 KiB
Java
Executable File
215 lines
6.7 KiB
Java
Executable File
// Copyright 2023 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in co mpliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
//////////////////////////////////////////////////////////////////////////////////
|
|
|
|
import com.code_intelligence.jazzer.api.FuzzedDataProvider;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.io.Reader;
|
|
import java.io.StringReader;
|
|
import java.io.IOException;
|
|
import java.nio.charset.StandardCharsets;
|
|
import java.util.Map;
|
|
import java.util.HashMap;
|
|
import java.util.List;
|
|
import java.util.ArrayList;
|
|
import java.lang.IllegalArgumentException;
|
|
import java.lang.IllegalStateException;
|
|
import java.lang.IllegalArgumentException;
|
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
import org.apache.lucene.analysis.TokenStream;
|
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
import org.apache.lucene.analysis.Analyzer;
|
|
import org.apache.lucene.analysis.Tokenizer;
|
|
import org.apache.lucene.index.IndexWriter;
|
|
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
|
|
|
public class CustomAnalyzerFuzzer {
|
|
static String [] tokenizerArray = {
|
|
"standard",
|
|
"simplePattern",
|
|
"classic",
|
|
"whitespace",
|
|
"uax29UrlEmail",
|
|
"pathHierarchy",
|
|
"wikipedia",
|
|
"nGram",
|
|
"edgeNGram",
|
|
"thai",
|
|
"pattern",
|
|
"simplePatternSplit",
|
|
"letter",
|
|
"keyword"
|
|
};
|
|
|
|
static String [] charFilterArray = {
|
|
"htmlStrip",
|
|
"cjkWidth",
|
|
"mapping",
|
|
"patternReplace",
|
|
"persian"
|
|
};
|
|
|
|
static String [] tokenFilterArray = {
|
|
"apostrophe",
|
|
"arabicNormalization",
|
|
"arabicStem",
|
|
"asciiFolding",
|
|
"bengaliNormalization",
|
|
"bengaliStem",
|
|
"brazilianStem",
|
|
"bulgarianStem",
|
|
"capitalization",
|
|
"cjkBigram",
|
|
"cjkWidth",
|
|
"classic",
|
|
"codepointCount",
|
|
"commonGrams",
|
|
"commonGramsQuery",
|
|
"concatenateGraph",
|
|
"czechStem",
|
|
"dateRecognizer",
|
|
"decimalDigit",
|
|
"delimitedPayload",
|
|
"delimitedTermFrequency",
|
|
"dictionaryCompoundWord",
|
|
"edgeNGram",
|
|
"elision",
|
|
"englishMinimalStem",
|
|
"englishPossessive",
|
|
"fingerprint",
|
|
"finnishLightStem",
|
|
"fixBrokenOffsets",
|
|
"fixedShingle",
|
|
"flattenGraph",
|
|
"frenchLightStem",
|
|
"frenchMinimalStem",
|
|
"galicianMinimalStem",
|
|
"galicianStem",
|
|
"germanLightStem",
|
|
"germanMinimalStem",
|
|
"germanNormalization",
|
|
"germanStem",
|
|
"greekLowercase",
|
|
"greekStem",
|
|
"hindiNormalization",
|
|
"hindiStem",
|
|
"hungarianLightStem",
|
|
"hunspellStem",
|
|
"hyphenatedWords",
|
|
"hyphenationCompoundWord",
|
|
"indicNormalization",
|
|
"indonesianStem",
|
|
"irishLowercase",
|
|
"italianLightStem",
|
|
"kStem",
|
|
"keepWord",
|
|
"keywordMarker",
|
|
"keywordRepeat",
|
|
"latvianStem",
|
|
"length",
|
|
"limitTokenCount",
|
|
"limitTokenOffset",
|
|
"limitTokenPosition",
|
|
"lowercase",
|
|
"minHash",
|
|
"nGram",
|
|
"norwegianLightStem",
|
|
"norwegianMinimalStem",
|
|
"numericPayload",
|
|
"patternCaptureGroup",
|
|
"patternReplace",
|
|
"persianNormalization",
|
|
"porterStem",
|
|
"portugueseLightStem",
|
|
"portugueseMinimalStem",
|
|
"portugueseStem",
|
|
"protectedTerm",
|
|
"removeDuplicates",
|
|
"reverseString",
|
|
"russianLightStem",
|
|
"scandinavianFolding",
|
|
"scandinavianNormalization",
|
|
"serbianNormalization",
|
|
"shingle",
|
|
"snowballPorter",
|
|
"soraniNormalization",
|
|
"soraniStem",
|
|
"spanishLightStem",
|
|
"spanishMinimalStem",
|
|
"stemmerOverride",
|
|
"stop",
|
|
"swedishLightStem",
|
|
"synonym",
|
|
"synonymGraph",
|
|
"tokenOffsetPayload",
|
|
"trim",
|
|
"truncate",
|
|
"turkishLowercase",
|
|
"type",
|
|
"typeAsPayload",
|
|
"typeAsSynonym",
|
|
"uppercase",
|
|
"wordDelimiter",
|
|
"wordDelimiterGraph"
|
|
};
|
|
|
|
|
|
public static void fuzzerTestOneInput(FuzzedDataProvider data) {
|
|
List<String> selectedTokenizers = data.pickValues(tokenizerArray, data.consumeInt(0, tokenizerArray.length));
|
|
List<String> selectedCharFilters = data.pickValues(charFilterArray, data.consumeInt(0, charFilterArray.length));
|
|
List<String> selectedTokenFilters = data.pickValues(tokenFilterArray, data.consumeInt(0, tokenFilterArray.length));
|
|
String str0 = data.consumeString(100);
|
|
String str1 = data.consumeRemainingAsString();
|
|
CustomAnalyzer.Builder cb = CustomAnalyzer.builder();
|
|
|
|
try {
|
|
cb.withTokenizer(data.pickValue(tokenizerArray));
|
|
|
|
for (String cf : selectedCharFilters) {
|
|
cb.addCharFilter(cf);
|
|
}
|
|
|
|
for (String tf : selectedTokenFilters) {
|
|
cb.addTokenFilter(tf);
|
|
}
|
|
|
|
Analyzer analyzer = cb.build();
|
|
List<String> result = analyze(str0, str1, analyzer);
|
|
} catch (IOException e) {
|
|
// IOException must be caught or declared to be thrown according to docs.
|
|
} catch (IllegalStateException e) {
|
|
// IllegalStateException is caught because with specific crashing input it will trigger line 185 in GraphTokenFilter.java
|
|
// https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java#L185
|
|
} catch (IllegalArgumentException e) {
|
|
// IllegalArgumentException is not documented but will be thrown with specific crashing input.
|
|
}
|
|
|
|
|
|
}
|
|
|
|
public static List<String> analyze(String field, String text, Analyzer analyzer) throws IOException {
|
|
List<String> result = new ArrayList<String>();
|
|
TokenStream tokenStream = analyzer.tokenStream(field, text);
|
|
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
|
|
tokenStream.reset();
|
|
while(tokenStream.incrementToken()) {
|
|
result.add(attr.toString());
|
|
}
|
|
return result;
|
|
}
|
|
} |