From b9bbb0d450a67fcf331769558c32fdcb5647c396 Mon Sep 17 00:00:00 2001 From: Danrich Parrol Date: Tue, 3 Feb 2015 22:04:23 -0800 Subject: [PATCH] Fix segfault in Damerau-Levenstein C code. If one of the characters had a value of 128 or above, this would be treated as a signed char and would result in an array lookup with a negative index. The somewhat contrived test case given here -- comparing a space with a non-breaking space -- reproduces the segmentation fault prior to the fix. This also makes a Clang warning go away. Thanks, compiler! :-) --- cjellyfish/damerau_levenshtein.c | 4 ++-- jellyfish/test.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cjellyfish/damerau_levenshtein.c b/cjellyfish/damerau_levenshtein.c index c24e604..98bb938 100644 --- a/cjellyfish/damerau_levenshtein.c +++ b/cjellyfish/damerau_levenshtein.c @@ -40,7 +40,7 @@ int damerau_levenshtein_distance(const char *s1, const char *s2) for (i = 1; i <= len1; i++) { db = 0; for (j = 1; j <= len2; j++) { - i1 = da[(size_t)(s2[j-1])]; + i1 = da[(unsigned char)s2[j-1]]; j1 = db; if (s1[i - 1] == s2[j - 1]) { @@ -58,7 +58,7 @@ int damerau_levenshtein_distance(const char *s1, const char *s2) dist[((i+1)*cols) + j + 1] = MIN(MIN(d1, d2), MIN(d3, d4)); } - da[s1[i-1]] = i; + da[(unsigned char)s1[i-1]] = i; } result = dist[((len1+1) * cols) + len2 + 1]; diff --git a/jellyfish/test.py b/jellyfish/test.py index cce7c28..12a28a3 100644 --- a/jellyfish/test.py +++ b/jellyfish/test.py @@ -59,7 +59,8 @@ class JellyfishTests(object): ("abcd", "acb", 2), ("cape sand recycling ", "edith ann graham", 17), ("jellyifhs", "jellyfish", 2), - ("ifhs", "fish", 2)] + ("ifhs", "fish", 2), + ("Hello, world!", "Hello,\xc2\xa0world!", 2)] for (s1, s2, value) in cases: self.assertEqual(self.jf.damerau_levenshtein_distance(s1, s2), value)