142 lines
4.1 KiB
C
142 lines
4.1 KiB
C
#include <ctype.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "jellyfish.h"
|
|
|
|
#define NOTNUM(c) ((c>57) || (c<48))
|
|
#define INRANGE(c) ((c>0) && (c<91))
|
|
|
|
#ifndef NaN
|
|
#define NaN (0.0 / 0.0)
|
|
#endif
|
|
|
|
/* borrowed heavily from strcmp95.c
|
|
* http://www.census.gov/geo/msb/stand/strcmp.c
|
|
*/
|
|
double _jaro_winkler(const char *ying, const char *yang,
|
|
bool long_tolerance, bool winklerize)
|
|
{
|
|
/* Arguments:
|
|
|
|
ying
|
|
yang
|
|
pointers to the 2 strings to be compared.
|
|
|
|
long_tolerance
|
|
Increase the probability of a match when the number of matched
|
|
characters is large. This option allows for a little more
|
|
tolerance when the strings are large. It is not an appropriate
|
|
test when comparing fixed length fields such as phone and
|
|
social security numbers.
|
|
*/
|
|
char *ying_flag=0, *yang_flag=0;
|
|
|
|
double weight;
|
|
|
|
long ying_length, yang_length, min_len;
|
|
long search_range;
|
|
long lowlim, hilim;
|
|
long trans_count, common_chars;
|
|
|
|
int i, j, k;
|
|
|
|
// ensure that neither string is blank
|
|
ying_length = strlen(ying);
|
|
yang_length = strlen(yang);
|
|
if (!ying_length || !yang_length) return 0;
|
|
|
|
search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
|
|
|
|
// Blank out the flags
|
|
ying_flag = alloca(ying_length + 1);
|
|
if (!ying_flag) return NaN;
|
|
|
|
yang_flag = alloca(yang_length + 1);
|
|
if (!yang_flag) return NaN;
|
|
|
|
memset(ying_flag, 0, ying_length + 1);
|
|
memset(yang_flag, 0, yang_length + 1);
|
|
|
|
search_range = (search_range/2) - 1;
|
|
if (search_range < 0) search_range = 0;
|
|
|
|
|
|
// Looking only within the search range, count and flag the matched pairs.
|
|
common_chars = 0;
|
|
for (i = 0; i < ying_length; i++) {
|
|
lowlim = (i >= search_range) ? i - search_range : 0;
|
|
hilim = (i + search_range <= yang_length-1) ? (i + search_range) : yang_length-1;
|
|
for (j = lowlim; j <= hilim; j++) {
|
|
if (!yang_flag[j] && yang[j] == ying[i]) {
|
|
yang_flag[j] = 1;
|
|
ying_flag[i] = 1;
|
|
common_chars++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If no characters in common - return
|
|
if (!common_chars) return 0;
|
|
|
|
// Count the number of transpositions
|
|
k = trans_count = 0;
|
|
for (i = 0; i < ying_length; i++) {
|
|
if (ying_flag[i]) {
|
|
for (j = k; j < yang_length; j++) {
|
|
if (yang_flag[j]) {
|
|
k = j + 1;
|
|
break;
|
|
}
|
|
}
|
|
if (ying[i] != yang[j]) {
|
|
trans_count++;
|
|
}
|
|
}
|
|
}
|
|
trans_count /= 2;
|
|
|
|
// adjust for similarities in nonmatched characters
|
|
|
|
// Main weight computation.
|
|
weight= common_chars / ((double) ying_length) + common_chars / ((double) yang_length)
|
|
+ ((double) (common_chars - trans_count)) / ((double) common_chars);
|
|
weight /= 3.0;
|
|
|
|
// Continue to boost the weight if the strings are similar
|
|
if (winklerize && weight > 0.7 && ying_length > 3 && yang_length > 3) {
|
|
|
|
// Adjust for having up to the first 4 characters in common
|
|
j = (min_len >= 4) ? 4 : min_len;
|
|
for (i=0; ((i<j) && (ying[i] == yang[i]) && (NOTNUM(ying[i]))); i++);
|
|
if (i) {
|
|
weight += i * 0.1 * (1.0 - weight);
|
|
}
|
|
|
|
/* Optionally adjust for long strings. */
|
|
/* After agreeing beginning chars, at least two more must agree and
|
|
the agreeing characters must be > .5 of remaining characters.
|
|
*/
|
|
if ((long_tolerance) && (min_len>4) && (common_chars>i+1) && (2*common_chars>=min_len+i)) {
|
|
if (NOTNUM(ying[0])) {
|
|
weight += (double) (1.0-weight) *
|
|
((double) (common_chars-i-1) / ((double) (ying_length+yang_length-i*2+2)));
|
|
}
|
|
}
|
|
}
|
|
|
|
return weight;
|
|
}
|
|
|
|
|
|
double jaro_winkler(const char *ying, const char *yang, bool long_tolerance)
|
|
{
|
|
return _jaro_winkler(ying, yang, long_tolerance, true);
|
|
}
|
|
|
|
double jaro_distance(const char *ying, const char *yang)
|
|
{
|
|
return _jaro_winkler(ying, yang, false, false);
|
|
}
|