Re: multibyte charater set in levenshtein function

From: Alexander Korotkov <aekorotkov(at)gmail(dot)com>
To: Robert Haas <robertmhaas(at)gmail(dot)com>
Cc: Itagaki Takahiro <itagaki(dot)takahiro(at)gmail(dot)com>, alvherre(at)commandprompt(dot)com, PostgreSQL Hackers <pgsql-hackers(at)postgresql(dot)org>
Subject: Re: multibyte charater set in levenshtein function
Date: 2010-07-22 08:23:35
Message-ID: AANLkTilQ5bzoOcA_A6mRC0U8BOyXcmwsd4BzZc_0QwlK@mail.gmail.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers

Such version with macros and includes can look like this:

#ifdef MULTIBYTE
#define NEXT_X (x+= char_lens[i-1])
#define NEXT_Y (y+= y_char_len)
#define CMP (char_cmp(x, char_lens[i-1], y, y_char_len))
#else
#define NEXT_X (x++)
#define NEXT_Y (y++)
#define CMP (*x == *y)
#endif

static int
levenshtein_internal(text *s, text *t, int ins_c, int del_c, int sub_c)
{
int m,
n,
d;
int *prev;
int *curr;
int i,
j;
const char *x;
const char *y;
#ifdef MULTIBYTE
int *char_lens;
int y_char_len;
#endif

/*
* We should calculate number of characters for multibyte encodings
*/
m = pg_mbstrlen_with_len(VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s));
n = pg_mbstrlen_with_len(VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));

/*
* We can transform an empty s into t with n insertions, or a non-empty
t
* into an empty s with m deletions.
*/
if (m == 0)
return n * ins_c;
if (n == 0)
return m * del_c;

/*
* For security concerns, restrict excessive CPU+RAM usage. (This
* implementation uses O(m) memory and has O(mn) complexity.)
*/
if (m > MAX_LEVENSHTEIN_STRLEN ||
n > MAX_LEVENSHTEIN_STRLEN)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("argument exceeds the maximum length of %d bytes",
MAX_LEVENSHTEIN_STRLEN)));

/* One more cell for initialization column and row. */
++m;
++n;

/*
* Instead of building an (m+1)x(n+1) array, we'll use two different
* arrays of size m+1 for storing accumulated values. At each step one
* represents the "previous" row and one is the "current" row of the
* notional large array.
* For multibyte encoding we'll also store array of lengths of
* characters of first string in order to avoid great number of
* pg_mblen calls.
*/
#ifdef MULTIBYTE
prev = (int *) palloc(3 * m * sizeof(int));
curr = prev + m;
char_lens = prev + 2 * m;
for (i = 0, x = VARDATA_ANY(s); i < m - 1; i++)
{
char_lens[i] = pg_mblen(x);
x += char_lens[i];
}
char_lens[i] = 0;
#else
prev = (int *) palloc(2 * m * sizeof(int));
curr = prev + m;
#endif

/* Initialize the "previous" row to 0..cols */
for (i = 0; i < m; i++)
prev[i] = i * del_c;

/*
* For multibyte encoding we should increase x and y pointers by the
* results of pg_mblen function. Also we should use CHAR_CMP macros
* for character comparison.
*/
for (y = VARDATA_ANY(t), j = 1; j < n; NEXT_Y, j++)
{
int *temp;
#ifdef MULTIBYTE
y_char_len = pg_mblen(y);
#endif

curr[0] = j * ins_c;

for (x = VARDATA_ANY(s), i = 1; i < m; NEXT_X, i++)
{
int ins;
int del;
int sub;

ins = prev[i] + ins_c;
del = curr[i - 1] + del_c;
sub = prev[i - 1] + (CMP ? 0 : sub_c);
curr[i] = Min(ins, del);
curr[i] = Min(curr[i], sub);
}

temp = curr;
curr = prev;
prev = temp;
}

d = prev[m - 1];

/*
* Because the final value was swapped from the previous row to the
* current row, that's where we'll find it.
*/
return d;
}

What do you thing about it?

----
With best regards,
Alexander Korotkov.

In response to

Browse pgsql-hackers by date

  From Date Subject
Next Message Yeb Havinga 2010-07-22 08:37:12 Re: Synchronous replication
Previous Message Dave Page 2010-07-22 08:18:29 Re: git config user.email