/* The "bitap" algorithm for fuzzy string matching, based on the paper "Fast Text Searching with Errors," by Sun Wu and Udi Manber, 1991. Note that this implementation is "backwards and upside-down": the meanings of 0 and 1 are reversed, as is the direction of shifting. That is, where Wu and Manber write a = (b | c) & (d << 1), this implementation writes a = (b & c) | (d >> 1). This source code is released to the public domain by Arthur O'Dwyer, November 2005. A similar version is available under the GFDL; see Wikipedia, article "Bitap algorithm," 26 July 2005. */ #include #include #include #include /* This function only works for patterns up to 31 characters long! */ int bitap_search(const char *text, const char *pattern, int k) { int m = strlen(pattern); unsigned long *R; unsigned long pattern_mask[CHAR_MAX+1]; int i, d; if (pattern[0] == '\0') return 0; if (m > 31) return -3; /* Initialize the bit array R */ R = malloc((k+2) * sizeof *R); ++R; for (d=-1; d <= k; ++d) R[d] = ~((1<<(d+1))-1); /* d+1 ones */ /* Initialize the pattern bitmasks */ for (i=0; i <= CHAR_MAX; ++i) pattern_mask[i] = ~0; for (i=0; i < m; ++i) pattern_mask[pattern[i]] &= ~(1UL << i); for (i=0; text[i] != '\0'; ++i) { /* Update the bit arrays */ unsigned long old_Rd1 = ~0; for (d=0; d <= k; ++d) { unsigned long old_Rd = R[d]; /* Account for substitutions, insertions, deletions */ R[d] = ((old_Rd | pattern_mask[text[i]]) << 1) & ((old_Rd1 & R[d-1]) << 1) & old_Rd1; if (0 == (R[d] & (1UL << m))) { free(R-1); return d; } old_Rd1 = old_Rd; } } free(R-1); return -1; } int main() { int i; #define t(s,x) for (i=0; ; ++i) { \ int f = bitap_search(s,x,i); \ if (f >= 0) { \ printf("%s is %d steps from %s\n", s,f,x); \ break; \ } \ } t("perl", "perl"); t("erlangen", "perl"); t("prl", "perl"); t("per", "perl"); t("pern", "perl"); t("gnarl", "perl"); t("peerl", "perl"); t("peel", "perl"); t("peril", "perl"); t("opel", "perl"); t("appeal", "perl"); t("hyperbola", "perl"); t("merlin", "perl"); t("pearly", "perl"); t("parlance", "perl"); t("perk", "perl"); t("superappeal", "perl"); t("superlative", "perl"); t("magic", "perl"); t("marlin", "perl"); t("peeve", "perl"); t("pear-like", "perl"); return 0; }