#include #include #include #include #include #include "xdictlib.h" enum { POS_NORMAL, POS_PLURAL, POS_VERB, POS_VERBE, POS_VERBB, POS_COVERED }; #define is_consonant(k) (strchr("bcdfghjklmnpqrstvwxyz",k) != NULL) #define is_vowel(k) (strchr("aeiouy",k) != NULL) /* For the time being, the |xdict| data is stored to disk as a single gigantic text file, containing all the words in the dictionary in plain text separated by newlines. Update, 16 March 2005: Related words may be stored in the file as "foo/s", indicating that both "foo" and "foos" are words; as "foo/v", indicating that "foo", "foos", "fooed", and "fooing" are words; as "foo/w", indicating that "fooe", "fooes", "fooed", and "fooing" are words; or as "fop/x", indicating that "fop", "fops", "fopped", and "fopping" are words. It is allowed for "foo/v", for example, to indicate the presence of the shorter word forms, even when "fooing" has more than |XDICT_MAXLENGTH| letters. */ int xdict_load(struct xdict *d, const char *fname) { FILE *in = fopen(fname, "r"); char buffer[XDICT_MAXLENGTH+25]; int rc = 0; if (in == NULL) return -1; while (fgets(buffer, (sizeof buffer)-5, in) != NULL) { char *p = strchr(buffer, '\n'); int buflen, rc; if (p == NULL) { rc = -2; goto done; } *p = '\0'; buflen = (p - buffer); p = strchr(buffer, '/'); if (p != NULL) { switch (p[1]) { case 's': case 'S': rc = xdict_addword(d, buffer, p - buffer); if (rc < -1) goto done; strcpy(p, "s"); rc = xdict_addword(d, buffer, (p+1) - buffer); if (rc < -1) goto done; break; case 'v': case 'V': rc = xdict_addword(d, buffer, p - buffer); if (rc < -1) goto done; strcpy(p, "s"); rc = xdict_addword(d, buffer, (p+1) - buffer); if (rc < -1) goto done; strcpy(p, "ed"); rc = xdict_addword(d, buffer, (p+2) - buffer); if (rc < -1) goto done; strcpy(p, "ing"); rc = xdict_addword(d, buffer, (p+3) - buffer); if (rc < -1) goto done; break; case 'w': case 'W': strcpy(p, "e"); rc = xdict_addword(d, buffer, (p+1) - buffer); if (rc < -1) goto done; strcpy(p, "es"); rc = xdict_addword(d, buffer, (p+2) - buffer); if (rc < -1) goto done; strcpy(p, "ed"); rc = xdict_addword(d, buffer, (p+2) - buffer); if (rc < -1) goto done; strcpy(p, "ing"); rc = xdict_addword(d, buffer, (p+3) - buffer); if (rc < -1) goto done; break; case 'x': case 'X': rc = xdict_addword(d, buffer, p - buffer); if (rc < -1) goto done; strcpy(p, "s"); rc = xdict_addword(d, buffer, (p+1) - buffer); if (rc < -1) goto done; strcpy(p, "xed"); p[0] = p[-1]; rc = xdict_addword(d, buffer, (p+3) - buffer); if (rc < -1) goto done; strcpy(p, "xing"); p[0] = p[-1]; rc = xdict_addword(d, buffer, (p+4) - buffer); if (rc < -1) goto done; break; default: rc = xdict_addword(d, buffer, buflen); if (rc < -1) goto done; break; } } else { rc = xdict_addword(d, buffer, buflen); if (rc < -1) goto done; } } done: fclose(in); xdict_sort(d); return rc; } int xdict_save(struct xdict *d, const char *fname) { FILE *out = fopen(fname, "w"); size_t i; int k; if (out == NULL) return -1; for (k=0; k < XDICT_MAXLENGTH; ++k) { for (i=0; i < d->len[k]; ++i) { fprintf(out, "%s\n", d->words[k][i].word); } } fclose(out); return 0; } /* Given an entry in the dictionary, determine whether it is a "root" of a verb or plural construction ("bake"), or a normal word with no derivative words in the dictionary ("whoever"), or a word that will be covered by some other root word ("baking"). Some words may be covered twice; for example, "pol/v" and "pol/w" both cover "poling" in a dictionary containing "pol" and "pols" as well as "pole", "poles", and "poled". Some words may appear at first glance to be covered when they're not; for example, "princess" is not covered by "princes" if the dictionary also contains "prince". To deal with the "princes/princess" problem, we allow |pos_categorize| to call itself recursively, but only on /shorter/ words, never longer ones! Then consider the entry "fling/v", which is a root word even though it ends in "-ing". This shows that we must check each word for rootness as well as coveredness. Todo: Given "car, cars, care, cares, cared, caring", we incorrectly store both "car/v" and "car/w", leading to duplicate entries for "cared" and "caring". This is incorrect behavior. */ static int pos_categorize(struct xdict *d, const char *word, int k) { char buffer[XDICT_MAXLENGTH+4]; int ends_with_ing = (k >= 6 && !memcmp(word+k-3, "ing", 3)); int ends_with_ed = (k >= 5 && !memcmp(word+k-2, "ed", 2)); int ends_with_es = (k >= 4 && !memcmp(word+k-2, "es", 2)); int ends_with_s = (k >= 4 && (word[k-1] == 's')); int ends_with_e = (k >= 3 && (word[k-1] == 'e')); int rc; if (ends_with_s || ends_with_es) { sprintf(buffer, "%.*s", k-1, word); if (xdict_find(d, buffer, NULL, NULL) > 0) { rc = pos_categorize(d, buffer, k-1); if (rc == POS_VERB || rc == POS_VERBE || rc == POS_VERBB || rc == POS_PLURAL) return POS_COVERED; } } else if (ends_with_ed) { sprintf(buffer, "%.*s", k-2, word); if (xdict_find(d, buffer, NULL, NULL) > 0) { rc = pos_categorize(d, buffer, k-2); if (rc == POS_VERB) return POS_COVERED; } sprintf(buffer, "%.*se", k-2, word); if (xdict_find(d, buffer, NULL, NULL) > 0) { rc = pos_categorize(d, buffer, k-1); if (rc == POS_VERBE) return POS_COVERED; } if (word[k-4] == word[k-3]) { sprintf(buffer, "%.*s", k-3, word); if (xdict_find(d, buffer, NULL, NULL) > 0) { rc = pos_categorize(d, buffer, k-3); if (rc == POS_VERBB) return POS_COVERED; } } } else if (ends_with_ing) { sprintf(buffer, "%.*s", k-3, word); if (xdict_find(d, buffer, NULL, NULL) > 0) { rc = pos_categorize(d, buffer, k-3); if (rc == POS_VERB) return POS_COVERED; } sprintf(buffer, "%.*se", k-3, word); if (xdict_find(d, buffer, NULL, NULL) > 0) { rc = pos_categorize(d, buffer, k-2); if (rc == POS_VERBE) return POS_COVERED; } if (word[k-5] == word[k-4]) { sprintf(buffer, "%.*s", k-4, word); if (xdict_find(d, buffer, NULL, NULL) > 0) { rc = pos_categorize(d, buffer, k-4); if (rc == POS_VERBB) return POS_COVERED; } } } /* Okay, this word isn't covered by any shorter root word. Is it a root word itself? */ if (ends_with_e) { sprintf(buffer, "%.*ses", k-1, word); if (xdict_find(d, buffer, NULL, NULL) <= 0) return POS_NORMAL; sprintf(buffer, "%.*sing", k-1, word); if (xdict_find(d, buffer, NULL, NULL) == 0) return POS_PLURAL; sprintf(buffer, "%.*sed", k-1, word); if (xdict_find(d, buffer, NULL, NULL) == 0) return POS_PLURAL; return POS_VERBE; } else { int has_taping, has_taped; sprintf(buffer, "%ss", word); if (xdict_find(d, buffer, NULL, NULL) <= 0) return POS_NORMAL; /* Look for "taping", if the root word is "tap". */ sprintf(buffer, "%sing", word); has_taping = xdict_find(d, buffer, NULL, NULL); if (has_taping == 0) goto look_for_tapping; /* The entry "taping" was found (or too long). Look for "taped". */ sprintf(buffer, "%sed", word); has_taped = xdict_find(d, buffer, NULL, NULL); if (has_taped == 0) goto look_for_tapping; if (has_taped > 0 || has_taping > 0) return POS_VERB; return POS_PLURAL; look_for_tapping: sprintf(buffer, "%s%cing", word, word[k-1]); if (xdict_find(d, buffer, NULL, NULL) == 0) return POS_PLURAL; sprintf(buffer, "%s%ced", word, word[k-1]); if (xdict_find(d, buffer, NULL, NULL) <= 0) return POS_PLURAL; return POS_VERBB; } } /* This routine saves out the dictionary using the "updated" compression scheme: we look for plurals and verb tenses while saving the dictionary. This will run very slowly if the last operation on the dictionary didn't leave it sorted! */ int xdict_save_small(struct xdict *d, const char *fname) { FILE *out = fopen(fname, "w"); size_t i; int k; if (out == NULL) return -1; for (k=0; k < XDICT_MAXLENGTH; ++k) { for (i=0; i < d->len[k]; ++i) { char *word = d->words[k][i].word; switch (pos_categorize(d, word, k)) { case POS_NORMAL: fprintf(out, "%s\n", word); break; case POS_VERB: fprintf(out, "%s/v\n", word); break; case POS_VERBE: fprintf(out, "%.*s/w\n", k-1, word); break; case POS_VERBB: fprintf(out, "%s/x\n", word); break; case POS_PLURAL: fprintf(out, "%s/s\n", word); break; case POS_COVERED: break; } } } fclose(out); return 0; } void xdict_init(struct xdict *d) { int k; d->sorted = 1; for (k=0; k < XDICT_MAXLENGTH; ++k) { d->words[k] = NULL; d->len[k] = d->cap[k] = 0; } } static int xdict_sortcmp(const void *p, const void *q) { const struct word_entry *wp = p; const struct word_entry *wq = q; return strcmp(wp->word, wq->word); } void xdict_sort(struct xdict *d) { int k; for (k=0; k < XDICT_MAXLENGTH; ++k) { size_t i, n = d->len[k]; struct word_entry *w = d->words[k]; if (n < 2) continue; qsort(w, n, sizeof *w, xdict_sortcmp); /* Remove duplicates. */ for (i=n; i > 1; --i) { if (strcmp(w[i-1].word, w[i-2].word) == 0) { if (i < --n) memcpy(w[i-1].word, w[n].word, k); } } if (n < d->len[k]) { d->len[k] = n; qsort(w, n, sizeof *w, xdict_sortcmp); } } d->sorted = 1; } void xdict_free(struct xdict *d) { size_t i, k; for (k=0; k < XDICT_MAXLENGTH; ++k) { for (i=0; i < d->len[k]; ++i) free(d->words[k][i].word); free(d->words[k]); } } int xdict_addword(struct xdict *d, const char *word, int k) { char *tmp; if (k == 0) k = strlen(word); if (k >= XDICT_MAXLENGTH) return -1; if (k <= 2) return -1; if (d->len[k] >= d->cap[k]) { size_t newcap = d->cap[k] * 2 + 15; void *t = realloc(d->words[k], newcap * sizeof *d->words[k]); if (t == NULL) return -3; d->words[k] = t; d->cap[k] = newcap; } tmp = malloc(k+1); if (tmp == NULL) return -4; memcpy(tmp, word, k); tmp[k] = '\0'; d->words[k][d->len[k]++].word = tmp; d->sorted = 0; return 0; } int xdict_remword(struct xdict *d, const char *word, int k) { int count = 0; struct word_entry *w; size_t i; if (k == 0) k = strlen(word); if (k >= XDICT_MAXLENGTH) return -1; if (k <= 2) return -1; w = d->words[k]; for (i=0; i < d->len[k]; ++i) { if (memcmp(w[i].word, word, k) == 0) { d->len[k]--; if (i != d->len[k]) { strcpy(w[i].word, w[d->len[k]].word); d->sorted = 0; } free(w[d->len[k]].word); ++count; } } return count; } int xdict_remmatch(struct xdict *d, const char *pat, int k) { if (k != 0 && strchr(pat, '*') == NULL) { if (strchr(pat, '?') == NULL) return xdict_remword(d, pat, k); else { int count = 0; struct word_entry *w; size_t i; if (k == 0) k = strlen(pat); if (k >= XDICT_MAXLENGTH) return -1; if (k <= 2) return -1; w = d->words[k]; for (i=0; i < d->len[k]; ++i) { if (xdict_match(w[i].word, pat)) { d->len[k]--; if (i != d->len[k]) { strcpy(w[i].word, w[d->len[k]].word); d->sorted = 0; } free(w[d->len[k]].word); ++count; } } return count; } } else { /* The pattern does contain a '*' wildcard, and the client has not specified a word length. We must check all possible word lengths. */ int count = 0; size_t i; for (i=k=0; pat[i]; ++i) if (pat[i] != '*') ++k; for ( ; k < XDICT_MAXLENGTH; ++k) { struct word_entry *w = d->words[k]; for (i=0; i < d->len[k]; ++i) { if (xdict_match(w[i].word, pat)) { d->len[k]--; if (i != d->len[k]) { strcpy(w[i].word, w[d->len[k]].word); d->sorted = 0; } free(w[d->len[k]].word); ++count; } } } return count; } } int xdict_match(const char *w, const char *p) { int i, j; for (i=0; p[i]; ++i) { if (p[i] == '*') { for (j=i; w[j]; ++j) if (xdict_match(w+j, p+i+1)) return 1; if (xdict_match(w+j, p+i+1)) return 1; return 0; } else if (w[i] == '\0') return 0; else if (p[i] == '1') { if (!is_consonant(w[i])) return 0; } else if (p[i] == '0') { if (!is_vowel(w[i])) return 0; } else if (p[i] != '?') { if (p[i] != w[i]) return 0; } } return (w[i] == '\0'); } int xdict_match_simple(const char *w, const char *p) { int i; for (i=0; p[i]; ++i) { if (w[i] == '\0') return 0; else if (p[i] == '1') { if (!is_consonant(w[i])) return 0; } else if (p[i] == '0') { if (!is_vowel(w[i])) return 0; } else if (p[i] != '?') { if (p[i] != w[i]) return 0; } } return (w[i] == '\0'); } static int is_purely_alphabetic(const char *pattern) { size_t i; for (i=0; isalpha(pattern[i]); ++i) continue; return (pattern[i] == '\0'); } int xdict_find(struct xdict *d, const char *pattern, int (*f)(const char *, void *), void *info) { int count = 0; if (strchr(pattern, '*') == NULL) { size_t len = strlen(pattern); struct word_entry *w; if (len < 2 || len >= XDICT_MAXLENGTH) return -1; w = d->words[len]; if (d->sorted && is_purely_alphabetic(pattern)) { /* If the dictionary is sorted and the pattern is simply a word, we can perform a speedy binary search instead of our usual slow linear search. */ size_t low = 0; size_t high = d->len[len]; while (low < high) { size_t i = low + (high-low)/2; int rc = strcmp(w[i].word, pattern); if (rc == 0) { if (f != NULL) f(w[i].word, info); return 1; } else if (rc > 0) { high = i; } else { low = i+1; } } return 0; } else { size_t i; for (i=0; i < d->len[len]; ++i) { if (xdict_match_simple(w[i].word, pattern)) { ++count; if (f && f(w[i].word, info)) return count; } } return count; } } else { size_t k, len=0; for (k=0; pattern[k]; ++k) if (pattern[k] != '*') ++len; for (k=len; k < XDICT_MAXLENGTH; ++k) { struct word_entry *w = d->words[k]; size_t i; for (i=0; i < d->len[k]; ++i) { if (xdict_match(w[i].word, pattern)) { ++count; if (f && f(w[i].word, info)) return count; } } } return count; } }