#include #include #include "uniformats.h" /** * Prints a single 32-bit Unicode code point as UCS-4 * (big-endian 32-bit values). */ int putUCS4(unsigned long ucp, FILE *outfp) { putc((ucp >> 24) & 0xFF, outfp); putc((ucp >> 16) & 0xFF, outfp); putc((ucp >> 8) & 0xFF, outfp); putc(ucp & 0xFF, outfp); return 4; } int getUCS4(unsigned long *ucp, FILE *infp) { unsigned char buf[4]; if (fread(buf, 1, 4, infp) != 4) return EOF; *ucp = buf[0]<<24 | buf[1]<<16 | buf[2]<<8 | buf[3]; return 4; } int gobbleUCS4(FILE *infp) { unsigned char signature[] = {0x00, 0x00, 0xFE, 0xFF}; int k; int i; for (i=0; i < 4; ++i) { k = getc(infp); if (k == EOF) { while (i) ungetc(signature[--i], infp); return EOF; } if (k != signature[i]) { ungetc(k, infp); while (i && ungetc(signature[--i], infp) != EOF) continue; return i; } } return 4; } int spewUCS4(FILE *outfp) { unsigned char signature[] = {0x00, 0x00, 0xFE, 0xFF}; int i; for (i=0; i < 4; ++i) putc(signature[i], outfp); return 0; } /** * Prints a single 32-bit Unicode code point as UCS-2 * (big-endian 16-bit values). */ int putUCS2(unsigned long ucp, FILE *outfp) { putc((ucp >> 8) & 0xFF, outfp); putc(ucp & 0xFF, outfp); return 2; } int getUCS2(unsigned long *ucp, FILE *infp) { unsigned char buf[2]; if (fread(buf, 1, 2, infp) != 2) return EOF; *ucp = buf[0]<<8 | buf[1]; return 2; } int gobbleUCS2(FILE *infp) { unsigned char signature[] = {0xFE, 0xFF}; int k; int i; for (i=0; i < 2; ++i) { k = getc(infp); if (k == EOF) { while (i) ungetc(signature[--i], infp); return EOF; } if (k != signature[i]) { ungetc(k, infp); while (i && ungetc(signature[--i], infp) != EOF) continue; return i; } } return 2; } int spewUCS2(FILE *outfp) { unsigned char signature[] = {0xFE, 0xFF}; int i; for (i=0; i < 2; ++i) putc(signature[i], outfp); return 0; } /** * Prints a single 32-bit Unicode code point as UTF-8. */ int putUTF8(unsigned long ucp, FILE *outfp) { if (ucp < 0x80) { putc(ucp & 0xFF, outfp); return 1; } else if (ucp < 0x800) { putc((ucp >> 6) | 0xC0, outfp); putc((ucp & 0x3F) | 0x80, outfp); return 2; } else if (ucp < 0x10000) { putc((ucp >> 12) | 0xE0, outfp); putc(((ucp >> 6) & 0x3F) | 0x80, outfp); putc((ucp & 0x3F) | 0x80, outfp); return 3; } else { putc((ucp >> 18) | 0xF0, outfp); putc(((ucp >> 12) & 0x3F) | 0x80, outfp); putc(((ucp >> 6) & 0x3F) | 0x80, outfp); putc((ucp & 0x3F) | 0x80, outfp); return 4; } } int getUTF8(unsigned long *ucp, FILE *infp) { int tmp; int nbytes; int i; tmp = getc(infp); if (tmp == EOF) return EOF; for (nbytes = 0; (tmp << nbytes) & 0x80; ++nbytes) continue; if (nbytes == 0) { *ucp = tmp; return 1; } else if (nbytes == 1 || nbytes > 6) { ungetc(tmp, infp); return 0; } else { *ucp = (tmp & (0x7F >> nbytes)); for (i=0; i < nbytes-1; ++i) { if ((tmp = getc(infp)) == EOF) return EOF; *ucp = (*ucp << 6) | (tmp & 0x3F); } return nbytes; } } int gobbleUTF8(FILE *infp) { unsigned char signature[] = {0xEF, 0xBB, 0xBF}; int k; int i; for (i=0; i < 3; ++i) { k = getc(infp); if (k == EOF) { while (i) ungetc(signature[--i], infp); return EOF; } if (k != signature[i]) { ungetc(k, infp); while (i && ungetc(signature[--i], infp) != EOF) continue; return i; } } return 3; } int spewUTF8(FILE *outfp) { unsigned char signature[] = {0xEF, 0xBB, 0xBF}; int i; for (i=0; i < 3; ++i) putc(signature[i], outfp); return 0; } /** * Prints a single 32-bit Unicode code point as ASCII * (using the question mark '?' if the value is out of * the range 0..255). */ int putANSI(unsigned long ucp, FILE *outfp) { if (ucp > 0xFF) putc(0x3F, outfp); else putc(ucp, outfp); return 1; } int getANSI(unsigned long *ucp, FILE *infp) { int k = getc(infp); if (k == EOF) return EOF; if (k & 0x80) { ungetc(k, infp); return 0; } *ucp = k & 0x7F; return 1; } /** * Prints a single 32-bit Unicode code point as ASCII * (using the LaTeX Omega syntax ^^^^6f7e if the value * is out of the range [10, 13, 32..126]). */ int putOmega(unsigned long ucp, FILE *outfp) { if (ucp == 10 || ucp == 13 || (ucp >= 32 && ucp <= 126)) { putc(ucp, outfp); return 1; } else { return fprintf(outfp, "^^^^%04lx", ucp); } } int getOmega(unsigned long *ucp, FILE *infp) { static int caret_count = 0; int k; if (caret_count) { --caret_count; *ucp = '^'; return 1; } if ((k=getc(infp)) == EOF) return EOF; if (k & 0x80) { ungetc(k, infp); return 0; } else if (k != '^') { *ucp = k & 0x7F; return 1; } else { for (caret_count = 1; caret_count < 4; ++caret_count) { if ((k=getc(infp)) == EOF) return EOF; if (k != '^') { ungetc(k, infp); *ucp = '^'; return 1; } } if ((k=getc(infp)) == EOF) return EOF; ungetc(k, infp); if (!isxdigit(k)) { --caret_count; *ucp = '^'; return 1; } else { if (fscanf(infp, "%4lx", ucp) != 1) return 0; caret_count = 0; return 8; } } }