#include #include #include #include #include #include #include "uniformats.h" #define steq(x,y) (!strcmp((x),(y))) #define NELEM(arr) ((int)(sizeof arr / sizeof *arr)) static const char *Argv0; static int GobbleHeader = 1; static int SpewHeader = 0; static int ConvertCRLFPairs = 1; void do_error(const char *fmat, ...); void do_help(int man); struct Format { const char *code; int (*get)(unsigned long *, FILE *); int (*put)(unsigned long, FILE *); int (*gobble)(FILE *); int (*spew)(FILE *); } fmt_table[] = { { "Aa", getANSI, putANSI, 0, 0 }, { "Oo", getOmega, putOmega, 0, 0 }, { "Uu2", getUCS2, putUCS2, gobbleUCS2, spewUCS2 }, { "4", getUCS4, putUCS4, gobbleUCS4, spewUCS4 }, { "Ff", getUTF8, putUTF8, gobbleUTF8, spewUTF8 }, }; int main(int argc, char *argv[]) { int i, j; char *InputFilename = NULL; char *OutputFilename = NULL; FILE *infp, *outfp; unsigned long ucp; struct Format *infmt = &fmt_table[0]; struct Format *outfmt = &fmt_table[0]; Argv0 = argv[0]; for (i=1; i < argc; i++) { if (argv[i][0] != '-') break; if (argv[i][1] == '\0') break; if (steq(argv[i]+1, "-")) { ++i; break; } else if (steq(argv[i]+1, "?")) do_help(0); else if (steq(argv[i]+1, "-help")) do_help(0); else if (steq(argv[i]+1, "-man")) do_help(1); else if (steq(argv[i]+1, "o") || steq(argv[i]+1, "O")) { if (i >= argc-1) do_error("I need a filename with '%s'!\n", argv[i]); OutputFilename = argv[++i]; } else if (argv[i][1] == '-' && argv[i][3] == '2' && argv[i][5] == '\0') { for (j=0; j < NELEM(fmt_table); ++j) { if (strchr(fmt_table[j].code, argv[i][2]) != NULL) { infmt = &fmt_table[j]; break; } } if (j == NELEM(fmt_table)) do_error("Unrecognized input format '%c'\n", argv[i][2]); for (j=0; j < NELEM(fmt_table); ++j) { if (strchr(fmt_table[j].code, argv[i][4]) != NULL) { outfmt = &fmt_table[j]; break; } } if (j == NELEM(fmt_table)) do_error("Unrecognized output format '%c'\n", argv[i][4]); } else { for (j=1; argv[i][j]; ++j) { switch (argv[i][j]) { case 'B': ConvertCRLFPairs = 1; break; case 'b': ConvertCRLFPairs = 0; break; case 'G': GobbleHeader = 1; break; case 'g': GobbleHeader = 0; break; case 'S': SpewHeader = 1; break; case 's': SpewHeader = 0; break; default: do_error("Unrecognized option(s) %s\n", argv[i]); } } } } if (InputFilename == NULL && i < argc) { InputFilename = argv[i++]; } if (OutputFilename == NULL && i < argc) { OutputFilename = argv[i++]; } if (i != argc) do_error("Extra arguments at end of command line.\n"); if (InputFilename) { infp = fopen(InputFilename, "rb"); if (infp == NULL) do_error("Can't open input file '%s'!\n", InputFilename); } else { infp = stdin; } if (OutputFilename) { outfp = fopen(OutputFilename, "wb"); if (outfp == NULL) do_error("Can't open output file '%s'!\n", OutputFilename); } else { outfp = stdout; } if (GobbleHeader && infmt->gobble) infmt->gobble(infp); if (SpewHeader && outfmt->spew) outfmt->spew(outfp); while ((j = infmt->get(&ucp, infp)) > 0) { /* Handle some brokenness in popular converters */ if ((j == 0x0D0A || j == 0x0A0D) && ConvertCRLFPairs) { outfmt->put(0x0D, outfp); outfmt->put(0x0A, outfp); } else { outfmt->put(ucp, outfp); } } if (j != EOF) { #define I(s) ((InputFilename != NULL)? s: "") fprintf(stderr, "Invalid code found in input file%s%s%s; output" " file unfinished\n", I(" '"), I(InputFilename), I("'")); #undef I } fclose(infp); fclose(outfp); return 0; } void do_error(const char *fmat, ...) { va_list ap; printf("%s: ", Argv0); va_start(ap, fmat); vprintf(fmat, ap); va_end(ap); exit(EXIT_FAILURE); } void do_help(int man) { if (man) goto man; puts("unitrans [-?] [-BbGgSs] [--x2y] [input] [output]"); puts("Translates between incompatible Unicode encodings."); puts(" --x2y (e.g., --f2o, --42a): specify input and output formats"); puts(" -B[b]: turn on [off] CR/LF fixing"); puts(" -G[g]: turn on [off] input header gobbling"); puts(" -S[s]: turn on [off] output header spewing"); exit(0); man: puts("unitrans [-?] [-GgSs] [--x2y] [input] [output]"); puts(" Translates a file between Unicode codings,"); puts(" according to the value of the --x2y parameter."); puts(" x,y can be any combination of the following:"); puts(" A,a: Plain ANSI encoding, replacing multibyte"); puts(" Unicode values with '?' markers"); puts(" F,f: UTF-8 variable-length encoding"); puts(" O,o: Omega-style encoding: ASCII characters"); puts(" with ^^^^6a7e to indicate 32-bit values."); puts(" Excessive carets in the input stream will"); puts(" confuse this one; use with caution!"); puts(" U,u,2: UCS-2 16-bit encoding"); puts(" 4: UCS-4 32-bit encoding"); puts(""); puts(" The -G option tells the program to gobble up any header"); puts(" in the file; for example, the sequence of bytes FE FF"); puts(" at the start of a UCS-2 encoded file. If no header is"); puts(" detected, nothing is gobbled, and the transcription"); puts(" proceeds. This function will probably be confused by"); puts(" header-less UCS-4 files; use with caution!"); puts(" This option is turned on by default; use -g to turn it off."); puts(""); puts(" The -S option tells the program to spew out an appropriate"); puts(" header before beginning to write the output file; for"); puts(" example, the sequence of bytes FE FF at the start of a"); puts(" UCS-2 encoded file."); puts(" The -s option turns off header-spewing (the default)."); puts(""); puts(" The -B option tells the program to replace instances of the"); puts(" Unicode values U+0D0A and U+0A0D with the two-code sequence"); puts(" U+0D U+0A. This is useful because some programs that"); puts(" generate Unicode output are broken like that."); puts(" The Unicode glyphs U+0D0A and U+0A0D are a Malayam glyph and"); puts(" an invalid entry in the Gurmukhi code page, respectively."); puts(" You should not have any reason to turn off this behavior"); puts(" unless you are using one of those alphabets in your text."); puts(" This option is turned on by default; use -b to turn it off."); exit(0); }