#include #define reg register #define uns unsigned #define byte unsigned char /* I/O, arguments */ char *ifile = NULL; FILE *ichan; char *ofile = NULL; FILE *ochan; #define IBUFSIZ 16777216 /* Input buffer */ char ibuf[IBUFSIZ]; /* Input buffer (one line) */ char tbuf[128]; /* Token buffer */ /* Command line, file parse status */ uns cargc; char **cargv; byte showprog = 0; /* Show progress */ long cline = 0; /* Current line */ long cmask = 32767; /* Print a "*" every this many lines */ uns nstars = 0; /* Parse arguments; call main routine to actually parse the file. */ main(argc, argv) uns argc; char *argv[]; { if (argc == 1) { printf("wkf -i {infile} -o {outfile} -fp -pp \n"); exit(1); } ichan = stdin; ochan = stdout; cargv = argv; cargc = argc; for (cargc--, cargv++; (cargc > 0); cargc--, cargv++) { if (**cargv == '-') { arg_doflag(); continue; } break; } copydata(); done(0); } /* Deal with all flag arguments; set debugger and feature flags, * etc. */ arg_doflag() { reg char *arg; arg = *cargv; arg++; switch(*arg++) { case 'f': while (*arg != '\0') arg_dofeat(arg++); break; case 'n': case 'p': arg_doparam(arg); break; case 'i': cargc--; cargv++; if (ifile != NULL) { printf("Duplicate input file %s\n", *cargv); done(1); } ifile = *cargv; if ((ichan = fopen(*cargv, "r")) == NULL) { printf("Can't open file %s\n", *cargv); done(1); } break; case 'o': cargc--; cargv++; if (ofile != NULL) { printf("Duplicate input file %s\n", *cargv); done(1); } ofile = *cargv; if ((ochan = fopen(*cargv, "w")) == NULL) { printf("Can't open file %s\n", *cargv); done(1); } break; default: printf("Bad option %c\n", *--arg); done(1); } } /* Handle parameters. */ arg_doparam(arg) char *arg; { switch (*arg) { case 'p': cargc--; cargv++; cmask = ((1 << atoi(*cargv)) - 1); break; default: printf("Bad parameter option %c\n", *arg); done(1); } } /* Set feature flags. */ arg_dofeat(arg) char *arg; { switch (*arg) { case 'p': showprog++; break; default: printf("Bad feature option %c\n", *arg); done(1); } } /* Process input file. * * Only nested multi-line items are: * , , , : * those are handled specially. * * Yes, I know this is really ugly/kludgy, but it's *really* fast. */ copydata() { reg char *cp; while (1) { if (fgets(&ibuf[0], IBUFSIZ, ichan) == NULL) { return; } if (feof(ichan)) { return; } cline++; if (showprog && ((cline & cmask) == 0)) { printf("*"); fflush(stdout); if (nstars++ == 72) { printf("\n"); fflush(stdout); nstars = 0; } } for (cp = &ibuf[0]; ((*cp == ' ') || (*cp == '\t')); cp++); if (*cp++ != '<') { printf("unexpected char '%c' in '%s'", *--cp, &ibuf[0]); done(1); } if (*cp == '/') { cp++; if ((*cp == 'n') && (strncmp(cp, "namespaces>", 11) == 0)) { fputs(&ibuf[0], ochan); continue; } if ((*cp == 'p') && (strncmp(cp, "page>", 5) == 0)) { fputs(&ibuf[0], ochan); continue; } if ((*cp == 'r') && (strncmp(cp, "revision>", 9) == 0)) { fputs(&ibuf[0], ochan); continue; } if ((*cp == 's') && (strncmp(cp, "siteinfo>", 9) == 0)) { fputs(&ibuf[0], ochan); continue; } printf("unexpected endtoken '%s' in '%s'", cp, &ibuf[0]); done(1); } if (*cp != 't') { putitem(cp); continue; } if (strncmp(cp, "text ", 5) != 0) { putitem(cp); continue; } cp += 5; discitem(cp, "text>", 5); } } /* Copy complete entry, up to matching terminating token; possibly multiple * lines. Handles three cases: * * - at the end of the current line * at the start of a following line * at the end of a following line * * Has to make copy of token because calls to read will bash the copy * passed as an arg. (Makes the copy even in cases it's not needed * because the length is needed in all cases.) * * Yes, I know this (and the code) is really ugly/kludgy, but it's fast. */ putitem(tnp) char *tnp; { reg char *cp, *tp; uns tlen; char t; fputs(&ibuf[0], ochan); cp = tnp; tp = &tbuf[0]; tlen = 0; for (;;) { t = *cp++; *tp++ = t; tlen++; if (t == ' ') return; if (t == '>') break; } while ((*cp != '\n') && (*cp != 0)) cp++; if (*cp == 0) { printf("put unexpected EOF '%s' '%s'", tnp, &ibuf[0]); done(1); } cp -= tlen; if (strncmp(cp, tnp, tlen) == 0) return; while (1) { if (fgets(&ibuf[0], IBUFSIZ, ichan) == NULL) return; if (feof(ichan)) return; fputs(&ibuf[0], ochan); cline++; if (showprog && ((cline & cmask) == 0)) { printf("*"); fflush(stdout); if (nstars++ == 72) { printf("\n"); fflush(stdout); nstars = 0; } } cp = &ibuf[0]; if ((*cp++ == '<') && (*cp++ == '/')) { if (strncmp(cp, &tbuf[0], tlen) != 0) { printf("put <%s unexpected entity '%s' in '%s'", &tbuf[0], cp, &ibuf[0]); done(1); } return; } for (cp = &ibuf[0]; ((*cp != '\n') && (*cp != 0)); cp++); if (*cp == 0) { printf("<%s unexpected EOF '%s'", &tbuf[0], &ibuf[0]); done(1); } cp -= tlen; if (strncmp(cp, &tbuf[0], tlen) == 0) return; } } /* Discard complete entry, up to matching terminating token; possibly multiple * lines. Handles four cases: * * at the end of the current line * - at the end of the current line * at the start of a following line * at the end of a following line * * Yes, I know this (and the code) is really ugly/kludgy, but it's fast. */ discitem(tnp, tok, tlen) char *tnp; char *tok; uns tlen; { reg char *cp; for (cp = tnp; ((*cp != '\n') && (*cp != 0)); cp++); if (*cp == 0) { printf("disc <%s unexpected EOF '%s' '%s'", tok, tnp, &ibuf[0]); done(1); } if ((*--cp == '>') && (*--cp == '/')) return; cp -= (tlen - 2); if (strncmp(cp, tok, tlen) == 0) return; while (1) { if (fgets(&ibuf[0], IBUFSIZ, ichan) == NULL) return; if (feof(ichan)) return; cline++; if (showprog && ((cline & cmask) == 0)) { printf("*"); fflush(stdout); if (nstars++ == 72) { printf("\n"); fflush(stdout); nstars = 0; } } cp = &ibuf[0]; if ((*cp++ == '<') && (*cp++ == '/')) { if (strncmp(cp, tok, tlen) != 0) { printf("disc <%s unexpected entity '%s' in '%s'", tok, cp, &ibuf[0]); done(1); } return; } for (cp = &ibuf[0]; ((*cp != '\n') && (*cp != 0)); cp++); if (*cp == 0) { printf("<%s unexpected EOF '%s'", tok, &ibuf[0]); done(1); } cp -= tlen; if (strncmp(cp, tok, tlen) == 0) return; } } /* Flush buffers, done. */ done(code) int code; { fflush(ochan); if (showprog) { printf("\nlines: %d\n", cline); fflush(stdout); } exit(code); }