Annotation of rpl/rplawk/b.c, revision 1.2

1.1       bertrand    1: /****************************************************************
                      2: Copyright (C) Lucent Technologies 1997
                      3: All Rights Reserved
                      4: 
                      5: Permission to use, copy, modify, and distribute this software and
                      6: its documentation for any purpose and without fee is hereby
                      7: granted, provided that the above copyright notice appear in all
                      8: copies and that both that the copyright notice and this
                      9: permission notice and warranty disclaimer appear in supporting
                     10: documentation, and that the name Lucent Technologies or any of
                     11: its entities not be used in advertising or publicity pertaining
                     12: to distribution of the software without specific, written prior
                     13: permission.
                     14: 
                     15: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
                     16: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
                     17: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
                     18: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
                     19: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
                     20: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
                     21: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
                     22: THIS SOFTWARE.
                     23: ****************************************************************/
                     24: 
                     25: /* lasciate ogne speranza, voi ch'intrate. */
                     26: 
                     27: #define    DEBUG
                     28: 
                     29: #include <ctype.h>
                     30: #include <stdio.h>
                     31: #include <string.h>
                     32: #include <stdlib.h>
                     33: #include "awk.h"
                     34: #include "ytab.h"
                     35: 
                     36: #define    HAT (NCHARS+2)  /* matches ^ in regular expr */
                     37:                /* NCHARS is 2**n */
                     38: #define MAXLIN 22
                     39: 
                     40: #define type(v)        (v)->nobj   /* badly overloaded here */
                     41: #define info(v)        (v)->ntype  /* badly overloaded here */
                     42: #define left(v)        (v)->narg[0]
                     43: #define right(v)   (v)->narg[1]
                     44: #define parent(v)  (v)->nnext
                     45: 
                     46: #define LEAF   case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
                     47: #define ELEAF  case EMPTYRE:       /* empty string in regexp */
                     48: #define UNARY  case STAR: case PLUS: case QUEST:
                     49: 
                     50: /* encoding in tree Nodes:
                     51:    leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
                     52:        left is index, right contains value or pointer to value
                     53:    unary (STAR, PLUS, QUEST): left is child, right is null
                     54:    binary (CAT, OR): left and right are children
                     55:    parent contains pointer to parent
                     56: */
                     57: 
                     58: 
                     59: int    *setvec;
                     60: int    *tmpset;
                     61: int    maxsetvec = 0;
                     62: 
                     63: int    rtok;       /* next token in current re */
                     64: int    rlxval;
                     65: static uschar  *rlxstr;
                     66: static uschar  *prestr;    /* current position in current re */
                     67: static uschar  *lastre;    /* origin of last re */
                     68: 
                     69: static int setcnt;
                     70: static int poscnt;
                     71: 
                     72: char   *patbeg;
                     73: int    patlen;
                     74: 
                     75: #define    NFA 20  /* cache this many dynamic fa's */
                     76: fa *fatab[NFA];
                     77: int    nfatab  = 0;    /* entries in fatab */
                     78: 
                     79: fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
                     80: {
                     81:    int i, use, nuse;
                     82:    fa *pfa;
                     83:    static int now = 1;
                     84: 
                     85:    if (setvec == 0) {  /* first time through any RE */
                     86:        maxsetvec = MAXLIN;
                     87:        setvec = (int *) malloc(maxsetvec * sizeof(int));
                     88:        tmpset = (int *) malloc(maxsetvec * sizeof(int));
                     89:        if (setvec == 0 || tmpset == 0)
                     90:            overflo("out of space initializing makedfa");
                     91:    }
                     92: 
                     93:    if (compile_time)   /* a constant for sure */
                     94:        return mkdfa(s, anchor);
                     95:    for (i = 0; i < nfatab; i++)    /* is it there already? */
                     96:        if (fatab[i]->anchor == anchor
                     97:          && strcmp((const char *) fatab[i]->restr, s) == 0) {
                     98:            fatab[i]->use = now++;
                     99:            return fatab[i];
                    100:        }
                    101:    pfa = mkdfa(s, anchor);
                    102:    if (nfatab < NFA) { /* room for another */
                    103:        fatab[nfatab] = pfa;
                    104:        fatab[nfatab]->use = now++;
                    105:        nfatab++;
                    106:        return pfa;
                    107:    }
                    108:    use = fatab[0]->use;    /* replace least-recently used */
                    109:    nuse = 0;
                    110:    for (i = 1; i < nfatab; i++)
                    111:        if (fatab[i]->use < use) {
                    112:            use = fatab[i]->use;
                    113:            nuse = i;
                    114:        }
                    115:    freefa(fatab[nuse]);
                    116:    fatab[nuse] = pfa;
                    117:    pfa->use = now++;
                    118:    return pfa;
                    119: }
                    120: 
                    121: fa *mkdfa(const char *s, int anchor)   /* does the real work of making a dfa */
                    122:                /* anchor = 1 for anchored matches, else 0 */
                    123: {
                    124:    Node *p, *p1;
                    125:    fa *f;
                    126: 
                    127:    p = reparse(s);
                    128:    p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
                    129:        /* put ALL STAR in front of reg.  exp. */
                    130:    p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
                    131:        /* put FINAL after reg.  exp. */
                    132: 
                    133:    poscnt = 0;
                    134:    penter(p1); /* enter parent pointers and leaf indices */
                    135:    if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
                    136:        overflo("out of space for fa");
                    137:    f->accept = poscnt-1;   /* penter has computed number of positions in re */
                    138:    cfoll(f, p1);   /* set up follow sets */
                    139:    freetr(p1);
                    140:    if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
                    141:            overflo("out of space in makedfa");
                    142:    if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
                    143:        overflo("out of space in makedfa");
                    144:    *f->posns[1] = 0;
                    145:    f->initstat = makeinit(f, anchor);
                    146:    f->anchor = anchor;
                    147:    f->restr = (uschar *) tostring(s);
                    148:    return f;
                    149: }
                    150: 
                    151: int makeinit(fa *f, int anchor)
                    152: {
                    153:    int i, k;
                    154: 
                    155:    f->curstat = 2;
                    156:    f->out[2] = 0;
                    157:    f->reset = 0;
                    158:    k = *(f->re[0].lfollow);
                    159:    xfree(f->posns[2]);         
                    160:    if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
                    161:        overflo("out of space in makeinit");
                    162:    for (i=0; i <= k; i++) {
                    163:        (f->posns[2])[i] = (f->re[0].lfollow)[i];
                    164:    }
                    165:    if ((f->posns[2])[1] == f->accept)
                    166:        f->out[2] = 1;
                    167:    for (i=0; i < NCHARS; i++)
                    168:        f->gototab[2][i] = 0;
                    169:    f->curstat = cgoto(f, 2, HAT);
                    170:    if (anchor) {
                    171:        *f->posns[2] = k-1; /* leave out position 0 */
                    172:        for (i=0; i < k; i++) {
                    173:            (f->posns[0])[i] = (f->posns[2])[i];
                    174:        }
                    175: 
                    176:        f->out[0] = f->out[2];
                    177:        if (f->curstat != 2)
                    178:            --(*f->posns[f->curstat]);
                    179:    }
                    180:    return f->curstat;
                    181: }
                    182: 
                    183: void penter(Node *p)   /* set up parent pointers and leaf indices */
                    184: {
                    185:    switch (type(p)) {
                    186:    ELEAF
                    187:    LEAF
                    188:        info(p) = poscnt;
                    189:        poscnt++;
                    190:        break;
                    191:    UNARY
                    192:        penter(left(p));
                    193:        parent(left(p)) = p;
                    194:        break;
                    195:    case CAT:
                    196:    case OR:
                    197:        penter(left(p));
                    198:        penter(right(p));
                    199:        parent(left(p)) = p;
                    200:        parent(right(p)) = p;
                    201:        break;
                    202:    default:    /* can't happen */
                    203:        FATAL("can't happen: unknown type %d in penter", type(p));
                    204:        break;
                    205:    }
                    206: }
                    207: 
                    208: void freetr(Node *p)   /* free parse tree */
                    209: {
                    210:    switch (type(p)) {
                    211:    ELEAF
                    212:    LEAF
                    213:        xfree(p);
                    214:        break;
                    215:    UNARY
                    216:        freetr(left(p));
                    217:        xfree(p);
                    218:        break;
                    219:    case CAT:
                    220:    case OR:
                    221:        freetr(left(p));
                    222:        freetr(right(p));
                    223:        xfree(p);
                    224:        break;
                    225:    default:    /* can't happen */
                    226:        FATAL("can't happen: unknown type %d in freetr", type(p));
                    227:        break;
                    228:    }
                    229: }
                    230: 
                    231: /* in the parsing of regular expressions, metacharacters like . have */
                    232: /* to be seen literally;  \056 is not a metacharacter. */
                    233: 
1.2     ! bertrand  234: int hexstr(uschar **pp)    /* find and eval hex string at pp, return new p */
1.1       bertrand  235: {          /* only pick up one 8-bit byte (2 chars) */
                    236:    uschar *p;
                    237:    int n = 0;
                    238:    int i;
                    239: 
                    240:    for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
                    241:        if (isdigit(*p))
                    242:            n = 16 * n + *p - '0';
                    243:        else if (*p >= 'a' && *p <= 'f')
                    244:            n = 16 * n + *p - 'a' + 10;
                    245:        else if (*p >= 'A' && *p <= 'F')
                    246:            n = 16 * n + *p - 'A' + 10;
                    247:    }
1.2     ! bertrand  248:    *pp = (uschar *) p;
1.1       bertrand  249:    return n;
                    250: }
                    251: 
                    252: #define isoctdigit(c) ((c) >= '0' && (c) <= '7')   /* multiple use of arg */
                    253: 
1.2     ! bertrand  254: int quoted(uschar **pp)    /* pick up next thing after a \\ */
1.1       bertrand  255:            /* and increment *pp */
                    256: {
1.2     ! bertrand  257:    uschar *p = *pp;
1.1       bertrand  258:    int c;
                    259: 
                    260:    if ((c = *p++) == 't')
                    261:        c = '\t';
                    262:    else if (c == 'n')
                    263:        c = '\n';
                    264:    else if (c == 'f')
                    265:        c = '\f';
                    266:    else if (c == 'r')
                    267:        c = '\r';
                    268:    else if (c == 'b')
                    269:        c = '\b';
                    270:    else if (c == '\\')
                    271:        c = '\\';
                    272:    else if (c == 'x') {    /* hexadecimal goo follows */
                    273:        c = hexstr(&p); /* this adds a null if number is invalid */
                    274:    } else if (isoctdigit(c)) { /* \d \dd \ddd */
                    275:        int n = c - '0';
                    276:        if (isoctdigit(*p)) {
                    277:            n = 8 * n + *p++ - '0';
                    278:            if (isoctdigit(*p))
                    279:                n = 8 * n + *p++ - '0';
                    280:        }
                    281:        c = n;
                    282:    } /* else */
                    283:        /* c = c; */
                    284:    *pp = p;
                    285:    return c;
                    286: }
                    287: 
                    288: char *cclenter(const char *argp)   /* add a character class */
                    289: {
                    290:    int i, c, c2;
                    291:    uschar *p = (uschar *) argp;
                    292:    uschar *op, *bp;
                    293:    static uschar *buf = 0;
                    294:    static int bufsz = 100;
                    295: 
                    296:    op = p;
                    297:    if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
                    298:        FATAL("out of space for character class [%.10s...] 1", p);
                    299:    bp = buf;
                    300:    for (i = 0; (c = *p++) != 0; ) {
                    301:        if (c == '\\') {
1.2     ! bertrand  302:            c = quoted(&p);
1.1       bertrand  303:        } else if (c == '-' && i > 0 && bp[-1] != 0) {
                    304:            if (*p != 0) {
                    305:                c = bp[-1];
                    306:                c2 = *p++;
                    307:                if (c2 == '\\')
1.2     ! bertrand  308:                    c2 = quoted(&p);
1.1       bertrand  309:                if (c > c2) {   /* empty; ignore */
                    310:                    bp--;
                    311:                    i--;
                    312:                    continue;
                    313:                }
                    314:                while (c < c2) {
                    315:                    if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
                    316:                        FATAL("out of space for character class [%.10s...] 2", p);
                    317:                    *bp++ = ++c;
                    318:                    i++;
                    319:                }
                    320:                continue;
                    321:            }
                    322:        }
                    323:        if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
                    324:            FATAL("out of space for character class [%.10s...] 3", p);
                    325:        *bp++ = c;
                    326:        i++;
                    327:    }
                    328:    *bp = 0;
                    329:    dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
                    330:    xfree(op);
                    331:    return (char *) tostring((char *) buf);
                    332: }
                    333: 
                    334: void overflo(const char *s)
                    335: {
                    336:    FATAL("regular expression too big: %.30s...", s);
                    337: }
                    338: 
                    339: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
                    340: {
                    341:    int i;
                    342:    int *p;
                    343: 
                    344:    switch (type(v)) {
                    345:    ELEAF
                    346:    LEAF
                    347:        f->re[info(v)].ltype = type(v);
                    348:        f->re[info(v)].lval.np = right(v);
                    349:        while (f->accept >= maxsetvec) {    /* guessing here! */
                    350:            maxsetvec *= 4;
                    351:            setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
                    352:            tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
                    353:            if (setvec == 0 || tmpset == 0)
                    354:                overflo("out of space in cfoll()");
                    355:        }
                    356:        for (i = 0; i <= f->accept; i++)
                    357:            setvec[i] = 0;
                    358:        setcnt = 0;
                    359:        follow(v);  /* computes setvec and setcnt */
                    360:        if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
                    361:            overflo("out of space building follow set");
                    362:        f->re[info(v)].lfollow = p;
                    363:        *p = setcnt;
                    364:        for (i = f->accept; i >= 0; i--)
                    365:            if (setvec[i] == 1)
                    366:                *++p = i;
                    367:        break;
                    368:    UNARY
                    369:        cfoll(f,left(v));
                    370:        break;
                    371:    case CAT:
                    372:    case OR:
                    373:        cfoll(f,left(v));
                    374:        cfoll(f,right(v));
                    375:        break;
                    376:    default:    /* can't happen */
                    377:        FATAL("can't happen: unknown type %d in cfoll", type(v));
                    378:    }
                    379: }
                    380: 
                    381: int first(Node *p) /* collects initially active leaves of p into setvec */
                    382:            /* returns 0 if p matches empty string */
                    383: {
                    384:    int b, lp;
                    385: 
                    386:    switch (type(p)) {
                    387:    ELEAF
                    388:    LEAF
                    389:        lp = info(p);   /* look for high-water mark of subscripts */
                    390:        while (setcnt >= maxsetvec || lp >= maxsetvec) {    /* guessing here! */
                    391:            maxsetvec *= 4;
                    392:            setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
                    393:            tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
                    394:            if (setvec == 0 || tmpset == 0)
                    395:                overflo("out of space in first()");
                    396:        }
                    397:        if (type(p) == EMPTYRE) {
                    398:            setvec[lp] = 0;
                    399:            return(0);
                    400:        }
                    401:        if (setvec[lp] != 1) {
                    402:            setvec[lp] = 1;
                    403:            setcnt++;
                    404:        }
                    405:        if (type(p) == CCL && (*(char *) right(p)) == '\0')
                    406:            return(0);      /* empty CCL */
                    407:        else return(1);
                    408:    case PLUS:
                    409:        if (first(left(p)) == 0) return(0);
                    410:        return(1);
                    411:    case STAR:
                    412:    case QUEST:
                    413:        first(left(p));
                    414:        return(0);
                    415:    case CAT:
                    416:        if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
                    417:        return(1);
                    418:    case OR:
                    419:        b = first(right(p));
                    420:        if (first(left(p)) == 0 || b == 0) return(0);
                    421:        return(1);
                    422:    }
                    423:    FATAL("can't happen: unknown type %d in first", type(p));   /* can't happen */
                    424:    return(-1);
                    425: }
                    426: 
                    427: void follow(Node *v)   /* collects leaves that can follow v into setvec */
                    428: {
                    429:    Node *p;
                    430: 
                    431:    if (type(v) == FINAL)
                    432:        return;
                    433:    p = parent(v);
                    434:    switch (type(p)) {
                    435:    case STAR:
                    436:    case PLUS:
                    437:        first(v);
                    438:        follow(p);
                    439:        return;
                    440: 
                    441:    case OR:
                    442:    case QUEST:
                    443:        follow(p);
                    444:        return;
                    445: 
                    446:    case CAT:
                    447:        if (v == left(p)) { /* v is left child of p */
                    448:            if (first(right(p)) == 0) {
                    449:                follow(p);
                    450:                return;
                    451:            }
                    452:        } else      /* v is right child */
                    453:            follow(p);
                    454:        return;
                    455:    }
                    456: }
                    457: 
                    458: int member(int c, const char *sarg)    /* is c in s? */
                    459: {
                    460:    uschar *s = (uschar *) sarg;
                    461: 
                    462:    while (*s)
                    463:        if (c == *s++)
                    464:            return(1);
                    465:    return(0);
                    466: }
                    467: 
                    468: int match(fa *f, const char *p0)   /* shortest match ? */
                    469: {
                    470:    int s, ns;
                    471:    uschar *p = (uschar *) p0;
                    472: 
                    473:    s = f->reset ? makeinit(f,0) : f->initstat;
                    474:    if (f->out[s])
                    475:        return(1);
                    476:    do {
                    477:        /* assert(*p < NCHARS); */
                    478:        if ((ns = f->gototab[s][*p]) != 0)
                    479:            s = ns;
                    480:        else
                    481:            s = cgoto(f, s, *p);
                    482:        if (f->out[s])
                    483:            return(1);
                    484:    } while (*p++ != 0);
                    485:    return(0);
                    486: }
                    487: 
                    488: int pmatch(fa *f, const char *p0)  /* longest match, for sub */
                    489: {
                    490:    int s, ns;
                    491:    uschar *p = (uschar *) p0;
                    492:    uschar *q;
                    493:    int i, k;
                    494: 
                    495:    /* s = f->reset ? makeinit(f,1) : f->initstat; */
                    496:    if (f->reset) {
                    497:        f->initstat = s = makeinit(f,1);
                    498:    } else {
                    499:        s = f->initstat;
                    500:    }
                    501:    patbeg = (char *) p;
                    502:    patlen = -1;
                    503:    do {
                    504:        q = p;
                    505:        do {
                    506:            if (f->out[s])      /* final state */
                    507:                patlen = q-p;
                    508:            /* assert(*q < NCHARS); */
                    509:            if ((ns = f->gototab[s][*q]) != 0)
                    510:                s = ns;
                    511:            else
                    512:                s = cgoto(f, s, *q);
                    513:            if (s == 1) {   /* no transition */
                    514:                if (patlen >= 0) {
                    515:                    patbeg = (char *) p;
                    516:                    return(1);
                    517:                }
                    518:                else
                    519:                    goto nextin;    /* no match */
                    520:            }
                    521:        } while (*q++ != 0);
                    522:        if (f->out[s])
                    523:            patlen = q-p-1; /* don't count $ */
                    524:        if (patlen >= 0) {
                    525:            patbeg = (char *) p;
                    526:            return(1);
                    527:        }
                    528:    nextin:
                    529:        s = 2;
                    530:        if (f->reset) {
                    531:            for (i = 2; i <= f->curstat; i++)
                    532:                xfree(f->posns[i]);
                    533:            k = *f->posns[0];           
                    534:            if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
                    535:                overflo("out of space in pmatch");
                    536:            for (i = 0; i <= k; i++)
                    537:                (f->posns[2])[i] = (f->posns[0])[i];
                    538:            f->initstat = f->curstat = 2;
                    539:            f->out[2] = f->out[0];
                    540:            for (i = 0; i < NCHARS; i++)
                    541:                f->gototab[2][i] = 0;
                    542:        }
                    543:    } while (*p++ != 0);
                    544:    return (0);
                    545: }
                    546: 
                    547: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
                    548: {
                    549:    int s, ns;
                    550:    uschar *p = (uschar *) p0;
                    551:    uschar *q;
                    552:    int i, k;
                    553: 
                    554:    /* s = f->reset ? makeinit(f,1) : f->initstat; */
                    555:    if (f->reset) {
                    556:        f->initstat = s = makeinit(f,1);
                    557:    } else {
                    558:        s = f->initstat;
                    559:    }
                    560:    patlen = -1;
                    561:    while (*p) {
                    562:        q = p;
                    563:        do {
                    564:            if (f->out[s])      /* final state */
                    565:                patlen = q-p;
                    566:            /* assert(*q < NCHARS); */
                    567:            if ((ns = f->gototab[s][*q]) != 0)
                    568:                s = ns;
                    569:            else
                    570:                s = cgoto(f, s, *q);
                    571:            if (s == 1) {   /* no transition */
                    572:                if (patlen > 0) {
                    573:                    patbeg = (char *) p;
                    574:                    return(1);
                    575:                } else
                    576:                    goto nnextin;   /* no nonempty match */
                    577:            }
                    578:        } while (*q++ != 0);
                    579:        if (f->out[s])
                    580:            patlen = q-p-1; /* don't count $ */
                    581:        if (patlen > 0 ) {
                    582:            patbeg = (char *) p;
                    583:            return(1);
                    584:        }
                    585:    nnextin:
                    586:        s = 2;
                    587:        if (f->reset) {
                    588:            for (i = 2; i <= f->curstat; i++)
                    589:                xfree(f->posns[i]);
                    590:            k = *f->posns[0];           
                    591:            if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
                    592:                overflo("out of state space");
                    593:            for (i = 0; i <= k; i++)
                    594:                (f->posns[2])[i] = (f->posns[0])[i];
                    595:            f->initstat = f->curstat = 2;
                    596:            f->out[2] = f->out[0];
                    597:            for (i = 0; i < NCHARS; i++)
                    598:                f->gototab[2][i] = 0;
                    599:        }
                    600:        p++;
                    601:    }
                    602:    return (0);
                    603: }
                    604: 
                    605: Node *reparse(const char *p)   /* parses regular expression pointed to by p */
                    606: {          /* uses relex() to scan regular expression */
                    607:    Node *np;
                    608: 
                    609:    dprintf( ("reparse <%s>\n", p) );
                    610:    lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */
                    611:    rtok = relex();
                    612:    /* GNU compatibility: an empty regexp matches anything */
                    613:    if (rtok == '\0') {
                    614:        /* FATAL("empty regular expression"); previous */
                    615:        return(op2(EMPTYRE, NIL, NIL));
                    616:    }
                    617:    np = regexp();
                    618:    if (rtok != '\0')
                    619:        FATAL("syntax error in regular expression %s at %s", lastre, prestr);
                    620:    return(np);
                    621: }
                    622: 
                    623: Node *regexp(void) /* top-level parse of reg expr */
                    624: {
                    625:    return (alt(concat(primary())));
                    626: }
                    627: 
                    628: Node *primary(void)
                    629: {
                    630:    Node *np;
                    631: 
                    632:    switch (rtok) {
                    633:    case CHAR:
                    634:        np = op2(CHAR, NIL, itonp(rlxval));
                    635:        rtok = relex();
                    636:        return (unary(np));
                    637:    case ALL:
                    638:        rtok = relex();
                    639:        return (unary(op2(ALL, NIL, NIL)));
                    640:    case EMPTYRE:
                    641:        rtok = relex();
                    642:        return (unary(op2(ALL, NIL, NIL)));
                    643:    case DOT:
                    644:        rtok = relex();
                    645:        return (unary(op2(DOT, NIL, NIL)));
                    646:    case CCL:
                    647:        np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
                    648:        rtok = relex();
                    649:        return (unary(np));
                    650:    case NCCL:
                    651:        np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
                    652:        rtok = relex();
                    653:        return (unary(np));
                    654:    case '^':
                    655:        rtok = relex();
                    656:        return (unary(op2(CHAR, NIL, itonp(HAT))));
                    657:    case '$':
                    658:        rtok = relex();
                    659:        return (unary(op2(CHAR, NIL, NIL)));
                    660:    case '(':
                    661:        rtok = relex();
                    662:        if (rtok == ')') {  /* special pleading for () */
                    663:            rtok = relex();
                    664:            return unary(op2(CCL, NIL, (Node *) tostring("")));
                    665:        }
                    666:        np = regexp();
                    667:        if (rtok == ')') {
                    668:            rtok = relex();
                    669:            return (unary(np));
                    670:        }
                    671:        else
                    672:            FATAL("syntax error in regular expression %s at %s", lastre, prestr);
                    673:    default:
                    674:        FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
                    675:    }
                    676:    return 0;   /*NOTREACHED*/
                    677: }
                    678: 
                    679: Node *concat(Node *np)
                    680: {
                    681:    switch (rtok) {
                    682:    case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
                    683:        return (concat(op2(CAT, np, primary())));
                    684:    }
                    685:    return (np);
                    686: }
                    687: 
                    688: Node *alt(Node *np)
                    689: {
                    690:    if (rtok == OR) {
                    691:        rtok = relex();
                    692:        return (alt(op2(OR, np, concat(primary()))));
                    693:    }
                    694:    return (np);
                    695: }
                    696: 
                    697: Node *unary(Node *np)
                    698: {
                    699:    switch (rtok) {
                    700:    case STAR:
                    701:        rtok = relex();
                    702:        return (unary(op2(STAR, np, NIL)));
                    703:    case PLUS:
                    704:        rtok = relex();
                    705:        return (unary(op2(PLUS, np, NIL)));
                    706:    case QUEST:
                    707:        rtok = relex();
                    708:        return (unary(op2(QUEST, np, NIL)));
                    709:    default:
                    710:        return (np);
                    711:    }
                    712: }
                    713: 
                    714: /*
                    715:  * Character class definitions conformant to the POSIX locale as
                    716:  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
                    717:  * and operating character sets are both ASCII (ISO646) or supersets
                    718:  * thereof.
                    719:  *
                    720:  * Note that to avoid overflowing the temporary buffer used in
                    721:  * relex(), the expanded character class (prior to range expansion)
                    722:  * must be less than twice the size of their full name.
                    723:  */
                    724: 
                    725: /* Because isblank doesn't show up in any of the header files on any
                    726:  * system i use, it's defined here.  if some other locale has a richer
                    727:  * definition of "blank", define HAS_ISBLANK and provide your own
                    728:  * version.
                    729:  * the parentheses here are an attempt to find a path through the maze
                    730:  * of macro definition and/or function and/or version provided.  thanks
                    731:  * to nelson beebe for the suggestion; let's see if it works everywhere.
                    732:  */
                    733: 
                    734: /* #define HAS_ISBLANK */
                    735: #ifndef HAS_ISBLANK
                    736: 
                    737: int (xisblank)(int c)
                    738: {
                    739:    return c==' ' || c=='\t';
                    740: }
                    741: 
                    742: #endif
                    743: 
                    744: struct charclass {
                    745:    const char *cc_name;
                    746:    int cc_namelen;
                    747:    int (*cc_func)(int);
                    748: } charclasses[] = {
                    749:    { "alnum",  5,  isalnum },
                    750:    { "alpha",  5,  isalpha },
1.2     ! bertrand  751: #ifndef HAS_ISBLANK
1.1       bertrand  752:    { "blank",  5,  isspace }, /* was isblank */
1.2     ! bertrand  753: #else
        !           754:    { "blank",  5,  isblank },
        !           755: #endif
1.1       bertrand  756:    { "cntrl",  5,  iscntrl },
                    757:    { "digit",  5,  isdigit },
                    758:    { "graph",  5,  isgraph },
                    759:    { "lower",  5,  islower },
                    760:    { "print",  5,  isprint },
                    761:    { "punct",  5,  ispunct },
                    762:    { "space",  5,  isspace },
                    763:    { "upper",  5,  isupper },
                    764:    { "xdigit", 6,  isxdigit },
                    765:    { NULL,     0,  NULL },
                    766: };
                    767: 
                    768: 
                    769: int relex(void)        /* lexical analyzer for reparse */
                    770: {
                    771:    int c, n;
                    772:    int cflag;
                    773:    static uschar *buf = 0;
                    774:    static int bufsz = 100;
                    775:    uschar *bp;
                    776:    struct charclass *cc;
                    777:    int i;
                    778: 
                    779:    switch (c = *prestr++) {
                    780:    case '|': return OR;
                    781:    case '*': return STAR;
                    782:    case '+': return PLUS;
                    783:    case '?': return QUEST;
                    784:    case '.': return DOT;
                    785:    case '\0': prestr--; return '\0';
                    786:    case '^':
                    787:    case '$':
                    788:    case '(':
                    789:    case ')':
                    790:        return c;
                    791:    case '\\':
1.2     ! bertrand  792:        rlxval = quoted(&prestr);
1.1       bertrand  793:        return CHAR;
                    794:    default:
                    795:        rlxval = c;
                    796:        return CHAR;
                    797:    case '[': 
                    798:        if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
                    799:            FATAL("out of space in reg expr %.10s..", lastre);
                    800:        bp = buf;
                    801:        if (*prestr == '^') {
                    802:            cflag = 1;
                    803:            prestr++;
                    804:        }
                    805:        else
                    806:            cflag = 0;
                    807:        n = 2 * strlen((const char *) prestr)+1;
                    808:        if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
                    809:            FATAL("out of space for reg expr %.10s...", lastre);
                    810:        for (; ; ) {
                    811:            if ((c = *prestr++) == '\\') {
                    812:                *bp++ = '\\';
                    813:                if ((c = *prestr++) == '\0')
                    814:                    FATAL("nonterminated character class %.20s...", lastre);
                    815:                *bp++ = c;
                    816:            /* } else if (c == '\n') { */
                    817:            /*  FATAL("newline in character class %.20s...", lastre); */
                    818:            } else if (c == '[' && *prestr == ':') {
                    819:                /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
                    820:                for (cc = charclasses; cc->cc_name; cc++)
                    821:                    if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
                    822:                        break;
                    823:                if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
                    824:                    prestr[2 + cc->cc_namelen] == ']') {
                    825:                    prestr += cc->cc_namelen + 3;
                    826:                    for (i = 0; i < NCHARS; i++) {
                    827:                        if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
                    828:                            FATAL("out of space for reg expr %.10s...", lastre);
                    829:                        if (cc->cc_func(i)) {
                    830:                            *bp++ = i;
                    831:                            n++;
                    832:                        }
                    833:                    }
                    834:                } else
                    835:                    *bp++ = c;
                    836:            } else if (c == '\0') {
                    837:                FATAL("nonterminated character class %.20s", lastre);
                    838:            } else if (bp == buf) { /* 1st char is special */
                    839:                *bp++ = c;
                    840:            } else if (c == ']') {
                    841:                *bp++ = 0;
                    842:                rlxstr = (uschar *) tostring((char *) buf);
                    843:                if (cflag == 0)
                    844:                    return CCL;
                    845:                else
                    846:                    return NCCL;
                    847:            } else
                    848:                *bp++ = c;
                    849:        }
                    850:    }
                    851: }
                    852: 
                    853: int cgoto(fa *f, int s, int c)
                    854: {
                    855:    int i, j, k;
                    856:    int *p, *q;
                    857: 
                    858:    assert(c == HAT || c < NCHARS);
                    859:    while (f->accept >= maxsetvec) {    /* guessing here! */
                    860:        maxsetvec *= 4;
                    861:        setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
                    862:        tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
                    863:        if (setvec == 0 || tmpset == 0)
                    864:            overflo("out of space in cgoto()");
                    865:    }
                    866:    for (i = 0; i <= f->accept; i++)
                    867:        setvec[i] = 0;
                    868:    setcnt = 0;
                    869:    /* compute positions of gototab[s,c] into setvec */
                    870:    p = f->posns[s];
                    871:    for (i = 1; i <= *p; i++) {
                    872:        if ((k = f->re[p[i]].ltype) != FINAL) {
                    873:            if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
                    874:             || (k == DOT && c != 0 && c != HAT)
                    875:             || (k == ALL && c != 0)
                    876:             || (k == EMPTYRE && c != 0)
                    877:             || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
                    878:             || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
                    879:                q = f->re[p[i]].lfollow;
                    880:                for (j = 1; j <= *q; j++) {
                    881:                    if (q[j] >= maxsetvec) {
                    882:                        maxsetvec *= 4;
                    883:                        setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
                    884:                        tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
                    885:                        if (setvec == 0 || tmpset == 0)
                    886:                            overflo("cgoto overflow");
                    887:                    }
                    888:                    if (setvec[q[j]] == 0) {
                    889:                        setcnt++;
                    890:                        setvec[q[j]] = 1;
                    891:                    }
                    892:                }
                    893:            }
                    894:        }
                    895:    }
                    896:    /* determine if setvec is a previous state */
                    897:    tmpset[0] = setcnt;
                    898:    j = 1;
                    899:    for (i = f->accept; i >= 0; i--)
                    900:        if (setvec[i]) {
                    901:            tmpset[j++] = i;
                    902:        }
                    903:    /* tmpset == previous state? */
                    904:    for (i = 1; i <= f->curstat; i++) {
                    905:        p = f->posns[i];
                    906:        if ((k = tmpset[0]) != p[0])
                    907:            goto different;
                    908:        for (j = 1; j <= k; j++)
                    909:            if (tmpset[j] != p[j])
                    910:                goto different;
                    911:        /* setvec is state i */
                    912:        f->gototab[s][c] = i;
                    913:        return i;
                    914:      different:;
                    915:    }
                    916: 
                    917:    /* add tmpset to current set of states */
                    918:    if (f->curstat >= NSTATES-1) {
                    919:        f->curstat = 2;
                    920:        f->reset = 1;
                    921:        for (i = 2; i < NSTATES; i++)
                    922:            xfree(f->posns[i]);
                    923:    } else
                    924:        ++(f->curstat);
                    925:    for (i = 0; i < NCHARS; i++)
                    926:        f->gototab[f->curstat][i] = 0;
                    927:    xfree(f->posns[f->curstat]);
                    928:    if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
                    929:        overflo("out of space in cgoto");
                    930: 
                    931:    f->posns[f->curstat] = p;
                    932:    f->gototab[s][c] = f->curstat;
                    933:    for (i = 0; i <= setcnt; i++)
                    934:        p[i] = tmpset[i];
                    935:    if (setvec[f->accept])
                    936:        f->out[f->curstat] = 1;
                    937:    else
                    938:        f->out[f->curstat] = 0;
                    939:    return f->curstat;
                    940: }
                    941: 
                    942: 
                    943: void freefa(fa *f) /* free a finite automaton */
                    944: {
                    945:    int i;
                    946: 
                    947:    if (f == NULL)
                    948:        return;
                    949:    for (i = 0; i <= f->curstat; i++)
                    950:        xfree(f->posns[i]);
                    951:    for (i = 0; i <= f->accept; i++) {
                    952:        xfree(f->re[i].lfollow);
                    953:        if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
                    954:            xfree((f->re[i].lval.np));
                    955:    }
                    956:    xfree(f->restr);
                    957:    xfree(f);
                    958: }

CVSweb interface <joel.bertrand@systella.fr>