lpcode.cc (31666B)
1 /* 2 ** $Id: lpcode.c,v 1.24 2016/09/15 17:46:13 roberto Exp $ 3 ** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) 4 */ 5 6 #include <limits.h> 7 8 9 #include "lua.h" 10 #include "lauxlib.h" 11 12 #include "lptypes.h" 13 #include "lpcode.h" 14 15 16 /* signals a "no-instruction */ 17 #define NOINST -1 18 19 20 21 static const Charset fullset_ = 22 {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 23 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 24 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 25 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; 26 27 static const Charset *fullset = &fullset_; 28 29 /* 30 ** {====================================================== 31 ** Analysis and some optimizations 32 ** ======================================================= 33 */ 34 35 /* 36 ** Check whether a charset is empty (returns IFail), singleton (IChar), 37 ** full (IAny), or none of those (ISet). When singleton, '*c' returns 38 ** which character it is. (When generic set, the set was the input, 39 ** so there is no need to return it.) 40 */ 41 static Opcode charsettype (const byte *cs, int *c) { 42 int count = 0; /* number of characters in the set */ 43 int i; 44 int candidate = -1; /* candidate position for the singleton char */ 45 for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */ 46 int b = cs[i]; 47 if (b == 0) { /* is byte empty? */ 48 if (count > 1) /* was set neither empty nor singleton? */ 49 return ISet; /* neither full nor empty nor singleton */ 50 /* else set is still empty or singleton */ 51 } 52 else if (b == 0xFF) { /* is byte full? */ 53 if (count < (i * BITSPERCHAR)) /* was set not full? */ 54 return ISet; /* neither full nor empty nor singleton */ 55 else count += BITSPERCHAR; /* set is still full */ 56 } 57 else if ((b & (b - 1)) == 0) { /* has byte only one bit? */ 58 if (count > 0) /* was set not empty? */ 59 return ISet; /* neither full nor empty nor singleton */ 60 else { /* set has only one char till now; track it */ 61 count++; 62 candidate = i; 63 } 64 } 65 else return ISet; /* byte is neither empty, full, nor singleton */ 66 } 67 switch (count) { 68 case 0: return IFail; /* empty set */ 69 case 1: { /* singleton; find character bit inside byte */ 70 int b = cs[candidate]; 71 *c = candidate * BITSPERCHAR; 72 if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } 73 if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } 74 if ((b & 0x02) != 0) { *c += 1; } 75 return IChar; 76 } 77 default: { 78 assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ 79 return IAny; 80 } 81 } 82 } 83 84 85 /* 86 ** A few basic operations on Charsets 87 */ 88 static void cs_complement (Charset *cs) { 89 loopset(i, cs->cs[i] = ~cs->cs[i]); 90 } 91 92 static int cs_equal (const byte *cs1, const byte *cs2) { 93 loopset(i, if (cs1[i] != cs2[i]) return 0); 94 return 1; 95 } 96 97 static int cs_disjoint (const Charset *cs1, const Charset *cs2) { 98 loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) 99 return 1; 100 } 101 102 103 /* 104 ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a 105 ** charset and return 1; else return 0. 106 */ 107 int tocharset (TTree *tree, Charset *cs) { 108 switch (tree->tag) { 109 case TSet: { /* copy set */ 110 loopset(i, cs->cs[i] = treebuffer(tree)[i]); 111 return 1; 112 } 113 case TChar: { /* only one char */ 114 assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); 115 loopset(i, cs->cs[i] = 0); /* erase all chars */ 116 setchar(cs->cs, tree->u.n); /* add that one */ 117 return 1; 118 } 119 case TAny: { 120 loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ 121 return 1; 122 } 123 default: return 0; 124 } 125 } 126 127 128 /* 129 ** Visit a TCall node taking care to stop recursion. If node not yet 130 ** visited, return 'f(sib2(tree))', otherwise return 'def' (default 131 ** value) 132 */ 133 static int callrecursive (TTree *tree, int f (TTree *t), int def) { 134 int key = tree->key; 135 assert(tree->tag == TCall); 136 assert(sib2(tree)->tag == TRule); 137 if (key == 0) /* node already visited? */ 138 return def; /* return default value */ 139 else { /* first visit */ 140 int result; 141 tree->key = 0; /* mark call as already visited */ 142 result = f(sib2(tree)); /* go to called rule */ 143 tree->key = key; /* restore tree */ 144 return result; 145 } 146 } 147 148 149 /* 150 ** Check whether a pattern tree has captures 151 */ 152 int hascaptures (TTree *tree) { 153 tailcall: 154 switch (tree->tag) { 155 case TCapture: case TRunTime: 156 return 1; 157 case TCall: 158 return callrecursive(tree, hascaptures, 0); 159 case TRule: /* do not follow siblings */ 160 tree = sib1(tree); goto tailcall; 161 case TOpenCall: assert(0); 162 default: { 163 switch (numsiblings[tree->tag]) { 164 case 1: /* return hascaptures(sib1(tree)); */ 165 tree = sib1(tree); goto tailcall; 166 case 2: 167 if (hascaptures(sib1(tree))) 168 return 1; 169 /* else return hascaptures(sib2(tree)); */ 170 tree = sib2(tree); goto tailcall; 171 default: assert(numsiblings[tree->tag] == 0); return 0; 172 } 173 } 174 } 175 } 176 177 178 /* 179 ** Checks how a pattern behaves regarding the empty string, 180 ** in one of two different ways: 181 ** A pattern is *nullable* if it can match without consuming any character; 182 ** A pattern is *nofail* if it never fails for any string 183 ** (including the empty string). 184 ** The difference is only for predicates and run-time captures; 185 ** for other patterns, the two properties are equivalent. 186 ** (With predicates, &'a' is nullable but not nofail. Of course, 187 ** nofail => nullable.) 188 ** These functions are all convervative in the following way: 189 ** p is nullable => nullable(p) 190 ** nofail(p) => p cannot fail 191 ** The function assumes that TOpenCall is not nullable; 192 ** this will be checked again when the grammar is fixed. 193 ** Run-time captures can do whatever they want, so the result 194 ** is conservative. 195 */ 196 int checkaux (TTree *tree, int pred) { 197 tailcall: 198 switch (tree->tag) { 199 case TChar: case TSet: case TAny: 200 case TFalse: case TOpenCall: 201 return 0; /* not nullable */ 202 case TRep: case TTrue: 203 return 1; /* no fail */ 204 case TNot: case TBehind: /* can match empty, but can fail */ 205 if (pred == PEnofail) return 0; 206 else return 1; /* PEnullable */ 207 case TAnd: /* can match empty; fail iff body does */ 208 if (pred == PEnullable) return 1; 209 /* else return checkaux(sib1(tree), pred); */ 210 tree = sib1(tree); goto tailcall; 211 case TRunTime: /* can fail; match empty iff body does */ 212 if (pred == PEnofail) return 0; 213 /* else return checkaux(sib1(tree), pred); */ 214 tree = sib1(tree); goto tailcall; 215 case TSeq: 216 if (!checkaux(sib1(tree), pred)) return 0; 217 /* else return checkaux(sib2(tree), pred); */ 218 tree = sib2(tree); goto tailcall; 219 case TChoice: 220 if (checkaux(sib2(tree), pred)) return 1; 221 /* else return checkaux(sib1(tree), pred); */ 222 tree = sib1(tree); goto tailcall; 223 case TCapture: case TGrammar: case TRule: 224 /* return checkaux(sib1(tree), pred); */ 225 tree = sib1(tree); goto tailcall; 226 case TCall: /* return checkaux(sib2(tree), pred); */ 227 tree = sib2(tree); goto tailcall; 228 default: assert(0); return 0; 229 } 230 } 231 232 233 /* 234 ** number of characters to match a pattern (or -1 if variable) 235 */ 236 int fixedlen (TTree *tree) { 237 int len = 0; /* to accumulate in tail calls */ 238 tailcall: 239 switch (tree->tag) { 240 case TChar: case TSet: case TAny: 241 return len + 1; 242 case TFalse: case TTrue: case TNot: case TAnd: case TBehind: 243 return len; 244 case TRep: case TRunTime: case TOpenCall: 245 return -1; 246 case TCapture: case TRule: case TGrammar: 247 /* return fixedlen(sib1(tree)); */ 248 tree = sib1(tree); goto tailcall; 249 case TCall: { 250 int n1 = callrecursive(tree, fixedlen, -1); 251 if (n1 < 0) 252 return -1; 253 else 254 return len + n1; 255 } 256 case TSeq: { 257 int n1 = fixedlen(sib1(tree)); 258 if (n1 < 0) 259 return -1; 260 /* else return fixedlen(sib2(tree)) + len; */ 261 len += n1; tree = sib2(tree); goto tailcall; 262 } 263 case TChoice: { 264 int n1 = fixedlen(sib1(tree)); 265 int n2 = fixedlen(sib2(tree)); 266 if (n1 != n2 || n1 < 0) 267 return -1; 268 else 269 return len + n1; 270 } 271 default: assert(0); return 0; 272 }; 273 } 274 275 276 /* 277 ** Computes the 'first set' of a pattern. 278 ** The result is a conservative aproximation: 279 ** match p ax -> x (for some x) ==> a belongs to first(p) 280 ** or 281 ** a not in first(p) ==> match p ax -> fail (for all x) 282 ** 283 ** The set 'follow' is the first set of what follows the 284 ** pattern (full set if nothing follows it). 285 ** 286 ** The function returns 0 when this resulting set can be used for 287 ** test instructions that avoid the pattern altogether. 288 ** A non-zero return can happen for two reasons: 289 ** 1) match p '' -> '' ==> return has bit 1 set 290 ** (tests cannot be used because they would always fail for an empty input); 291 ** 2) there is a match-time capture ==> return has bit 2 set 292 ** (optimizations should not bypass match-time captures). 293 */ 294 static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { 295 tailcall: 296 switch (tree->tag) { 297 case TChar: case TSet: case TAny: { 298 tocharset(tree, firstset); 299 return 0; 300 } 301 case TTrue: { 302 loopset(i, firstset->cs[i] = follow->cs[i]); 303 return 1; /* accepts the empty string */ 304 } 305 case TFalse: { 306 loopset(i, firstset->cs[i] = 0); 307 return 0; 308 } 309 case TChoice: { 310 Charset csaux; 311 int e1 = getfirst(sib1(tree), follow, firstset); 312 int e2 = getfirst(sib2(tree), follow, &csaux); 313 loopset(i, firstset->cs[i] |= csaux.cs[i]); 314 return e1 | e2; 315 } 316 case TSeq: { 317 if (!nullable(sib1(tree))) { 318 /* when p1 is not nullable, p2 has nothing to contribute; 319 return getfirst(sib1(tree), fullset, firstset); */ 320 tree = sib1(tree); follow = fullset; goto tailcall; 321 } 322 else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ 323 Charset csaux; 324 int e2 = getfirst(sib2(tree), follow, &csaux); 325 int e1 = getfirst(sib1(tree), &csaux, firstset); 326 if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ 327 else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ 328 return 2; /* pattern has a matchtime capture */ 329 else return e2; /* else depends on 'e2' */ 330 } 331 } 332 case TRep: { 333 getfirst(sib1(tree), follow, firstset); 334 loopset(i, firstset->cs[i] |= follow->cs[i]); 335 return 1; /* accept the empty string */ 336 } 337 case TCapture: case TGrammar: case TRule: { 338 /* return getfirst(sib1(tree), follow, firstset); */ 339 tree = sib1(tree); goto tailcall; 340 } 341 case TRunTime: { /* function invalidates any follow info. */ 342 int e = getfirst(sib1(tree), fullset, firstset); 343 if (e) return 2; /* function is not "protected"? */ 344 else return 0; /* pattern inside capture ensures first can be used */ 345 } 346 case TCall: { 347 /* return getfirst(sib2(tree), follow, firstset); */ 348 tree = sib2(tree); goto tailcall; 349 } 350 case TAnd: { 351 int e = getfirst(sib1(tree), follow, firstset); 352 loopset(i, firstset->cs[i] &= follow->cs[i]); 353 return e; 354 } 355 case TNot: { 356 if (tocharset(sib1(tree), firstset)) { 357 cs_complement(firstset); 358 return 1; 359 } 360 /* else go through */ 361 } 362 case TBehind: { /* instruction gives no new information */ 363 /* call 'getfirst' only to check for math-time captures */ 364 int e = getfirst(sib1(tree), follow, firstset); 365 loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ 366 return e | 1; /* always can accept the empty string */ 367 } 368 default: assert(0); return 0; 369 } 370 } 371 372 373 /* 374 ** If 'headfail(tree)' true, then 'tree' can fail only depending on the 375 ** next character of the subject. 376 */ 377 static int headfail (TTree *tree) { 378 tailcall: 379 switch (tree->tag) { 380 case TChar: case TSet: case TAny: case TFalse: 381 return 1; 382 case TTrue: case TRep: case TRunTime: case TNot: 383 case TBehind: 384 return 0; 385 case TCapture: case TGrammar: case TRule: case TAnd: 386 tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ 387 case TCall: 388 tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ 389 case TSeq: 390 if (!nofail(sib2(tree))) return 0; 391 /* else return headfail(sib1(tree)); */ 392 tree = sib1(tree); goto tailcall; 393 case TChoice: 394 if (!headfail(sib1(tree))) return 0; 395 /* else return headfail(sib2(tree)); */ 396 tree = sib2(tree); goto tailcall; 397 default: assert(0); return 0; 398 } 399 } 400 401 402 /* 403 ** Check whether the code generation for the given tree can benefit 404 ** from a follow set (to avoid computing the follow set when it is 405 ** not needed) 406 */ 407 static int needfollow (TTree *tree) { 408 tailcall: 409 switch (tree->tag) { 410 case TChar: case TSet: case TAny: 411 case TFalse: case TTrue: case TAnd: case TNot: 412 case TRunTime: case TGrammar: case TCall: case TBehind: 413 return 0; 414 case TChoice: case TRep: 415 return 1; 416 case TCapture: 417 tree = sib1(tree); goto tailcall; 418 case TSeq: 419 tree = sib2(tree); goto tailcall; 420 default: assert(0); return 0; 421 } 422 } 423 424 /* }====================================================== */ 425 426 427 428 /* 429 ** {====================================================== 430 ** Code generation 431 ** ======================================================= 432 */ 433 434 435 /* 436 ** size of an instruction 437 */ 438 int sizei (const Instruction *i) { 439 switch((Opcode)i->i.code) { 440 case ISet: case ISpan: return CHARSETINSTSIZE; 441 case ITestSet: return CHARSETINSTSIZE + 1; 442 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: 443 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: 444 return 2; 445 default: return 1; 446 } 447 } 448 449 450 /* 451 ** state for the compiler 452 */ 453 typedef struct CompileState { 454 Pattern *p; /* pattern being compiled */ 455 int ncode; /* next position in p->code to be filled */ 456 lua_State *L; 457 } CompileState; 458 459 460 /* 461 ** code generation is recursive; 'opt' indicates that the code is being 462 ** generated as the last thing inside an optional pattern (so, if that 463 ** code is optional too, it can reuse the 'IChoice' already in place for 464 ** the outer pattern). 'tt' points to a previous test protecting this 465 ** code (or NOINST). 'fl' is the follow set of the pattern. 466 */ 467 static void codegen (CompileState *compst, TTree *tree, int opt, int tt, 468 const Charset *fl); 469 470 471 void realloccode (lua_State *L, Pattern *p, int nsize) { 472 void *ud; 473 lua_Alloc f = lua_getallocf(L, &ud); 474 void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction), 475 nsize * sizeof(Instruction)); 476 if (newblock == NULL && nsize > 0) 477 luaL_error(L, "not enough memory"); 478 p->code = (Instruction *)newblock; 479 p->codesize = nsize; 480 } 481 482 483 static int nextinstruction (CompileState *compst) { 484 int size = compst->p->codesize; 485 if (compst->ncode >= size) 486 realloccode(compst->L, compst->p, size * 2); 487 return compst->ncode++; 488 } 489 490 491 #define getinstr(cs,i) ((cs)->p->code[i]) 492 493 494 static int addinstruction (CompileState *compst, Opcode op, int aux) { 495 int i = nextinstruction(compst); 496 getinstr(compst, i).i.code = op; 497 getinstr(compst, i).i.aux = aux; 498 return i; 499 } 500 501 502 /* 503 ** Add an instruction followed by space for an offset (to be set later) 504 */ 505 static int addoffsetinst (CompileState *compst, Opcode op) { 506 int i = addinstruction(compst, op, 0); /* instruction */ 507 addinstruction(compst, (Opcode)0, 0); /* open space for offset */ 508 assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); 509 return i; 510 } 511 512 513 /* 514 ** Set the offset of an instruction 515 */ 516 static void setoffset (CompileState *compst, int instruction, int offset) { 517 getinstr(compst, instruction + 1).offset = offset; 518 } 519 520 521 /* 522 ** Add a capture instruction: 523 ** 'op' is the capture instruction; 'cap' the capture kind; 524 ** 'key' the key into ktable; 'aux' is the optional capture offset 525 ** 526 */ 527 static int addinstcap (CompileState *compst, Opcode op, int cap, int key, 528 int aux) { 529 int i = addinstruction(compst, op, joinkindoff(cap, aux)); 530 getinstr(compst, i).i.key = key; 531 return i; 532 } 533 534 535 #define gethere(compst) ((compst)->ncode) 536 537 #define target(code,i) ((i) + code[i + 1].offset) 538 539 540 /* 541 ** Patch 'instruction' to jump to 'target' 542 */ 543 static void jumptothere (CompileState *compst, int instruction, int target) { 544 if (instruction >= 0) 545 setoffset(compst, instruction, target - instruction); 546 } 547 548 549 /* 550 ** Patch 'instruction' to jump to current position 551 */ 552 static void jumptohere (CompileState *compst, int instruction) { 553 jumptothere(compst, instruction, gethere(compst)); 554 } 555 556 557 /* 558 ** Code an IChar instruction, or IAny if there is an equivalent 559 ** test dominating it 560 */ 561 static void codechar (CompileState *compst, int c, int tt) { 562 if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && 563 getinstr(compst, tt).i.aux == c) 564 addinstruction(compst, IAny, 0); 565 else 566 addinstruction(compst, IChar, c); 567 } 568 569 570 /* 571 ** Add a charset posfix to an instruction 572 */ 573 static void addcharset (CompileState *compst, const byte *cs) { 574 int p = gethere(compst); 575 int i; 576 for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) 577 nextinstruction(compst); /* space for buffer */ 578 /* fill buffer with charset */ 579 loopset(j, getinstr(compst, p).buff[j] = cs[j]); 580 } 581 582 583 /* 584 ** code a char set, optimizing unit sets for IChar, "complete" 585 ** sets for IAny, and empty sets for IFail; also use an IAny 586 ** when instruction is dominated by an equivalent test. 587 */ 588 static void codecharset (CompileState *compst, const byte *cs, int tt) { 589 int c = 0; /* (=) to avoid warnings */ 590 Opcode op = charsettype(cs, &c); 591 switch (op) { 592 case IChar: codechar(compst, c, tt); break; 593 case ISet: { /* non-trivial set? */ 594 if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && 595 cs_equal(cs, getinstr(compst, tt + 2).buff)) 596 addinstruction(compst, IAny, 0); 597 else { 598 addinstruction(compst, ISet, 0); 599 addcharset(compst, cs); 600 } 601 break; 602 } 603 default: addinstruction(compst, op, c); break; 604 } 605 } 606 607 608 /* 609 ** code a test set, optimizing unit sets for ITestChar, "complete" 610 ** sets for ITestAny, and empty sets for IJmp (always fails). 611 ** 'e' is true iff test should accept the empty string. (Test 612 ** instructions in the current VM never accept the empty string.) 613 */ 614 static int codetestset (CompileState *compst, Charset *cs, int e) { 615 if (e) return NOINST; /* no test */ 616 else { 617 int c = 0; 618 Opcode op = charsettype(cs->cs, &c); 619 switch (op) { 620 case IFail: return addoffsetinst(compst, IJmp); /* always jump */ 621 case IAny: return addoffsetinst(compst, ITestAny); 622 case IChar: { 623 int i = addoffsetinst(compst, ITestChar); 624 getinstr(compst, i).i.aux = c; 625 return i; 626 } 627 case ISet: { 628 int i = addoffsetinst(compst, ITestSet); 629 addcharset(compst, cs->cs); 630 return i; 631 } 632 default: assert(0); return 0; 633 } 634 } 635 } 636 637 638 /* 639 ** Find the final destination of a sequence of jumps 640 */ 641 static int finaltarget (Instruction *code, int i) { 642 while (code[i].i.code == IJmp) 643 i = target(code, i); 644 return i; 645 } 646 647 648 /* 649 ** final label (after traversing any jumps) 650 */ 651 static int finallabel (Instruction *code, int i) { 652 return finaltarget(code, target(code, i)); 653 } 654 655 656 /* 657 ** <behind(p)> == behind n; <p> (where n = fixedlen(p)) 658 */ 659 static void codebehind (CompileState *compst, TTree *tree) { 660 if (tree->u.n > 0) 661 addinstruction(compst, IBehind, tree->u.n); 662 codegen(compst, sib1(tree), 0, NOINST, fullset); 663 } 664 665 666 /* 667 ** Choice; optimizations: 668 ** - when p1 is headfail or 669 ** when first(p1) and first(p2) are disjoint, than 670 ** a character not in first(p1) cannot go to p1, and a character 671 ** in first(p1) cannot go to p2 (at it is not in first(p2)). 672 ** (The optimization is not valid if p1 accepts the empty string, 673 ** as then there is no character at all...) 674 ** - when p2 is empty and opt is true; a IPartialCommit can reuse 675 ** the Choice already active in the stack. 676 */ 677 static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, 678 const Charset *fl) { 679 int emptyp2 = (p2->tag == TTrue); 680 Charset cs1, cs2; 681 int e1 = getfirst(p1, fullset, &cs1); 682 if (headfail(p1) || 683 (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { 684 /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ 685 int test = codetestset(compst, &cs1, 0); 686 int jmp = NOINST; 687 codegen(compst, p1, 0, test, fl); 688 if (!emptyp2) 689 jmp = addoffsetinst(compst, IJmp); 690 jumptohere(compst, test); 691 codegen(compst, p2, opt, NOINST, fl); 692 jumptohere(compst, jmp); 693 } 694 else if (opt && emptyp2) { 695 /* p1? == IPartialCommit; p1 */ 696 jumptohere(compst, addoffsetinst(compst, IPartialCommit)); 697 codegen(compst, p1, 1, NOINST, fullset); 698 } 699 else { 700 /* <p1 / p2> == 701 test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ 702 int pcommit; 703 int test = codetestset(compst, &cs1, e1); 704 int pchoice = addoffsetinst(compst, IChoice); 705 codegen(compst, p1, emptyp2, test, fullset); 706 pcommit = addoffsetinst(compst, ICommit); 707 jumptohere(compst, pchoice); 708 jumptohere(compst, test); 709 codegen(compst, p2, opt, NOINST, fl); 710 jumptohere(compst, pcommit); 711 } 712 } 713 714 715 /* 716 ** And predicate 717 ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n 718 ** (valid only when 'p' has no captures) 719 */ 720 static void codeand (CompileState *compst, TTree *tree, int tt) { 721 int n = fixedlen(tree); 722 if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { 723 codegen(compst, tree, 0, tt, fullset); 724 if (n > 0) 725 addinstruction(compst, IBehind, n); 726 } 727 else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ 728 int pcommit; 729 int pchoice = addoffsetinst(compst, IChoice); 730 codegen(compst, tree, 0, tt, fullset); 731 pcommit = addoffsetinst(compst, IBackCommit); 732 jumptohere(compst, pchoice); 733 addinstruction(compst, IFail, 0); 734 jumptohere(compst, pcommit); 735 } 736 } 737 738 739 /* 740 ** Captures: if pattern has fixed (and not too big) length, and it 741 ** has no nested captures, use a single IFullCapture instruction 742 ** after the match; otherwise, enclose the pattern with OpenCapture - 743 ** CloseCapture. 744 */ 745 static void codecapture (CompileState *compst, TTree *tree, int tt, 746 const Charset *fl) { 747 int len = fixedlen(sib1(tree)); 748 if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { 749 codegen(compst, sib1(tree), 0, tt, fl); 750 addinstcap(compst, IFullCapture, tree->cap, tree->key, len); 751 } 752 else { 753 addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); 754 codegen(compst, sib1(tree), 0, tt, fl); 755 addinstcap(compst, ICloseCapture, Cclose, 0, 0); 756 } 757 } 758 759 760 static void coderuntime (CompileState *compst, TTree *tree, int tt) { 761 addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); 762 codegen(compst, sib1(tree), 0, tt, fullset); 763 addinstcap(compst, ICloseRunTime, Cclose, 0, 0); 764 } 765 766 767 /* 768 ** Repetion; optimizations: 769 ** When pattern is a charset, can use special instruction ISpan. 770 ** When pattern is head fail, or if it starts with characters that 771 ** are disjoint from what follows the repetions, a simple test 772 ** is enough (a fail inside the repetition would backtrack to fail 773 ** again in the following pattern, so there is no need for a choice). 774 ** When 'opt' is true, the repetion can reuse the Choice already 775 ** active in the stack. 776 */ 777 static void coderep (CompileState *compst, TTree *tree, int opt, 778 const Charset *fl) { 779 Charset st; 780 if (tocharset(tree, &st)) { 781 addinstruction(compst, ISpan, 0); 782 addcharset(compst, st.cs); 783 } 784 else { 785 int e1 = getfirst(tree, fullset, &st); 786 if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { 787 /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ 788 int jmp; 789 int test = codetestset(compst, &st, 0); 790 codegen(compst, tree, 0, test, fullset); 791 jmp = addoffsetinst(compst, IJmp); 792 jumptohere(compst, test); 793 jumptothere(compst, jmp, test); 794 } 795 else { 796 /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ 797 /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */ 798 int commit, l2; 799 int test = codetestset(compst, &st, e1); 800 int pchoice = NOINST; 801 if (opt) 802 jumptohere(compst, addoffsetinst(compst, IPartialCommit)); 803 else 804 pchoice = addoffsetinst(compst, IChoice); 805 l2 = gethere(compst); 806 codegen(compst, tree, 0, NOINST, fullset); 807 commit = addoffsetinst(compst, IPartialCommit); 808 jumptothere(compst, commit, l2); 809 jumptohere(compst, pchoice); 810 jumptohere(compst, test); 811 } 812 } 813 } 814 815 816 /* 817 ** Not predicate; optimizations: 818 ** In any case, if first test fails, 'not' succeeds, so it can jump to 819 ** the end. If pattern is headfail, that is all (it cannot fail 820 ** in other parts); this case includes 'not' of simple sets. Otherwise, 821 ** use the default code (a choice plus a failtwice). 822 */ 823 static void codenot (CompileState *compst, TTree *tree) { 824 Charset st; 825 int e = getfirst(tree, fullset, &st); 826 int test = codetestset(compst, &st, e); 827 if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ 828 addinstruction(compst, IFail, 0); 829 else { 830 /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */ 831 int pchoice = addoffsetinst(compst, IChoice); 832 codegen(compst, tree, 0, NOINST, fullset); 833 addinstruction(compst, IFailTwice, 0); 834 jumptohere(compst, pchoice); 835 } 836 jumptohere(compst, test); 837 } 838 839 840 /* 841 ** change open calls to calls, using list 'positions' to find 842 ** correct offsets; also optimize tail calls 843 */ 844 static void correctcalls (CompileState *compst, int *positions, 845 int from, int to) { 846 int i; 847 Instruction *code = compst->p->code; 848 for (i = from; i < to; i += sizei(&code[i])) { 849 if (code[i].i.code == IOpenCall) { 850 int n = code[i].i.key; /* rule number */ 851 int rule = positions[n]; /* rule position */ 852 assert(rule == from || code[rule - 1].i.code == IRet); 853 if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ 854 code[i].i.code = IJmp; /* tail call */ 855 else 856 code[i].i.code = ICall; 857 jumptothere(compst, i, rule); /* call jumps to respective rule */ 858 } 859 } 860 assert(i == to); 861 } 862 863 864 /* 865 ** Code for a grammar: 866 ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: 867 */ 868 static void codegrammar (CompileState *compst, TTree *grammar) { 869 int positions[MAXRULES]; 870 int rulenumber = 0; 871 TTree *rule; 872 int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ 873 int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ 874 int start = gethere(compst); /* here starts the initial rule */ 875 jumptohere(compst, firstcall); 876 for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { 877 positions[rulenumber++] = gethere(compst); /* save rule position */ 878 codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ 879 addinstruction(compst, IRet, 0); 880 } 881 assert(rule->tag == TTrue); 882 jumptohere(compst, jumptoend); 883 correctcalls(compst, positions, start, gethere(compst)); 884 } 885 886 887 static void codecall (CompileState *compst, TTree *call) { 888 int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ 889 getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ 890 assert(sib2(call)->tag == TRule); 891 } 892 893 894 /* 895 ** Code first child of a sequence 896 ** (second child is called in-place to allow tail call) 897 ** Return 'tt' for second child 898 */ 899 static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, 900 int tt, const Charset *fl) { 901 if (needfollow(p1)) { 902 Charset fl1; 903 getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ 904 codegen(compst, p1, 0, tt, &fl1); 905 } 906 else /* use 'fullset' as follow */ 907 codegen(compst, p1, 0, tt, fullset); 908 if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ 909 return NOINST; /* invalidate test */ 910 else return tt; /* else 'tt' still protects sib2 */ 911 } 912 913 914 /* 915 ** Main code-generation function: dispatch to auxiliar functions 916 ** according to kind of tree. ('needfollow' should return true 917 ** only for consructions that use 'fl'.) 918 */ 919 static void codegen (CompileState *compst, TTree *tree, int opt, int tt, 920 const Charset *fl) { 921 tailcall: 922 switch (tree->tag) { 923 case TChar: codechar(compst, tree->u.n, tt); break; 924 case TAny: addinstruction(compst, IAny, 0); break; 925 case TSet: codecharset(compst, treebuffer(tree), tt); break; 926 case TTrue: break; 927 case TFalse: addinstruction(compst, IFail, 0); break; 928 case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; 929 case TRep: coderep(compst, sib1(tree), opt, fl); break; 930 case TBehind: codebehind(compst, tree); break; 931 case TNot: codenot(compst, sib1(tree)); break; 932 case TAnd: codeand(compst, sib1(tree), tt); break; 933 case TCapture: codecapture(compst, tree, tt, fl); break; 934 case TRunTime: coderuntime(compst, tree, tt); break; 935 case TGrammar: codegrammar(compst, tree); break; 936 case TCall: codecall(compst, tree); break; 937 case TSeq: { 938 tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ 939 /* codegen(compst, p2, opt, tt, fl); */ 940 tree = sib2(tree); goto tailcall; 941 } 942 default: assert(0); 943 } 944 } 945 946 947 /* 948 ** Optimize jumps and other jump-like instructions. 949 ** * Update labels of instructions with labels to their final 950 ** destinations (e.g., choice L1; ... L1: jmp L2: becomes 951 ** choice L2) 952 ** * Jumps to other instructions that do jumps become those 953 ** instructions (e.g., jump to return becomes a return; jump 954 ** to commit becomes a commit) 955 */ 956 static void peephole (CompileState *compst) { 957 Instruction *code = compst->p->code; 958 int i; 959 for (i = 0; i < compst->ncode; i += sizei(&code[i])) { 960 redo: 961 switch (code[i].i.code) { 962 case IChoice: case ICall: case ICommit: case IPartialCommit: 963 case IBackCommit: case ITestChar: case ITestSet: 964 case ITestAny: { /* instructions with labels */ 965 jumptothere(compst, i, finallabel(code, i)); /* optimize label */ 966 break; 967 } 968 case IJmp: { 969 int ft = finaltarget(code, i); 970 switch (code[ft].i.code) { /* jumping to what? */ 971 case IRet: case IFail: case IFailTwice: 972 case IEnd: { /* instructions with unconditional implicit jumps */ 973 code[i] = code[ft]; /* jump becomes that instruction */ 974 code[i + 1].i.code = IAny; /* 'no-op' for target position */ 975 break; 976 } 977 case ICommit: case IPartialCommit: 978 case IBackCommit: { /* inst. with unconditional explicit jumps */ 979 int fft = finallabel(code, ft); 980 code[i] = code[ft]; /* jump becomes that instruction... */ 981 jumptothere(compst, i, fft); /* but must correct its offset */ 982 goto redo; /* reoptimize its label */ 983 } 984 default: { 985 jumptothere(compst, i, ft); /* optimize label */ 986 break; 987 } 988 } 989 break; 990 } 991 default: break; 992 } 993 } 994 assert(code[i - 1].i.code == IEnd); 995 } 996 997 998 /* 999 ** Compile a pattern 1000 */ 1001 Instruction *compile (lua_State *L, Pattern *p) { 1002 CompileState compst; 1003 compst.p = p; compst.ncode = 0; compst.L = L; 1004 realloccode(L, p, 2); /* minimum initial size */ 1005 codegen(&compst, p->tree, 0, NOINST, fullset); 1006 addinstruction(&compst, IEnd, 0); 1007 realloccode(L, p, compst.ncode); /* set final size */ 1008 peephole(&compst); 1009 return p->code; 1010 } 1011 1012 1013 /* }====================================================== */ 1014