patterns.cc (12887B)
1 /* 2 * this is derived from OpenBSD's patterns.c, which is itself derived from 3 * Lua's lstrlib.c 4 */ 5 6 /* $OpenBSD: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $ */ 7 8 /* 9 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org> 10 * Copyright (C) 1994-2015 Lua.org, PUC-Rio. 11 * 12 * Permission is hereby granted, free of charge, to any person obtaining 13 * a copy of this software and associated documentation files (the 14 * "Software"), to deal in the Software without restriction, including 15 * without limitation the rights to use, copy, modify, merge, publish, 16 * distribute, sublicense, and/or sell copies of the Software, and to 17 * permit persons to whom the Software is furnished to do so, subject to 18 * the following conditions: 19 * 20 * The above copyright notice and this permission notice shall be 21 * included in all copies or substantial portions of the Software. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 27 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 28 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 29 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30 */ 31 32 /* 33 * Derived from Lua 5.3.1: 34 * $Id: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $ 35 * Standard library for string operations and pattern-matching 36 */ 37 38 #include <sys/types.h> 39 #include <ctype.h> 40 #include <errno.h> 41 #include <stddef.h> 42 #include <stdlib.h> 43 #include <string.h> 44 45 #include "patterns.h" 46 47 #define uchar(c) ((unsigned char)(c)) /* macro to 'unsign' a char */ 48 #define CAP_UNFINISHED (-1) 49 #define CAP_POSITION (-2) 50 #define L_ESC '%' 51 #define SPECIALS "^$*+?.([%-" 52 53 /* recursive function */ 54 static const char *match(struct match_state *, const char *, const char *); 55 56 static int 57 match_error(struct match_state *ms, const char *error) 58 { 59 ms->error = ms->error == NULL ? error : ms->error; 60 return (-1); 61 } 62 63 static int 64 check_capture(struct match_state *ms, int l) 65 { 66 l -= '1'; 67 if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) 68 return match_error(ms, "invalid capture index"); 69 return (l); 70 } 71 72 static int 73 capture_to_close(struct match_state *ms) 74 { 75 int level = ms->level; 76 for (level--; level >= 0; level--) 77 if (ms->capture[level].len == CAP_UNFINISHED) 78 return (level); 79 return match_error(ms, "invalid pattern capture"); 80 } 81 82 static const char * 83 classend(struct match_state *ms, const char *p) 84 { 85 switch (*p++) { 86 case L_ESC: 87 if (p == ms->p_end) 88 match_error(ms, 89 "malformed pattern (ends with '%')"); 90 return p + 1; 91 case '[': 92 if (*p == '^') 93 p++; 94 do { 95 /* look for a ']' */ 96 if (p == ms->p_end) { 97 match_error(ms, 98 "malformed pattern (missing ']')"); 99 break; 100 } 101 if (*(p++) == L_ESC && p < ms->p_end) { 102 /* skip escapes (e.g. '%]') */ 103 p++; 104 } 105 } while (*p != ']'); 106 return p + 1; 107 default: 108 return p; 109 } 110 } 111 112 static int 113 match_class(int c, int cl) 114 { 115 int res; 116 switch (tolower(cl)) { 117 case 'a': 118 res = isalpha(c); 119 break; 120 case 'c': 121 res = iscntrl(c); 122 break; 123 case 'd': 124 res = isdigit(c); 125 break; 126 case 'g': 127 res = isgraph(c); 128 break; 129 case 'l': 130 res = islower(c); 131 break; 132 case 'p': 133 res = ispunct(c); 134 break; 135 case 's': 136 res = isspace(c); 137 break; 138 case 'u': 139 res = isupper(c); 140 break; 141 case 'w': 142 res = isalnum(c); 143 break; 144 case 'x': 145 res = isxdigit(c); 146 break; 147 default: 148 return (cl == c); 149 } 150 return (islower(cl) ? res : !res); 151 } 152 153 static int 154 matchbracketclass(int c, const char *p, const char *ec) 155 { 156 int sig = 1; 157 if (*(p + 1) == '^') { 158 sig = 0; 159 /* skip the '^' */ 160 p++; 161 } 162 while (++p < ec) { 163 if (*p == L_ESC) { 164 p++; 165 if (match_class(c, uchar(*p))) 166 return sig; 167 } else if ((*(p + 1) == '-') && (p + 2 < ec)) { 168 p += 2; 169 if (uchar(*(p - 2)) <= c && c <= uchar(*p)) 170 return sig; 171 } else if (uchar(*p) == c) 172 return sig; 173 } 174 return !sig; 175 } 176 177 static int 178 singlematch(struct match_state *ms, const char *s, const char *p, 179 const char *ep) 180 { 181 if (s >= ms->src_end) 182 return 0; 183 else { 184 int c = uchar(*s); 185 switch (*p) { 186 case '.': 187 /* matches any char */ 188 return (1); 189 case L_ESC: 190 return match_class(c, uchar(*(p + 1))); 191 case '[': 192 return matchbracketclass(c, p, ep - 1); 193 default: 194 return (uchar(*p) == c); 195 } 196 } 197 } 198 199 static const char * 200 matchbalance(struct match_state *ms, const char *s, const char *p) 201 { 202 if (p >= ms->p_end - 1) { 203 match_error(ms, 204 "malformed pattern (missing arguments to '%b')"); 205 return (NULL); 206 } 207 if (*s != *p) 208 return (NULL); 209 else { 210 int b = *p; 211 int e = *(p + 1); 212 int cont = 1; 213 while (++s < ms->src_end) { 214 if (*s == e) { 215 if (--cont == 0) 216 return s + 1; 217 } else if (*s == b) 218 cont++; 219 } 220 } 221 222 /* string ends out of balance */ 223 return (NULL); 224 } 225 226 static const char * 227 max_expand(struct match_state *ms, const char *s, const char *p, const char *ep) 228 { 229 ptrdiff_t i = 0; 230 /* counts maximum expand for item */ 231 while (singlematch(ms, s + i, p, ep)) 232 i++; 233 /* keeps trying to match with the maximum repetitions */ 234 while (i >= 0) { 235 const char *res = match(ms, (s + i), ep + 1); 236 if (res) 237 return res; 238 /* else didn't match; reduce 1 repetition to try again */ 239 i--; 240 } 241 return NULL; 242 } 243 244 static const char * 245 min_expand(struct match_state *ms, const char *s, const char *p, const char *ep) 246 { 247 for (;;) { 248 const char *res = match(ms, s, ep + 1); 249 if (res != NULL) 250 return res; 251 else if (singlematch(ms, s, p, ep)) 252 s++; /* try with one more repetition */ 253 else 254 return NULL; 255 } 256 } 257 258 static const char * 259 start_capture(struct match_state *ms, const char *s, const char *p, int what) 260 { 261 const char *res; 262 263 int level = ms->level; 264 if (level >= ms->maxcaptures) { 265 match_error(ms, "too many captures"); 266 return (NULL); 267 } 268 ms->capture[level].init = s; 269 ms->capture[level].len = what; 270 ms->level = level + 1; 271 /* undo capture if match failed */ 272 if ((res = match(ms, s, p)) == NULL) 273 ms->level--; 274 return res; 275 } 276 277 static const char * 278 end_capture(struct match_state *ms, const char *s, const char *p) 279 { 280 int l = capture_to_close(ms); 281 const char *res; 282 if (l == -1) 283 return NULL; 284 /* close capture */ 285 ms->capture[l].len = s - ms->capture[l].init; 286 /* undo capture if match failed */ 287 if ((res = match(ms, s, p)) == NULL) 288 ms->capture[l].len = CAP_UNFINISHED; 289 return res; 290 } 291 292 static const char * 293 match_capture(struct match_state *ms, const char *s, int l) 294 { 295 size_t len; 296 l = check_capture(ms, l); 297 if (l == -1) 298 return NULL; 299 len = ms->capture[l].len; 300 if ((size_t) (ms->src_end - s) >= len && 301 memcmp(ms->capture[l].init, s, len) == 0) 302 return s + len; 303 else 304 return NULL; 305 } 306 307 static const char * 308 match(struct match_state *ms, const char *s, const char *p) 309 { 310 const char *ep, *res; 311 char previous; 312 313 if (ms->matchdepth-- == 0) { 314 match_error(ms, "pattern too complex"); 315 return (NULL); 316 } 317 318 /* using goto's to optimize tail recursion */ 319 init: 320 /* end of pattern? */ 321 if (p != ms->p_end) { 322 switch (*p) { 323 case '(': 324 /* start capture */ 325 if (*(p + 1) == ')') 326 /* position capture? */ 327 s = start_capture(ms, s, p + 2, CAP_POSITION); 328 else 329 s = start_capture(ms, s, p + 1, CAP_UNFINISHED); 330 break; 331 case ')': 332 /* end capture */ 333 s = end_capture(ms, s, p + 1); 334 break; 335 case '$': 336 /* is the '$' the last char in pattern? */ 337 if ((p + 1) != ms->p_end) { 338 /* no; go to default */ 339 goto dflt; 340 } 341 /* check end of string */ 342 s = (s == ms->src_end) ? s : NULL; 343 break; 344 case L_ESC: 345 /* escaped sequences not in the format class[*+?-]? */ 346 switch (*(p + 1)) { 347 case 'b': 348 /* balanced string? */ 349 s = matchbalance(ms, s, p + 2); 350 if (s != NULL) { 351 p += 4; 352 /* return match(ms, s, p + 4); */ 353 goto init; 354 } /* else fail (s == NULL) */ 355 break; 356 case 'f': 357 /* frontier? */ 358 p += 2; 359 if (*p != '[') { 360 match_error(ms, "missing '['" 361 " after '%f' in pattern"); 362 break; 363 } 364 /* points to what is next */ 365 ep = classend(ms, p); 366 if (ms->error != NULL) 367 break; 368 previous = 369 (s == ms->src_init) ? '\0' : *(s - 1); 370 if (!matchbracketclass(uchar(previous), 371 p, ep - 1) && 372 matchbracketclass(uchar(*s), 373 p, ep - 1)) { 374 p = ep; 375 /* return match(ms, s, ep); */ 376 goto init; 377 } 378 /* match failed */ 379 s = NULL; 380 break; 381 case '0': 382 case '1': 383 case '2': 384 case '3': 385 case '4': 386 case '5': 387 case '6': 388 case '7': 389 case '8': 390 case '9': 391 /* capture results (%0-%9)? */ 392 s = match_capture(ms, s, uchar(*(p + 1))); 393 if (s != NULL) { 394 p += 2; 395 /* return match(ms, s, p + 2) */ 396 goto init; 397 } 398 break; 399 default: 400 goto dflt; 401 } 402 break; 403 default: 404 405 /* pattern class plus optional suffix */ 406 dflt: 407 /* points to optional suffix */ 408 ep = classend(ms, p); 409 if (ms->error != NULL) 410 break; 411 412 /* does not match at least once? */ 413 if (!singlematch(ms, s, p, ep)) { 414 if (ms->repetitioncounter-- == 0) { 415 match_error(ms, "max repetition items"); 416 s = NULL; /* fail */ 417 /* accept empty? */ 418 } else if 419 (*ep == '*' || *ep == '?' || *ep == '-') { 420 p = ep + 1; 421 /* return match(ms, s, ep + 1); */ 422 goto init; 423 } else { 424 /* '+' or no suffix */ 425 s = NULL; /* fail */ 426 } 427 } else { 428 /* matched once */ 429 /* handle optional suffix */ 430 switch (*ep) { 431 case '?': 432 /* optional */ 433 if ((res = 434 match(ms, s + 1, ep + 1)) != NULL) 435 s = res; 436 else { 437 /* 438 * else return 439 * match(ms, s, ep + 1); 440 */ 441 p = ep + 1; 442 goto init; 443 } 444 break; 445 case '+': 446 /* 1 or more repetitions */ 447 s++; /* 1 match already done */ 448 /* FALLTHROUGH */ 449 case '*': 450 /* 0 or more repetitions */ 451 s = max_expand(ms, s, p, ep); 452 break; 453 case '-': 454 /* 0 or more repetitions (minimum) */ 455 s = min_expand(ms, s, p, ep); 456 break; 457 default: 458 /* no suffix */ 459 s++; 460 p = ep; 461 /* return match(ms, s + 1, ep); */ 462 goto init; 463 } 464 } 465 break; 466 } 467 } 468 ms->matchdepth++; 469 return s; 470 } 471 472 int 473 gmatch_aux(gmatch_state * gm) 474 { 475 const char *src; 476 for (src = gm->src; src <= gm->ms.src_end;) { 477 const char *e; 478 gm->ms.level = 0; 479 if ((e = match(&gm->ms, src, gm->p)) != NULL && e != gm->lastmatch) { 480 gm->src = gm->lastmatch = e; 481 return 1; 482 } 483 else src++; 484 ASSERT( gm->ms.error == NULL ); 485 } 486 return 0; /* not found */ 487 } 488 489 void 490 gmatch_init(const char * str, const char * pattern, gmatch_state * gm) 491 { 492 memset(gm, 0, sizeof(*gm)); 493 494 gm->ms.maxcaptures = MAXCAPTURES; 495 gm->ms.matchdepth = MAXCCALLS; 496 gm->ms.repetitioncounter = MAXREPETITION; 497 gm->ms.src_init = str; 498 gm->ms.src_end = str + strlen(str); 499 gm->ms.p_end = pattern + strlen(pattern); 500 501 gm->src = str; 502 gm->p = pattern; 503 gm->lastmatch = NULL; 504 } 505 506 bool match( Matches * matches, array< const char > str, const char * pattern ) { 507 match_state ms; 508 memset( &ms, 0, sizeof( ms ) ); 509 510 bool anchor = pattern[ 0 ] == '^'; 511 if( anchor ) { 512 pattern++; 513 } 514 515 ms.maxcaptures = MAXCAPTURES; 516 ms.matchdepth = MAXCCALLS; 517 ms.repetitioncounter = MAXREPETITION; 518 ms.src_init = str.ptr(); 519 ms.src_end = str.ptr() + str.n; 520 ms.p_end = pattern + strlen( pattern ); 521 522 for( size_t i = 0; i < str.n; i++ ) { 523 ms.level = 0; 524 const char * end = match( &ms, str.ptr() + i, pattern ); 525 ASSERT( ms.error == NULL ); 526 if( end != NULL ) { 527 break; 528 } 529 if( anchor ) { 530 break; 531 } 532 } 533 534 matches->matches = matches->matches_data.slice( 0, ms.level ); 535 for( int i = 0; i < ms.level; i++ ) { 536 ssize_t len = ms.capture[ i ].len; 537 if( len == CAP_POSITION ) len = 0; 538 matches->matches_data[ i ] = array< const char >( ms.capture[ i ].init, checked_cast< size_t >( len ) ); 539 } 540 541 return ms.level > 0; 542 } 543 544 bool match( Matches * matches, const char * str, const char * pattern ) { 545 return match( matches, array< const char >( str, strlen( str ) ), pattern ); 546 } 547 548 gmatch::Iterator::Iterator( gmatch_state * gm_ ) { 549 if( gm_ == NULL ) { 550 done = true; 551 return; 552 } 553 554 gm = gm_; 555 done = false; 556 next(); 557 } 558 559 array< array< const char > > gmatch::Iterator::operator*() { 560 return matches.matches; 561 } 562 563 void gmatch::Iterator::operator++() { 564 next(); 565 } 566 567 bool gmatch::Iterator::operator!=( const Iterator & other ) { 568 return done != other.done; 569 } 570 571 void gmatch::Iterator::next() { 572 int ok = gmatch_aux( gm ); 573 if( ok == 0 ) { 574 done = true; 575 return; 576 } 577 578 matches.matches = matches.matches_data.slice( 0, gm->ms.level ); 579 for( int i = 0; i < gm->ms.level; i++ ) { 580 ssize_t len = gm->ms.capture[ i ].len; 581 if( len == CAP_POSITION ) len = 0; 582 matches.matches_data[ i ] = array< const char >( gm->ms.capture[ i ].init, checked_cast< size_t >( len ) ); 583 } 584 } 585 586 gmatch::gmatch( const char * str, const char * pattern ) { 587 gmatch_init( str, pattern, &gm ); 588 }