medfall

A super great game engine
Log | Files | Refs

patterns.cc (12887B)


      1 /*
      2  * this is derived from OpenBSD's patterns.c, which is itself derived from
      3  * Lua's lstrlib.c
      4  */
      5 
      6 /*	$OpenBSD: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $	*/
      7 
      8 /*
      9  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
     10  * Copyright (C) 1994-2015 Lua.org, PUC-Rio.
     11  *
     12  * Permission is hereby granted, free of charge, to any person obtaining
     13  * a copy of this software and associated documentation files (the
     14  * "Software"), to deal in the Software without restriction, including
     15  * without limitation the rights to use, copy, modify, merge, publish,
     16  * distribute, sublicense, and/or sell copies of the Software, and to
     17  * permit persons to whom the Software is furnished to do so, subject to
     18  * the following conditions:
     19  *
     20  * The above copyright notice and this permission notice shall be
     21  * included in all copies or substantial portions of the Software.
     22  *
     23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     26  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
     27  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     28  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     29  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     30  */
     31 
     32 /*
     33  * Derived from Lua 5.3.1:
     34  * $Id: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $
     35  * Standard library for string operations and pattern-matching
     36  */
     37 
     38 #include <sys/types.h>
     39 #include <ctype.h>
     40 #include <errno.h>
     41 #include <stddef.h>
     42 #include <stdlib.h>
     43 #include <string.h>
     44 
     45 #include "patterns.h"
     46 
     47 #define uchar(c)	((unsigned char)(c)) /* macro to 'unsign' a char */
     48 #define CAP_UNFINISHED	(-1)
     49 #define CAP_POSITION	(-2)
     50 #define L_ESC		'%'
     51 #define SPECIALS	"^$*+?.([%-"
     52 
     53 /* recursive function */
     54 static const char *match(struct match_state *, const char *, const char *);
     55 
     56 static int
     57 match_error(struct match_state *ms, const char *error)
     58 {
     59 	ms->error = ms->error == NULL ? error : ms->error;
     60 	return (-1);
     61 }
     62 
     63 static int
     64 check_capture(struct match_state *ms, int l)
     65 {
     66 	l -= '1';
     67 	if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
     68 		return match_error(ms, "invalid capture index");
     69 	return (l);
     70 }
     71 
     72 static int
     73 capture_to_close(struct match_state *ms)
     74 {
     75 	int level = ms->level;
     76 	for (level--; level >= 0; level--)
     77 		if (ms->capture[level].len == CAP_UNFINISHED)
     78 			return (level);
     79 	return match_error(ms, "invalid pattern capture");
     80 }
     81 
     82 static const char *
     83 classend(struct match_state *ms, const char *p)
     84 {
     85 	switch (*p++) {
     86 	case L_ESC:
     87 		if (p == ms->p_end)
     88 			match_error(ms,
     89 			    "malformed pattern (ends with '%')");
     90 		return p + 1;
     91 	case '[':
     92 		if (*p == '^')
     93 			p++;
     94 		do {
     95 			/* look for a ']' */
     96 			if (p == ms->p_end) {
     97 				match_error(ms,
     98 				    "malformed pattern (missing ']')");
     99 				break;
    100 			}
    101 			if (*(p++) == L_ESC && p < ms->p_end) {
    102 				/* skip escapes (e.g. '%]') */
    103 				p++;
    104 			}
    105 		} while (*p != ']');
    106 		return p + 1;
    107 	default:
    108 		return p;
    109 	}
    110 }
    111 
    112 static int
    113 match_class(int c, int cl)
    114 {
    115 	int res;
    116 	switch (tolower(cl)) {
    117 	case 'a':
    118 		res = isalpha(c);
    119 		break;
    120 	case 'c':
    121 		res = iscntrl(c);
    122 		break;
    123 	case 'd':
    124 		res = isdigit(c);
    125 		break;
    126 	case 'g':
    127 		res = isgraph(c);
    128 		break;
    129 	case 'l':
    130 		res = islower(c);
    131 		break;
    132 	case 'p':
    133 		res = ispunct(c);
    134 		break;
    135 	case 's':
    136 		res = isspace(c);
    137 		break;
    138 	case 'u':
    139 		res = isupper(c);
    140 		break;
    141 	case 'w':
    142 		res = isalnum(c);
    143 		break;
    144 	case 'x':
    145 		res = isxdigit(c);
    146 		break;
    147 	default:
    148 		return (cl == c);
    149 	}
    150 	return (islower(cl) ? res : !res);
    151 }
    152 
    153 static int
    154 matchbracketclass(int c, const char *p, const char *ec)
    155 {
    156 	int sig = 1;
    157 	if (*(p + 1) == '^') {
    158 		sig = 0;
    159 		/* skip the '^' */
    160 		p++;
    161 	}
    162 	while (++p < ec) {
    163 		if (*p == L_ESC) {
    164 			p++;
    165 			if (match_class(c, uchar(*p)))
    166 				return sig;
    167 		} else if ((*(p + 1) == '-') && (p + 2 < ec)) {
    168 			p += 2;
    169 			if (uchar(*(p - 2)) <= c && c <= uchar(*p))
    170 				return sig;
    171 		} else if (uchar(*p) == c)
    172 			return sig;
    173 	}
    174 	return !sig;
    175 }
    176 
    177 static int
    178 singlematch(struct match_state *ms, const char *s, const char *p,
    179     const char *ep)
    180 {
    181 	if (s >= ms->src_end)
    182 		return 0;
    183 	else {
    184 		int c = uchar(*s);
    185 		switch (*p) {
    186 		case '.':
    187 			/* matches any char */
    188 			return (1);
    189 		case L_ESC:
    190 			return match_class(c, uchar(*(p + 1)));
    191 		case '[':
    192 			return matchbracketclass(c, p, ep - 1);
    193 		default:
    194 			return (uchar(*p) == c);
    195 		}
    196 	}
    197 }
    198 
    199 static const char *
    200 matchbalance(struct match_state *ms, const char *s, const char *p)
    201 {
    202 	if (p >= ms->p_end - 1) {
    203 		match_error(ms,
    204 		    "malformed pattern (missing arguments to '%b')");
    205 		return (NULL);
    206 	}
    207 	if (*s != *p)
    208 		return (NULL);
    209 	else {
    210 		int b = *p;
    211 		int e = *(p + 1);
    212 		int cont = 1;
    213 		while (++s < ms->src_end) {
    214 			if (*s == e) {
    215 				if (--cont == 0)
    216 					return s + 1;
    217 			} else if (*s == b)
    218 				cont++;
    219 		}
    220 	}
    221 
    222 	/* string ends out of balance */
    223 	return (NULL);
    224 }
    225 
    226 static const char *
    227 max_expand(struct match_state *ms, const char *s, const char *p, const char *ep)
    228 {
    229 	ptrdiff_t i = 0;
    230 	/* counts maximum expand for item */
    231 	while (singlematch(ms, s + i, p, ep))
    232 		i++;
    233 	/* keeps trying to match with the maximum repetitions */
    234 	while (i >= 0) {
    235 		const char *res = match(ms, (s + i), ep + 1);
    236 		if (res)
    237 			return res;
    238 		/* else didn't match; reduce 1 repetition to try again */
    239 		i--;
    240 	}
    241 	return NULL;
    242 }
    243 
    244 static const char *
    245 min_expand(struct match_state *ms, const char *s, const char *p, const char *ep)
    246 {
    247 	for (;;) {
    248 		const char *res = match(ms, s, ep + 1);
    249 		if (res != NULL)
    250 			return res;
    251 		else if (singlematch(ms, s, p, ep))
    252 			s++;	/* try with one more repetition */
    253 		else
    254 			return NULL;
    255 	}
    256 }
    257 
    258 static const char *
    259 start_capture(struct match_state *ms, const char *s, const char *p, int what)
    260 {
    261 	const char *res;
    262 
    263 	int level = ms->level;
    264 	if (level >= ms->maxcaptures) {
    265 		match_error(ms, "too many captures");
    266 		return (NULL);
    267 	}
    268 	ms->capture[level].init = s;
    269 	ms->capture[level].len = what;
    270 	ms->level = level + 1;
    271 	/* undo capture if match failed */
    272 	if ((res = match(ms, s, p)) == NULL)
    273 		ms->level--;
    274 	return res;
    275 }
    276 
    277 static const char *
    278 end_capture(struct match_state *ms, const char *s, const char *p)
    279 {
    280 	int l = capture_to_close(ms);
    281 	const char *res;
    282 	if (l == -1)
    283 		return NULL;
    284 	/* close capture */
    285 	ms->capture[l].len = s - ms->capture[l].init;
    286 	/* undo capture if match failed */
    287 	if ((res = match(ms, s, p)) == NULL)
    288 		ms->capture[l].len = CAP_UNFINISHED;
    289 	return res;
    290 }
    291 
    292 static const char *
    293 match_capture(struct match_state *ms, const char *s, int l)
    294 {
    295 	size_t len;
    296 	l = check_capture(ms, l);
    297 	if (l == -1)
    298 		return NULL;
    299 	len = ms->capture[l].len;
    300 	if ((size_t) (ms->src_end - s) >= len &&
    301 	    memcmp(ms->capture[l].init, s, len) == 0)
    302 		return s + len;
    303 	else
    304 		return NULL;
    305 }
    306 
    307 static const char *
    308 match(struct match_state *ms, const char *s, const char *p)
    309 {
    310 	const char *ep, *res;
    311 	char previous;
    312 
    313 	if (ms->matchdepth-- == 0) {
    314 		match_error(ms, "pattern too complex");
    315 		return (NULL);
    316 	}
    317 
    318 	/* using goto's to optimize tail recursion */
    319  init:
    320 	/* end of pattern? */
    321 	if (p != ms->p_end) {
    322 		switch (*p) {
    323 		case '(':
    324 			/* start capture */
    325 			if (*(p + 1) == ')')
    326 				/* position capture? */
    327 				s = start_capture(ms, s, p + 2, CAP_POSITION);
    328 			else
    329 				s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
    330 			break;
    331 		case ')':
    332 			/* end capture */
    333 			s = end_capture(ms, s, p + 1);
    334 			break;
    335 		case '$':
    336 			/* is the '$' the last char in pattern? */
    337 			if ((p + 1) != ms->p_end) {
    338 				/* no; go to default */
    339 				goto dflt;
    340 			}
    341 			 /* check end of string */
    342 			s = (s == ms->src_end) ? s : NULL;
    343 			break;
    344 		case L_ESC:
    345 			/* escaped sequences not in the format class[*+?-]? */
    346 			switch (*(p + 1)) {
    347 			case 'b':
    348 				/* balanced string? */
    349 				s = matchbalance(ms, s, p + 2);
    350 				if (s != NULL) {
    351 					p += 4;
    352 					/* return match(ms, s, p + 4); */
    353 					goto init;
    354 				} /* else fail (s == NULL) */
    355 				break;
    356 			case 'f':
    357 				/* frontier? */
    358 				p += 2;
    359 				if (*p != '[') {
    360 					match_error(ms, "missing '['"
    361 					    " after '%f' in pattern");
    362 					break;
    363 				}
    364 				/* points to what is next */
    365 				ep = classend(ms, p);
    366 				if (ms->error != NULL)
    367 					break;
    368 				previous =
    369 				    (s == ms->src_init) ? '\0' : *(s - 1);
    370 				if (!matchbracketclass(uchar(previous),
    371 				    p, ep - 1) &&
    372 				    matchbracketclass(uchar(*s),
    373 				    p, ep - 1)) {
    374 					p = ep;
    375 					/* return match(ms, s, ep); */
    376 					goto init;
    377 				}
    378 				/* match failed */
    379 				s = NULL;
    380 				break;
    381 			case '0':
    382 			case '1':
    383 			case '2':
    384 			case '3':
    385 			case '4':
    386 			case '5':
    387 			case '6':
    388 			case '7':
    389 			case '8':
    390 			case '9':
    391 				/* capture results (%0-%9)? */
    392 				s = match_capture(ms, s, uchar(*(p + 1)));
    393 				if (s != NULL) {
    394 					p += 2;
    395 					/* return match(ms, s, p + 2) */
    396 					goto init;
    397 				}
    398 				break;
    399 			default:
    400 				goto dflt;
    401 			}
    402 			break;
    403 		default:
    404 
    405 			/* pattern class plus optional suffix */
    406 	dflt:
    407 			/* points to optional suffix */
    408 			ep = classend(ms, p);
    409 			if (ms->error != NULL)
    410 				break;
    411 
    412 			/* does not match at least once? */
    413 			if (!singlematch(ms, s, p, ep)) {
    414 				if (ms->repetitioncounter-- == 0) {
    415 					match_error(ms, "max repetition items");
    416 					s = NULL; /* fail */
    417 				/* accept empty? */
    418 				} else if
    419 				    (*ep == '*' || *ep == '?' || *ep == '-') {
    420 					 p = ep + 1;
    421 					/* return match(ms, s, ep + 1); */
    422 					 goto init;
    423 				} else {
    424 					/* '+' or no suffix */
    425 					s = NULL; /* fail */
    426 				}
    427 			} else {
    428 				/* matched once */
    429 				/* handle optional suffix */
    430 				switch (*ep) {
    431 				case '?':
    432 					/* optional */
    433 					if ((res =
    434 					    match(ms, s + 1, ep + 1)) != NULL)
    435 						s = res;
    436 					else {
    437 						/* 
    438 						 * else return
    439 						 *     match(ms, s, ep + 1);
    440 						 */
    441 						p = ep + 1;
    442 						goto init;
    443 					}
    444 					break;
    445 				case '+':
    446 					/* 1 or more repetitions */
    447 					s++; /* 1 match already done */
    448 					/* FALLTHROUGH */
    449 				case '*':
    450 					/* 0 or more repetitions */
    451 					s = max_expand(ms, s, p, ep);
    452 					break;
    453 				case '-':
    454 					/* 0 or more repetitions (minimum) */
    455 					s = min_expand(ms, s, p, ep);
    456 					break;
    457 				default:
    458 					/* no suffix */
    459 					s++;
    460 					p = ep;
    461 					/* return match(ms, s + 1, ep); */
    462 					goto init;
    463 				}
    464 			}
    465 			break;
    466 		}
    467 	}
    468 	ms->matchdepth++;
    469 	return s;
    470 }
    471 
    472 int
    473 gmatch_aux(gmatch_state * gm)
    474 {
    475 	const char *src;
    476 	for (src = gm->src; src <= gm->ms.src_end;) {
    477 		const char *e;
    478 		gm->ms.level = 0;
    479 		if ((e = match(&gm->ms, src, gm->p)) != NULL && e != gm->lastmatch) {
    480 			gm->src = gm->lastmatch = e;
    481 			return 1;
    482 		}
    483 		else src++;
    484 		ASSERT( gm->ms.error == NULL );
    485 	}
    486 	return 0;  /* not found */
    487 }
    488 
    489 void
    490 gmatch_init(const char * str, const char * pattern, gmatch_state * gm)
    491 {
    492 	memset(gm, 0, sizeof(*gm));
    493 
    494 	gm->ms.maxcaptures = MAXCAPTURES;
    495 	gm->ms.matchdepth = MAXCCALLS;
    496 	gm->ms.repetitioncounter = MAXREPETITION;
    497 	gm->ms.src_init = str;
    498 	gm->ms.src_end = str + strlen(str);
    499 	gm->ms.p_end = pattern + strlen(pattern);
    500 
    501 	gm->src = str;
    502 	gm->p = pattern;
    503 	gm->lastmatch = NULL;
    504 }
    505 
    506 bool match( Matches * matches, array< const char > str, const char * pattern ) {
    507 	match_state ms;
    508 	memset( &ms, 0, sizeof( ms ) );
    509 
    510 	bool anchor = pattern[ 0 ] == '^';
    511 	if( anchor ) {
    512 		pattern++;
    513 	}
    514 
    515 	ms.maxcaptures = MAXCAPTURES;
    516 	ms.matchdepth = MAXCCALLS;
    517 	ms.repetitioncounter = MAXREPETITION;
    518 	ms.src_init = str.ptr();
    519 	ms.src_end = str.ptr() + str.n;
    520 	ms.p_end = pattern + strlen( pattern );
    521 
    522 	for( size_t i = 0; i < str.n; i++ ) {
    523 		ms.level = 0;
    524 		const char * end = match( &ms, str.ptr() + i, pattern );
    525 		ASSERT( ms.error == NULL );
    526 		if( end != NULL ) {
    527 			break;
    528 		}
    529 		if( anchor ) {
    530 			break;
    531 		}
    532 	}
    533 
    534 	matches->matches = matches->matches_data.slice( 0, ms.level );
    535 	for( int i = 0; i < ms.level; i++ ) {
    536 		ssize_t len = ms.capture[ i ].len;
    537 		if( len == CAP_POSITION ) len = 0;
    538 		matches->matches_data[ i ] = array< const char >( ms.capture[ i ].init, checked_cast< size_t >( len ) );
    539 	}
    540 
    541 	return ms.level > 0;
    542 }
    543 
    544 bool match( Matches * matches, const char * str, const char * pattern ) {
    545 	return match( matches, array< const char >( str, strlen( str ) ), pattern );
    546 }
    547 
    548 gmatch::Iterator::Iterator( gmatch_state * gm_ ) {
    549 	if( gm_ == NULL ) {
    550 		done = true;
    551 		return;
    552 	}
    553 
    554 	gm = gm_;
    555 	done = false;
    556 	next();
    557 }
    558 
    559 array< array< const char > > gmatch::Iterator::operator*() {
    560 	return matches.matches;
    561 }
    562 
    563 void gmatch::Iterator::operator++() {
    564 	next();
    565 }
    566 
    567 bool gmatch::Iterator::operator!=( const Iterator & other ) {
    568 	return done != other.done;
    569 }
    570 
    571 void gmatch::Iterator::next() {
    572 	int ok = gmatch_aux( gm );
    573 	if( ok == 0 ) {
    574 		done = true;
    575 		return;
    576 	}
    577 
    578 	matches.matches = matches.matches_data.slice( 0, gm->ms.level );
    579 	for( int i = 0; i < gm->ms.level; i++ ) {
    580 		ssize_t len = gm->ms.capture[ i ].len;
    581 		if( len == CAP_POSITION ) len = 0;
    582 		matches.matches_data[ i ] = array< const char >( gm->ms.capture[ i ].init, checked_cast< size_t >( len ) );
    583 	}
    584 }
    585 
    586 gmatch::gmatch( const char * str, const char * pattern ) {
    587 	gmatch_init( str, pattern, &gm );
    588 }