This source file includes following definitions.
- btowc
- re_compile_pattern
- weak_alias
- weak_alias
- weak_alias
- re_compile_fastmap_iter
- regcomp
- weak_alias
- free_dfa_content
- regfree
- weak_alias
- libc_freeres_fn
- re_compile_internal
- init_dfa
- init_word_char
- free_workarea_compile
- create_initial_state
- optimize_utf8
- analyze
- postorder
- preorder
- optimize_subexps
- lower_subexps
- lower_subexp
- calc_first
- calc_next
- link_nfa_nodes
- duplicate_node_closure
- search_duplicated_node
- duplicate_node
- calc_inveclosure
- calc_eclosure
- calc_eclosure_iter
- fetch_token
- peek_token
- peek_token_bracket
- parse
- parse_reg_exp
- parse_branch
- parse_expression
- parse_sub_exp
- parse_dup_op
- build_range_exp
- build_collating_symbol
- parse_bracket_exp
- parse_bracket_element
- parse_bracket_symbol
- build_equiv_class
- build_charclass
- build_charclass_op
- fetch_number
- free_charset
- create_tree
- create_token_tree
- mark_opt_subexp
- free_token
- free_tree
- duplicate_tree
#include <stdint.h>
static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
size_t length, reg_syntax_t syntax);
static void re_compile_fastmap_iter (regex_t *bufp,
const re_dfastate_t *init_state,
char *fastmap);
static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
#ifdef RE_ENABLE_I18N
static void free_charset (re_charset_t *cset);
#endif
static void free_workarea_compile (regex_t *preg);
static reg_errcode_t create_initial_state (re_dfa_t *dfa);
#ifdef RE_ENABLE_I18N
static void optimize_utf8 (re_dfa_t *dfa);
#endif
static reg_errcode_t analyze (regex_t *preg);
static reg_errcode_t preorder (bin_tree_t *root,
reg_errcode_t (fn (void *, bin_tree_t *)),
void *extra);
static reg_errcode_t postorder (bin_tree_t *root,
reg_errcode_t (fn (void *, bin_tree_t *)),
void *extra);
static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
bin_tree_t *node);
static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
unsigned int constraint);
static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
int node, int root);
static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
static int fetch_number (re_string_t *input, re_token_t *token,
reg_syntax_t syntax);
static int peek_token (re_token_t *token, re_string_t *input,
reg_syntax_t syntax) internal_function;
static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
reg_syntax_t syntax, reg_errcode_t *err);
static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
re_token_t *token, reg_syntax_t syntax,
int nest, reg_errcode_t *err);
static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
re_token_t *token, reg_syntax_t syntax,
int nest, reg_errcode_t *err);
static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
re_token_t *token, reg_syntax_t syntax,
int nest, reg_errcode_t *err);
static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
re_token_t *token, reg_syntax_t syntax,
int nest, reg_errcode_t *err);
static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
re_dfa_t *dfa, re_token_t *token,
reg_syntax_t syntax, reg_errcode_t *err);
static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
re_token_t *token, reg_syntax_t syntax,
reg_errcode_t *err);
static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
re_string_t *regexp,
re_token_t *token, int token_len,
re_dfa_t *dfa,
reg_syntax_t syntax,
int accept_hyphen);
static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
re_string_t *regexp,
re_token_t *token);
#ifdef RE_ENABLE_I18N
static reg_errcode_t build_equiv_class (bitset_t sbcset,
re_charset_t *mbcset,
int *equiv_class_alloc,
const unsigned char *name);
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
bitset_t sbcset,
re_charset_t *mbcset,
int *char_class_alloc,
const char *class_name,
reg_syntax_t syntax);
#else
static reg_errcode_t build_equiv_class (bitset_t sbcset,
const unsigned char *name);
static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
bitset_t sbcset,
const char *class_name,
reg_syntax_t syntax);
#endif
static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
RE_TRANSLATE_TYPE trans,
const char *class_name,
const char *extra,
int non_match, reg_errcode_t *err);
static bin_tree_t *create_tree (re_dfa_t *dfa,
bin_tree_t *left, bin_tree_t *right,
re_token_type_t type);
static bin_tree_t *create_token_tree (re_dfa_t *dfa,
bin_tree_t *left, bin_tree_t *right,
const re_token_t *token);
static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
static void free_token (re_token_t *node);
static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
const char __re_error_msgid[] attribute_hidden =
{
#define REG_NOERROR_IDX 0
gettext_noop ("Success")
"\0"
#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
gettext_noop ("No match")
"\0"
#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
gettext_noop ("Invalid regular expression")
"\0"
#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
gettext_noop ("Invalid collation character")
"\0"
#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
gettext_noop ("Invalid character class name")
"\0"
#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
gettext_noop ("Trailing backslash")
"\0"
#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
gettext_noop ("Invalid back reference")
"\0"
#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
gettext_noop ("Unmatched [ or [^")
"\0"
#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
gettext_noop ("Unmatched ( or \\(")
"\0"
#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
gettext_noop ("Unmatched \\{")
"\0"
#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
gettext_noop ("Invalid content of \\{\\}")
"\0"
#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
gettext_noop ("Invalid range end")
"\0"
#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
gettext_noop ("Memory exhausted")
"\0"
#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
gettext_noop ("Invalid preceding regular expression")
"\0"
#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
gettext_noop ("Premature end of regular expression")
"\0"
#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
gettext_noop ("Regular expression too big")
"\0"
#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
gettext_noop ("Unmatched ) or \\)")
};
const size_t __re_error_msgid_idx[] attribute_hidden =
{
REG_NOERROR_IDX,
REG_NOMATCH_IDX,
REG_BADPAT_IDX,
REG_ECOLLATE_IDX,
REG_ECTYPE_IDX,
REG_EESCAPE_IDX,
REG_ESUBREG_IDX,
REG_EBRACK_IDX,
REG_EPAREN_IDX,
REG_EBRACE_IDX,
REG_BADBR_IDX,
REG_ERANGE_IDX,
REG_ESPACE_IDX,
REG_BADRPT_IDX,
REG_EEND_IDX,
REG_ESIZE_IDX,
REG_ERPAREN_IDX
};
#ifdef ZOS_USS
wchar_t
btowc (int c)
{
wchar_t wtmp[2];
char tmp[2];
tmp[0] = c;
tmp[1] = 0;
mbtowc (wtmp, tmp, 1);
return wtmp[0];
}
#endif
const char *
re_compile_pattern (const char *pattern,
size_t length,
struct re_pattern_buffer *bufp)
{
reg_errcode_t ret;
bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
bufp->newline_anchor = 1;
ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
if (!ret)
return NULL;
return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
}
#ifdef _LIBC
weak_alias (__re_compile_pattern, re_compile_pattern)
#endif
reg_syntax_t re_syntax_options;
reg_syntax_t
re_set_syntax (reg_syntax_t syntax)
{
reg_syntax_t ret = re_syntax_options;
re_syntax_options = syntax;
return ret;
}
#ifdef _LIBC
weak_alias (__re_set_syntax, re_set_syntax)
#endif
int
re_compile_fastmap (struct re_pattern_buffer *bufp)
{
re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
char *fastmap = bufp->fastmap;
memset (fastmap, '\0', sizeof (char) * SBC_MAX);
re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
if (dfa->init_state != dfa->init_state_word)
re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
if (dfa->init_state != dfa->init_state_nl)
re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
if (dfa->init_state != dfa->init_state_begbuf)
re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
bufp->fastmap_accurate = 1;
return 0;
}
#ifdef _LIBC
weak_alias (__re_compile_fastmap, re_compile_fastmap)
#endif
static inline void
__attribute ((always_inline))
re_set_fastmap (char *fastmap, int icase, int ch)
{
fastmap[ch] = 1;
if (icase)
fastmap[tolower (ch)] = 1;
}
static void
re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
char *fastmap)
{
volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
int node_cnt;
int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
{
int node = init_state->nodes.elems[node_cnt];
re_token_type_t type = dfa->nodes[node].type;
if (type == CHARACTER)
{
re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
#ifdef RE_ENABLE_I18N
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
{
unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
wchar_t wc;
mbstate_t state;
p = buf;
*p++ = dfa->nodes[node].opr.c;
while (++node < dfa->nodes_len
&& dfa->nodes[node].type == CHARACTER
&& dfa->nodes[node].mb_partial)
*p++ = dfa->nodes[node].opr.c;
memset (&state, '\0', sizeof (state));
if (__mbrtowc (&wc, (const char *) buf, p - buf,
&state) == p - buf
&& (__wcrtomb ((char *) buf, towlower (wc), &state)
!= (size_t) -1))
re_set_fastmap (fastmap, 0, buf[0]);
re_free (buf);
}
#endif
}
else if (type == SIMPLE_BRACKET)
{
int i, ch;
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
{
int j;
bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
if (w & ((bitset_word_t) 1 << j))
re_set_fastmap (fastmap, icase, ch);
}
}
#ifdef RE_ENABLE_I18N
else if (type == COMPLEX_BRACKET)
{
re_charset_t *cset = dfa->nodes[node].opr.mbcset;
int i;
# ifdef _LIBC
if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
&& (cset->ncoll_syms || cset->nranges))
{
const int32_t *table = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
for (i = 0; i < SBC_MAX; ++i)
if (table[i] < 0)
re_set_fastmap (fastmap, icase, i);
}
# endif
if (dfa->mb_cur_max > 1
&& (cset->nchar_classes || cset->non_match || cset->nranges
# ifdef _LIBC
|| cset->nequiv_classes
# endif
))
{
unsigned char c = 0;
do
{
mbstate_t mbs;
memset (&mbs, 0, sizeof (mbs));
if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
re_set_fastmap (fastmap, false, (int) c);
}
while (++c != 0);
}
else
{
for (i = 0; i < cset->nmbchars; ++i)
{
char buf[256];
mbstate_t state;
memset (&state, '\0', sizeof (state));
if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
{
if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
!= (size_t) -1)
re_set_fastmap (fastmap, false, *(unsigned char *) buf);
}
}
}
}
#endif
else if (type == OP_PERIOD
#ifdef RE_ENABLE_I18N
|| type == OP_UTF8_PERIOD
#endif
|| type == END_OF_RE)
{
memset (fastmap, '\1', sizeof (char) * SBC_MAX);
if (type == END_OF_RE)
bufp->can_be_null = 1;
return;
}
}
}
int
regcomp (regex_t *__restrict preg,
const char *__restrict pattern,
int cflags)
{
reg_errcode_t ret;
reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
: RE_SYNTAX_POSIX_BASIC);
preg->buffer = NULL;
preg->allocated = 0;
preg->used = 0;
preg->fastmap = re_malloc (char, SBC_MAX);
if (BE (preg->fastmap == NULL, 0))
return REG_ESPACE;
syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
if (cflags & REG_NEWLINE)
{
syntax &= ~RE_DOT_NEWLINE;
syntax |= RE_HAT_LISTS_NOT_NEWLINE;
preg->newline_anchor = 1;
}
else
preg->newline_anchor = 0;
preg->no_sub = !!(cflags & REG_NOSUB);
preg->translate = NULL;
ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
if (ret == REG_ERPAREN)
ret = REG_EPAREN;
if (BE (ret == REG_NOERROR, 1))
(void) re_compile_fastmap (preg);
else
{
re_free (preg->fastmap);
preg->fastmap = NULL;
}
return (int) ret;
}
#ifdef _LIBC
weak_alias (__regcomp, regcomp)
#endif
size_t
regerror(int errcode, const regex_t *__restrict preg,
char *__restrict errbuf, size_t errbuf_size)
{
const char *msg;
size_t msg_size;
if (BE (errcode < 0
|| errcode >= (int) (sizeof (__re_error_msgid_idx)
/ sizeof (__re_error_msgid_idx[0])), 0))
abort ();
msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
msg_size = strlen (msg) + 1;
if (BE (errbuf_size != 0, 1))
{
if (BE (msg_size > errbuf_size, 0))
{
memcpy (errbuf, msg, errbuf_size - 1);
errbuf[errbuf_size - 1] = 0;
}
else
memcpy (errbuf, msg, msg_size);
}
return msg_size;
}
#ifdef _LIBC
weak_alias (__regerror, regerror)
#endif
#ifdef RE_ENABLE_I18N
#if __GNUC__ >= 3
static const bitset_t utf8_sb_map = {
[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
};
#else
static bitset_t utf8_sb_map;
#endif
#endif
static void
free_dfa_content (re_dfa_t *dfa)
{
int i, j;
if (dfa->nodes)
for (i = 0; i < dfa->nodes_len; ++i)
free_token (dfa->nodes + i);
re_free (dfa->nexts);
for (i = 0; i < dfa->nodes_len; ++i)
{
if (dfa->eclosures != NULL)
re_node_set_free (dfa->eclosures + i);
if (dfa->inveclosures != NULL)
re_node_set_free (dfa->inveclosures + i);
if (dfa->edests != NULL)
re_node_set_free (dfa->edests + i);
}
re_free (dfa->edests);
re_free (dfa->eclosures);
re_free (dfa->inveclosures);
re_free (dfa->nodes);
if (dfa->state_table)
for (i = 0; i <= dfa->state_hash_mask; ++i)
{
struct re_state_table_entry *entry = dfa->state_table + i;
for (j = 0; j < entry->num; ++j)
{
re_dfastate_t *state = entry->array[j];
free_state (state);
}
re_free (entry->array);
}
re_free (dfa->state_table);
#ifdef RE_ENABLE_I18N
if (dfa->sb_char != utf8_sb_map)
re_free (dfa->sb_char);
#endif
re_free (dfa->subexp_map);
#ifdef DEBUG
re_free (dfa->re_str);
#endif
re_free (dfa);
}
void
regfree (regex_t *preg)
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
if (BE (dfa != NULL, 1))
free_dfa_content (dfa);
preg->buffer = NULL;
preg->allocated = 0;
re_free (preg->fastmap);
preg->fastmap = NULL;
re_free (preg->translate);
preg->translate = NULL;
}
#ifdef _LIBC
weak_alias (__regfree, regfree)
#endif
#if defined _REGEX_RE_COMP || defined _LIBC
static struct re_pattern_buffer re_comp_buf;
char *
# ifdef _LIBC
weak_function
# endif
re_comp (s)
const char *s;
{
reg_errcode_t ret;
char *fastmap;
if (!s)
{
if (!re_comp_buf.buffer)
return gettext ("No previous regular expression");
return 0;
}
if (re_comp_buf.buffer)
{
fastmap = re_comp_buf.fastmap;
re_comp_buf.fastmap = NULL;
__regfree (&re_comp_buf);
memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
re_comp_buf.fastmap = fastmap;
}
if (re_comp_buf.fastmap == NULL)
{
re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
if (re_comp_buf.fastmap == NULL)
return (char *) gettext (__re_error_msgid
+ __re_error_msgid_idx[(int) REG_ESPACE]);
}
re_comp_buf.newline_anchor = 1;
ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
if (!ret)
return NULL;
return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
}
#ifdef _LIBC
libc_freeres_fn (free_mem)
{
__regfree (&re_comp_buf);
}
#endif
#endif
static reg_errcode_t
re_compile_internal (regex_t *preg, const char * pattern, size_t length,
reg_syntax_t syntax)
{
reg_errcode_t err = REG_NOERROR;
re_dfa_t *dfa;
re_string_t regexp;
preg->fastmap_accurate = 0;
preg->syntax = syntax;
preg->not_bol = preg->not_eol = 0;
preg->used = 0;
preg->re_nsub = 0;
preg->can_be_null = 0;
preg->regs_allocated = REGS_UNALLOCATED;
dfa = (re_dfa_t *) preg->buffer;
if (BE (preg->allocated < sizeof (re_dfa_t), 0))
{
dfa = re_realloc (preg->buffer, re_dfa_t, 1);
if (dfa == NULL)
return REG_ESPACE;
preg->allocated = sizeof (re_dfa_t);
preg->buffer = (unsigned char *) dfa;
}
preg->used = sizeof (re_dfa_t);
err = init_dfa (dfa, length);
if (BE (err != REG_NOERROR, 0))
{
free_dfa_content (dfa);
preg->buffer = NULL;
preg->allocated = 0;
return err;
}
#ifdef DEBUG
dfa->re_str = re_malloc (char, length + 1);
strncpy (dfa->re_str, pattern, length + 1);
#endif
__libc_lock_init (dfa->lock);
err = re_string_construct (®exp, pattern, length, preg->translate,
syntax & RE_ICASE, dfa);
if (BE (err != REG_NOERROR, 0))
{
re_compile_internal_free_return:
free_workarea_compile (preg);
re_string_destruct (®exp);
free_dfa_content (dfa);
preg->buffer = NULL;
preg->allocated = 0;
return err;
}
preg->re_nsub = 0;
dfa->str_tree = parse (®exp, preg, syntax, &err);
if (BE (dfa->str_tree == NULL, 0))
goto re_compile_internal_free_return;
err = analyze (preg);
if (BE (err != REG_NOERROR, 0))
goto re_compile_internal_free_return;
#ifdef RE_ENABLE_I18N
if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
optimize_utf8 (dfa);
#endif
err = create_initial_state (dfa);
free_workarea_compile (preg);
re_string_destruct (®exp);
if (BE (err != REG_NOERROR, 0))
{
free_dfa_content (dfa);
preg->buffer = NULL;
preg->allocated = 0;
}
return err;
}
static reg_errcode_t
init_dfa (re_dfa_t *dfa, size_t pat_len)
{
unsigned int table_size;
#ifndef _LIBC
char *codeset_name;
#endif
memset (dfa, '\0', sizeof (re_dfa_t));
dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
if (pat_len == SIZE_MAX)
return REG_ESPACE;
dfa->nodes_alloc = pat_len + 1;
dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
for (table_size = 1; ; table_size <<= 1)
if (table_size > pat_len)
break;
dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
dfa->state_hash_mask = table_size - 1;
dfa->mb_cur_max = MB_CUR_MAX;
#ifdef _LIBC
if (dfa->mb_cur_max == 6
&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
dfa->is_utf8 = 1;
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
!= 0);
#else
# ifdef HAVE_LANGINFO_CODESET
codeset_name = nl_langinfo (CODESET);
# else
codeset_name = getenv ("LC_ALL");
if (codeset_name == NULL || codeset_name[0] == '\0')
codeset_name = getenv ("LC_CTYPE");
if (codeset_name == NULL || codeset_name[0] == '\0')
codeset_name = getenv ("LANG");
if (codeset_name == NULL)
codeset_name = "";
else if (strchr (codeset_name, '.') != NULL)
codeset_name = strchr (codeset_name, '.') + 1;
# endif
#if 0
if (strcasecmp (codeset_name, "UTF-8") == 0
|| strcasecmp (codeset_name, "UTF8") == 0)
dfa->is_utf8 = 1;
#else
if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
&& (codeset_name[1] == 'T' || codeset_name[1] == 't')
&& (codeset_name[2] == 'F' || codeset_name[2] == 'f')
&& (codeset_name[3] == '-'
? codeset_name[4] == '8' && codeset_name[5] == '\0'
: codeset_name[3] == '8' && codeset_name[4] == '\0'))
dfa->is_utf8 = 1;
#endif
dfa->map_notascii = 0;
#endif
#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
{
if (dfa->is_utf8)
{
#if !defined(__GNUC__) || __GNUC__ < 3
static short utf8_sb_map_inited = 0;
if (! utf8_sb_map_inited)
{
int i;
utf8_sb_map_inited = 0;
for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
utf8_sb_map[i] = BITSET_WORD_MAX;
}
#endif
dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
}
else
{
int i, j, ch;
dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
if (BE (dfa->sb_char == NULL, 0))
return REG_ESPACE;
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
{
wint_t wch = __btowc (ch);
if (wch != WEOF)
dfa->sb_char[i] |= (bitset_word_t) 1 << j;
# ifndef _LIBC
if (isascii (ch) && wch != ch)
dfa->map_notascii = 1;
# endif
}
}
}
#endif
if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
return REG_ESPACE;
return REG_NOERROR;
}
static void
internal_function
init_word_char (re_dfa_t *dfa)
{
int i, j, ch;
dfa->word_ops_used = 1;
for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
if (isalnum (ch) || ch == '_')
dfa->word_char[i] |= (bitset_word_t) 1 << j;
}
static void
free_workarea_compile (regex_t *preg)
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
bin_tree_storage_t *storage, *next;
for (storage = dfa->str_tree_storage; storage; storage = next)
{
next = storage->next;
re_free (storage);
}
dfa->str_tree_storage = NULL;
dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
dfa->str_tree = NULL;
re_free (dfa->org_indices);
dfa->org_indices = NULL;
}
static reg_errcode_t
create_initial_state (re_dfa_t *dfa)
{
int first, i;
reg_errcode_t err;
re_node_set init_nodes;
first = dfa->str_tree->first->node_idx;
dfa->init_node = first;
err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
if (BE (err != REG_NOERROR, 0))
return err;
if (dfa->nbackref > 0)
for (i = 0; i < init_nodes.nelem; ++i)
{
int node_idx = init_nodes.elems[i];
re_token_type_t type = dfa->nodes[node_idx].type;
int clexp_idx;
if (type != OP_BACK_REF)
continue;
for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
{
re_token_t *clexp_node;
clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
if (clexp_node->type == OP_CLOSE_SUBEXP
&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
break;
}
if (clexp_idx == init_nodes.nelem)
continue;
if (type == OP_BACK_REF)
{
int dest_idx = dfa->edests[node_idx].elems[0];
if (!re_node_set_contains (&init_nodes, dest_idx))
{
reg_errcode_t err = re_node_set_merge (&init_nodes,
dfa->eclosures
+ dest_idx);
if (err != REG_NOERROR)
return err;
i = 0;
}
}
}
dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
if (BE (dfa->init_state == NULL, 0))
return err;
if (dfa->init_state->has_constraint)
{
dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
CONTEXT_WORD);
dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
CONTEXT_NEWLINE);
dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
&init_nodes,
CONTEXT_NEWLINE
| CONTEXT_BEGBUF);
if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
|| dfa->init_state_begbuf == NULL, 0))
return err;
}
else
dfa->init_state_word = dfa->init_state_nl
= dfa->init_state_begbuf = dfa->init_state;
re_node_set_free (&init_nodes);
return REG_NOERROR;
}
#ifdef RE_ENABLE_I18N
static void
optimize_utf8 (re_dfa_t *dfa)
{
int node, i, mb_chars = 0, has_period = 0;
for (node = 0; node < dfa->nodes_len; ++node)
switch (dfa->nodes[node].type)
{
case CHARACTER:
if (dfa->nodes[node].opr.c >= 0x80)
mb_chars = 1;
break;
case ANCHOR:
switch (dfa->nodes[node].opr.ctx_type)
{
case LINE_FIRST:
case LINE_LAST:
case BUF_FIRST:
case BUF_LAST:
break;
default:
return;
}
break;
case OP_PERIOD:
has_period = 1;
break;
case OP_BACK_REF:
case OP_ALT:
case END_OF_RE:
case OP_DUP_ASTERISK:
case OP_OPEN_SUBEXP:
case OP_CLOSE_SUBEXP:
break;
case COMPLEX_BRACKET:
return;
case SIMPLE_BRACKET:
assert (0x80 % BITSET_WORD_BITS == 0);
for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
if (dfa->nodes[node].opr.sbcset[i])
return;
break;
default:
abort ();
}
if (mb_chars || has_period)
for (node = 0; node < dfa->nodes_len; ++node)
{
if (dfa->nodes[node].type == CHARACTER
&& dfa->nodes[node].opr.c >= 0x80)
dfa->nodes[node].mb_partial = 0;
else if (dfa->nodes[node].type == OP_PERIOD)
dfa->nodes[node].type = OP_UTF8_PERIOD;
}
dfa->mb_cur_max = 1;
dfa->is_utf8 = 0;
dfa->has_mb_node = dfa->nbackref > 0 || has_period;
}
#endif
static reg_errcode_t
analyze (regex_t *preg)
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
reg_errcode_t ret;
dfa->nexts = re_malloc (int, dfa->nodes_alloc);
dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
|| dfa->eclosures == NULL, 0))
return REG_ESPACE;
dfa->subexp_map = re_malloc (int, preg->re_nsub);
if (dfa->subexp_map != NULL)
{
int i;
for (i = 0; i < preg->re_nsub; i++)
dfa->subexp_map[i] = i;
preorder (dfa->str_tree, optimize_subexps, dfa);
for (i = 0; i < preg->re_nsub; i++)
if (dfa->subexp_map[i] != i)
break;
if (i == preg->re_nsub)
{
free (dfa->subexp_map);
dfa->subexp_map = NULL;
}
}
ret = postorder (dfa->str_tree, lower_subexps, preg);
if (BE (ret != REG_NOERROR, 0))
return ret;
ret = postorder (dfa->str_tree, calc_first, dfa);
if (BE (ret != REG_NOERROR, 0))
return ret;
preorder (dfa->str_tree, calc_next, dfa);
ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
if (BE (ret != REG_NOERROR, 0))
return ret;
ret = calc_eclosure (dfa);
if (BE (ret != REG_NOERROR, 0))
return ret;
if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
|| dfa->nbackref)
{
dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
if (BE (dfa->inveclosures == NULL, 0))
return REG_ESPACE;
ret = calc_inveclosure (dfa);
}
return ret;
}
static reg_errcode_t
postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
void *extra)
{
bin_tree_t *node, *prev;
for (node = root; ; )
{
while (node->left || node->right)
if (node->left)
node = node->left;
else
node = node->right;
do
{
reg_errcode_t err = fn (extra, node);
if (BE (err != REG_NOERROR, 0))
return err;
if (node->parent == NULL)
return REG_NOERROR;
prev = node;
node = node->parent;
}
while (node->right == prev || node->right == NULL);
node = node->right;
}
}
static reg_errcode_t
preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
void *extra)
{
bin_tree_t *node;
for (node = root; ; )
{
reg_errcode_t err = fn (extra, node);
if (BE (err != REG_NOERROR, 0))
return err;
if (node->left)
node = node->left;
else
{
bin_tree_t *prev = NULL;
while (node->right == prev || node->right == NULL)
{
prev = node;
node = node->parent;
if (!node)
return REG_NOERROR;
}
node = node->right;
}
}
}
static reg_errcode_t
optimize_subexps (void *extra, bin_tree_t *node)
{
re_dfa_t *dfa = (re_dfa_t *) extra;
if (node->token.type == OP_BACK_REF && dfa->subexp_map)
{
int idx = node->token.opr.idx;
node->token.opr.idx = dfa->subexp_map[idx];
dfa->used_bkref_map |= 1 << node->token.opr.idx;
}
else if (node->token.type == SUBEXP
&& node->left && node->left->token.type == SUBEXP)
{
int other_idx = node->left->token.opr.idx;
node->left = node->left->left;
if (node->left)
node->left->parent = node;
dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
if (other_idx < BITSET_WORD_BITS)
dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
}
return REG_NOERROR;
}
static reg_errcode_t
lower_subexps (void *extra, bin_tree_t *node)
{
regex_t *preg = (regex_t *) extra;
reg_errcode_t err = REG_NOERROR;
if (node->left && node->left->token.type == SUBEXP)
{
node->left = lower_subexp (&err, preg, node->left);
if (node->left)
node->left->parent = node;
}
if (node->right && node->right->token.type == SUBEXP)
{
node->right = lower_subexp (&err, preg, node->right);
if (node->right)
node->right->parent = node;
}
return err;
}
static bin_tree_t *
lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
bin_tree_t *body = node->left;
bin_tree_t *op, *cls, *tree1, *tree;
if (preg->no_sub
&& node->left != NULL
&& (node->token.opr.idx >= BITSET_WORD_BITS
|| !(dfa->used_bkref_map
& ((bitset_word_t) 1 << node->token.opr.idx))))
return node->left;
op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
tree = create_tree (dfa, op, tree1, CONCAT);
if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
return tree;
}
static reg_errcode_t
calc_first (void *extra, bin_tree_t *node)
{
re_dfa_t *dfa = (re_dfa_t *) extra;
if (node->token.type == CONCAT)
{
node->first = node->left->first;
node->node_idx = node->left->node_idx;
}
else
{
node->first = node;
node->node_idx = re_dfa_add_node (dfa, node->token);
if (BE (node->node_idx == -1, 0))
return REG_ESPACE;
if (node->token.type == ANCHOR)
dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
}
return REG_NOERROR;
}
static reg_errcode_t
calc_next (void *extra, bin_tree_t *node)
{
switch (node->token.type)
{
case OP_DUP_ASTERISK:
node->left->next = node;
break;
case CONCAT:
node->left->next = node->right->first;
node->right->next = node->next;
break;
default:
if (node->left)
node->left->next = node->next;
if (node->right)
node->right->next = node->next;
break;
}
return REG_NOERROR;
}
static reg_errcode_t
link_nfa_nodes (void *extra, bin_tree_t *node)
{
re_dfa_t *dfa = (re_dfa_t *) extra;
int idx = node->node_idx;
reg_errcode_t err = REG_NOERROR;
switch (node->token.type)
{
case CONCAT:
break;
case END_OF_RE:
assert (node->next == NULL);
break;
case OP_DUP_ASTERISK:
case OP_ALT:
{
int left, right;
dfa->has_plural_match = 1;
if (node->left != NULL)
left = node->left->first->node_idx;
else
left = node->next->node_idx;
if (node->right != NULL)
right = node->right->first->node_idx;
else
right = node->next->node_idx;
assert (left > -1);
assert (right > -1);
err = re_node_set_init_2 (dfa->edests + idx, left, right);
}
break;
case ANCHOR:
case OP_OPEN_SUBEXP:
case OP_CLOSE_SUBEXP:
err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
break;
case OP_BACK_REF:
dfa->nexts[idx] = node->next->node_idx;
if (node->token.type == OP_BACK_REF)
err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
break;
default:
assert (!IS_EPSILON_NODE (node->token.type));
dfa->nexts[idx] = node->next->node_idx;
break;
}
return err;
}
static reg_errcode_t
internal_function
duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
int root_node, unsigned int init_constraint)
{
int org_node, clone_node, ret;
unsigned int constraint = init_constraint;
for (org_node = top_org_node, clone_node = top_clone_node;;)
{
int org_dest, clone_dest;
if (dfa->nodes[org_node].type == OP_BACK_REF)
{
org_dest = dfa->nexts[org_node];
re_node_set_empty (dfa->edests + clone_node);
clone_dest = duplicate_node (dfa, org_dest, constraint);
if (BE (clone_dest == -1, 0))
return REG_ESPACE;
dfa->nexts[clone_node] = dfa->nexts[org_node];
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
}
else if (dfa->edests[org_node].nelem == 0)
{
dfa->nexts[clone_node] = dfa->nexts[org_node];
break;
}
else if (dfa->edests[org_node].nelem == 1)
{
org_dest = dfa->edests[org_node].elems[0];
re_node_set_empty (dfa->edests + clone_node);
if (org_node == root_node && clone_node != org_node)
{
ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
break;
}
constraint |= dfa->nodes[org_node].constraint;
clone_dest = duplicate_node (dfa, org_dest, constraint);
if (BE (clone_dest == -1, 0))
return REG_ESPACE;
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
}
else
{
org_dest = dfa->edests[org_node].elems[0];
re_node_set_empty (dfa->edests + clone_node);
clone_dest = search_duplicated_node (dfa, org_dest, constraint);
if (clone_dest == -1)
{
reg_errcode_t err;
clone_dest = duplicate_node (dfa, org_dest, constraint);
if (BE (clone_dest == -1, 0))
return REG_ESPACE;
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
err = duplicate_node_closure (dfa, org_dest, clone_dest,
root_node, constraint);
if (BE (err != REG_NOERROR, 0))
return err;
}
else
{
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
}
org_dest = dfa->edests[org_node].elems[1];
clone_dest = duplicate_node (dfa, org_dest, constraint);
if (BE (clone_dest == -1, 0))
return REG_ESPACE;
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
}
org_node = org_dest;
clone_node = clone_dest;
}
return REG_NOERROR;
}
static int
search_duplicated_node (const re_dfa_t *dfa, int org_node,
unsigned int constraint)
{
int idx;
for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
{
if (org_node == dfa->org_indices[idx]
&& constraint == dfa->nodes[idx].constraint)
return idx;
}
return -1;
}
static int
duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
{
int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
if (BE (dup_idx != -1, 1))
{
dfa->nodes[dup_idx].constraint = constraint;
dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
dfa->nodes[dup_idx].duplicated = 1;
dfa->org_indices[dup_idx] = org_idx;
}
return dup_idx;
}
static reg_errcode_t
calc_inveclosure (re_dfa_t *dfa)
{
int src, idx, ret;
for (idx = 0; idx < dfa->nodes_len; ++idx)
re_node_set_init_empty (dfa->inveclosures + idx);
for (src = 0; src < dfa->nodes_len; ++src)
{
int *elems = dfa->eclosures[src].elems;
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
{
ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
if (BE (ret == -1, 0))
return REG_ESPACE;
}
}
return REG_NOERROR;
}
static reg_errcode_t
calc_eclosure (re_dfa_t *dfa)
{
int node_idx, incomplete;
#ifdef DEBUG
assert (dfa->nodes_len > 0);
#endif
incomplete = 0;
for (node_idx = 0; ; ++node_idx)
{
reg_errcode_t err;
re_node_set eclosure_elem;
if (node_idx == dfa->nodes_len)
{
if (!incomplete)
break;
incomplete = 0;
node_idx = 0;
}
#ifdef DEBUG
assert (dfa->eclosures[node_idx].nelem != -1);
#endif
if (dfa->eclosures[node_idx].nelem != 0)
continue;
err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
if (BE (err != REG_NOERROR, 0))
return err;
if (dfa->eclosures[node_idx].nelem == 0)
{
incomplete = 1;
re_node_set_free (&eclosure_elem);
}
}
return REG_NOERROR;
}
static reg_errcode_t
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
{
reg_errcode_t err;
int i;
re_node_set eclosure;
int ret;
int incomplete = 0;
err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
if (BE (err != REG_NOERROR, 0))
return err;
dfa->eclosures[node].nelem = -1;
if (dfa->nodes[node].constraint
&& dfa->edests[node].nelem
&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
{
err = duplicate_node_closure (dfa, node, node, node,
dfa->nodes[node].constraint);
if (BE (err != REG_NOERROR, 0))
return err;
}
if (IS_EPSILON_NODE(dfa->nodes[node].type))
for (i = 0; i < dfa->edests[node].nelem; ++i)
{
re_node_set eclosure_elem;
int edest = dfa->edests[node].elems[i];
if (dfa->eclosures[edest].nelem == -1)
{
incomplete = 1;
continue;
}
if (dfa->eclosures[edest].nelem == 0)
{
err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
if (BE (err != REG_NOERROR, 0))
return err;
}
else
eclosure_elem = dfa->eclosures[edest];
err = re_node_set_merge (&eclosure, &eclosure_elem);
if (BE (err != REG_NOERROR, 0))
return err;
if (dfa->eclosures[edest].nelem == 0)
{
incomplete = 1;
re_node_set_free (&eclosure_elem);
}
}
ret = re_node_set_insert (&eclosure, node);
if (BE (ret < 0, 0))
return REG_ESPACE;
if (incomplete && !root)
dfa->eclosures[node].nelem = 0;
else
dfa->eclosures[node] = eclosure;
*new_set = eclosure;
return REG_NOERROR;
}
static void
internal_function
fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
{
re_string_skip_bytes (input, peek_token (result, input, syntax));
}
static int
internal_function
peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
{
unsigned char c;
if (re_string_eoi (input))
{
token->type = END_OF_RE;
return 0;
}
c = re_string_peek_byte (input, 0);
token->opr.c = c;
token->word_char = 0;
#ifdef RE_ENABLE_I18N
token->mb_partial = 0;
if (input->mb_cur_max > 1 &&
!re_string_first_byte (input, re_string_cur_idx (input)))
{
token->type = CHARACTER;
token->mb_partial = 1;
return 1;
}
#endif
if (c == '\\')
{
unsigned char c2;
if (re_string_cur_idx (input) + 1 >= re_string_length (input))
{
token->type = BACK_SLASH;
return 1;
}
c2 = re_string_peek_byte_case (input, 1);
token->opr.c = c2;
token->type = CHARACTER;
#ifdef RE_ENABLE_I18N
if (input->mb_cur_max > 1)
{
wint_t wc = re_string_wchar_at (input,
re_string_cur_idx (input) + 1);
token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
}
else
#endif
token->word_char = IS_WORD_CHAR (c2) != 0;
switch (c2)
{
case '|':
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
token->type = OP_ALT;
break;
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
if (!(syntax & RE_NO_BK_REFS))
{
token->type = OP_BACK_REF;
token->opr.idx = c2 - '1';
}
break;
case '<':
if (!(syntax & RE_NO_GNU_OPS))
{
token->type = ANCHOR;
token->opr.ctx_type = WORD_FIRST;
}
break;
case '>':
if (!(syntax & RE_NO_GNU_OPS))
{
token->type = ANCHOR;
token->opr.ctx_type = WORD_LAST;
}
break;
case 'b':
if (!(syntax & RE_NO_GNU_OPS))
{
token->type = ANCHOR;
token->opr.ctx_type = WORD_DELIM;
}
break;
case 'B':
if (!(syntax & RE_NO_GNU_OPS))
{
token->type = ANCHOR;
token->opr.ctx_type = NOT_WORD_DELIM;
}
break;
case 'w':
if (!(syntax & RE_NO_GNU_OPS))
token->type = OP_WORD;
break;
case 'W':
if (!(syntax & RE_NO_GNU_OPS))
token->type = OP_NOTWORD;
break;
case 's':
if (!(syntax & RE_NO_GNU_OPS))
token->type = OP_SPACE;
break;
case 'S':
if (!(syntax & RE_NO_GNU_OPS))
token->type = OP_NOTSPACE;
break;
case '`':
if (!(syntax & RE_NO_GNU_OPS))
{
token->type = ANCHOR;
token->opr.ctx_type = BUF_FIRST;
}
break;
case '\'':
if (!(syntax & RE_NO_GNU_OPS))
{
token->type = ANCHOR;
token->opr.ctx_type = BUF_LAST;
}
break;
case '(':
if (!(syntax & RE_NO_BK_PARENS))
token->type = OP_OPEN_SUBEXP;
break;
case ')':
if (!(syntax & RE_NO_BK_PARENS))
token->type = OP_CLOSE_SUBEXP;
break;
case '+':
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
token->type = OP_DUP_PLUS;
break;
case '?':
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
token->type = OP_DUP_QUESTION;
break;
case '{':
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
token->type = OP_OPEN_DUP_NUM;
break;
case '}':
if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
token->type = OP_CLOSE_DUP_NUM;
break;
default:
break;
}
return 2;
}
token->type = CHARACTER;
#ifdef RE_ENABLE_I18N
if (input->mb_cur_max > 1)
{
wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
}
else
#endif
token->word_char = IS_WORD_CHAR (token->opr.c);
switch (c)
{
case '\n':
if (syntax & RE_NEWLINE_ALT)
token->type = OP_ALT;
break;
case '|':
if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
token->type = OP_ALT;
break;
case '*':
token->type = OP_DUP_ASTERISK;
break;
case '+':
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
token->type = OP_DUP_PLUS;
break;
case '?':
if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
token->type = OP_DUP_QUESTION;
break;
case '{':
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
token->type = OP_OPEN_DUP_NUM;
break;
case '}':
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
token->type = OP_CLOSE_DUP_NUM;
break;
case '(':
if (syntax & RE_NO_BK_PARENS)
token->type = OP_OPEN_SUBEXP;
break;
case ')':
if (syntax & RE_NO_BK_PARENS)
token->type = OP_CLOSE_SUBEXP;
break;
case '[':
token->type = OP_OPEN_BRACKET;
break;
case '.':
token->type = OP_PERIOD;
break;
case '^':
if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
re_string_cur_idx (input) != 0)
{
char prev = re_string_peek_byte (input, -1);
if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
break;
}
token->type = ANCHOR;
token->opr.ctx_type = LINE_FIRST;
break;
case '$':
if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
re_string_cur_idx (input) + 1 != re_string_length (input))
{
re_token_t next;
re_string_skip_bytes (input, 1);
peek_token (&next, input, syntax);
re_string_skip_bytes (input, -1);
if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
break;
}
token->type = ANCHOR;
token->opr.ctx_type = LINE_LAST;
break;
default:
break;
}
return 1;
}
static int
internal_function
peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
{
unsigned char c;
if (re_string_eoi (input))
{
token->type = END_OF_RE;
return 0;
}
c = re_string_peek_byte (input, 0);
token->opr.c = c;
#ifdef RE_ENABLE_I18N
if (input->mb_cur_max > 1 &&
!re_string_first_byte (input, re_string_cur_idx (input)))
{
token->type = CHARACTER;
return 1;
}
#endif
if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
&& re_string_cur_idx (input) + 1 < re_string_length (input))
{
unsigned char c2;
re_string_skip_bytes (input, 1);
c2 = re_string_peek_byte (input, 0);
token->opr.c = c2;
token->type = CHARACTER;
return 1;
}
if (c == '[')
{
unsigned char c2;
int token_len;
if (re_string_cur_idx (input) + 1 < re_string_length (input))
c2 = re_string_peek_byte (input, 1);
else
c2 = 0;
token->opr.c = c2;
token_len = 2;
switch (c2)
{
case '.':
token->type = OP_OPEN_COLL_ELEM;
break;
case '=':
token->type = OP_OPEN_EQUIV_CLASS;
break;
case ':':
if (syntax & RE_CHAR_CLASSES)
{
token->type = OP_OPEN_CHAR_CLASS;
break;
}
default:
token->type = CHARACTER;
token->opr.c = c;
token_len = 1;
break;
}
return token_len;
}
switch (c)
{
case '-':
token->type = OP_CHARSET_RANGE;
break;
case ']':
token->type = OP_CLOSE_BRACKET;
break;
case '^':
token->type = OP_NON_MATCH_LIST;
break;
default:
token->type = CHARACTER;
}
return 1;
}
static bin_tree_t *
parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
reg_errcode_t *err)
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
bin_tree_t *tree, *eor, *root;
re_token_t current_token;
dfa->syntax = syntax;
fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
eor = create_tree (dfa, NULL, NULL, END_OF_RE);
if (tree != NULL)
root = create_tree (dfa, tree, eor, CONCAT);
else
root = eor;
if (BE (eor == NULL || root == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
return root;
}
static bin_tree_t *
parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
reg_syntax_t syntax, int nest, reg_errcode_t *err)
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
bin_tree_t *tree, *branch = NULL;
tree = parse_branch (regexp, preg, token, syntax, nest, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
while (token->type == OP_ALT)
{
fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
if (token->type != OP_ALT && token->type != END_OF_RE
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
{
branch = parse_branch (regexp, preg, token, syntax, nest, err);
if (BE (*err != REG_NOERROR && branch == NULL, 0))
return NULL;
}
else
branch = NULL;
tree = create_tree (dfa, tree, branch, OP_ALT);
if (BE (tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
}
return tree;
}
static bin_tree_t *
parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
reg_syntax_t syntax, int nest, reg_errcode_t *err)
{
bin_tree_t *tree, *exp;
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
tree = parse_expression (regexp, preg, token, syntax, nest, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
while (token->type != OP_ALT && token->type != END_OF_RE
&& (nest == 0 || token->type != OP_CLOSE_SUBEXP))
{
exp = parse_expression (regexp, preg, token, syntax, nest, err);
if (BE (*err != REG_NOERROR && exp == NULL, 0))
{
return NULL;
}
if (tree != NULL && exp != NULL)
{
tree = create_tree (dfa, tree, exp, CONCAT);
if (tree == NULL)
{
*err = REG_ESPACE;
return NULL;
}
}
else if (tree == NULL)
tree = exp;
}
return tree;
}
static bin_tree_t *
parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
reg_syntax_t syntax, int nest, reg_errcode_t *err)
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
bin_tree_t *tree;
switch (token->type)
{
case CHARACTER:
tree = create_token_tree (dfa, NULL, NULL, token);
if (BE (tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
{
while (!re_string_eoi (regexp)
&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
{
bin_tree_t *mbc_remain;
fetch_token (token, regexp, syntax);
mbc_remain = create_token_tree (dfa, NULL, NULL, token);
tree = create_tree (dfa, tree, mbc_remain, CONCAT);
if (BE (mbc_remain == NULL || tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
}
}
#endif
break;
case OP_OPEN_SUBEXP:
tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
break;
case OP_OPEN_BRACKET:
tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
break;
case OP_BACK_REF:
if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
{
*err = REG_ESUBREG;
return NULL;
}
dfa->used_bkref_map |= 1 << token->opr.idx;
tree = create_token_tree (dfa, NULL, NULL, token);
if (BE (tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
++dfa->nbackref;
dfa->has_mb_node = 1;
break;
case OP_OPEN_DUP_NUM:
if (syntax & RE_CONTEXT_INVALID_DUP)
{
*err = REG_BADRPT;
return NULL;
}
case OP_DUP_ASTERISK:
case OP_DUP_PLUS:
case OP_DUP_QUESTION:
if (syntax & RE_CONTEXT_INVALID_OPS)
{
*err = REG_BADRPT;
return NULL;
}
else if (syntax & RE_CONTEXT_INDEP_OPS)
{
fetch_token (token, regexp, syntax);
return parse_expression (regexp, preg, token, syntax, nest, err);
}
case OP_CLOSE_SUBEXP:
if ((token->type == OP_CLOSE_SUBEXP) &&
!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
{
*err = REG_ERPAREN;
return NULL;
}
case OP_CLOSE_DUP_NUM:
token->type = CHARACTER;
tree = create_token_tree (dfa, NULL, NULL, token);
if (BE (tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
break;
case ANCHOR:
if ((token->opr.ctx_type
& (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
&& dfa->word_ops_used == 0)
init_word_char (dfa);
if (token->opr.ctx_type == WORD_DELIM
|| token->opr.ctx_type == NOT_WORD_DELIM)
{
bin_tree_t *tree_first, *tree_last;
if (token->opr.ctx_type == WORD_DELIM)
{
token->opr.ctx_type = WORD_FIRST;
tree_first = create_token_tree (dfa, NULL, NULL, token);
token->opr.ctx_type = WORD_LAST;
}
else
{
token->opr.ctx_type = INSIDE_WORD;
tree_first = create_token_tree (dfa, NULL, NULL, token);
token->opr.ctx_type = INSIDE_NOTWORD;
}
tree_last = create_token_tree (dfa, NULL, NULL, token);
tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
}
else
{
tree = create_token_tree (dfa, NULL, NULL, token);
if (BE (tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
}
fetch_token (token, regexp, syntax);
return tree;
case OP_PERIOD:
tree = create_token_tree (dfa, NULL, NULL, token);
if (BE (tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
if (dfa->mb_cur_max > 1)
dfa->has_mb_node = 1;
break;
case OP_WORD:
case OP_NOTWORD:
tree = build_charclass_op (dfa, regexp->trans,
"alnum",
"_",
token->type == OP_NOTWORD, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
break;
case OP_SPACE:
case OP_NOTSPACE:
tree = build_charclass_op (dfa, regexp->trans,
"space",
"",
token->type == OP_NOTSPACE, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
break;
case OP_ALT:
case END_OF_RE:
return NULL;
case BACK_SLASH:
*err = REG_EESCAPE;
return NULL;
default:
#ifdef DEBUG
assert (0);
#endif
return NULL;
}
fetch_token (token, regexp, syntax);
while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
|| token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
{
tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
if (BE (*err != REG_NOERROR && tree == NULL, 0))
return NULL;
if ((syntax & RE_CONTEXT_INVALID_DUP)
&& (token->type == OP_DUP_ASTERISK
|| token->type == OP_OPEN_DUP_NUM))
{
*err = REG_BADRPT;
return NULL;
}
}
return tree;
}
static bin_tree_t *
parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
reg_syntax_t syntax, int nest, reg_errcode_t *err)
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
bin_tree_t *tree;
size_t cur_nsub;
cur_nsub = preg->re_nsub++;
fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
if (token->type == OP_CLOSE_SUBEXP)
tree = NULL;
else
{
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
*err = REG_EPAREN;
if (BE (*err != REG_NOERROR, 0))
return NULL;
}
if (cur_nsub <= '9' - '1')
dfa->completed_bkref_map |= 1 << cur_nsub;
tree = create_tree (dfa, tree, NULL, SUBEXP);
if (BE (tree == NULL, 0))
{
*err = REG_ESPACE;
return NULL;
}
tree->token.opr.idx = cur_nsub;
return tree;
}
static bin_tree_t *
parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
{
bin_tree_t *tree = NULL, *old_tree = NULL;
int i, start, end, start_idx = re_string_cur_idx (regexp);
#ifndef RE_TOKEN_INIT_BUG
re_token_t start_token = *token;
#else
re_token_t start_token;
memcpy ((void *) &start_token, (void *) token, sizeof start_token);
#endif
if (token->type == OP_OPEN_DUP_NUM)
{
end = 0;
start = fetch_number (regexp, token, syntax);
if (start == -1)
{
if (token->type == CHARACTER && token->opr.c == ',')
start = 0;
else
{
*err = REG_BADBR;
return NULL;
}
}
if (BE (start != -2, 1))
{
end = ((token->type == OP_CLOSE_DUP_NUM) ? start
: ((token->type == CHARACTER && token->opr.c == ',')
? fetch_number (regexp, token, syntax) : -2));
}
if (BE (start == -2 || end == -2, 0))
{
if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
{
if (token->type == END_OF_RE)
*err = REG_EBRACE;
else
*err = REG_BADBR;
return NULL;
}
re_string_set_index (regexp, start_idx);
*token = start_token;
token->type = CHARACTER;
return elem;
}
if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
{
*err = REG_BADBR;
return NULL;
}
}
else
{
start = (token->type == OP_DUP_PLUS) ? 1 : 0;
end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
}
fetch_token (token, regexp, syntax);
if (BE (elem == NULL, 0))
return NULL;
if (BE (start == 0 && end == 0, 0))
{
postorder (elem, free_tree, NULL);
return NULL;
}
if (BE (start > 0, 0))
{
tree = elem;
for (i = 2; i <= start; ++i)
{
elem = duplicate_tree (elem, dfa);
tree = create_tree (dfa, tree, elem, CONCAT);
if (BE (elem == NULL || tree == NULL, 0))
goto parse_dup_op_espace;
}
if (start == end)
return tree;
elem = duplicate_tree (elem, dfa);
old_tree = tree;
}
else
old_tree = NULL;
if (elem->token.type == SUBEXP)
postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
if (BE (tree == NULL, 0))
goto parse_dup_op_espace;
for (i = start + 2; i <= end; ++i)
{
elem = duplicate_tree (elem, dfa);
tree = create_tree (dfa, tree, elem, CONCAT);
if (BE (elem == NULL || tree == NULL, 0))
goto parse_dup_op_espace;
tree = create_tree (dfa, tree, NULL, OP_ALT);
if (BE (tree == NULL, 0))
goto parse_dup_op_espace;
}
if (old_tree)
tree = create_tree (dfa, old_tree, tree, CONCAT);
return tree;
parse_dup_op_espace:
*err = REG_ESPACE;
return NULL;
}
#define BRACKET_NAME_BUF_SIZE 32
#ifndef _LIBC
static reg_errcode_t
internal_function
# ifdef RE_ENABLE_I18N
build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
bracket_elem_t *start_elem, bracket_elem_t *end_elem)
# else
build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
bracket_elem_t *end_elem)
# endif
{
unsigned int start_ch, end_ch;
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
0))
return REG_ERANGE;
if (BE ((start_elem->type == COLL_SYM
&& strlen ((char *) start_elem->opr.name) > 1)
|| (end_elem->type == COLL_SYM
&& strlen ((char *) end_elem->opr.name) > 1), 0))
return REG_ECOLLATE;
# ifdef RE_ENABLE_I18N
{
wchar_t wc;
wint_t start_wc;
wint_t end_wc;
wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
: 0));
end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
: 0));
#ifdef GAWK
start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
? start_ch : start_elem->opr.wch);
end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
? end_ch : end_elem->opr.wch);
#else
start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
? __btowc (start_ch) : start_elem->opr.wch);
end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
? __btowc (end_ch) : end_elem->opr.wch);
#endif
if (start_wc == WEOF || end_wc == WEOF)
return REG_ECOLLATE;
cmp_buf[0] = start_wc;
cmp_buf[4] = end_wc;
if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
return REG_ERANGE;
if (mbcset)
{
if (BE (*range_alloc == mbcset->nranges, 0))
{
wchar_t *new_array_start, *new_array_end;
int new_nranges;
new_nranges = 2 * mbcset->nranges + 1;
new_array_start = re_realloc (mbcset->range_starts, wchar_t,
new_nranges);
new_array_end = re_realloc (mbcset->range_ends, wchar_t,
new_nranges);
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
return REG_ESPACE;
mbcset->range_starts = new_array_start;
mbcset->range_ends = new_array_end;
*range_alloc = new_nranges;
}
mbcset->range_starts[mbcset->nranges] = start_wc;
mbcset->range_ends[mbcset->nranges++] = end_wc;
}
for (wc = 0; wc < SBC_MAX; ++wc)
{
cmp_buf[2] = wc;
if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
bitset_set (sbcset, wc);
}
}
# else
{
unsigned int ch;
start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
: 0));
end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
: 0));
if (start_ch > end_ch)
return REG_ERANGE;
for (ch = 0; ch < SBC_MAX; ++ch)
if (start_ch <= ch && ch <= end_ch)
bitset_set (sbcset, ch);
}
# endif
return REG_NOERROR;
}
#endif
#ifndef _LIBC
static reg_errcode_t
internal_function
# ifdef RE_ENABLE_I18N
build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
int *coll_sym_alloc, const unsigned char *name)
# else
build_collating_symbol (bitset_t sbcset, const unsigned char *name)
# endif
{
size_t name_len = strlen ((const char *) name);
if (BE (name_len != 1, 0))
return REG_ECOLLATE;
else
{
bitset_set (sbcset, name[0]);
return REG_NOERROR;
}
}
#endif
static bin_tree_t *
parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
reg_syntax_t syntax, reg_errcode_t *err)
{
#ifdef _LIBC
const unsigned char *collseqmb;
const char *collseqwc;
uint32_t nrules;
int32_t table_size;
const int32_t *symb_table;
const unsigned char *extra;
auto inline int32_t
__attribute ((always_inline))
seek_collating_symbol_entry (name, name_len)
const unsigned char *name;
size_t name_len;
{
int32_t hash = elem_hash ((const char *) name, name_len);
int32_t elem = hash % table_size;
if (symb_table[2 * elem] != 0)
{
int32_t second = hash % (table_size - 2) + 1;
do
{
if (symb_table[2 * elem] == hash
&& name_len == extra[symb_table[2 * elem + 1]]
&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
name_len) == 0)
{
break;
}
elem += second;
}
while (symb_table[2 * elem] != 0);
}
return elem;
}
auto inline unsigned int
__attribute ((always_inline))
lookup_collation_sequence_value (br_elem)
bracket_elem_t *br_elem;
{
if (br_elem->type == SB_CHAR)
{
if (nrules == 0)
return collseqmb[br_elem->opr.ch];
else
{
wint_t wc = __btowc (br_elem->opr.ch);
return __collseq_table_lookup (collseqwc, wc);
}
}
else if (br_elem->type == MB_CHAR)
{
if (nrules != 0)
return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
}
else if (br_elem->type == COLL_SYM)
{
size_t sym_name_len = strlen ((char *) br_elem->opr.name);
if (nrules != 0)
{
int32_t elem, idx;
elem = seek_collating_symbol_entry (br_elem->opr.name,
sym_name_len);
if (symb_table[2 * elem] != 0)
{
idx = symb_table[2 * elem + 1];
idx += 1 + extra[idx];
idx += 1 + extra[idx];
idx = (idx + 3) & ~3;
idx += sizeof (unsigned int);
idx += sizeof (unsigned int) *
(1 + *(unsigned int *) (extra + idx));
return *(unsigned int *) (extra + idx);
}
else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
{
return collseqmb[br_elem->opr.name[0]];
}
}
else if (sym_name_len == 1)
return collseqmb[br_elem->opr.name[0]];
}
return UINT_MAX;
}
auto inline reg_errcode_t
__attribute ((always_inline))
build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
re_charset_t *mbcset;
int *range_alloc;
bitset_t sbcset;
bracket_elem_t *start_elem, *end_elem;
{
unsigned int ch;
uint32_t start_collseq;
uint32_t end_collseq;
if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
|| end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
0))
return REG_ERANGE;
start_collseq = lookup_collation_sequence_value (start_elem);
end_collseq = lookup_collation_sequence_value (end_elem);
if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
return REG_ECOLLATE;
if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
return REG_ERANGE;
if (nrules > 0 || dfa->mb_cur_max > 1)
{
if (BE (*range_alloc == mbcset->nranges, 0))
{
uint32_t *new_array_start;
uint32_t *new_array_end;
int new_nranges;
new_nranges = 2 * mbcset->nranges + 1;
new_array_start = re_realloc (mbcset->range_starts, uint32_t,
new_nranges);
new_array_end = re_realloc (mbcset->range_ends, uint32_t,
new_nranges);
if (BE (new_array_start == NULL || new_array_end == NULL, 0))
return REG_ESPACE;
mbcset->range_starts = new_array_start;
mbcset->range_ends = new_array_end;
*range_alloc = new_nranges;
}
mbcset->range_starts[mbcset->nranges] = start_collseq;
mbcset->range_ends[mbcset->nranges++] = end_collseq;
}
for (ch = 0; ch < SBC_MAX; ch++)
{
uint32_t ch_collseq;
if (nrules == 0)
ch_collseq = collseqmb[ch];
else
ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
bitset_set (sbcset, ch);
}
return REG_NOERROR;
}
auto inline reg_errcode_t
__attribute ((always_inline))
build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
re_charset_t *mbcset;
int *coll_sym_alloc;
bitset_t sbcset;
const unsigned char *name;
{
int32_t elem, idx;
size_t name_len = strlen ((const char *) name);
if (nrules != 0)
{
elem = seek_collating_symbol_entry (name, name_len);
if (symb_table[2 * elem] != 0)
{
idx = symb_table[2 * elem + 1];
idx += 1 + extra[idx];
}
else if (symb_table[2 * elem] == 0 && name_len == 1)
{
bitset_set (sbcset, name[0]);
return REG_NOERROR;
}
else
return REG_ECOLLATE;
if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
{
int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
new_coll_sym_alloc);
if (BE (new_coll_syms == NULL, 0))
return REG_ESPACE;
mbcset->coll_syms = new_coll_syms;
*coll_sym_alloc = new_coll_sym_alloc;
}
mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
return REG_NOERROR;
}
else
{
if (BE (name_len != 1, 0))
return REG_ECOLLATE;
else
{
bitset_set (sbcset, name[0]);
return REG_NOERROR;
}
}
}
#endif
re_token_t br_token;
re_bitset_ptr_t sbcset;
#ifdef RE_ENABLE_I18N
re_charset_t *mbcset;
int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
int equiv_class_alloc = 0, char_class_alloc = 0;
#endif
int non_match = 0;
bin_tree_t *work_tree;
int token_len;
int first_round = 1;
#ifdef _LIBC
collseqmb = (const unsigned char *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules)
{
collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_SYMB_TABLEMB);
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_SYMB_EXTRAMB);
}
#endif
sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
#ifdef RE_ENABLE_I18N
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
#endif
#ifdef RE_ENABLE_I18N
if (BE (sbcset == NULL || mbcset == NULL, 0))
#else
if (BE (sbcset == NULL, 0))
#endif
{
*err = REG_ESPACE;
return NULL;
}
token_len = peek_token_bracket (token, regexp, syntax);
if (BE (token->type == END_OF_RE, 0))
{
*err = REG_BADPAT;
goto parse_bracket_exp_free_return;
}
if (token->type == OP_NON_MATCH_LIST)
{
#ifdef RE_ENABLE_I18N
mbcset->non_match = 1;
#endif
non_match = 1;
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
bitset_set (sbcset, '\n');
re_string_skip_bytes (regexp, token_len);
token_len = peek_token_bracket (token, regexp, syntax);
if (BE (token->type == END_OF_RE, 0))
{
*err = REG_BADPAT;
goto parse_bracket_exp_free_return;
}
}
if (token->type == OP_CLOSE_BRACKET)
token->type = CHARACTER;
while (1)
{
bracket_elem_t start_elem, end_elem;
unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
reg_errcode_t ret;
int token_len2 = 0, is_range_exp = 0;
re_token_t token2;
start_elem.opr.name = start_name_buf;
ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
syntax, first_round);
if (BE (ret != REG_NOERROR, 0))
{
*err = ret;
goto parse_bracket_exp_free_return;
}
first_round = 0;
token_len = peek_token_bracket (token, regexp, syntax);
if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
{
if (BE (token->type == END_OF_RE, 0))
{
*err = REG_EBRACK;
goto parse_bracket_exp_free_return;
}
if (token->type == OP_CHARSET_RANGE)
{
re_string_skip_bytes (regexp, token_len);
token_len2 = peek_token_bracket (&token2, regexp, syntax);
if (BE (token2.type == END_OF_RE, 0))
{
*err = REG_EBRACK;
goto parse_bracket_exp_free_return;
}
if (token2.type == OP_CLOSE_BRACKET)
{
re_string_skip_bytes (regexp, -token_len);
token->type = CHARACTER;
}
else
is_range_exp = 1;
}
}
if (is_range_exp == 1)
{
end_elem.opr.name = end_name_buf;
ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
dfa, syntax, 1);
if (BE (ret != REG_NOERROR, 0))
{
*err = ret;
goto parse_bracket_exp_free_return;
}
token_len = peek_token_bracket (token, regexp, syntax);
#ifdef _LIBC
*err = build_range_exp (sbcset, mbcset, &range_alloc,
&start_elem, &end_elem);
#else
# ifdef RE_ENABLE_I18N
*err = build_range_exp (sbcset,
dfa->mb_cur_max > 1 ? mbcset : NULL,
&range_alloc, &start_elem, &end_elem);
# else
*err = build_range_exp (sbcset, &start_elem, &end_elem);
# endif
#endif
if (BE (*err != REG_NOERROR, 0))
goto parse_bracket_exp_free_return;
}
else
{
switch (start_elem.type)
{
case SB_CHAR:
bitset_set (sbcset, start_elem.opr.ch);
break;
#ifdef RE_ENABLE_I18N
case MB_CHAR:
if (BE (mbchar_alloc == mbcset->nmbchars, 0))
{
wchar_t *new_mbchars;
mbchar_alloc = 2 * mbcset->nmbchars + 1;
new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
mbchar_alloc);
if (BE (new_mbchars == NULL, 0))
goto parse_bracket_exp_espace;
mbcset->mbchars = new_mbchars;
}
mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
break;
#endif
case EQUIV_CLASS:
*err = build_equiv_class (sbcset,
#ifdef RE_ENABLE_I18N
mbcset, &equiv_class_alloc,
#endif
start_elem.opr.name);
if (BE (*err != REG_NOERROR, 0))
goto parse_bracket_exp_free_return;
break;
case COLL_SYM:
*err = build_collating_symbol (sbcset,
#ifdef RE_ENABLE_I18N
mbcset, &coll_sym_alloc,
#endif
start_elem.opr.name);
if (BE (*err != REG_NOERROR, 0))
goto parse_bracket_exp_free_return;
break;
case CHAR_CLASS:
*err = build_charclass (regexp->trans, sbcset,
#ifdef RE_ENABLE_I18N
mbcset, &char_class_alloc,
#endif
(const char *) start_elem.opr.name, syntax);
if (BE (*err != REG_NOERROR, 0))
goto parse_bracket_exp_free_return;
break;
default:
assert (0);
break;
}
}
if (BE (token->type == END_OF_RE, 0))
{
*err = REG_EBRACK;
goto parse_bracket_exp_free_return;
}
if (token->type == OP_CLOSE_BRACKET)
break;
}
re_string_skip_bytes (regexp, token_len);
if (non_match)
bitset_not (sbcset);
#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
bitset_mask (sbcset, dfa->sb_char);
if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
|| mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
|| mbcset->non_match)))
{
bin_tree_t *mbc_tree;
int sbc_idx;
dfa->has_mb_node = 1;
br_token.type = COMPLEX_BRACKET;
br_token.opr.mbcset = mbcset;
mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
if (BE (mbc_tree == NULL, 0))
goto parse_bracket_exp_espace;
for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
if (sbcset[sbc_idx])
break;
if (sbc_idx < BITSET_WORDS)
{
br_token.type = SIMPLE_BRACKET;
br_token.opr.sbcset = sbcset;
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
if (BE (work_tree == NULL, 0))
goto parse_bracket_exp_espace;
work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
if (BE (work_tree == NULL, 0))
goto parse_bracket_exp_espace;
}
else
{
re_free (sbcset);
work_tree = mbc_tree;
}
}
else
#endif
{
#ifdef RE_ENABLE_I18N
free_charset (mbcset);
#endif
br_token.type = SIMPLE_BRACKET;
br_token.opr.sbcset = sbcset;
work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
if (BE (work_tree == NULL, 0))
goto parse_bracket_exp_espace;
}
return work_tree;
parse_bracket_exp_espace:
*err = REG_ESPACE;
parse_bracket_exp_free_return:
re_free (sbcset);
#ifdef RE_ENABLE_I18N
free_charset (mbcset);
#endif
return NULL;
}
static reg_errcode_t
parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
re_token_t *token, int token_len, re_dfa_t *dfa,
reg_syntax_t syntax, int accept_hyphen)
{
#ifdef RE_ENABLE_I18N
int cur_char_size;
cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
if (cur_char_size > 1)
{
elem->type = MB_CHAR;
elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
re_string_skip_bytes (regexp, cur_char_size);
return REG_NOERROR;
}
#endif
re_string_skip_bytes (regexp, token_len);
if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
|| token->type == OP_OPEN_EQUIV_CLASS)
return parse_bracket_symbol (elem, regexp, token);
if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
{
re_token_t token2;
(void) peek_token_bracket (&token2, regexp, syntax);
if (token2.type != OP_CLOSE_BRACKET)
return REG_ERANGE;
}
elem->type = SB_CHAR;
elem->opr.ch = token->opr.c;
return REG_NOERROR;
}
static reg_errcode_t
parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
re_token_t *token)
{
unsigned char ch, delim = token->opr.c;
int i = 0;
if (re_string_eoi(regexp))
return REG_EBRACK;
for (;; ++i)
{
if (i >= BRACKET_NAME_BUF_SIZE)
return REG_EBRACK;
if (token->type == OP_OPEN_CHAR_CLASS)
ch = re_string_fetch_byte_case (regexp);
else
ch = re_string_fetch_byte (regexp);
if (re_string_eoi(regexp))
return REG_EBRACK;
if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
break;
elem->opr.name[i] = ch;
}
re_string_skip_bytes (regexp, 1);
elem->opr.name[i] = '\0';
switch (token->type)
{
case OP_OPEN_COLL_ELEM:
elem->type = COLL_SYM;
break;
case OP_OPEN_EQUIV_CLASS:
elem->type = EQUIV_CLASS;
break;
case OP_OPEN_CHAR_CLASS:
elem->type = CHAR_CLASS;
break;
default:
break;
}
return REG_NOERROR;
}
static reg_errcode_t
#ifdef RE_ENABLE_I18N
build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
int *equiv_class_alloc, const unsigned char *name)
#else
build_equiv_class (bitset_t sbcset, const unsigned char *name)
#endif
{
#ifdef _LIBC
uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules != 0)
{
const int32_t *table, *indirect;
const unsigned char *weights, *extra, *cp;
unsigned char char_buf[2];
int32_t idx1, idx2;
unsigned int ch;
size_t len;
# include <locale/weight.h>
cp = name;
table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_WEIGHTMB);
extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_EXTRAMB);
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_INDIRECTMB);
idx1 = findidx (&cp);
if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
return REG_ECOLLATE;
char_buf[1] = (unsigned char) '\0';
len = weights[idx1 & 0xffffff];
for (ch = 0; ch < SBC_MAX; ++ch)
{
char_buf[0] = ch;
cp = char_buf;
idx2 = findidx (&cp);
if (idx2 == 0)
continue;
if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
{
int cnt = 0;
while (cnt <= len &&
weights[(idx1 & 0xffffff) + 1 + cnt]
== weights[(idx2 & 0xffffff) + 1 + cnt])
++cnt;
if (cnt > len)
bitset_set (sbcset, ch);
}
}
if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
{
int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
int32_t,
new_equiv_class_alloc);
if (BE (new_equiv_classes == NULL, 0))
return REG_ESPACE;
mbcset->equiv_classes = new_equiv_classes;
*equiv_class_alloc = new_equiv_class_alloc;
}
mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
}
else
#endif
{
if (BE (strlen ((const char *) name) != 1, 0))
return REG_ECOLLATE;
bitset_set (sbcset, *name);
}
return REG_NOERROR;
}
static reg_errcode_t
#ifdef RE_ENABLE_I18N
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
re_charset_t *mbcset, int *char_class_alloc,
const char *class_name, reg_syntax_t syntax)
#else
build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
const char *class_name, reg_syntax_t syntax)
#endif
{
int i;
if ((syntax & RE_ICASE)
&& (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
class_name = "alpha";
#ifdef RE_ENABLE_I18N
if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
{
int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
new_char_class_alloc);
if (BE (new_char_classes == NULL, 0))
return REG_ESPACE;
mbcset->char_classes = new_char_classes;
*char_class_alloc = new_char_class_alloc;
}
mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
#endif
#define BUILD_CHARCLASS_LOOP(ctype_func) \
do { \
if (BE (trans != NULL, 0)) \
{ \
for (i = 0; i < SBC_MAX; ++i) \
if (ctype_func (i)) \
bitset_set (sbcset, trans[i]); \
} \
else \
{ \
for (i = 0; i < SBC_MAX; ++i) \
if (ctype_func (i)) \
bitset_set (sbcset, i); \
} \
} while (0)
if (strcmp (class_name, "alnum") == 0)
BUILD_CHARCLASS_LOOP (isalnum);
else if (strcmp (class_name, "cntrl") == 0)
BUILD_CHARCLASS_LOOP (iscntrl);
else if (strcmp (class_name, "lower") == 0)
BUILD_CHARCLASS_LOOP (islower);
else if (strcmp (class_name, "space") == 0)
BUILD_CHARCLASS_LOOP (isspace);
else if (strcmp (class_name, "alpha") == 0)
BUILD_CHARCLASS_LOOP (isalpha);
else if (strcmp (class_name, "digit") == 0)
BUILD_CHARCLASS_LOOP (isdigit);
else if (strcmp (class_name, "print") == 0)
BUILD_CHARCLASS_LOOP (isprint);
else if (strcmp (class_name, "upper") == 0)
BUILD_CHARCLASS_LOOP (isupper);
else if (strcmp (class_name, "blank") == 0)
#ifndef GAWK
BUILD_CHARCLASS_LOOP (isblank);
#else
BUILD_CHARCLASS_LOOP (is_blank);
#endif
else if (strcmp (class_name, "graph") == 0)
BUILD_CHARCLASS_LOOP (isgraph);
else if (strcmp (class_name, "punct") == 0)
BUILD_CHARCLASS_LOOP (ispunct);
else if (strcmp (class_name, "xdigit") == 0)
BUILD_CHARCLASS_LOOP (isxdigit);
else
return REG_ECTYPE;
return REG_NOERROR;
}
static bin_tree_t *
build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
const char *class_name,
const char *extra, int non_match,
reg_errcode_t *err)
{
re_bitset_ptr_t sbcset;
#ifdef RE_ENABLE_I18N
re_charset_t *mbcset;
int alloc = 0;
#endif
reg_errcode_t ret;
re_token_t br_token;
bin_tree_t *tree;
sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
#ifdef RE_ENABLE_I18N
mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
#endif
#ifdef RE_ENABLE_I18N
if (BE (sbcset == NULL || mbcset == NULL, 0))
#else
if (BE (sbcset == NULL, 0))
#endif
{
*err = REG_ESPACE;
return NULL;
}
if (non_match)
{
#ifdef RE_ENABLE_I18N
mbcset->non_match = 1;
#endif
}
ret = build_charclass (trans, sbcset,
#ifdef RE_ENABLE_I18N
mbcset, &alloc,
#endif
class_name, 0);
if (BE (ret != REG_NOERROR, 0))
{
re_free (sbcset);
#ifdef RE_ENABLE_I18N
free_charset (mbcset);
#endif
*err = ret;
return NULL;
}
for (; *extra; extra++)
bitset_set (sbcset, *extra);
if (non_match)
bitset_not (sbcset);
#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
bitset_mask (sbcset, dfa->sb_char);
#endif
br_token.type = SIMPLE_BRACKET;
br_token.opr.sbcset = sbcset;
tree = create_token_tree (dfa, NULL, NULL, &br_token);
if (BE (tree == NULL, 0))
goto build_word_op_espace;
#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1)
{
bin_tree_t *mbc_tree;
br_token.type = COMPLEX_BRACKET;
br_token.opr.mbcset = mbcset;
dfa->has_mb_node = 1;
mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
if (BE (mbc_tree == NULL, 0))
goto build_word_op_espace;
tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
if (BE (mbc_tree != NULL, 1))
return tree;
}
else
{
free_charset (mbcset);
return tree;
}
#else
return tree;
#endif
build_word_op_espace:
re_free (sbcset);
#ifdef RE_ENABLE_I18N
free_charset (mbcset);
#endif
*err = REG_ESPACE;
return NULL;
}
static int
fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
{
int num = -1;
unsigned char c;
while (1)
{
fetch_token (token, input, syntax);
c = token->opr.c;
if (BE (token->type == END_OF_RE, 0))
return -2;
if (token->type == OP_CLOSE_DUP_NUM || c == ',')
break;
num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
num = (num > RE_DUP_MAX) ? -2 : num;
}
return num;
}
#ifdef RE_ENABLE_I18N
static void
free_charset (re_charset_t *cset)
{
re_free (cset->mbchars);
# ifdef _LIBC
re_free (cset->coll_syms);
re_free (cset->equiv_classes);
re_free (cset->range_starts);
re_free (cset->range_ends);
# endif
re_free (cset->char_classes);
re_free (cset);
}
#endif
static bin_tree_t *
create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
re_token_type_t type)
{
re_token_t t;
t.type = type;
return create_token_tree (dfa, left, right, &t);
}
static bin_tree_t *
create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
const re_token_t *token)
{
bin_tree_t *tree;
if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
{
bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
if (storage == NULL)
return NULL;
storage->next = dfa->str_tree_storage;
dfa->str_tree_storage = storage;
dfa->str_tree_storage_idx = 0;
}
tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
tree->parent = NULL;
tree->left = left;
tree->right = right;
tree->token = *token;
tree->token.duplicated = 0;
tree->token.opt_subexp = 0;
tree->first = NULL;
tree->next = NULL;
tree->node_idx = -1;
if (left != NULL)
left->parent = tree;
if (right != NULL)
right->parent = tree;
return tree;
}
static reg_errcode_t
mark_opt_subexp (void *extra, bin_tree_t *node)
{
int idx = (int) (intptr_t) extra;
if (node->token.type == SUBEXP && node->token.opr.idx == idx)
node->token.opt_subexp = 1;
return REG_NOERROR;
}
static void
free_token (re_token_t *node)
{
#ifdef RE_ENABLE_I18N
if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
free_charset (node->opr.mbcset);
else
#endif
if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
re_free (node->opr.sbcset);
}
static reg_errcode_t
free_tree (void *extra, bin_tree_t *node)
{
free_token (&node->token);
return REG_NOERROR;
}
static bin_tree_t *
duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
{
const bin_tree_t *node;
bin_tree_t *dup_root;
bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
for (node = root; ; )
{
*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
if (*p_new == NULL)
return NULL;
(*p_new)->parent = dup_node;
(*p_new)->token.duplicated = 1;
dup_node = *p_new;
if (node->left)
{
node = node->left;
p_new = &dup_node->left;
}
else
{
const bin_tree_t *prev = NULL;
while (node->right == prev || node->right == NULL)
{
prev = node;
node = node->parent;
dup_node = dup_node->parent;
if (!node)
return dup_root;
}
node = node->right;
p_new = &dup_node->right;
}
}
}