xenocara/app/xedit/lisp/re/reo.c
2006-11-25 20:07:29 +00:00

686 lines
16 KiB
C

/*
* Copyright (c) 2002 by The XFree86 Project, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
* OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Except as contained in this notice, the name of the XFree86 Project shall
* not be used in advertising or otherwise to promote the sale, use or other
* dealings in this Software without prior written authorization from the
* XFree86 Project.
*
* Author: Paulo César Pereira de Andrade
*/
/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.8 2002/09/29 02:55:01 paulo Exp $ */
#include "rep.h"
/*
* This file is a placeholder to add code to analyse and optimize the
* intermediate data structure generated in rep.c.
* Character ranges are optimized while being generated.
*/
/*
* Types
*/
typedef struct _orec_inf {
rec_alt *alt; /* Main alternatives list */
rec_grp *grp; /* Current group pointer */
int flags;
int ecode;
} orec_inf;
/*
* Prototypes
*/
static int orec_alt(orec_inf*, rec_alt*);
static int orec_pat(orec_inf*, rec_pat*);
static int orec_grp(orec_inf*, rec_grp*);
static int orec_pat_bad_rpt(orec_inf*, rec_pat*);
static int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*);
static int orec_pat_rng(orec_inf*, rec_pat*);
static int orec_pat_cse(orec_inf*, rec_pat*);
static int orec_pat_cse_can(orec_inf*, rec_pat*);
static int orec_str_list(orec_inf*, rec_alt*, int, int);
/*
* Initialization
*/
extern unsigned char re__alnum[256];
extern unsigned char re__odigit[256];
extern unsigned char re__ddigit[256];
extern unsigned char re__xdigit[256];
extern unsigned char re__control[256];
/*
* Implementation
*/
int
orec_comp(rec_alt *alt, int flags)
{
orec_inf inf;
inf.alt = alt;
inf.grp = NULL;
inf.flags = flags;
inf.ecode = 0;
orec_alt(&inf, alt);
return (inf.ecode);
}
void
orec_free_stl(rec_stl *stl)
{
int i;
for (i = 0; i < stl->nstrs; i++) {
if (stl->lens[i] > 2)
free(stl->strs[i]);
}
free(stl->lens);
free(stl->strs);
free(stl);
}
static int
orec_alt(orec_inf *inf, rec_alt *alt)
{
if (alt) {
rec_alt *ptr = alt;
int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0;
/* Check if can build a string list */
if (ptr->next) {
/* If more than one alternative */
while (ptr && (str || cstr)) {
if (ptr->pat == NULL || ptr->pat->rep != NULL) {
cstr = str = 0;
break;
}
if ((inf->flags & RE_ICASE)) {
if (!(ret = orec_pat_cse_can(inf, ptr->pat))) {
cstr = str = 0;
break;
}
if (ret == 1)
++lits;
else if (ret == 2)
++clits;
}
else if (ptr->pat->next == NULL) {
if (ptr->pat->type != Rep_String) {
if (ptr->pat->type != Rep_Literal) {
str = 0;
if (ptr->pat->type != Rep_CaseString) {
if (ptr->pat->type != Rep_CaseLiteral)
cstr = 0;
else
++clits;
}
else if (strlen((char*)ptr->pat->data.str) >= 255)
str = cstr = 0;
}
else
++lits;
}
else if (strlen((char*)ptr->pat->data.str) >= 255)
str = cstr = 0;
}
else {
str = cstr = 0;
break;
}
if (++count >= 255)
str = cstr = 0;
ptr = ptr->next;
}
if (str || cstr) {
if (inf->flags & RE_ICASE) {
for (ptr = alt; ptr; ptr = ptr->next) {
if (orec_pat_cse(inf, ptr->pat))
return (inf->ecode);
}
str = 0;
}
return (orec_str_list(inf, alt, str, count));
}
}
else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) {
/* If the toplevel single alternative */
switch (alt->pat->type) {
/* One of these will always be true for RE_NOSPEC,
* but can also be optimized for simple patterns */
case Rep_Literal:
alt->pat->type = Rep_SearchLiteral;
break;
case Rep_CaseLiteral:
alt->pat->type = Rep_SearchCaseLiteral;
break;
case Rep_String:
alt->pat->type = Rep_SearchString;
break;
case Rep_CaseString:
alt->pat->type = Rep_SearchCaseString;
break;
default:
break;
}
}
while (alt) {
orec_pat(inf, alt->pat);
alt = alt->next;
}
}
return (inf->ecode);
}
static int
orec_pat(orec_inf *inf, rec_pat *pat)
{
rec_pat *next;
while (pat) {
switch (pat->type) {
case Rep_AnyAnyTimes:
if (pat->next == NULL) {
rec_grp *grp = inf->grp;
next = NULL;
while (grp) {
next = grp->parent->next;
/* Cannot check if is .*$ as the input
* may be a substring */
if (next)
break;
grp = grp->pgrp;
}
if (next == NULL) {
/* <re>.* */
pat->type = Rep_AnyEatAnyTimes;
grp = inf->grp;
while (grp) {
--grp->comp;
next = grp->parent->next;
if (next)
break;
grp = grp->pgrp;
}
}
else if (orec_pat_bad_rpt(inf, next))
return (inf->ecode);
}
else if (orec_pat_bad_rpt(inf, pat->next))
return (inf->ecode);
break;
case Rep_AnyMaybe:
if (pat->next == NULL) {
rec_grp *grp = inf->grp;
next = NULL;
while (grp) {
next = grp->parent->next;
if (next)
break;
grp = grp->pgrp;
}
if (next == NULL) {
/* <re>.? */
pat->type = Rep_AnyEatMaybe;
grp = inf->grp;
while (grp) {
--grp->comp;
next = grp->parent->next;
if (next)
break;
grp = grp->pgrp;
}
}
else if (orec_pat_bad_rpt(inf, next))
return (inf->ecode);
}
else if (orec_pat_bad_rpt(inf, pat->next))
return (inf->ecode);
break;
case Rep_AnyAtLeast:
if (pat->next == NULL) {
rec_grp *grp = inf->grp;
next = NULL;
while (grp) {
next = grp->parent->next;
if (next)
break;
grp = grp->pgrp;
}
if (next == NULL) {
/* <re>.+ */
pat->type = Rep_AnyEatAtLeast;
grp = inf->grp;
while (grp) {
--grp->comp;
next = grp->parent->next;
if (next)
break;
grp = grp->pgrp;
}
}
else if (orec_pat_bad_rpt(inf, next))
return (inf->ecode);
}
else if (orec_pat_bad_rpt(inf, pat->next))
return (inf->ecode);
break;
case Rep_Range:
case Rep_RangeNot:
orec_pat_rng(inf, pat);
break;
case Rep_Group:
orec_grp(inf, pat->data.grp);
break;
default:
break;
}
pat = pat->next;
}
return (inf->ecode);
}
static int
orec_pat_bad_rpt(orec_inf *inf, rec_pat *pat)
{
switch (pat->type) {
/* Not really an error, but aren't supported by the library.
* Includes: .*.*, .+<re>? .*<re>*, (.*)(<re>*), etc.
*/
/* Not a repetition, but mathes anything... */
case Rep_Any:
/* Zero length matches */
case Rep_Eol:
if (!(inf->flags & RE_NEWLINE))
break;
case Rep_Bol:
case Rep_Bow:
case Rep_Eow:
/* Repetitions */
case Rep_AnyAnyTimes:
case Rep_AnyMaybe:
case Rep_AnyAtLeast:
inf->ecode = RE_BADRPT;
break;
/* Check if the first group element is a complex pattern */
case Rep_Group:
if (pat->rep == NULL) {
if (pat->data.grp->alt) {
for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) {
if (orec_pat_bad_rpt(inf, pat))
break;
}
}
break;
}
/*FALLTHROUGH*/
default:
if (pat->rep)
inf->ecode = RE_BADRPT;
break;
}
if (!inf->ecode && pat && pat->next)
orec_pat_bad_forward_rpt(inf, pat->next);
return (inf->ecode);
}
static int
orec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat)
{
if (pat->rep) {
switch (pat->rep->type) {
case Rer_MinMax:
if (pat->rep->mine > 0)
break;
case Rer_AnyTimes:
case Rer_Maybe:
case Rer_Max:
inf->ecode = RE_BADRPT;
default:
break;
}
}
else if (pat->type == Rep_Group &&
pat->data.grp->alt &&
pat->data.grp->alt->pat)
orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat);
return (inf->ecode);
}
static int
orec_grp(orec_inf *inf, rec_grp *grp)
{
rec_grp *prev = inf->grp;
inf->grp = grp;
orec_alt(inf, grp->alt);
/* Could also just say: inf->grp = grp->gparent */
inf->grp = prev;
return (inf->ecode);
}
static int
orec_pat_rng(orec_inf *inf, rec_pat *pat)
{
int i, j[2], count;
rec_pat_t type = pat->type;
unsigned char *range = pat->data.rng->range;
for (i = count = j[0] = j[1] = 0; i < 256; i++) {
if (range[i]) {
if (count == 2) {
++count;
break;
}
j[count++] = i;
}
}
if (count == 1 ||
(count == 2 &&
((islower(j[0]) && toupper(j[0]) == j[1]) ||
(isupper(j[0]) && tolower(j[0]) == j[1])))) {
free(pat->data.rng);
if (count == 1) {
pat->data.chr = j[0];
pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot;
}
else {
pat->data.cse.upper = j[0];
pat->data.cse.lower = j[1];
pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot;
}
}
else {
if (memcmp(re__alnum, range, 256) == 0)
type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot;
else if (memcmp(re__odigit, range, 256) == 0)
type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot;
else if (memcmp(re__ddigit, range, 256) == 0)
type = type == Rep_Range ? Rep_Digit : Rep_DigitNot;
else if (memcmp(re__xdigit, range, 256) == 0)
type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot;
else if (memcmp(re__control, range, 256) == 0)
type = type == Rep_Range ? Rep_Control : Rep_ControlNot;
if (type != pat->type) {
free(pat->data.rng);
pat->type = type;
}
}
return (inf->ecode);
}
/* Join patterns if required, will only fail on memory allocation failure:
*/
static int
orec_pat_cse(orec_inf *inf, rec_pat *pat)
{
rec_pat_t type;
int i, len, length;
rec_pat *ptr, *next;
unsigned char *str, *tofree;
if (pat->next == NULL && pat->type == Rep_CaseString)
return (inf->ecode);
type = Rep_CaseString;
/* First calculate how many bytes will be required */
for (ptr = pat, length = 1; ptr; ptr = ptr->next) {
switch (ptr->type) {
case Rep_Literal:
length += 2;
break;
case Rep_String:
length += strlen((char*)ptr->data.str) << 1;
break;
case Rep_CaseLiteral:
length += 2;
break;
case Rep_CaseString:
length += strlen((char*)ptr->data.str);
break;
default:
break;
}
}
if ((str = malloc(length)) == NULL)
return (inf->ecode = RE_ESPACE);
for (ptr = pat, length = 0; ptr; ptr = next) {
tofree = NULL;
next = ptr->next;
switch (ptr->type) {
case Rep_Literal:
str[length++] = ptr->data.chr;
str[length++] = ptr->data.chr;
break;
case Rep_String:
tofree = ptr->data.str;
len = strlen((char*)tofree);
for (i = 0; i < len; i++) {
str[length++] = tofree[i];
str[length++] = tofree[i];
}
break;
case Rep_CaseLiteral:
str[length++] = ptr->data.cse.lower;
str[length++] = ptr->data.cse.upper;
break;
case Rep_CaseString:
tofree = ptr->data.str;
len = strlen((char*)tofree);
memcpy(str + length, tofree, len);
length += len;
break;
default:
break;
}
if (tofree)
free(tofree);
if (ptr != pat)
free(ptr);
}
str[length] = '\0';
pat->type = type;
pat->data.str = str;
pat->next = NULL;
return (inf->ecode);
}
/* Return 0 if the patterns in the list cannot be merged, 1 if will
* be a simple string, 2 if a case string.
* This is useful when building an alternative list that is composed
* only of strings, but the regex is case insensitive, in wich case
* the first pass may have splited some patterns, but if it is a member
* of an alternatives list, the cost of using a string list is smaller */
static int
orec_pat_cse_can(orec_inf *inf, rec_pat *pat)
{
int ret;
if (pat == NULL)
return (0);
for (ret = 1; pat; pat = pat->next) {
if (pat->rep)
return (0);
switch (pat->type) {
case Rep_Literal:
case Rep_String:
break;
case Rep_CaseLiteral:
case Rep_CaseString:
ret = 2;
break;
default:
return (0);
}
}
return (ret);
}
/* XXX If everything is a (case) byte, the pattern should be
* [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE)
* as a string list works fine, but as a character range
* should be faster, and maybe could be converted here. But not
* very important, if performance is required, it should have already
* been done in the pattern.
*/
static int
orec_str_list(orec_inf *inf, rec_alt *alt, int str, int count)
{
rec_stl *stl;
rec_pat *pat;
rec_alt *ptr, *next;
int i, j, tlen, len, is;
if ((stl = calloc(1, sizeof(rec_stl))) == NULL)
return (inf->ecode = RE_ESPACE);
if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) {
free(stl);
return (inf->ecode = RE_ESPACE);
}
if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) {
free(stl->lens);
free(stl);
return (inf->ecode = RE_ESPACE);
}
if ((pat = calloc(1, sizeof(rec_pat))) == NULL) {
free(stl->strs);
free(stl->lens);
free(stl);
return (inf->ecode = RE_ESPACE);
}
pat->data.stl = stl;
pat->type = Rep_StringList;
stl->type = str ? Resl_StringList : Resl_CaseStringList;
for (i = tlen = 0, ptr = alt; i < count; i++) {
next = ptr->next;
switch (ptr->pat->type) {
case Rep_Literal:
is = len = 1;
break;
case Rep_CaseLiteral:
is = len = 2;
break;
default:
is = 0;
len = strlen((char*)ptr->pat->data.str);
break;
}
tlen += len;
stl->lens[i] = len;
if (!is) {
if (len > 2)
stl->strs[i] = ptr->pat->data.str;
else {
if (len == 1)
stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]);
else
stl->strs[i] = (void*)(long)
(ptr->pat->data.str[0] |
((int)ptr->pat->data.str[1] << 8));
free(ptr->pat->data.str);
}
}
else {
if (is == 1)
stl->strs[i] = (void*)(long)ptr->pat->data.chr;
else
stl->strs[i] = (void*)(long)
(ptr->pat->data.cse.lower |
(ptr->pat->data.cse.upper << 8));
}
free(ptr->pat);
if (i)
free(ptr);
ptr = next;
}
stl->tlen = tlen;
stl->nstrs = count;
alt->pat = pat;
alt->next = NULL;
{
int li, lj;
unsigned char ci, cj, *str;
/* Don't need a stable sort, there shouldn't be duplicated strings,
* but don't check for it either. Only need to make sure that all
* strings that start with the same byte are together */
for (i = 0; i < count; i++) {
li = stl->lens[i];
ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff;
for (j = i + 1; j < count; j++) {
lj = stl->lens[j];
cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff;
if ((count >= LARGE_STL_COUNT && cj < ci) ||
(cj == ci && lj > li)) {
/* If both strings start with the same byte,
* put the longer first */
str = stl->strs[j];
stl->strs[j] = stl->strs[i];
stl->strs[i] = str;
stl->lens[j] = li;
stl->lens[i] = lj;
li ^= lj; lj ^= li; li ^= lj;
ci ^= cj; cj ^= ci; ci ^= cj;
}
}
}
}
return (inf->ecode);
}